Refactoring extracteurs
This commit is contained in:
parent
2862a0c5dd
commit
c043ba198c
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), LaComedieExtractor())
|
||||
u2e = URL2Events(SimpleDownloader(), lacomedie.CExtractor())
|
||||
url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
|
||||
url_human = "https://lacomediedeclermont.com/saison23-24/"
|
||||
|
||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
|
||||
u2e = URL2Events(SimpleDownloader(), lacoope.CExtractor())
|
||||
url = "https://www.lacoope.org/concerts-calendrier/"
|
||||
url_human = "https://www.lacoope.org/concerts-calendrier/"
|
||||
|
||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
|
||||
u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
|
||||
url = "https://www.lapucealoreille63.fr/programmation/"
|
||||
url_human = "https://www.lapucealoreille63.fr/programmation/"
|
||||
|
||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), LeFotomatExtractor())
|
||||
u2e = URL2Events(SimpleDownloader(), lefotomat.CExtractor())
|
||||
url = "https://www.lefotomat.com/feed"
|
||||
url_human = "https://www.lefotomat.com/"
|
||||
|
||||
|
@ -102,13 +102,13 @@ def run_recurrent_import(self, pk):
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
||||
extractor = ICALNoVCExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
||||
extractor = LaCoopeExtractor()
|
||||
extractor = lacoope.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
||||
extractor = LaComedieExtractor()
|
||||
extractor = lacomedie.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
|
||||
extractor = LeFotomatExtractor()
|
||||
extractor = lefotomat.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
|
||||
extractor = LaPuceALOreilleExtractor()
|
||||
extractor = lapucealoreille.CExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
@ -1,266 +0,0 @@
|
||||
|
||||
from .generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
from datetime import timedelta
|
||||
|
||||
# A class dedicated to get events from La Coopérative de Mai:
|
||||
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||
class LaCoopeExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Coopérative de Mai"
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
||||
if len(script) == 0:
|
||||
raise Exception("Cannot find events in the first page")
|
||||
script = script[0]
|
||||
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||
if search:
|
||||
data = json5.loads(search.group(1))
|
||||
for e in data['events']:
|
||||
self.add_event_url(e['url'])
|
||||
if e['tag'] == "Gratuit":
|
||||
self.add_event_tag(e['url'], 'gratuit')
|
||||
|
||||
else:
|
||||
raise Exception('Cannot extract events from javascript')
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
title = soup.find("h1").contents[0]
|
||||
category = "Concert"
|
||||
image = soup.find("meta", property="og:image")
|
||||
if image:
|
||||
image = image["content"]
|
||||
|
||||
description = soup.find("div", class_="grid-concert-content")
|
||||
if description:
|
||||
description = description.find('div', class_="content-striped")
|
||||
if description:
|
||||
description = description.find('div', class_='wysiwyg')
|
||||
if description:
|
||||
description = description.get_text()
|
||||
if description is None:
|
||||
description = ""
|
||||
|
||||
tags = []
|
||||
|
||||
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
||||
if len(link_calendar) == 0:
|
||||
raise Exception('Cannot find the google calendar url')
|
||||
|
||||
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||
start_day = gg_cal.start_day
|
||||
start_time = gg_cal.start_time
|
||||
end_day = gg_cal.end_day
|
||||
end_time = gg_cal.end_time
|
||||
location = LaCoopeExtractor.nom_lieu
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||
|
||||
|
||||
# A class dedicated to get events from La Coopérative de Mai:
|
||||
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
||||
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
|
||||
class LaComedieExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Comédie de Clermont"
|
||||
|
||||
def category_comedie2agenda(self, category):
|
||||
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
|
||||
if category in mapping:
|
||||
return mapping[category]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
dates = json5.loads(content)["data"][0]
|
||||
|
||||
url = self.url.split("?")[0]
|
||||
for d in list(set(dates)):
|
||||
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
|
||||
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
|
||||
if events:
|
||||
events = json5.loads(events)
|
||||
if "data" in events:
|
||||
events = events["data"][0]
|
||||
soup = BeautifulSoup(events, "html.parser")
|
||||
events = soup.select("div.unedatedev")
|
||||
for e in events:
|
||||
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
||||
self.add_event_url(e_url)
|
||||
self.add_event_start_day(e_url, d)
|
||||
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
||||
self.add_event_start_time(e_url, t)
|
||||
title = e.select('a')[0].contents[0]
|
||||
self.add_event_title(e_url, title)
|
||||
category = e.select("div#lieuevtcal span")
|
||||
if len(category) > 0:
|
||||
category = self.category_comedie2agenda(category[-1].contents[0])
|
||||
if category is not None:
|
||||
self.add_event_category(e_url, category)
|
||||
location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
|
||||
self.add_event_location(e_url, location)
|
||||
|
||||
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
image = soup.select("#imgspec img")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
|
||||
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
|
||||
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
||||
|
||||
|
||||
|
||||
|
||||
# A class dedicated to get events from Le Fotomat'
|
||||
# URL: https://www.lefotomat.com/
|
||||
class LeFotomatExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "Le Fotomat'"
|
||||
|
||||
def category_fotomat2agenda(self, category):
|
||||
if not category:
|
||||
return None
|
||||
mapping = { "Concerts": "Concert"}
|
||||
if category in mapping:
|
||||
return mapping[category]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "xml")
|
||||
|
||||
events = soup.select("item")
|
||||
for e in events:
|
||||
e_url = e.find("link").contents[0]
|
||||
self.add_event_url(e_url)
|
||||
|
||||
title = e.find("title").contents[0]
|
||||
self.add_event_title(e_url, title)
|
||||
|
||||
category = self.category_fotomat2agenda(e.find("category").contents[0])
|
||||
if category:
|
||||
self.add_event_category(e_url, category)
|
||||
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
image = soup.select("div.post-content img.wp-post-image")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
desc = soup.select("head meta[name=description]")[0]["content"]
|
||||
start_day = self.parse_french_date(desc.split("-")[0])
|
||||
start_time = self.parse_french_time(desc.split("-")[1])
|
||||
end_time = self.parse_french_time(desc.split("-")[2])
|
||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||
|
||||
location = self.nom_lieu
|
||||
descriptions = soup.select("div.vce-col-content")
|
||||
if descriptions:
|
||||
descriptions = [d.get_text() for d in descriptions]
|
||||
description = max(descriptions, key=len)
|
||||
else:
|
||||
description = None
|
||||
|
||||
article = soup.select("article.post")
|
||||
tags = []
|
||||
for c in article[0]["class"]:
|
||||
if c.startswith("category-"):
|
||||
tag = '-'.join(c.split("-")[1:])
|
||||
if tag != "concerts":
|
||||
tags.append(tag)
|
||||
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||
|
||||
|
||||
# A class dedicated to get events from La puce à l'oreille
|
||||
# URL: https://www.lapucealoreille63.fr/
|
||||
class LaPuceALOreilleExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Puce à l'Oreille"
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
|
||||
for e in events:
|
||||
e_url = e.find("a")
|
||||
if e_url:
|
||||
if self.add_event_url(e_url["href"]):
|
||||
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
|
||||
if title:
|
||||
title = title[0].contents[0].get_text().replace("\n", " ")
|
||||
title = re.sub(" +", " ", title)
|
||||
self.add_event_title(e_url["href"], title)
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
|
||||
|
||||
spans = soup.select("div[data-testid=richTextElement] span")
|
||||
start_time = None
|
||||
end_time = None
|
||||
location = None
|
||||
|
||||
for span in spans:
|
||||
txt = span.get_text()
|
||||
if txt.lstrip().startswith("DÉBUT"):
|
||||
start_time = self.parse_french_time(txt.split(":")[-1])
|
||||
end_time = None
|
||||
elif txt.lstrip().startswith("HORAIRES :"):
|
||||
hs = txt.split(":")[-1].split("-")
|
||||
start_time = self.parse_french_time(hs[0])
|
||||
if len(hs) > 1:
|
||||
end_time = self.parse_french_time(hs[1])
|
||||
else:
|
||||
end_time = None
|
||||
elif txt.lstrip().startswith("LIEU :") and not location:
|
||||
location = txt.split(":")[-1].lstrip()
|
||||
|
||||
if not location:
|
||||
location = self.nom_lieu
|
||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||
|
||||
url_human = event_url
|
||||
tags = []
|
||||
|
||||
image = soup.select("wow-image img[fetchpriority=high]")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
|
||||
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
|
||||
if descriptions:
|
||||
descriptions = [d.get_text() for d in descriptions]
|
||||
description = max(descriptions, key=len)
|
||||
else:
|
||||
description = None
|
||||
|
||||
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
@ -0,0 +1,4 @@
|
||||
from os.path import dirname, basename, isfile, join
|
||||
import glob
|
||||
modules = glob.glob(join(dirname(__file__), "*.py"))
|
||||
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
|
@ -0,0 +1,69 @@
|
||||
from ..generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
# A class dedicated to get events from La Coopérative de Mai:
|
||||
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
||||
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Comédie de Clermont"
|
||||
|
||||
def category_comedie2agenda(self, category):
|
||||
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
|
||||
if category in mapping:
|
||||
return mapping[category]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
dates = json5.loads(content)["data"][0]
|
||||
|
||||
url = self.url.split("?")[0]
|
||||
for d in list(set(dates)):
|
||||
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
|
||||
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
|
||||
if events:
|
||||
events = json5.loads(events)
|
||||
if "data" in events:
|
||||
events = events["data"][0]
|
||||
soup = BeautifulSoup(events, "html.parser")
|
||||
events = soup.select("div.unedatedev")
|
||||
for e in events:
|
||||
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
||||
self.add_event_url(e_url)
|
||||
self.add_event_start_day(e_url, d)
|
||||
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
||||
self.add_event_start_time(e_url, t)
|
||||
title = e.select('a')[0].contents[0]
|
||||
self.add_event_title(e_url, title)
|
||||
category = e.select("div#lieuevtcal span")
|
||||
if len(category) > 0:
|
||||
category = self.category_comedie2agenda(category[-1].contents[0])
|
||||
if category is not None:
|
||||
self.add_event_category(e_url, category)
|
||||
location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
|
||||
self.add_event_location(e_url, location)
|
||||
|
||||
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
image = soup.select("#imgspec img")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
|
||||
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
|
||||
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
||||
|
@ -0,0 +1,64 @@
|
||||
from ..generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
from datetime import timedelta
|
||||
|
||||
# A class dedicated to get events from La Coopérative de Mai:
|
||||
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Coopérative de Mai"
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
||||
if len(script) == 0:
|
||||
raise Exception("Cannot find events in the first page")
|
||||
script = script[0]
|
||||
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||
if search:
|
||||
data = json5.loads(search.group(1))
|
||||
for e in data['events']:
|
||||
self.add_event_url(e['url'])
|
||||
if e['tag'] == "Gratuit":
|
||||
self.add_event_tag(e['url'], 'gratuit')
|
||||
|
||||
else:
|
||||
raise Exception('Cannot extract events from javascript')
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
title = soup.find("h1").contents[0]
|
||||
category = "Concert"
|
||||
image = soup.find("meta", property="og:image")
|
||||
if image:
|
||||
image = image["content"]
|
||||
|
||||
description = soup.find("div", class_="grid-concert-content")
|
||||
if description:
|
||||
description = description.find('div', class_="content-striped")
|
||||
if description:
|
||||
description = description.find('div', class_='wysiwyg')
|
||||
if description:
|
||||
description = description.get_text()
|
||||
if description is None:
|
||||
description = ""
|
||||
|
||||
tags = []
|
||||
|
||||
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
||||
if len(link_calendar) == 0:
|
||||
raise Exception('Cannot find the google calendar url')
|
||||
|
||||
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||
start_day = gg_cal.start_day
|
||||
start_time = gg_cal.start_time
|
||||
end_day = gg_cal.end_day
|
||||
end_time = gg_cal.end_time
|
||||
location = CExtractor.nom_lieu
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||
|
@ -0,0 +1,73 @@
|
||||
from ..generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
# A class dedicated to get events from La puce à l'oreille
|
||||
# URL: https://www.lapucealoreille63.fr/
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Puce à l'Oreille"
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
|
||||
for e in events:
|
||||
e_url = e.find("a")
|
||||
if e_url:
|
||||
if self.add_event_url(e_url["href"]):
|
||||
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
|
||||
if title:
|
||||
title = title[0].contents[0].get_text().replace("\n", " ")
|
||||
title = re.sub(" +", " ", title)
|
||||
self.add_event_title(e_url["href"], title)
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
|
||||
|
||||
spans = soup.select("div[data-testid=richTextElement] span")
|
||||
start_time = None
|
||||
end_time = None
|
||||
location = None
|
||||
|
||||
for span in spans:
|
||||
txt = span.get_text()
|
||||
if txt.lstrip().startswith("DÉBUT"):
|
||||
start_time = self.parse_french_time(txt.split(":")[-1])
|
||||
end_time = None
|
||||
elif txt.lstrip().startswith("HORAIRES :"):
|
||||
hs = txt.split(":")[-1].split("-")
|
||||
start_time = self.parse_french_time(hs[0])
|
||||
if len(hs) > 1:
|
||||
end_time = self.parse_french_time(hs[1])
|
||||
else:
|
||||
end_time = None
|
||||
elif txt.lstrip().startswith("LIEU :") and not location:
|
||||
location = txt.split(":")[-1].lstrip()
|
||||
|
||||
if not location:
|
||||
location = self.nom_lieu
|
||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||
|
||||
url_human = event_url
|
||||
tags = []
|
||||
|
||||
image = soup.select("wow-image img[fetchpriority=high]")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
|
||||
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
|
||||
if descriptions:
|
||||
descriptions = [d.get_text() for d in descriptions]
|
||||
description = max(descriptions, key=len)
|
||||
else:
|
||||
description = None
|
||||
|
||||
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
@ -0,0 +1,72 @@
|
||||
from ..generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
# A class dedicated to get events from Le Fotomat'
|
||||
# URL: https://www.lefotomat.com/
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "Le Fotomat'"
|
||||
|
||||
def category_fotomat2agenda(self, category):
|
||||
if not category:
|
||||
return None
|
||||
mapping = { "Concerts": "Concert"}
|
||||
if category in mapping:
|
||||
return mapping[category]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "xml")
|
||||
|
||||
events = soup.select("item")
|
||||
for e in events:
|
||||
e_url = e.find("link").contents[0]
|
||||
self.add_event_url(e_url)
|
||||
|
||||
title = e.find("title").contents[0]
|
||||
self.add_event_title(e_url, title)
|
||||
|
||||
category = self.category_fotomat2agenda(e.find("category").contents[0])
|
||||
if category:
|
||||
self.add_event_category(e_url, category)
|
||||
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
image = soup.select("div.post-content img.wp-post-image")
|
||||
if image:
|
||||
image = image[0]["src"]
|
||||
else:
|
||||
image = None
|
||||
desc = soup.select("head meta[name=description]")[0]["content"]
|
||||
start_day = self.parse_french_date(desc.split("-")[0])
|
||||
start_time = self.parse_french_time(desc.split("-")[1])
|
||||
end_time = self.parse_french_time(desc.split("-")[2])
|
||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||
|
||||
location = self.nom_lieu
|
||||
descriptions = soup.select("div.vce-col-content")
|
||||
if descriptions:
|
||||
descriptions = [d.get_text() for d in descriptions]
|
||||
description = max(descriptions, key=len)
|
||||
else:
|
||||
description = None
|
||||
|
||||
article = soup.select("article.post")
|
||||
tags = []
|
||||
for c in article[0]["class"]:
|
||||
if c.startswith("category-"):
|
||||
tag = '-'.join(c.split("-")[1:])
|
||||
if tag != "concerts":
|
||||
tags.append(tag)
|
||||
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||
|
@ -3,6 +3,7 @@
|
||||
<tr>
|
||||
<th rowspan="2">Identifiant</th>
|
||||
<th rowspan="2">Date</th>
|
||||
<th rowspan="2">Source</th>
|
||||
<th rowspan="2">Status</th>
|
||||
<th rowspan="2">Action</th>
|
||||
<th colspan="4">événements</th>
|
||||
@ -19,6 +20,7 @@
|
||||
<tr>
|
||||
<td>{{ obj.id }}</a></td>
|
||||
<td>{{ obj.created_date }}</td>
|
||||
<td>{% if obj.recurrentImport %}<a href="{{ obj.recurrentImport.get_absolute_url }}">{{ obj.recurrentImport.name }}</a>{% else %}-{% endif %} </td>
|
||||
<td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
|
||||
<td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
|
||||
<td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>
|
||||
|
Loading…
Reference in New Issue
Block a user