Refactoring extracteurs
This commit is contained in:
parent
2862a0c5dd
commit
c043ba198c
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
u2e = URL2Events(SimpleDownloader(), LaComedieExtractor())
|
u2e = URL2Events(SimpleDownloader(), lacomedie.CExtractor())
|
||||||
url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
|
url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
|
||||||
url_human = "https://lacomediedeclermont.com/saison23-24/"
|
url_human = "https://lacomediedeclermont.com/saison23-24/"
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
|
u2e = URL2Events(SimpleDownloader(), lacoope.CExtractor())
|
||||||
url = "https://www.lacoope.org/concerts-calendrier/"
|
url = "https://www.lacoope.org/concerts-calendrier/"
|
||||||
url_human = "https://www.lacoope.org/concerts-calendrier/"
|
url_human = "https://www.lacoope.org/concerts-calendrier/"
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
|
u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
|
||||||
url = "https://www.lapucealoreille63.fr/programmation/"
|
url = "https://www.lapucealoreille63.fr/programmation/"
|
||||||
url_human = "https://www.lapucealoreille63.fr/programmation/"
|
url_human = "https://www.lapucealoreille63.fr/programmation/"
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
u2e = URL2Events(SimpleDownloader(), LeFotomatExtractor())
|
u2e = URL2Events(SimpleDownloader(), lefotomat.CExtractor())
|
||||||
url = "https://www.lefotomat.com/feed"
|
url = "https://www.lefotomat.com/feed"
|
||||||
url_human = "https://www.lefotomat.com/"
|
url_human = "https://www.lefotomat.com/"
|
||||||
|
|
||||||
|
@ -102,13 +102,13 @@ def run_recurrent_import(self, pk):
|
|||||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
||||||
extractor = ICALNoVCExtractor()
|
extractor = ICALNoVCExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
||||||
extractor = LaCoopeExtractor()
|
extractor = lacoope.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
||||||
extractor = LaComedieExtractor()
|
extractor = lacomedie.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
|
||||||
extractor = LeFotomatExtractor()
|
extractor = lefotomat.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
|
||||||
extractor = LaPuceALOreilleExtractor()
|
extractor = lapucealoreille.CExtractor()
|
||||||
else:
|
else:
|
||||||
extractor = None
|
extractor = None
|
||||||
|
|
||||||
|
@ -1,266 +0,0 @@
|
|||||||
|
|
||||||
from .generic_extractors import *
|
|
||||||
import re
|
|
||||||
import json5
|
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
# A class dedicated to get events from La Coopérative de Mai:
|
|
||||||
# URL: https://www.lacoope.org/concerts-calendrier/
|
|
||||||
class LaCoopeExtractor(TwoStepsExtractor):
|
|
||||||
|
|
||||||
nom_lieu = "La Coopérative de Mai"
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
|
||||||
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
|
||||||
if len(script) == 0:
|
|
||||||
raise Exception("Cannot find events in the first page")
|
|
||||||
script = script[0]
|
|
||||||
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
|
||||||
if search:
|
|
||||||
data = json5.loads(search.group(1))
|
|
||||||
for e in data['events']:
|
|
||||||
self.add_event_url(e['url'])
|
|
||||||
if e['tag'] == "Gratuit":
|
|
||||||
self.add_event_tag(e['url'], 'gratuit')
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise Exception('Cannot extract events from javascript')
|
|
||||||
|
|
||||||
|
|
||||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
|
||||||
soup = BeautifulSoup(event_content, "html.parser")
|
|
||||||
|
|
||||||
title = soup.find("h1").contents[0]
|
|
||||||
category = "Concert"
|
|
||||||
image = soup.find("meta", property="og:image")
|
|
||||||
if image:
|
|
||||||
image = image["content"]
|
|
||||||
|
|
||||||
description = soup.find("div", class_="grid-concert-content")
|
|
||||||
if description:
|
|
||||||
description = description.find('div', class_="content-striped")
|
|
||||||
if description:
|
|
||||||
description = description.find('div', class_='wysiwyg')
|
|
||||||
if description:
|
|
||||||
description = description.get_text()
|
|
||||||
if description is None:
|
|
||||||
description = ""
|
|
||||||
|
|
||||||
tags = []
|
|
||||||
|
|
||||||
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
|
||||||
if len(link_calendar) == 0:
|
|
||||||
raise Exception('Cannot find the google calendar url')
|
|
||||||
|
|
||||||
gg_cal = GGCalendar(link_calendar[0]["href"])
|
|
||||||
start_day = gg_cal.start_day
|
|
||||||
start_time = gg_cal.start_time
|
|
||||||
end_day = gg_cal.end_day
|
|
||||||
end_time = gg_cal.end_time
|
|
||||||
location = LaCoopeExtractor.nom_lieu
|
|
||||||
url_human = event_url
|
|
||||||
|
|
||||||
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
|
||||||
|
|
||||||
|
|
||||||
# A class dedicated to get events from La Coopérative de Mai:
|
|
||||||
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
|
||||||
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
|
|
||||||
class LaComedieExtractor(TwoStepsExtractor):
|
|
||||||
|
|
||||||
nom_lieu = "La Comédie de Clermont"
|
|
||||||
|
|
||||||
def category_comedie2agenda(self, category):
|
|
||||||
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
|
|
||||||
if category in mapping:
|
|
||||||
return mapping[category]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
|
||||||
dates = json5.loads(content)["data"][0]
|
|
||||||
|
|
||||||
url = self.url.split("?")[0]
|
|
||||||
for d in list(set(dates)):
|
|
||||||
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
|
|
||||||
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
|
|
||||||
if events:
|
|
||||||
events = json5.loads(events)
|
|
||||||
if "data" in events:
|
|
||||||
events = events["data"][0]
|
|
||||||
soup = BeautifulSoup(events, "html.parser")
|
|
||||||
events = soup.select("div.unedatedev")
|
|
||||||
for e in events:
|
|
||||||
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
|
||||||
self.add_event_url(e_url)
|
|
||||||
self.add_event_start_day(e_url, d)
|
|
||||||
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
|
||||||
self.add_event_start_time(e_url, t)
|
|
||||||
title = e.select('a')[0].contents[0]
|
|
||||||
self.add_event_title(e_url, title)
|
|
||||||
category = e.select("div#lieuevtcal span")
|
|
||||||
if len(category) > 0:
|
|
||||||
category = self.category_comedie2agenda(category[-1].contents[0])
|
|
||||||
if category is not None:
|
|
||||||
self.add_event_category(e_url, category)
|
|
||||||
location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
|
|
||||||
self.add_event_location(e_url, location)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
|
||||||
soup = BeautifulSoup(event_content, "html.parser")
|
|
||||||
|
|
||||||
image = soup.select("#imgspec img")
|
|
||||||
if image:
|
|
||||||
image = image[0]["src"]
|
|
||||||
else:
|
|
||||||
image = None
|
|
||||||
|
|
||||||
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
|
|
||||||
|
|
||||||
url_human = event_url
|
|
||||||
|
|
||||||
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# A class dedicated to get events from Le Fotomat'
|
|
||||||
# URL: https://www.lefotomat.com/
|
|
||||||
class LeFotomatExtractor(TwoStepsExtractor):
|
|
||||||
|
|
||||||
nom_lieu = "Le Fotomat'"
|
|
||||||
|
|
||||||
def category_fotomat2agenda(self, category):
|
|
||||||
if not category:
|
|
||||||
return None
|
|
||||||
mapping = { "Concerts": "Concert"}
|
|
||||||
if category in mapping:
|
|
||||||
return mapping[category]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
|
||||||
soup = BeautifulSoup(content, "xml")
|
|
||||||
|
|
||||||
events = soup.select("item")
|
|
||||||
for e in events:
|
|
||||||
e_url = e.find("link").contents[0]
|
|
||||||
self.add_event_url(e_url)
|
|
||||||
|
|
||||||
title = e.find("title").contents[0]
|
|
||||||
self.add_event_title(e_url, title)
|
|
||||||
|
|
||||||
category = self.category_fotomat2agenda(e.find("category").contents[0])
|
|
||||||
if category:
|
|
||||||
self.add_event_category(e_url, category)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
|
||||||
soup = BeautifulSoup(event_content, "html.parser")
|
|
||||||
image = soup.select("div.post-content img.wp-post-image")
|
|
||||||
if image:
|
|
||||||
image = image[0]["src"]
|
|
||||||
else:
|
|
||||||
image = None
|
|
||||||
desc = soup.select("head meta[name=description]")[0]["content"]
|
|
||||||
start_day = self.parse_french_date(desc.split("-")[0])
|
|
||||||
start_time = self.parse_french_time(desc.split("-")[1])
|
|
||||||
end_time = self.parse_french_time(desc.split("-")[2])
|
|
||||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
|
||||||
|
|
||||||
location = self.nom_lieu
|
|
||||||
descriptions = soup.select("div.vce-col-content")
|
|
||||||
if descriptions:
|
|
||||||
descriptions = [d.get_text() for d in descriptions]
|
|
||||||
description = max(descriptions, key=len)
|
|
||||||
else:
|
|
||||||
description = None
|
|
||||||
|
|
||||||
article = soup.select("article.post")
|
|
||||||
tags = []
|
|
||||||
for c in article[0]["class"]:
|
|
||||||
if c.startswith("category-"):
|
|
||||||
tag = '-'.join(c.split("-")[1:])
|
|
||||||
if tag != "concerts":
|
|
||||||
tags.append(tag)
|
|
||||||
|
|
||||||
url_human = event_url
|
|
||||||
|
|
||||||
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
|
||||||
|
|
||||||
|
|
||||||
# A class dedicated to get events from La puce à l'oreille
|
|
||||||
# URL: https://www.lapucealoreille63.fr/
|
|
||||||
class LaPuceALOreilleExtractor(TwoStepsExtractor):
|
|
||||||
|
|
||||||
nom_lieu = "La Puce à l'Oreille"
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
|
||||||
|
|
||||||
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
|
|
||||||
for e in events:
|
|
||||||
e_url = e.find("a")
|
|
||||||
if e_url:
|
|
||||||
if self.add_event_url(e_url["href"]):
|
|
||||||
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
|
|
||||||
if title:
|
|
||||||
title = title[0].contents[0].get_text().replace("\n", " ")
|
|
||||||
title = re.sub(" +", " ", title)
|
|
||||||
self.add_event_title(e_url["href"], title)
|
|
||||||
|
|
||||||
|
|
||||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
|
||||||
soup = BeautifulSoup(event_content, "html.parser")
|
|
||||||
|
|
||||||
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
|
|
||||||
|
|
||||||
spans = soup.select("div[data-testid=richTextElement] span")
|
|
||||||
start_time = None
|
|
||||||
end_time = None
|
|
||||||
location = None
|
|
||||||
|
|
||||||
for span in spans:
|
|
||||||
txt = span.get_text()
|
|
||||||
if txt.lstrip().startswith("DÉBUT"):
|
|
||||||
start_time = self.parse_french_time(txt.split(":")[-1])
|
|
||||||
end_time = None
|
|
||||||
elif txt.lstrip().startswith("HORAIRES :"):
|
|
||||||
hs = txt.split(":")[-1].split("-")
|
|
||||||
start_time = self.parse_french_time(hs[0])
|
|
||||||
if len(hs) > 1:
|
|
||||||
end_time = self.parse_french_time(hs[1])
|
|
||||||
else:
|
|
||||||
end_time = None
|
|
||||||
elif txt.lstrip().startswith("LIEU :") and not location:
|
|
||||||
location = txt.split(":")[-1].lstrip()
|
|
||||||
|
|
||||||
if not location:
|
|
||||||
location = self.nom_lieu
|
|
||||||
end_day = self.guess_end_day(start_day, start_time, end_time)
|
|
||||||
|
|
||||||
url_human = event_url
|
|
||||||
tags = []
|
|
||||||
|
|
||||||
image = soup.select("wow-image img[fetchpriority=high]")
|
|
||||||
if image:
|
|
||||||
image = image[0]["src"]
|
|
||||||
else:
|
|
||||||
image = None
|
|
||||||
|
|
||||||
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
|
|
||||||
if descriptions:
|
|
||||||
descriptions = [d.get_text() for d in descriptions]
|
|
||||||
description = max(descriptions, key=len)
|
|
||||||
else:
|
|
||||||
description = None
|
|
||||||
|
|
||||||
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
|
@ -0,0 +1,4 @@
|
|||||||
|
from os.path import dirname, basename, isfile, join
|
||||||
|
import glob
|
||||||
|
modules = glob.glob(join(dirname(__file__), "*.py"))
|
||||||
|
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
|
@ -0,0 +1,69 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
import re
|
||||||
|
import json5
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from La Coopérative de Mai:
|
||||||
|
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
||||||
|
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
|
||||||
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "La Comédie de Clermont"
|
||||||
|
|
||||||
|
def category_comedie2agenda(self, category):
|
||||||
|
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
|
||||||
|
if category in mapping:
|
||||||
|
return mapping[category]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
dates = json5.loads(content)["data"][0]
|
||||||
|
|
||||||
|
url = self.url.split("?")[0]
|
||||||
|
for d in list(set(dates)):
|
||||||
|
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
|
||||||
|
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
|
||||||
|
if events:
|
||||||
|
events = json5.loads(events)
|
||||||
|
if "data" in events:
|
||||||
|
events = events["data"][0]
|
||||||
|
soup = BeautifulSoup(events, "html.parser")
|
||||||
|
events = soup.select("div.unedatedev")
|
||||||
|
for e in events:
|
||||||
|
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
||||||
|
self.add_event_url(e_url)
|
||||||
|
self.add_event_start_day(e_url, d)
|
||||||
|
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
||||||
|
self.add_event_start_time(e_url, t)
|
||||||
|
title = e.select('a')[0].contents[0]
|
||||||
|
self.add_event_title(e_url, title)
|
||||||
|
category = e.select("div#lieuevtcal span")
|
||||||
|
if len(category) > 0:
|
||||||
|
category = self.category_comedie2agenda(category[-1].contents[0])
|
||||||
|
if category is not None:
|
||||||
|
self.add_event_category(e_url, category)
|
||||||
|
location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
|
||||||
|
self.add_event_location(e_url, location)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
|
image = soup.select("#imgspec img")
|
||||||
|
if image:
|
||||||
|
image = image[0]["src"]
|
||||||
|
else:
|
||||||
|
image = None
|
||||||
|
|
||||||
|
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
|
||||||
|
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
||||||
|
|
@ -0,0 +1,64 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
import re
|
||||||
|
import json5
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
# A class dedicated to get events from La Coopérative de Mai:
|
||||||
|
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||||
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "La Coopérative de Mai"
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
||||||
|
if len(script) == 0:
|
||||||
|
raise Exception("Cannot find events in the first page")
|
||||||
|
script = script[0]
|
||||||
|
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||||
|
if search:
|
||||||
|
data = json5.loads(search.group(1))
|
||||||
|
for e in data['events']:
|
||||||
|
self.add_event_url(e['url'])
|
||||||
|
if e['tag'] == "Gratuit":
|
||||||
|
self.add_event_tag(e['url'], 'gratuit')
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception('Cannot extract events from javascript')
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
|
title = soup.find("h1").contents[0]
|
||||||
|
category = "Concert"
|
||||||
|
image = soup.find("meta", property="og:image")
|
||||||
|
if image:
|
||||||
|
image = image["content"]
|
||||||
|
|
||||||
|
description = soup.find("div", class_="grid-concert-content")
|
||||||
|
if description:
|
||||||
|
description = description.find('div', class_="content-striped")
|
||||||
|
if description:
|
||||||
|
description = description.find('div', class_='wysiwyg')
|
||||||
|
if description:
|
||||||
|
description = description.get_text()
|
||||||
|
if description is None:
|
||||||
|
description = ""
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
|
||||||
|
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
||||||
|
if len(link_calendar) == 0:
|
||||||
|
raise Exception('Cannot find the google calendar url')
|
||||||
|
|
||||||
|
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||||
|
start_day = gg_cal.start_day
|
||||||
|
start_time = gg_cal.start_time
|
||||||
|
end_day = gg_cal.end_day
|
||||||
|
end_time = gg_cal.end_time
|
||||||
|
location = CExtractor.nom_lieu
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||||
|
|
@ -0,0 +1,73 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
import re
|
||||||
|
import json5
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from La puce à l'oreille
|
||||||
|
# URL: https://www.lapucealoreille63.fr/
|
||||||
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "La Puce à l'Oreille"
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
|
||||||
|
for e in events:
|
||||||
|
e_url = e.find("a")
|
||||||
|
if e_url:
|
||||||
|
if self.add_event_url(e_url["href"]):
|
||||||
|
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
|
||||||
|
if title:
|
||||||
|
title = title[0].contents[0].get_text().replace("\n", " ")
|
||||||
|
title = re.sub(" +", " ", title)
|
||||||
|
self.add_event_title(e_url["href"], title)
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
|
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
|
||||||
|
|
||||||
|
spans = soup.select("div[data-testid=richTextElement] span")
|
||||||
|
start_time = None
|
||||||
|
end_time = None
|
||||||
|
location = None
|
||||||
|
|
||||||
|
for span in spans:
|
||||||
|
txt = span.get_text()
|
||||||
|
if txt.lstrip().startswith("DÉBUT"):
|
||||||
|
start_time = self.parse_french_time(txt.split(":")[-1])
|
||||||
|
end_time = None
|
||||||
|
elif txt.lstrip().startswith("HORAIRES :"):
|
||||||
|
hs = txt.split(":")[-1].split("-")
|
||||||
|
start_time = self.parse_french_time(hs[0])
|
||||||
|
if len(hs) > 1:
|
||||||
|
end_time = self.parse_french_time(hs[1])
|
||||||
|
else:
|
||||||
|
end_time = None
|
||||||
|
elif txt.lstrip().startswith("LIEU :") and not location:
|
||||||
|
location = txt.split(":")[-1].lstrip()
|
||||||
|
|
||||||
|
if not location:
|
||||||
|
location = self.nom_lieu
|
||||||
|
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||||
|
|
||||||
|
url_human = event_url
|
||||||
|
tags = []
|
||||||
|
|
||||||
|
image = soup.select("wow-image img[fetchpriority=high]")
|
||||||
|
if image:
|
||||||
|
image = image[0]["src"]
|
||||||
|
else:
|
||||||
|
image = None
|
||||||
|
|
||||||
|
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
|
||||||
|
if descriptions:
|
||||||
|
descriptions = [d.get_text() for d in descriptions]
|
||||||
|
description = max(descriptions, key=len)
|
||||||
|
else:
|
||||||
|
description = None
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
@ -0,0 +1,72 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
import re
|
||||||
|
import json5
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from Le Fotomat'
|
||||||
|
# URL: https://www.lefotomat.com/
|
||||||
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "Le Fotomat'"
|
||||||
|
|
||||||
|
def category_fotomat2agenda(self, category):
|
||||||
|
if not category:
|
||||||
|
return None
|
||||||
|
mapping = { "Concerts": "Concert"}
|
||||||
|
if category in mapping:
|
||||||
|
return mapping[category]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "xml")
|
||||||
|
|
||||||
|
events = soup.select("item")
|
||||||
|
for e in events:
|
||||||
|
e_url = e.find("link").contents[0]
|
||||||
|
self.add_event_url(e_url)
|
||||||
|
|
||||||
|
title = e.find("title").contents[0]
|
||||||
|
self.add_event_title(e_url, title)
|
||||||
|
|
||||||
|
category = self.category_fotomat2agenda(e.find("category").contents[0])
|
||||||
|
if category:
|
||||||
|
self.add_event_category(e_url, category)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
image = soup.select("div.post-content img.wp-post-image")
|
||||||
|
if image:
|
||||||
|
image = image[0]["src"]
|
||||||
|
else:
|
||||||
|
image = None
|
||||||
|
desc = soup.select("head meta[name=description]")[0]["content"]
|
||||||
|
start_day = self.parse_french_date(desc.split("-")[0])
|
||||||
|
start_time = self.parse_french_time(desc.split("-")[1])
|
||||||
|
end_time = self.parse_french_time(desc.split("-")[2])
|
||||||
|
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||||
|
|
||||||
|
location = self.nom_lieu
|
||||||
|
descriptions = soup.select("div.vce-col-content")
|
||||||
|
if descriptions:
|
||||||
|
descriptions = [d.get_text() for d in descriptions]
|
||||||
|
description = max(descriptions, key=len)
|
||||||
|
else:
|
||||||
|
description = None
|
||||||
|
|
||||||
|
article = soup.select("article.post")
|
||||||
|
tags = []
|
||||||
|
for c in article[0]["class"]:
|
||||||
|
if c.startswith("category-"):
|
||||||
|
tag = '-'.join(c.split("-")[1:])
|
||||||
|
if tag != "concerts":
|
||||||
|
tags.append(tag)
|
||||||
|
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||||
|
|
@ -3,6 +3,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<th rowspan="2">Identifiant</th>
|
<th rowspan="2">Identifiant</th>
|
||||||
<th rowspan="2">Date</th>
|
<th rowspan="2">Date</th>
|
||||||
|
<th rowspan="2">Source</th>
|
||||||
<th rowspan="2">Status</th>
|
<th rowspan="2">Status</th>
|
||||||
<th rowspan="2">Action</th>
|
<th rowspan="2">Action</th>
|
||||||
<th colspan="4">événements</th>
|
<th colspan="4">événements</th>
|
||||||
@ -19,6 +20,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td>{{ obj.id }}</a></td>
|
<td>{{ obj.id }}</a></td>
|
||||||
<td>{{ obj.created_date }}</td>
|
<td>{{ obj.created_date }}</td>
|
||||||
|
<td>{% if obj.recurrentImport %}<a href="{{ obj.recurrentImport.get_absolute_url }}">{{ obj.recurrentImport.name }}</a>{% else %}-{% endif %} </td>
|
||||||
<td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
|
<td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
|
||||||
<td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
|
<td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
|
||||||
<td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>
|
<td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>
|
||||||
|
Loading…
Reference in New Issue
Block a user