Refactoring extracteurs

This commit is contained in:
Jean-Marie Favreau 2024-04-22 09:42:23 +02:00
parent 2862a0c5dd
commit c043ba198c
12 changed files with 292 additions and 274 deletions

View File

@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__": if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaComedieExtractor()) u2e = URL2Events(SimpleDownloader(), lacomedie.CExtractor())
url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes" url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
url_human = "https://lacomediedeclermont.com/saison23-24/" url_human = "https://lacomediedeclermont.com/saison23-24/"

View File

@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__": if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor()) u2e = URL2Events(SimpleDownloader(), lacoope.CExtractor())
url = "https://www.lacoope.org/concerts-calendrier/" url = "https://www.lacoope.org/concerts-calendrier/"
url_human = "https://www.lacoope.org/concerts-calendrier/" url_human = "https://www.lacoope.org/concerts-calendrier/"

View File

@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__": if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor()) u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
url = "https://www.lapucealoreille63.fr/programmation/" url = "https://www.lapucealoreille63.fr/programmation/"
url_human = "https://www.lapucealoreille63.fr/programmation/" url_human = "https://www.lapucealoreille63.fr/programmation/"

View File

@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__": if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LeFotomatExtractor()) u2e = URL2Events(SimpleDownloader(), lefotomat.CExtractor())
url = "https://www.lefotomat.com/feed" url = "https://www.lefotomat.com/feed"
url_human = "https://www.lefotomat.com/" url_human = "https://www.lefotomat.com/"

View File

@ -102,13 +102,13 @@ def run_recurrent_import(self, pk):
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC: elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
extractor = ICALNoVCExtractor() extractor = ICALNoVCExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE: elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
extractor = LaCoopeExtractor() extractor = lacoope.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE: elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
extractor = LaComedieExtractor() extractor = lacomedie.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT: elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
extractor = LeFotomatExtractor() extractor = lefotomat.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE: elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
extractor = LaPuceALOreilleExtractor() extractor = lapucealoreille.CExtractor()
else: else:
extractor = None extractor = None

View File

@ -1,266 +0,0 @@
from .generic_extractors import *
import re
import json5
from datetime import timedelta
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://www.lacoope.org/concerts-calendrier/
class LaCoopeExtractor(TwoStepsExtractor):
nom_lieu = "La Coopérative de Mai"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
script = soup.find('div', class_="js-filter__results").findChildren('script')
if len(script) == 0:
raise Exception("Cannot find events in the first page")
script = script[0]
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
if search:
data = json5.loads(search.group(1))
for e in data['events']:
self.add_event_url(e['url'])
if e['tag'] == "Gratuit":
self.add_event_tag(e['url'], 'gratuit')
else:
raise Exception('Cannot extract events from javascript')
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.find("h1").contents[0]
category = "Concert"
image = soup.find("meta", property="og:image")
if image:
image = image["content"]
description = soup.find("div", class_="grid-concert-content")
if description:
description = description.find('div', class_="content-striped")
if description:
description = description.find('div', class_='wysiwyg')
if description:
description = description.get_text()
if description is None:
description = ""
tags = []
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
if len(link_calendar) == 0:
raise Exception('Cannot find the google calendar url')
gg_cal = GGCalendar(link_calendar[0]["href"])
start_day = gg_cal.start_day
start_time = gg_cal.start_time
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = LaCoopeExtractor.nom_lieu
url_human = event_url
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
class LaComedieExtractor(TwoStepsExtractor):
nom_lieu = "La Comédie de Clermont"
def category_comedie2agenda(self, category):
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
dates = json5.loads(content)["data"][0]
url = self.url.split("?")[0]
for d in list(set(dates)):
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
if events:
events = json5.loads(events)
if "data" in events:
events = events["data"][0]
soup = BeautifulSoup(events, "html.parser")
events = soup.select("div.unedatedev")
for e in events:
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
self.add_event_url(e_url)
self.add_event_start_day(e_url, d)
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
self.add_event_start_time(e_url, t)
title = e.select('a')[0].contents[0]
self.add_event_title(e_url, title)
category = e.select("div#lieuevtcal span")
if len(category) > 0:
category = self.category_comedie2agenda(category[-1].contents[0])
if category is not None:
self.add_event_category(e_url, category)
location = e.select("div#lieuevtcal")[0].contents[-1].split("")[-1]
self.add_event_location(e_url, location)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
image = soup.select("#imgspec img")
if image:
image = image[0]["src"]
else:
image = None
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
url_human = event_url
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
# A class dedicated to get events from Le Fotomat'
# URL: https://www.lefotomat.com/
class LeFotomatExtractor(TwoStepsExtractor):
nom_lieu = "Le Fotomat'"
def category_fotomat2agenda(self, category):
if not category:
return None
mapping = { "Concerts": "Concert"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "xml")
events = soup.select("item")
for e in events:
e_url = e.find("link").contents[0]
self.add_event_url(e_url)
title = e.find("title").contents[0]
self.add_event_title(e_url, title)
category = self.category_fotomat2agenda(e.find("category").contents[0])
if category:
self.add_event_category(e_url, category)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
image = soup.select("div.post-content img.wp-post-image")
if image:
image = image[0]["src"]
else:
image = None
desc = soup.select("head meta[name=description]")[0]["content"]
start_day = self.parse_french_date(desc.split("-")[0])
start_time = self.parse_french_time(desc.split("-")[1])
end_time = self.parse_french_time(desc.split("-")[2])
end_day = self.guess_end_day(start_day, start_time, end_time)
location = self.nom_lieu
descriptions = soup.select("div.vce-col-content")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
article = soup.select("article.post")
tags = []
for c in article[0]["class"]:
if c.startswith("category-"):
tag = '-'.join(c.split("-")[1:])
if tag != "concerts":
tags.append(tag)
url_human = event_url
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
# A class dedicated to get events from La puce à l'oreille
# URL: https://www.lapucealoreille63.fr/
class LaPuceALOreilleExtractor(TwoStepsExtractor):
nom_lieu = "La Puce à l'Oreille"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
for e in events:
e_url = e.find("a")
if e_url:
if self.add_event_url(e_url["href"]):
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
if title:
title = title[0].contents[0].get_text().replace("\n", " ")
title = re.sub(" +", " ", title)
self.add_event_title(e_url["href"], title)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
spans = soup.select("div[data-testid=richTextElement] span")
start_time = None
end_time = None
location = None
for span in spans:
txt = span.get_text()
if txt.lstrip().startswith("DÉBUT"):
start_time = self.parse_french_time(txt.split(":")[-1])
end_time = None
elif txt.lstrip().startswith("HORAIRES :"):
hs = txt.split(":")[-1].split("-")
start_time = self.parse_french_time(hs[0])
if len(hs) > 1:
end_time = self.parse_french_time(hs[1])
else:
end_time = None
elif txt.lstrip().startswith("LIEU :") and not location:
location = txt.split(":")[-1].lstrip()
if not location:
location = self.nom_lieu
end_day = self.guess_end_day(start_day, start_time, end_time)
url_human = event_url
tags = []
image = soup.select("wow-image img[fetchpriority=high]")
if image:
image = image[0]["src"]
else:
image = None
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -0,0 +1,4 @@
from os.path import dirname, basename, isfile, join
import glob
modules = glob.glob(join(dirname(__file__), "*.py"))
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]

View File

@ -0,0 +1,69 @@
from ..generic_extractors import *
import re
import json5
from datetime import timedelta
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
class CExtractor(TwoStepsExtractor):
nom_lieu = "La Comédie de Clermont"
def category_comedie2agenda(self, category):
mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
dates = json5.loads(content)["data"][0]
url = self.url.split("?")[0]
for d in list(set(dates)):
if not self.only_future or self.now <= datetime.date.fromisoformat(d):
events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
if events:
events = json5.loads(events)
if "data" in events:
events = events["data"][0]
soup = BeautifulSoup(events, "html.parser")
events = soup.select("div.unedatedev")
for e in events:
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
self.add_event_url(e_url)
self.add_event_start_day(e_url, d)
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
self.add_event_start_time(e_url, t)
title = e.select('a')[0].contents[0]
self.add_event_title(e_url, title)
category = e.select("div#lieuevtcal span")
if len(category) > 0:
category = self.category_comedie2agenda(category[-1].contents[0])
if category is not None:
self.add_event_category(e_url, category)
location = e.select("div#lieuevtcal")[0].contents[-1].split("")[-1]
self.add_event_location(e_url, location)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
image = soup.select("#imgspec img")
if image:
image = image[0]["src"]
else:
image = None
description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
url_human = event_url
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)

View File

@ -0,0 +1,64 @@
from ..generic_extractors import *
import re
import json5
from datetime import timedelta
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://www.lacoope.org/concerts-calendrier/
class CExtractor(TwoStepsExtractor):
nom_lieu = "La Coopérative de Mai"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
script = soup.find('div', class_="js-filter__results").findChildren('script')
if len(script) == 0:
raise Exception("Cannot find events in the first page")
script = script[0]
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
if search:
data = json5.loads(search.group(1))
for e in data['events']:
self.add_event_url(e['url'])
if e['tag'] == "Gratuit":
self.add_event_tag(e['url'], 'gratuit')
else:
raise Exception('Cannot extract events from javascript')
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.find("h1").contents[0]
category = "Concert"
image = soup.find("meta", property="og:image")
if image:
image = image["content"]
description = soup.find("div", class_="grid-concert-content")
if description:
description = description.find('div', class_="content-striped")
if description:
description = description.find('div', class_='wysiwyg')
if description:
description = description.get_text()
if description is None:
description = ""
tags = []
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
if len(link_calendar) == 0:
raise Exception('Cannot find the google calendar url')
gg_cal = GGCalendar(link_calendar[0]["href"])
start_day = gg_cal.start_day
start_time = gg_cal.start_time
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = CExtractor.nom_lieu
url_human = event_url
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -0,0 +1,73 @@
from ..generic_extractors import *
import re
import json5
from datetime import timedelta
# A class dedicated to get events from La puce à l'oreille
# URL: https://www.lapucealoreille63.fr/
class CExtractor(TwoStepsExtractor):
nom_lieu = "La Puce à l'Oreille"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
for e in events:
e_url = e.find("a")
if e_url:
if self.add_event_url(e_url["href"]):
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
if title:
title = title[0].contents[0].get_text().replace("\n", " ")
title = re.sub(" +", " ", title)
self.add_event_title(e_url["href"], title)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
spans = soup.select("div[data-testid=richTextElement] span")
start_time = None
end_time = None
location = None
for span in spans:
txt = span.get_text()
if txt.lstrip().startswith("DÉBUT"):
start_time = self.parse_french_time(txt.split(":")[-1])
end_time = None
elif txt.lstrip().startswith("HORAIRES :"):
hs = txt.split(":")[-1].split("-")
start_time = self.parse_french_time(hs[0])
if len(hs) > 1:
end_time = self.parse_french_time(hs[1])
else:
end_time = None
elif txt.lstrip().startswith("LIEU :") and not location:
location = txt.split(":")[-1].lstrip()
if not location:
location = self.nom_lieu
end_day = self.guess_end_day(start_day, start_time, end_time)
url_human = event_url
tags = []
image = soup.select("wow-image img[fetchpriority=high]")
if image:
image = image[0]["src"]
else:
image = None
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -0,0 +1,72 @@
from ..generic_extractors import *
import re
import json5
from datetime import timedelta
# A class dedicated to get events from Le Fotomat'
# URL: https://www.lefotomat.com/
class CExtractor(TwoStepsExtractor):
nom_lieu = "Le Fotomat'"
def category_fotomat2agenda(self, category):
if not category:
return None
mapping = { "Concerts": "Concert"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "xml")
events = soup.select("item")
for e in events:
e_url = e.find("link").contents[0]
self.add_event_url(e_url)
title = e.find("title").contents[0]
self.add_event_title(e_url, title)
category = self.category_fotomat2agenda(e.find("category").contents[0])
if category:
self.add_event_category(e_url, category)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
image = soup.select("div.post-content img.wp-post-image")
if image:
image = image[0]["src"]
else:
image = None
desc = soup.select("head meta[name=description]")[0]["content"]
start_day = self.parse_french_date(desc.split("-")[0])
start_time = self.parse_french_time(desc.split("-")[1])
end_time = self.parse_french_time(desc.split("-")[2])
end_day = self.guess_end_day(start_day, start_time, end_time)
location = self.nom_lieu
descriptions = soup.select("div.vce-col-content")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
article = soup.select("article.post")
tags = []
for c in article[0]["class"]:
if c.startswith("category-"):
tag = '-'.join(c.split("-")[1:])
if tag != "concerts":
tags.append(tag)
url_human = event_url
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -3,6 +3,7 @@
<tr> <tr>
<th rowspan="2">Identifiant</th> <th rowspan="2">Identifiant</th>
<th rowspan="2">Date</th> <th rowspan="2">Date</th>
<th rowspan="2">Source</th>
<th rowspan="2">Status</th> <th rowspan="2">Status</th>
<th rowspan="2">Action</th> <th rowspan="2">Action</th>
<th colspan="4">événements</th> <th colspan="4">événements</th>
@ -19,6 +20,7 @@
<tr> <tr>
<td>{{ obj.id }}</a></td> <td>{{ obj.id }}</a></td>
<td>{{ obj.created_date }}</td> <td>{{ obj.created_date }}</td>
<td>{% if obj.recurrentImport %}<a href="{{ obj.recurrentImport.get_absolute_url }}">{{ obj.recurrentImport.name }}</a>{% else %}-{% endif %} </td>
<td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td> <td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
<td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td> <td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
<td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td> <td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>