On intègre les nouvelles catégories aux outils d'import

This commit is contained in:
Jean-Marie Favreau 2024-11-02 15:20:12 +01:00
parent 40ce9a9cba
commit 4186b70e7e
11 changed files with 78 additions and 40 deletions

View File

@ -33,7 +33,7 @@ if __name__ == "__main__":
url_human = "https://www.lacoope.org/concerts-calendrier/" url_human = "https://www.lacoope.org/concerts-calendrier/"
try: try:
events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True) events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Fêtes & Concerts", "location": "La Coopérative"}, published = True)
exportfile = "events-lacoope.json" exportfile = "events-lacoope.json"
print("Saving events to file {}".format(exportfile)) print("Saving events to file {}".format(exportfile))

View File

@ -69,13 +69,16 @@ class CExtractor(TwoStepsExtractorNoPause):
first_cat = Extractor.remove_accents(category.split(",")[0].lower()) first_cat = Extractor.remove_accents(category.split(",")[0].lower())
tags = [] tags = []
if first_cat in ["grand spectacle"]: if first_cat in ["grand spectacle"]:
category = "Danse" category = "Spectacles"
tags.append("danse")
elif first_cat in ["theatre", "humour / one man show"]: elif first_cat in ["theatre", "humour / one man show"]:
category = "Theatre" category = "Spectacles"
tags.append("théâtre")
elif first_cat in ["chanson francaise", "musique du monde", "pop / rock", "rap", "rnb", "raggae", "variete"]: elif first_cat in ["chanson francaise", "musique du monde", "pop / rock", "rap", "rnb", "raggae", "variete"]:
category = "Concert" category = "Fêtes & Concerts"
tags.append("concert")
elif first_cat in ["comedie musicale", "humour / one man show", "spectacle equestre"]: elif first_cat in ["comedie musicale", "humour / one man show", "spectacle equestre"]:
category = "Art du spectacle" category = "Spectacles"
elif first_cat in ["spectacle pour enfant"]: elif first_cat in ["spectacle pour enfant"]:
tags = ["jeune public"] tags = ["jeune public"]
category = None category = None

View File

@ -10,11 +10,12 @@ class CExtractor(TwoStepsExtractor):
def category_c3c2agenda(self, category): def category_c3c2agenda(self, category):
if not category: if not category:
return None return None
mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"} mapping = {"Théâtre": "Spectacles", "Concert": "Fêtes & Concerts", "Projection": "Cinéma"}
mapping_tag = {"Théâtre": "théâtre", "Concert": "concert", "Projection": None}
if category in mapping: if category in mapping:
return mapping[category] return mapping[category], mapping_tag[category]
else: else:
return None return None, None
def build_event_url_list(self, content): def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
@ -49,20 +50,23 @@ class CExtractor(TwoStepsExtractor):
description = soup.select_one(".presentation").get_text() description = soup.select_one(".presentation").get_text()
duration = soup.select_one("#criteres .DUREE-V .valeur-critere li") duration = soup.select_one("#criteres .DUREE-V .valeur-critere li")
if duration is not None: if not duration is None:
duration = Extractor.parse_french_time(duration.text) duration = Extractor.parse_french_time(duration.text)
location = self.nom_lieu location = self.nom_lieu
categories = [] categories = []
tags = []
for t in soup.select(".sous-titre span"): for t in soup.select(".sous-titre span"):
classes = t.get("class") classes = t.get("class")
if classes and len(classes) > 0: if classes and len(classes) > 0:
if classes[0].startswith("LIEU-"): if classes[0].startswith("LIEU-"):
location = t.text location = t.text
elif classes[0].startswith("THEMATIQUE-"): elif classes[0].startswith("THEMATIQUE-"):
cat = self.category_c3c2agenda(t.text) cat, tag = self.category_c3c2agenda(t.text)
if cat is not None: if cat:
categories.append(cat) categories.append(cat)
if tag:
tags.append(tag)
# TODO: parser les dates, récupérer les heures () # TODO: parser les dates, récupérer les heures ()
dates = [o.get("value") for o in soup.select("select.datedleb_resa option")] dates = [o.get("value") for o in soup.select("select.datedleb_resa option")]
@ -120,7 +124,7 @@ class CExtractor(TwoStepsExtractor):
dt[0], dt[0],
location, location,
description, description,
[], tags,
recurrences=None, recurrences=None,
uuids=[event_url], uuids=[event_url],
url_human=url_human, url_human=url_human,

View File

@ -11,16 +11,23 @@ class CExtractor(TwoStepsExtractor):
def category_comedie2agenda(self, category): def category_comedie2agenda(self, category):
mapping = { mapping = {
"Théâtre": "Théâtre", "Théâtre": "Spectacles",
"Danse": "Danse", "Danse": "Spectacles",
"Rencontre": "Sans catégorie", "Rencontre": "Rencontres & Débats",
"Sortie de résidence": "Sans catégorie", "Sortie de résidence": "Sans catégorie",
"PopCorn Live": "Sans catégorie", "PopCorn Live": "Sans catégorie",
} }
mapping_tag = {
"Théâtre": "théâtre",
"Danse": "danse",
"Rencontre": None,
"Sortie de résidence": "sortie de résidence",
"PopCorn Live": None,
}
if category in mapping: if category in mapping:
return mapping[category] return mapping[category], mapping_tag[category]
else: else:
return None return None, None
def build_event_url_list(self, content): def build_event_url_list(self, content):
dates = json5.loads(content)["data"][0] dates = json5.loads(content)["data"][0]
@ -55,11 +62,13 @@ class CExtractor(TwoStepsExtractor):
self.add_event_title(e_url, title) self.add_event_title(e_url, title)
category = e.select("div#lieuevtcal span") category = e.select("div#lieuevtcal span")
if len(category) > 0: if len(category) > 0:
category = self.category_comedie2agenda( category, tag = self.category_comedie2agenda(
category[-1].contents[0] category[-1].contents[0]
) )
if category is not None: if category:
self.add_event_category(e_url, category) self.add_event_category(e_url, category)
if tag:
self.add_event_tag(e_url, tag)
location = ( location = (
e.select("div#lieuevtcal")[0] e.select("div#lieuevtcal")[0]
.contents[-1] .contents[-1]

View File

@ -38,7 +38,7 @@ class CExtractor(TwoStepsExtractor):
soup = BeautifulSoup(event_content, "html.parser") soup = BeautifulSoup(event_content, "html.parser")
title = soup.find("h1").contents[0] title = soup.find("h1").contents[0]
category = "Concert" category = "Fêtes & Concerts"
image = soup.find("meta", property="og:image") image = soup.find("meta", property="og:image")
if image: if image:
image = image["content"] image = image["content"]
@ -53,7 +53,7 @@ class CExtractor(TwoStepsExtractor):
if description is None: if description is None:
description = "" description = ""
tags = [] tags = ["concert"]
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]') link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
if len(link_calendar) == 0: if len(link_calendar) == 0:

View File

@ -58,7 +58,7 @@ class CExtractor(TwoStepsExtractor):
end_day = Extractor.guess_end_day(start_day, start_time, end_time) end_day = Extractor.guess_end_day(start_day, start_time, end_time)
url_human = event_url url_human = event_url
tags = [] tags = ["concert"]
image = soup.select("wow-image img[fetchpriority=high]") image = soup.select("wow-image img[fetchpriority=high]")
if image: if image:
@ -79,7 +79,7 @@ class CExtractor(TwoStepsExtractor):
default_values, default_values,
event_url, event_url,
title, title,
"Concert", "Fêtes & Concerts",
start_day, start_day,
location, location,
description, description,

View File

@ -9,11 +9,12 @@ class CExtractor(TwoStepsExtractor):
def category_fotomat2agenda(self, category): def category_fotomat2agenda(self, category):
if not category: if not category:
return None return None
mapping = {"Concerts": "Concert"} mapping = {"Concerts": "Fêtes & Concerts"}
mapping_tag = {"Concerts": "concert"}
if category in mapping: if category in mapping:
return mapping[category] return mapping[category], mapping_tag
else: else:
return None return None, None
def build_event_url_list(self, content): def build_event_url_list(self, content):
soup = BeautifulSoup(content, "xml") soup = BeautifulSoup(content, "xml")
@ -26,9 +27,11 @@ class CExtractor(TwoStepsExtractor):
title = e.find("title").contents[0] title = e.find("title").contents[0]
self.add_event_title(e_url, title) self.add_event_title(e_url, title)
category = self.category_fotomat2agenda(e.find("category").contents[0]) category, tag = self.category_fotomat2agenda(e.find("category").contents[0])
if category: if category:
self.add_event_category(e_url, category) self.add_event_category(e_url, category)
if tag:
self.add_event_tag(e_url, tag)
def add_event_from_content( def add_event_from_content(
self, self,

View File

@ -8,16 +8,26 @@ class CExtractor(TwoStepsExtractor):
def local2agendaCategory(self, category): def local2agendaCategory(self, category):
mapping = { mapping = {
"Musique": "Concert", "Musique": "Fêtes & Concerts",
"CONCERT": "Concert", "CONCERT": "Fêtes & Concerts",
"VISITE": "Sans catégorie", "VISITE": "Visites & Expositions",
"Spectacle": "Théâtre", "Spectacle": "Spectacles",
"Rencontre": "Sans catégorie", "Rencontre": "Rencontres & Débats",
"Atelier": "Sans catégorie", "Atelier": "Animations & Ateliers",
"Projection": "Sans catégorie", "Projection": "Cinéma",
} }
mapping_tag = {
"Musique": "concert",
"CONCERT": "concert",
"VISITE": None,
"Spectacle": "rhéâtre",
"Rencontre": None,
"Atelier": "atelier",
"Projection": None,
}
if category in mapping: if category in mapping:
return mapping[category] return mapping[category], mapping_tag[category]
else: else:
return None return None
@ -39,9 +49,11 @@ class CExtractor(TwoStepsExtractor):
if len(categories) == 0: if len(categories) == 0:
categories = e.select(".mec-category") categories = e.select(".mec-category")
if len(categories) > 0: if len(categories) > 0:
category = self.local2agendaCategory(categories[0].get_text()) category, tag = self.local2agendaCategory(categories[0].get_text())
if category is not None: if category:
self.add_event_category(url, category) self.add_event_category(url, category)
if tag:
self.add_event_category(url, tag)
def add_event_from_content( def add_event_from_content(

View File

@ -49,7 +49,7 @@ class GoogleCalendarLinkEventExtractor(Extractor):
start_day=start_day, start_day=start_day,
location=location, location=location,
description=description, description=description,
tags=None, tags=[],
uuids=[url], uuids=[url],
recurrences=None, recurrences=None,
url_human=url_human, url_human=url_human,

View File

@ -144,7 +144,7 @@ def update_database(apps, cats):
if e.category and e.category.name in convert.keys(): if e.category and e.category.name in convert.keys():
cat, tag = convert[e.category.name].get_transfered_to_object(apps, e) cat, tag = convert[e.category.name].get_transfered_to_object(apps, e)
e.category = cat e.category = cat
if not tag is None: if tag:
if e.tags is None: if e.tags is None:
e.tags = [tag] e.tags = [tag]
else: else:

View File

@ -1108,7 +1108,14 @@ class Event(models.Model):
return events[0] return events[0]
def update(self, other): def update(self, other):
# TODO: what about category, tags?
# we do not modify the category (local categories are more important)
# however, we add supplementary tags
if other.tags:
if not self.tags:
self.tags = []
self.tags += [t for t in other.tags if not t in self.tags]
# set attributes # set attributes
for attr in Event.data_fields(): for attr in Event.data_fields():
setattr(self, attr, getattr(other, attr)) setattr(self, attr, getattr(other, attr))