From 4186b70e7eab85b8ee05b5fccf5d79ed7fc6b771 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 2 Nov 2024 15:20:12 +0100 Subject: [PATCH] =?UTF-8?q?On=20int=C3=A8gre=20les=20nouvelles=20cat=C3=A9?= =?UTF-8?q?gories=20aux=20outils=20d'import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_lacoope_events.py | 2 +- .../custom_extractors/arachnee.py | 11 ++++--- .../import_tasks/custom_extractors/c3c.py | 18 +++++++---- .../custom_extractors/lacomedie.py | 23 +++++++++---- .../import_tasks/custom_extractors/lacoope.py | 4 +-- .../custom_extractors/lapucealoreille.py | 4 +-- .../custom_extractors/lefotomat.py | 11 ++++--- .../custom_extractors/wordpress_mec.py | 32 +++++++++++++------ .../import_tasks/extractor_ggcal_link.py | 2 +- .../migrations/0099_update_categories.py | 2 +- src/agenda_culturel/models.py | 9 +++++- 11 files changed, 78 insertions(+), 40 deletions(-) diff --git a/experimentations/get_lacoope_events.py b/experimentations/get_lacoope_events.py index c699f0b..aede1c5 100755 --- a/experimentations/get_lacoope_events.py +++ b/experimentations/get_lacoope_events.py @@ -33,7 +33,7 @@ if __name__ == "__main__": url_human = "https://www.lacoope.org/concerts-calendrier/" try: - events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True) + events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Fêtes & Concerts", "location": "La Coopérative"}, published = True) exportfile = "events-lacoope.json" print("Saving events to file {}".format(exportfile)) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py index efa7422..f2b046d 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py @@ -69,13 +69,16 @@ class CExtractor(TwoStepsExtractorNoPause): first_cat = Extractor.remove_accents(category.split(",")[0].lower()) tags = [] if first_cat in ["grand spectacle"]: - category = "Danse" + category = "Spectacles" + tags.append("danse") elif first_cat in ["theatre", "humour / one man show"]: - category = "Theatre" + category = "Spectacles" + tags.append("théâtre") elif first_cat in ["chanson francaise", "musique du monde", "pop / rock", "rap", "rnb", "raggae", "variete"]: - category = "Concert" + category = "Fêtes & Concerts" + tags.append("concert") elif first_cat in ["comedie musicale", "humour / one man show", "spectacle equestre"]: - category = "Art du spectacle" + category = "Spectacles" elif first_cat in ["spectacle pour enfant"]: tags = ["jeune public"] category = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py index b41254d..978227e 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py @@ -10,11 +10,12 @@ class CExtractor(TwoStepsExtractor): def category_c3c2agenda(self, category): if not category: return None - mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"} + mapping = {"Théâtre": "Spectacles", "Concert": "Fêtes & Concerts", "Projection": "Cinéma"} + mapping_tag = {"Théâtre": "théâtre", "Concert": "concert", "Projection": None} if category in mapping: - return mapping[category] + return mapping[category], mapping_tag[category] else: - return None + return None, None def build_event_url_list(self, content): soup = BeautifulSoup(content, "html.parser") @@ -49,20 +50,23 @@ class CExtractor(TwoStepsExtractor): description = soup.select_one(".presentation").get_text() duration = soup.select_one("#criteres .DUREE-V .valeur-critere li") - if duration is not None: + if not duration is None: duration = Extractor.parse_french_time(duration.text) location = self.nom_lieu categories = [] + tags = [] for t in soup.select(".sous-titre span"): classes = t.get("class") if classes and len(classes) > 0: if classes[0].startswith("LIEU-"): location = t.text elif classes[0].startswith("THEMATIQUE-"): - cat = self.category_c3c2agenda(t.text) - if cat is not None: + cat, tag = self.category_c3c2agenda(t.text) + if cat: categories.append(cat) + if tag: + tags.append(tag) # TODO: parser les dates, récupérer les heures () dates = [o.get("value") for o in soup.select("select.datedleb_resa option")] @@ -120,7 +124,7 @@ class CExtractor(TwoStepsExtractor): dt[0], location, description, - [], + tags, recurrences=None, uuids=[event_url], url_human=url_human, diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py index bbefd97..a148417 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py @@ -11,16 +11,23 @@ class CExtractor(TwoStepsExtractor): def category_comedie2agenda(self, category): mapping = { - "Théâtre": "Théâtre", - "Danse": "Danse", - "Rencontre": "Sans catégorie", + "Théâtre": "Spectacles", + "Danse": "Spectacles", + "Rencontre": "Rencontres & Débats", "Sortie de résidence": "Sans catégorie", "PopCorn Live": "Sans catégorie", } + mapping_tag = { + "Théâtre": "théâtre", + "Danse": "danse", + "Rencontre": None, + "Sortie de résidence": "sortie de résidence", + "PopCorn Live": None, + } if category in mapping: - return mapping[category] + return mapping[category], mapping_tag[category] else: - return None + return None, None def build_event_url_list(self, content): dates = json5.loads(content)["data"][0] @@ -55,11 +62,13 @@ class CExtractor(TwoStepsExtractor): self.add_event_title(e_url, title) category = e.select("div#lieuevtcal span") if len(category) > 0: - category = self.category_comedie2agenda( + category, tag = self.category_comedie2agenda( category[-1].contents[0] ) - if category is not None: + if category: self.add_event_category(e_url, category) + if tag: + self.add_event_tag(e_url, tag) location = ( e.select("div#lieuevtcal")[0] .contents[-1] diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py b/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py index 9e52856..9f7ff8a 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py @@ -38,7 +38,7 @@ class CExtractor(TwoStepsExtractor): soup = BeautifulSoup(event_content, "html.parser") title = soup.find("h1").contents[0] - category = "Concert" + category = "Fêtes & Concerts" image = soup.find("meta", property="og:image") if image: image = image["content"] @@ -53,7 +53,7 @@ class CExtractor(TwoStepsExtractor): if description is None: description = "" - tags = [] + tags = ["concert"] link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]') if len(link_calendar) == 0: diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py index 8383210..614d53c 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py @@ -58,7 +58,7 @@ class CExtractor(TwoStepsExtractor): end_day = Extractor.guess_end_day(start_day, start_time, end_time) url_human = event_url - tags = [] + tags = ["concert"] image = soup.select("wow-image img[fetchpriority=high]") if image: @@ -79,7 +79,7 @@ class CExtractor(TwoStepsExtractor): default_values, event_url, title, - "Concert", + "Fêtes & Concerts", start_day, location, description, diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py b/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py index 75539a2..fc85431 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py @@ -9,11 +9,12 @@ class CExtractor(TwoStepsExtractor): def category_fotomat2agenda(self, category): if not category: return None - mapping = {"Concerts": "Concert"} + mapping = {"Concerts": "Fêtes & Concerts"} + mapping_tag = {"Concerts": "concert"} if category in mapping: - return mapping[category] + return mapping[category], mapping_tag else: - return None + return None, None def build_event_url_list(self, content): soup = BeautifulSoup(content, "xml") @@ -26,9 +27,11 @@ class CExtractor(TwoStepsExtractor): title = e.find("title").contents[0] self.add_event_title(e_url, title) - category = self.category_fotomat2agenda(e.find("category").contents[0]) + category, tag = self.category_fotomat2agenda(e.find("category").contents[0]) if category: self.add_event_category(e_url, category) + if tag: + self.add_event_tag(e_url, tag) def add_event_from_content( self, diff --git a/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py b/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py index 14170ef..782a8a0 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py @@ -8,16 +8,26 @@ class CExtractor(TwoStepsExtractor): def local2agendaCategory(self, category): mapping = { - "Musique": "Concert", - "CONCERT": "Concert", - "VISITE": "Sans catégorie", - "Spectacle": "Théâtre", - "Rencontre": "Sans catégorie", - "Atelier": "Sans catégorie", - "Projection": "Sans catégorie", + "Musique": "Fêtes & Concerts", + "CONCERT": "Fêtes & Concerts", + "VISITE": "Visites & Expositions", + "Spectacle": "Spectacles", + "Rencontre": "Rencontres & Débats", + "Atelier": "Animations & Ateliers", + "Projection": "Cinéma", } + mapping_tag = { + "Musique": "concert", + "CONCERT": "concert", + "VISITE": None, + "Spectacle": "rhéâtre", + "Rencontre": None, + "Atelier": "atelier", + "Projection": None, + } + if category in mapping: - return mapping[category] + return mapping[category], mapping_tag[category] else: return None @@ -39,9 +49,11 @@ class CExtractor(TwoStepsExtractor): if len(categories) == 0: categories = e.select(".mec-category") if len(categories) > 0: - category = self.local2agendaCategory(categories[0].get_text()) - if category is not None: + category, tag = self.local2agendaCategory(categories[0].get_text()) + if category: self.add_event_category(url, category) + if tag: + self.add_event_category(url, tag) def add_event_from_content( diff --git a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py b/src/agenda_culturel/import_tasks/extractor_ggcal_link.py index 3ccdac3..da5344e 100644 --- a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py +++ b/src/agenda_culturel/import_tasks/extractor_ggcal_link.py @@ -49,7 +49,7 @@ class GoogleCalendarLinkEventExtractor(Extractor): start_day=start_day, location=location, description=description, - tags=None, + tags=[], uuids=[url], recurrences=None, url_human=url_human, diff --git a/src/agenda_culturel/migrations/0099_update_categories.py b/src/agenda_culturel/migrations/0099_update_categories.py index 601df35..f6cb4f2 100644 --- a/src/agenda_culturel/migrations/0099_update_categories.py +++ b/src/agenda_culturel/migrations/0099_update_categories.py @@ -144,7 +144,7 @@ def update_database(apps, cats): if e.category and e.category.name in convert.keys(): cat, tag = convert[e.category.name].get_transfered_to_object(apps, e) e.category = cat - if not tag is None: + if tag: if e.tags is None: e.tags = [tag] else: diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 969ed3b..dce5282 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -1108,7 +1108,14 @@ class Event(models.Model): return events[0] def update(self, other): - # TODO: what about category, tags? + + # we do not modify the category (local categories are more important) + # however, we add supplementary tags + if other.tags: + if not self.tags: + self.tags = [] + self.tags += [t for t in other.tags if not t in self.tags] + # set attributes for attr in Event.data_fields(): setattr(self, attr, getattr(other, attr))