From 9b898d26da66e0eb487d160dadcec8768d355597 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 1 Feb 2025 14:48:52 +0100 Subject: [PATCH] =?UTF-8?q?Int=C3=A9gration=20Graine=20de=20spectacles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix #278 --- experimentations/get_c3c_events.py | 2 +- experimentations/get_gds_events.py | 44 ++++++ src/agenda_culturel/celery.py | 4 +- .../import_tasks/custom_extractors/c3c.py | 136 ------------------ .../0141_alter_recurrentimport_processor.py | 37 +++++ src/agenda_culturel/models.py | 2 +- 6 files changed, 85 insertions(+), 140 deletions(-) create mode 100644 experimentations/get_gds_events.py delete mode 100644 src/agenda_culturel/import_tasks/custom_extractors/c3c.py create mode 100644 src/agenda_culturel/migrations/0141_alter_recurrentimport_processor.py diff --git a/experimentations/get_c3c_events.py b/experimentations/get_c3c_events.py index f115b99..b09c13e 100755 --- a/experimentations/get_c3c_events.py +++ b/experimentations/get_c3c_events.py @@ -29,7 +29,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import * if __name__ == "__main__": - u2e = URL2Events(ChromiumHeadlessDownloader(), c3c.CExtractor()) + u2e = URL2Events(ChromiumHeadlessDownloader(), billetterie_cf.CExtractor()) url = "https://billetterie-c3c.clermont-ferrand.fr/" url_human = "https://billetterie-c3c.clermont-ferrand.fr/" diff --git a/experimentations/get_gds_events.py b/experimentations/get_gds_events.py new file mode 100644 index 0000000..5ef11a9 --- /dev/null +++ b/experimentations/get_gds_events.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(ChromiumHeadlessDownloader(), billetterie_cf.CExtractor()) + url = "https://billetterie-gds.clermont-ferrand.fr/" + url_human = "https://billetterie-gds.clermont-ferrand.fr/" + + try: + events = u2e.process(url, url_human, cache = "cache-gds.html", default_values = {}, published = True) + + exportfile = "events-gds.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index ff8e749..90f0839 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -144,8 +144,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = wordpress_mec.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.FBEVENTS: extractor = fbevents.CExtractor() - elif rimport.processor == RecurrentImport.PROCESSOR.C3C: - extractor = c3c.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.BILLETTERIECF: + extractor = billetterie_cf.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE: extractor = arachnee.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LERIO: diff --git a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py deleted file mode 100644 index 6ac9de4..0000000 --- a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py +++ /dev/null @@ -1,136 +0,0 @@ -from ..generic_extractors import * -from bs4 import BeautifulSoup -from datetime import timedelta - -# A class dedicated to get events from La Cour des 3 Coquins -# URL: https://billetterie-c3c.clermont-ferrand.fr// -class CExtractor(TwoStepsExtractor): - nom_lieu = "La Cour des 3 Coquins" - - def category_c3c2agenda(self, category): - if not category: - return None - mapping = {"Théâtre": "Spectacles", "Concert": "Fêtes & Concerts", "Projection": "Cinéma"} - mapping_tag = {"Théâtre": "🎭 théâtre", "Concert": "🎵 concert", "Projection": None} - if category in mapping: - return mapping[category], mapping_tag[category] - else: - return None, None - - def build_event_url_list(self, content): - soup = BeautifulSoup(content, "html.parser") - - events = soup.select("div.fiche-info") - - for e in events: - e_url = e.select_one("a.btn.lien_savoir_plus")["href"] - if e_url != "": - e_url = self.url + "/" + e_url - self.add_event_url(e_url) - - def add_event_from_content( - self, - event_content, - event_url, - url_human=None, - default_values=None, - published=False, - ): - soup = BeautifulSoup(event_content, "html.parser") - - title = soup.select_one("h1") - if title: - title = title.text - - image = soup.select_one("#media .swiper-slide img") - if image: - image = image["src"] - else: - image = None - - description = soup.select_one(".presentation").get_text() - duration = soup.select_one("#criteres .DUREE-V .valeur-critere li") - if not duration is None: - duration = Extractor.parse_french_time(duration.text) - - location = self.nom_lieu - categories = [] - tags = [] - for t in soup.select(".sous-titre span"): - classes = t.get("class") - if classes and len(classes) > 0: - if classes[0].startswith("LIEU-"): - location = t.text - elif classes[0].startswith("THEMATIQUE-"): - cat, tag = self.category_c3c2agenda(t.text) - if cat: - categories.append(cat) - if tag: - tags.append(tag) - - # TODO: parser les dates, récupérer les heures () - dates = [o.get("value") for o in soup.select("select.datedleb_resa option")] - - patternCodeSite = re.compile(r'.*gsw_vars\["CODEPRESTATAIRE"\] = "(.*?)";.*', flags=re.DOTALL) - patternCodeObject = re.compile(r'.*gsw_vars\["CODEPRESTATION"\] = "(.*?)";.*', flags=re.DOTALL) - scripts = soup.find_all('script') - codeSite = "" - idObject = "" - for script in scripts: - if(patternCodeSite.match(str(script.string))): - data = patternCodeSite.match(script.string) - codeSite = data.groups()[0] - if(patternCodeObject.match(str(script.string))): - data = patternCodeObject.match(script.string) - idObject = data.groups()[0] - - - pause = self.downloader.pause - self.downloader.pause = False - # get exact schedule need two supplementary requests - datetimes = [] - if codeSite != "" and idObject != "": - for date in dates: - # the first page is required such that the server knows the selected date - page1 = self.downloader.get_content("https://billetterie-c3c.clermont-ferrand.fr/booking?action=searchAjax&cid=2&afficheDirectDispo=" + date + "&type_prestataire=V&cle_fiche=PRESTATION-V-" + codeSite + "-" + idObject + "&datedeb=" + date) - # then we get the form with hours - page2 = self.downloader.get_content("https://billetterie-c3c.clermont-ferrand.fr/booking?action=detailTarifsPrestationAjax&prestation=V-" + codeSite + "-" + idObject) - soup2 = BeautifulSoup(page2, "html.parser") - times = [o.text for o in soup2.select("#quart_en_cours_spec option")] - for t in times: - startdate = Extractor.parse_french_date(date) - starttime = Extractor.parse_french_time(t) - start = datetime.datetime.combine(startdate, starttime) - enddate = None - endtime = None - if duration is not None: - end = start + timedelta(hours=duration.hour, minutes=duration.minute, seconds=duration.second) - enddate = end.date() - endtime = end.time() - datetimes.append((startdate, starttime, enddate, endtime)) - self.downloader.pause = pause - - category = None - if len(categories) > 0: - category = categories[0] - - for dt in datetimes: - - self.add_event_with_props( - default_values, - event_url, - title, - category, - dt[0], - location, - description, - tags, - recurrences=None, - uuids=[event_url], - url_human=url_human, - start_time=dt[1], - end_day=dt[2], - end_time=dt[3], - published=published, - image=image, - ) diff --git a/src/agenda_culturel/migrations/0141_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0141_alter_recurrentimport_processor.py new file mode 100644 index 0000000..10b5631 --- /dev/null +++ b/src/agenda_culturel/migrations/0141_alter_recurrentimport_processor.py @@ -0,0 +1,37 @@ +# Generated by Django 4.2.9 on 2025-02-01 14:32 + +from django.db import migrations, models + +def rename_c3c(apps, schema_editor): + RecurrentImport = apps.get_model("agenda_culturel", "RecurrentImport") + + for instance in RecurrentImport.objects.all(): + if str(instance.processor) == "cour3coquins": + instance.processor = "Billetterie CF" + instance.save() + +def rename_c3c_backward(apps, schema_editor): + RecurrentImport = apps.get_model("agenda_culturel", "RecurrentImport") + + for instance in RecurrentImport.objects.all(): + if str(instance.processor) == "Billetterie CF": + instance.processor = "cour3coquins" + instance.save() + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0140_alter_event_created_by_user_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)')], default='ical', max_length=20, verbose_name='Processor'), + ), + migrations.RunPython( + rename_c3c, + rename_c3c_backward, + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index a56226c..9018114 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -2098,7 +2098,7 @@ class RecurrentImport(models.Model): LAPUCEALOREILLE = "lapucealoreille", _("la puce à l'oreille") MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC") FBEVENTS = "Facebook events", _("Événements d'une page FB") - C3C = "cour3coquins", _("la cour des 3 coquins") + BILLETTERIECF = "Billetterie CF", _("Billetterie Clermont-Ferrand") ARACHNEE = "arachnee", _("Arachnée concert") LERIO = "rio", _('Le Rio') LARAYMONDE = "raymonde", _('La Raymonde')