From ac3d6796cff7713b82d204aabc06bbdd1280884f Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Fri, 29 Nov 2024 14:57:29 +0100 Subject: [PATCH] Ajout de l'import Rio Fix #187 --- experimentations/get_le_rio.py | 43 +++++++++ src/agenda_culturel/celery.py | 2 + .../import_tasks/custom_extractors/lerio.py | 91 +++++++++++++++++++ .../0122_alter_recurrentimport_processor.py | 18 ++++ src/agenda_culturel/models.py | 1 + 5 files changed, 155 insertions(+) create mode 100755 experimentations/get_le_rio.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/lerio.py create mode 100644 src/agenda_culturel/migrations/0122_alter_recurrentimport_processor.py diff --git a/experimentations/get_le_rio.py b/experimentations/get_le_rio.py new file mode 100755 index 0000000..35e0364 --- /dev/null +++ b/experimentations/get_le_rio.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), lerio.CExtractor()) + url = "https://www.cinemalerio.com/evenements/" + url_human = "https://www.cinemalerio.com/evenements/" + + try: + events = u2e.process(url, url_human, cache = "cache-le-rio.html", default_values = {"location": "Cinéma le Rio", "category": "Cinéma"}, published = True) + + exportfile = "events-le-roi.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index b435900..f3ffaeb 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -147,6 +147,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = c3c.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE: extractor = arachnee.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.LERIO: + extractor = lerio.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lerio.py b/src/agenda_culturel/import_tasks/custom_extractors/lerio.py new file mode 100644 index 0000000..117693c --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/lerio.py @@ -0,0 +1,91 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup +from datetime import datetime + +# A class dedicated to get events from Cinéma Le Rio (Clermont-Ferrand) +# URL: https://www.cinemalerio.com/evenements/ +class CExtractor(TwoStepsExtractorNoPause): + + def __init__(self): + super().__init__() + self.possible_dates = {} + self.theater = None + + def build_event_url_list(self, content, infuture_days=180): + + soup = BeautifulSoup(content, "html.parser") + + links = soup.select("td.seance_link a") + if links: + for l in links: + print(l["href"]) + self.add_event_url(l["href"]) + + def to_text_select_one(soup, filter): + e = soup.select_one(filter) + if e is None: + return None + else: + return e.text + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + + soup = BeautifulSoup(event_content, "html.parser") + + title = soup.select_one("h1").text + + alerte_date = CExtractor.to_text_select_one(soup, ".alerte_date") + if alerte_date is None: + return + dh = alerte_date.split("à") + # if date is not found, we skip + if len(dh) != 2: + return + + date = Extractor.parse_french_date(dh[0], default_year=datetime.now().year) + time = Extractor.parse_french_time(dh[1]) + + synopsis = CExtractor.to_text_select_one(soup, ".synopsis_bloc") + special_titre = CExtractor.to_text_select_one(soup, ".alerte_titre") + special = CExtractor.to_text_select_one(soup, ".alerte_text") + + # it's not a specific event: we skip it + special_lines = None if special is None else special.split('\n') + if special is None or len(special_lines) == 0 or \ + (len(special_lines) == 1 and special_lines[0].strip().startswith('En partenariat')): + return + + description = "\n\n".join([x for x in [synopsis, special_titre, special] if not x is None]) + + image = soup.select_one(".col1 img") + image_alt = None + if not image is None: + image_alt = image["alt"] + image = image["src"] + + self.add_event_with_props( + default_values, + event_url, + title, + None, + date, + None, + description, + [], + recurrences=None, + uuids=[event_url], + url_human=event_url, + start_time=time, + end_day=None, + end_time=None, + published=published, + image=image, + image_alt=image_alt + ) \ No newline at end of file diff --git a/src/agenda_culturel/migrations/0122_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0122_alter_recurrentimport_processor.py new file mode 100644 index 0000000..0758649 --- /dev/null +++ b/src/agenda_culturel/migrations/0122_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-11-29 13:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0121_contactmessage_related_event'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 9554190..03aeba3 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -1767,6 +1767,7 @@ class RecurrentImport(models.Model): FBEVENTS = "Facebook events", _("Événements d'une page FB") C3C = "cour3coquins", _("la cour des 3 coquins") ARACHNEE = "arachnee", _("Arachnée concert") + LERIO = "rio", _('Le Rio') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")