From 995aa4b2d3108a0b2bb29afbe34fcefa709a9db6 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Fri, 19 Apr 2024 23:59:59 +0200 Subject: [PATCH] =?UTF-8?q?Ajout=20de=20l'import=20de=20la=20programmation?= =?UTF-8?q?=20de=20la=20Com=C3=A9die?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_lacomedie_events.py | 43 ++++++++++++ experimentations/get_lacoope_events.py | 2 +- src/agenda_culturel/celery.py | 2 + .../import_tasks/custom_extractors.py | 67 ++++++++++++++++++- .../import_tasks/downloader.py | 23 ++++--- .../import_tasks/generic_extractors.py | 48 +++++++++++-- .../0050_alter_recurrentimport_processor.py | 18 +++++ src/agenda_culturel/models.py | 6 +- 8 files changed, 194 insertions(+), 15 deletions(-) create mode 100755 experimentations/get_lacomedie_events.py create mode 100644 src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py diff --git a/experimentations/get_lacomedie_events.py b/experimentations/get_lacomedie_events.py new file mode 100755 index 0000000..411938d --- /dev/null +++ b/experimentations/get_lacomedie_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), LaComedieExtractor()) + url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes" + url_human = "https://lacomediedeclermont.com/saison23-24/" + + try: + events = u2e.process(url, url_human, cache = "cache-lacomedie.html", default_values = {"location": "La Comédie de Clermont"}, published = True) + + exportfile = "events-lacomedie.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/experimentations/get_lacoope_events.py b/experimentations/get_lacoope_events.py index b98dddb..c76dedf 100755 --- a/experimentations/get_lacoope_events.py +++ b/experimentations/get_lacoope_events.py @@ -33,7 +33,7 @@ if __name__ == "__main__": url_human = "https://www.lacoope.org/concerts-calendrier/" try: - events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True) + events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True) exportfile = "events-lacoope.json" print("Saving events to file {}".format(exportfile)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 16e72c7..386d7ab 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -103,6 +103,8 @@ def run_recurrent_import(self, pk): extractor = ICALNoVCExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE: extractor = LaCoopeExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE: + extractor = LaComedieExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors.py b/src/agenda_culturel/import_tasks/custom_extractors.py index 473d83a..6b10dc4 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors.py +++ b/src/agenda_culturel/import_tasks/custom_extractors.py @@ -61,4 +61,69 @@ class LaCoopeExtractor(TwoStepsExtractor): location = LaCoopeExtractor.nom_lieu url_human = event_url - self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) \ No newline at end of file + self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) + + +# A class dedicated to get events from La Coopérative de Mai: +# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes +# URL pour les humains: https://lacomediedeclermont.com/saison23-24/ +class LaComedieExtractor(TwoStepsExtractor): + + nom_lieu = "La Comédie de Clermont" + + def category_comedie2agenda(self, category): + mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"} + if category in mapping: + return mapping[category] + else: + return None + + + + def build_event_url_list(self, content): + self.event_urls = [] + dates = json5.loads(content)["data"][0] + + url = self.url.split("?")[0] + for d in dates: + if not self.only_future or self.now <= datetime.date.fromisoformat(d): + events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d}) + if events: + events = json5.loads(events) + if "data" in events: + events = events["data"][0] + soup = BeautifulSoup(events, "html.parser") + events = soup.select("div.unedatedev") + for e in events: + e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show + self.event_urls.append(e_url) + self.add_event_start_day(e_url, d) + t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0] + self.add_event_start_time(e_url, t) + title = e.select('a')[0].contents[0] + self.add_event_title(e_url, title) + category = e.select("div#lieuevtcal span") + if len(category) > 0: + category = self.category_comedie2agenda(category[-1].contents[0]) + if category is not None: + self.add_event_category(e_url, category) + location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1] + self.add_event_location(e_url, location) + + + + + def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False): + soup = BeautifulSoup(event_content, "html.parser") + + image = soup.select("#imgspec img") + if image: + image = image[0]["src"] + else: + image = None + + description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "") + + url_human = event_url + + self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image) diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index d5333a9..785e4f3 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -1,4 +1,4 @@ -from urllib.parse import urlparse +from urllib.parse import urlparse, urlencode import urllib.request import os from selenium import webdriver @@ -13,16 +13,16 @@ class Downloader(ABC): pass @abstractmethod - def download(self, url): + def download(self, url, post=None): pass - def get_content(self, url, cache = None): + def get_content(self, url, cache = None, post = None): if cache and os.path.exists(cache): print("Loading cache ({})".format(cache)) with open(cache) as f: content = "\n".join(f.readlines()) else: - content = self.download(url) + content = self.download(url, post) if cache: print("Saving cache ({})".format(cache)) @@ -40,14 +40,19 @@ class SimpleDownloader(Downloader): super().__init__() - def download(self, url): + def download(self, url, post=None): print("Downloading {}".format(url)) try: - resource = urllib.request.urlopen(url) + if post: + post_args = urlencode(post).encode() + resource = urllib.request.urlopen(url, post_args) + else: + resource = urllib.request.urlopen(url) data = resource.read().decode(resource.headers.get_content_charset()) return data - except: + except Exception as e: + print(e) return None @@ -63,7 +68,9 @@ class ChromiumHeadlessDownloader(Downloader): self.service = Service("/usr/bin/chromedriver") - def download(self, url): + def download(self, url, post=None): + if post: + raise Exception('POST method with Chromium headless not yet implemented') print("Download {}".format(url)) self.driver = webdriver.Chrome(service=self.service, options=self.options) diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index 0a8182e..d3b5c81 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -2,6 +2,7 @@ from abc import abstractmethod from urllib.parse import urlparse from urllib.parse import parse_qs + from .extractor import * from django.utils.translation import gettext_lazy as _ from dateutil import parser @@ -56,6 +57,21 @@ class TwoStepsExtractor(Extractor): def clean_url(url): return url + def add_event_start_day(self, url, start_day): + if not url in self.event_properties: + self.event_properties[url] = {} + self.event_properties[url]["start_day"] = start_day + + def add_event_start_time(self, url, start_time): + if not url in self.event_properties: + self.event_properties[url] = {} + self.event_properties[url]["start_time"] = start_time + + def add_event_title(self, url, title): + if not url in self.event_properties: + self.event_properties[url] = {} + self.event_properties[url]["title"] = title + def add_event_tag(self, url, tag): if not url in self.event_properties: self.event_properties[url] = {} @@ -63,11 +79,32 @@ class TwoStepsExtractor(Extractor): self.event_properties[url]["tags"] = [] self.event_properties[url]["tags"].append(tag) + def add_event_category(self, url, cat): + if not url in self.event_properties: + self.event_properties[url] = {} + self.event_properties[url]["category"] = cat + + def add_event_location(self, url, loc): + if not url in self.event_properties: + self.event_properties[url] = {} + self.event_properties[url]["location"] = loc + def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None): - if event_url in self.event_properties and 'tags' in self.event_properties[event_url]: - tags = tags + self.event_properties[event_url]['tags'] - + if event_url in self.event_properties: + if 'tags' in self.event_properties[event_url]: + tags = tags + self.event_properties[event_url]['tags'] + if 'start_day' in self.event_properties[event_url]: + start_day = self.event_properties[event_url]['start_day'] + if 'start_time' in self.event_properties[event_url]: + start_time = self.event_properties[event_url]['start_time'] + if 'title' in self.event_properties[event_url]: + title = self.event_properties[event_url]['title'] + if 'category' in self.event_properties[event_url]: + category = self.event_properties[event_url]['category'] + if 'location' in self.event_properties[event_url]: + location = self.event_properties[event_url]['location'] + self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt) @@ -80,10 +117,13 @@ class TwoStepsExtractor(Extractor): pass - def extract(self, content, url, url_human = None, default_values = None, published = False): + def extract(self, content, url, url_human = None, default_values = None, published = False, only_future=True): + self.only_future = only_future + self.now = datetime.datetime.now().date() self.set_header(url) self.clear_events() + self.url = url self.event_urls = None self.event_properties.clear() diff --git a/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py new file mode 100644 index 0000000..248e4e9 --- /dev/null +++ b/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-04-19 21:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0049_alter_recurrentimport_processor'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index bbd3677..94493b6 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -604,7 +604,10 @@ class Event(models.Model): # for each event, check if it's a new one, or a one to be updated for event in events: sdate = date.fromisoformat(event.start_day) - edate = date.fromisoformat(event.end_day) + if event.end_day: + edate = date.fromisoformat(event.end_day) + else: + edate = sdate if min_date is None or min_date > sdate: min_date = sdate if max_date is None or max_date < sdate: @@ -755,6 +758,7 @@ class RecurrentImport(models.Model): ICALNOBUSY = "icalnobusy", _("ical no busy") ICALNOVC = "icalnovc", _("ical no VC") LACOOPE = "lacoope", _('lacoope.org') + LACOMEDIE = "lacomedie", _('la comédie') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")