From aa878b8fb3e89499be66b9e9d2223f5e0bb98464 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 20 Apr 2024 12:11:39 +0200 Subject: [PATCH] Ajout support le photomat --- experimentations/get_lephotomat.py | 43 +++++++++ src/agenda_culturel/celery.py | 2 + .../import_tasks/custom_extractors.py | 76 +++++++++++++++- src/agenda_culturel/import_tasks/extractor.py | 90 ++++++++++++++++++- .../import_tasks/generic_extractors.py | 7 +- .../0051_alter_recurrentimport_processor.py | 18 ++++ src/agenda_culturel/models.py | 1 + 7 files changed, 230 insertions(+), 7 deletions(-) create mode 100755 experimentations/get_lephotomat.py create mode 100644 src/agenda_culturel/migrations/0051_alter_recurrentimport_processor.py diff --git a/experimentations/get_lephotomat.py b/experimentations/get_lephotomat.py new file mode 100755 index 0000000..7a9c641 --- /dev/null +++ b/experimentations/get_lephotomat.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), LePhotomatExtractor()) + url = "https://www.lefotomat.com/feed" + url_human = "https://www.lefotomat.com/" + + try: + events = u2e.process(url, url_human, cache = "cache-lephotomat.xml", default_values = {"location": "Le Photomat'"}, published = True) + + exportfile = "events-lephotomat.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 386d7ab..3e8986f 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -105,6 +105,8 @@ def run_recurrent_import(self, pk): extractor = LaCoopeExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE: extractor = LaComedieExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.LEPHOTOMAT: + extractor = LePhotomatExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors.py b/src/agenda_culturel/import_tasks/custom_extractors.py index e8e8031..95e01dc 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors.py +++ b/src/agenda_culturel/import_tasks/custom_extractors.py @@ -2,7 +2,7 @@ from .generic_extractors import * import re import json5 - +from datetime import timedelta # A class dedicated to get events from La Coopérative de Mai: # URL: https://www.lacoope.org/concerts-calendrier/ @@ -19,8 +19,8 @@ class LaCoopeExtractor(TwoStepsExtractor): search = re.search(r"window.fullCalendarContent = (.*)", str(script), re.S) if search: data = json5.loads(search.group(1)) - self.event_urls = [e['url'] for e in data['events']] for e in data['events']: + self.add_event_url(e['url']) if e['tag'] == "Gratuit": self.add_event_tag(e['url'], 'gratuit') @@ -81,7 +81,6 @@ class LaComedieExtractor(TwoStepsExtractor): def build_event_url_list(self, content): - self.event_urls = [] dates = json5.loads(content)["data"][0] url = self.url.split("?")[0] @@ -96,7 +95,7 @@ class LaComedieExtractor(TwoStepsExtractor): events = soup.select("div.unedatedev") for e in events: e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show - self.event_urls.append(e_url) + self.add_event_url(e_url) self.add_event_start_day(e_url, d) t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0] self.add_event_start_time(e_url, t) @@ -127,3 +126,72 @@ class LaComedieExtractor(TwoStepsExtractor): url_human = event_url self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image) + + + + +# A class dedicated to get events from Le Photomat' +# URL: https://www.lefotomat.com/ +class LePhotomatExtractor(TwoStepsExtractor): + + nom_lieu = "Le Photomat'" + + def category_photomat2agenda(self, category): + if not category: + return None + mapping = { "Concerts": "Concert"} + if category in mapping: + return mapping[category] + else: + return None + + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "xml") + + events = soup.select("item") + for e in events: + e_url = e.find("link").contents[0] + self.add_event_url(e_url) + + title = e.find("title").contents[0] + self.add_event_title(e_url, title) + + category = self.category_photomat2agenda(e.find("category").contents[0]) + if category: + self.add_event_category(e_url, category) + + + + def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False): + soup = BeautifulSoup(event_content, "html.parser") + image = soup.select("div.post-content img.wp-post-image") + if image: + image = image[0]["src"] + else: + image = None + desc = soup.select("head meta[name=description]")[0]["content"] + start_day = self.parse_french_date(desc.split("-")[0]) + start_time = self.parse_french_time(desc.split("-")[1]) + end_time = self.parse_french_time(desc.split("-")[2]) + end_day = self.guess_end_day(start_day, start_time, end_time) + + location = self.nom_lieu + descriptions = soup.select("div.vce-col-content") + if descriptions: + descriptions = [d.get_text() for d in descriptions] + description = max(descriptions, key=len) + else: + description = None + + article = soup.select("article.post") + tags = [] + for c in article[0]["class"]: + if c.startswith("category-"): + tag = '-'.join(c.split("-")[1:]) + if tag != "concerts": + tags.append(tag) + + url_human = event_url + + self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 6da3936..546e739 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -1,8 +1,14 @@ from abc import ABC, abstractmethod from bs4 import BeautifulSoup -from datetime import datetime +from datetime import datetime, time, date, timedelta +import re +import unicodedata +def remove_accents(input_str): + nfkd_form = unicodedata.normalize('NFKD', input_str) + return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + class Extractor(ABC): def __init__(self): @@ -10,6 +16,88 @@ class Extractor(ABC): self.events = [] self.downloader = None + def guess_end_day(self, start_day, start_time, end_time): + if end_time: + if end_time > start_time: + return start_day + else: + return start_day + timedelta(days=1) + + def guess_month(self, text): + mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"] + t = remove_accents(text).lower() + for i, m in enumerate(mths): + if t.startswith(m): + return i + 1 + return None + + def parse_french_date(self, text): + # format NomJour Numero Mois Année + m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) + if m: + day = m.group(1) + month = self.guess_month(m.group(2)) + year = m.group(3) + else: + # format Numero Mois Annee + m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) + if m: + day = m.group(1) + month = self.guess_month(m.group(2)) + year = m.group(3) + else: + # TODO: consolider les cas non satisfaits + return None + + if month is None: + return None + try: + day = int(day) + year = int(year) + except: + return None + if year < 100: + year = 2000 + year + if day >= 32: + return None + return date(year, month, day) + + def parse_french_time(self, text): + # format heures minutes secondes + m = re.search('([0-9]+)[ a-zA-Z:.]+([0-9]+)[ a-zA-Z:.]+([0-9]+)', text) + if m: + h = m.group(1) + m = m.group(2) + s = m.group(3) + else: + # format heures minutes + m = re.search('([0-9]+)[ h:.]+([0-9]+)', text) + if m: + h = m.group(1) + m = m.group(2) + s = "0" + else: + # format heures + m = re.search('([0-9]+)[ h:.]', text) + if m: + h = m.group(1) + m = "0" + s = "0" + else: + return None + + try: + h = int(h) + m = int(m) + s = int(s) + except: + return None + if h >= 24 or m >= 60 or s >= 60: + return None + return time(h, m, s) + + + @abstractmethod def extract(self, content, url, url_human = None, default_values = None, published = False): pass diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index d3b5c81..8f3bda4 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -57,6 +57,9 @@ class TwoStepsExtractor(Extractor): def clean_url(url): return url + def add_event_url(self, url): + self.event_urls.append(url) + def add_event_start_day(self, url, start_day): if not url in self.event_properties: self.event_properties[url] = {} @@ -124,9 +127,9 @@ class TwoStepsExtractor(Extractor): self.clear_events() self.url = url - self.event_urls = None + self.event_urls = [] self.event_properties.clear() - + # first build the event list self.build_event_url_list(content) diff --git a/src/agenda_culturel/migrations/0051_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0051_alter_recurrentimport_processor.py new file mode 100644 index 0000000..adc302b --- /dev/null +++ b/src/agenda_culturel/migrations/0051_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-04-20 10:03 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0050_alter_recurrentimport_processor'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lephotomat', 'le photomat')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 01d486c..19b2ec2 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -759,6 +759,7 @@ class RecurrentImport(models.Model): ICALNOVC = "icalnovc", _("ical no VC") LACOOPE = "lacoope", _('lacoope.org') LACOMEDIE = "lacomedie", _('la comédie') + LEPHOTOMAT = "lephotomat", _('le photomat') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")