diff --git a/experimentations/get_milleformes_events.py b/experimentations/get_milleformes_events.py new file mode 100755 index 0000000..c9a7264 --- /dev/null +++ b/experimentations/get_milleformes_events.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor()) + url = "https://www.milleformes.fr/programme" + url_human = "https://www.milleformes.fr/programme" + + try: + events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True) + + exportfile = "events-1000formes.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 90f0839..2c7090e 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = apidae_tourisme.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA: extractor = iguana_agenda.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES: + extractor = mille_formes.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py new file mode 100644 index 0000000..3859cb7 --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py @@ -0,0 +1,193 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup +from datetime import datetime, date + +# A class dedicated to get events from Mille formes +# URL: https://www.milleformes.fr/programme +class CExtractor(TwoStepsExtractorNoPause): + + def extract( + self, + content, + url, + url_human=None, + default_values=None, + published=False, + only_future=True, + ignore_404=True): + self.root_address = "https://" + urlparse(url).netloc + "/" + self.today = date.today() + return super().extract(content, url, url_human, default_values, published, only_future, ignore_404) + + + def parse_category(self, cat): + cat = cat.replace("\n", "").strip() + if "exposition" in cat or "dispositif artistique interactif" in cat: + result = 'Visites & Expositions' + elif "atelier" in cat: + result = 'Animations & Ateliers' + elif cat in ["buffet"]: + result = 'Rendez-vous locaux' + elif "ciné" in cat: + result = 'Cinéma' + elif "concert" in cat: + result = 'Fêtes & Concerts' + elif "rencontre" in cat: + result = 'Rencontres & Débats' + elif "spectacle" in cat: + result = 'Spectacles' + else: + result = 'Sans catégorie' + + return result + + # this method is not perfect, but dates and hours are not structured + def parse_dates(self, date): + dl = date.replace(' à ', '\n').split('\n') + result = [] + + for d in dl: + # only lines with a digit + if sum(c.isdigit() for c in d) != 0: + # split subparts + for d2 in d.replace(' et ', ', ').split(', '): + d2 = d2.strip() + dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today) + if dd is None: + hh = Extractor.parse_french_time(d2) + for i, r in enumerate(result): + result[i][1].append(hh) + else: + result.append([dd, []]) + + if "De" in date and " à " in date: + for i, r in enumerate(result): + result[i].append(True) + + return result + + def build_event_url_list(self, content, infuture_days=180): + + soup = BeautifulSoup(content, "html.parser") + links = soup.select('.cell a.evenement') + for l in links: + self.add_event_url(self.root_address + l["href"]) + + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + soup = BeautifulSoup(event_content, "html.parser") + title = soup.select_one('h1').text.replace("\n", "").strip().title() + + image = soup.select_one('.slide img') + if image is None: + image_alt = '' + else: + image_alt = image["alt"] + image = self.root_address + image["src"] + + soustitre = soup.select_one('.sous-titre') + if not soustitre is None: + soustitre = soustitre.text.strip() + + description = soup.select_one('.texte-full').text.strip() + infos = soup.select_one('.champ .infos') + if not infos is None: + infos = infos.text + + location = soup.select_one('.champ .taxo.espace').text.strip() + + age = soup.select_one('.champ.taxo-age').text + category = self.parse_category(soup.select_one('.champ.categorie').text) + + + date = soup.select_one('.champ.date-libre').text + + description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None]) + + if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"): + return + + dates = self.parse_dates(date) + end_day = None + + for d in dates: + if len(d) >= 2: + start_day = d[0] + + if len(d) == 3 and len(d[1]) == 2: + start_time = d[1][0] + end_time = d[1][1] + uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time) + self.add_event_with_props( + default_values, + event_url, + title, + category, + start_day, + location, + description, + [], + recurrences=None, + uuids=[uuid], + url_human=event_url, + start_time=start_time, + end_day=start_day, + end_time=end_time, + published=published, + image=image, + image_alt=image_alt + ) + else: + end_time = None + if len(d[1]) == 0: + start_time = None + uuid = event_url + "?date=" + str(start_day) + self.add_event_with_props( + default_values, + event_url, + title, + category, + start_day, + location, + description, + [], + recurrences=None, + uuids=[uuid], + url_human=event_url, + start_time=start_time, + end_day=start_day, + end_time=end_time, + published=published, + image=image, + image_alt=image_alt + ) + for t in d[1]: + start_time = t + uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time) + self.add_event_with_props( + default_values, + event_url, + title, + category, + start_day, + location, + description, + [], + recurrences=None, + uuids=[uuid], + url_human=event_url, + start_time=start_time, + end_day=start_day, + end_time=end_time, + published=published, + image=image, + image_alt=image_alt + ) + diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 450a3cb..4efd504 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -54,7 +54,7 @@ class Extractor(ABC): return i + 1 return None - def parse_french_date(text, default_year=None): + def parse_french_date(text, default_year=None, default_year_by_proximity=None): # format NomJour Numero Mois Année m = re.search( "[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text @@ -92,13 +92,25 @@ class Extractor(ABC): return None try: day = int(day) - year = int(year) + if not year is None: + year = int(year) except: return None - if year < 100: - year = 2000 + year if day >= 32: return None + + # by proximity + if year is None and not default_year_by_proximity is None: + dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]] + dates = [(abs((d - default_year_by_proximity).days), d) for d in dates] + d = min(dates, key=lambda x: x[0]) + return d[1] + + if year is None: + return None + + if year < 100: + year = 2000 + year return date(year, month, day) def parse_french_time(text): diff --git a/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py new file mode 100644 index 0000000..df78845 --- /dev/null +++ b/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2025-02-02 14:18 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0141_alter_recurrentimport_processor'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 9018114..0938a0f 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model): LARAYMONDE = "raymonde", _('La Raymonde') APIDAE = 'apidae', _('Agenda apidae tourisme') IGUANA = 'iguana', _('Agenda iguana (médiathèques)') + MILLEFORMES = 'Mille formes', _('Mille formes') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")