Ajout Mille formes

Fix #274
2025-02-02 14:21:03 +01:00 · 2025-02-02 14:21:03 +01:00 · 2fe2611788
commit 2fe2611788
parent 55a0094e2f
6 changed files with 274 additions and 4 deletions
--- a/experimentations/get_milleformes_events.py
+++ b/experimentations/get_milleformes_events.py
@ -0,0 +1,44 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import os
+import json
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+
+# adding the parent directory to
+# the sys.path.
+sys.path.append(parent)
+sys.path.append(parent + "/src")
+
+from src.agenda_culturel.import_tasks.downloader import *
+from src.agenda_culturel.import_tasks.extractor import *
+from src.agenda_culturel.import_tasks.importer import *
+from src.agenda_culturel.import_tasks.custom_extractors import *
+
+
+
+
+
+if __name__ == "__main__":
+
+    u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor())
+    url = "https://www.milleformes.fr/programme"
+    url_human = "https://www.milleformes.fr/programme"
+
+    try:
+        events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True)
+
+        exportfile = "events-1000formes.json"
+        print("Saving events to file {}".format(exportfile))
+        with open(exportfile, "w") as f:
+            json.dump(events, f, indent=4, default=str)
+    except Exception as e:
+        print("Exception: " + str(e))
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
        extractor = apidae_tourisme.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
        extractor = iguana_agenda.CExtractor()
+    elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
+        extractor = mille_formes.CExtractor()
    else:
        extractor = None

--- a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py
@ -0,0 +1,193 @@
+from ..generic_extractors import *
+from bs4 import BeautifulSoup
+from datetime import datetime, date
+
+# A class dedicated to get events from Mille formes
+# URL: https://www.milleformes.fr/programme
+class CExtractor(TwoStepsExtractorNoPause):
+
+    def extract(
+        self,
+        content,
+        url,
+        url_human=None,
+        default_values=None,
+        published=False,
+        only_future=True,
+        ignore_404=True):
+        self.root_address = "https://" + urlparse(url).netloc + "/"
+        self.today = date.today()
+        return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
+
+
+    def parse_category(self, cat):
+        cat = cat.replace("\n", "").strip()
+        if "exposition" in cat or "dispositif artistique interactif" in cat:
+            result = 'Visites & Expositions'
+        elif "atelier" in cat:
+            result = 'Animations & Ateliers'
+        elif cat in ["buffet"]:
+            result = 'Rendez-vous locaux'
+        elif "ciné" in cat:
+            result = 'Cinéma'
+        elif "concert" in cat:
+            result = 'Fêtes & Concerts'
+        elif "rencontre" in cat:
+            result = 'Rencontres & Débats'
+        elif "spectacle" in cat:
+            result = 'Spectacles'
+        else:
+            result = 'Sans catégorie'
+
+        return result
+
+    # this method is not perfect, but dates and hours are not structured
+    def parse_dates(self, date):
+        dl = date.replace(' à ', '\n').split('\n')
+        result = []
+
+        for d in dl:
+            # only lines with a digit
+            if sum(c.isdigit() for c in d) != 0:
+                # split subparts
+                for d2 in d.replace(' et ', ', ').split(', '):
+                    d2 = d2.strip()
+                    dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today)
+                    if dd is None:
+                        hh = Extractor.parse_french_time(d2)
+                        for i, r in enumerate(result):
+                            result[i][1].append(hh)
+                    else: 
+                        result.append([dd, []])
+
+        if "De" in date and " à " in date:
+            for i, r in enumerate(result):
+                result[i].append(True)
+
+        return result
+
+    def build_event_url_list(self, content, infuture_days=180):
+        
+        soup = BeautifulSoup(content, "html.parser")
+        links = soup.select('.cell a.evenement')
+        for l in links:
+            self.add_event_url(self.root_address + l["href"])
+
+
+    def add_event_from_content(
+        self,
+        event_content,
+        event_url,
+        url_human=None,
+        default_values=None,
+        published=False,
+    ):
+        soup = BeautifulSoup(event_content, "html.parser")
+        title = soup.select_one('h1').text.replace("\n", "").strip().title()
+
+        image = soup.select_one('.slide img')
+        if image is None:
+            image_alt = ''
+        else:
+            image_alt = image["alt"]
+            image = self.root_address + image["src"]
+        
+        soustitre = soup.select_one('.sous-titre')
+        if not soustitre is None:
+            soustitre = soustitre.text.strip()
+
+        description = soup.select_one('.texte-full').text.strip()
+        infos = soup.select_one('.champ .infos')
+        if not infos is None:
+            infos = infos.text
+
+        location = soup.select_one('.champ .taxo.espace').text.strip()
+
+        age = soup.select_one('.champ.taxo-age').text
+        category = self.parse_category(soup.select_one('.champ.categorie').text)
+
+
+        date = soup.select_one('.champ.date-libre').text
+
+        description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None])
+
+        if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"):
+            return
+        
+        dates = self.parse_dates(date)
+        end_day = None
+
+        for d in dates:
+            if len(d) >= 2:
+                start_day = d[0]
+
+                if len(d) == 3 and len(d[1]) == 2:
+                    start_time = d[1][0]
+                    end_time = d[1][1]
+                    uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
+                    self.add_event_with_props(
+                                default_values,
+                                event_url,
+                                title,
+                                category,
+                                start_day,
+                                location,
+                                description,
+                                [],
+                                recurrences=None,
+                                uuids=[uuid],
+                                url_human=event_url,
+                                start_time=start_time,
+                                end_day=start_day,
+                                end_time=end_time,
+                                published=published,
+                                image=image,
+                                image_alt=image_alt
+                            )
+                else:
+                    end_time = None
+                    if len(d[1]) == 0:
+                        start_time = None
+                        uuid = event_url + "?date=" + str(start_day)
+                        self.add_event_with_props(
+                                    default_values,
+                                    event_url,
+                                    title,
+                                    category,
+                                    start_day,
+                                    location,
+                                    description,
+                                    [],
+                                    recurrences=None,
+                                    uuids=[uuid],
+                                    url_human=event_url,
+                                    start_time=start_time,
+                                    end_day=start_day,
+                                    end_time=end_time,
+                                    published=published,
+                                    image=image,
+                                    image_alt=image_alt
+                                )
+                    for t in d[1]:
+                        start_time = t
+                        uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
+                        self.add_event_with_props(
+                                    default_values,
+                                    event_url,
+                                    title,
+                                    category,
+                                    start_day,
+                                    location,
+                                    description,
+                                    [],
+                                    recurrences=None,
+                                    uuids=[uuid],
+                                    url_human=event_url,
+                                    start_time=start_time,
+                                    end_day=start_day,
+                                    end_time=end_time,
+                                    published=published,
+                                    image=image,
+                                    image_alt=image_alt
+                                )
+
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@ -54,7 +54,7 @@ class Extractor(ABC):
                return i + 1
        return None

-    def parse_french_date(text, default_year=None):
+    def parse_french_date(text, default_year=None, default_year_by_proximity=None):
        # format NomJour Numero Mois Année
        m = re.search(
            "[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
@ -92,13 +92,25 @@ class Extractor(ABC):
            return None
        try:
            day = int(day)
-            year = int(year)
+            if not year is None:
+                year = int(year)
        except:
            return None
-        if year < 100:
-            year = 2000 + year
        if day >= 32:
            return None
+
+        # by proximity
+        if year is None and not default_year_by_proximity is None:
+            dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]]
+            dates = [(abs((d - default_year_by_proximity).days), d) for d in dates]
+            d = min(dates, key=lambda x: x[0])
+            return d[1]
+
+        if year is None:
+            return None
+
+        if year < 100:
+            year = 2000 + year
        return date(year, month, day)

    def parse_french_time(text):
--- a/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py
+++ b/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py
@ -0,0 +1,18 @@
+# Generated by Django 4.2.9 on 2025-02-02 14:18
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('agenda_culturel', '0141_alter_recurrentimport_processor'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='recurrentimport',
+            name='processor',
+            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'),
+        ),
+    ]
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model):
        LARAYMONDE = "raymonde", _('La Raymonde')
        APIDAE = 'apidae', _('Agenda apidae tourisme')
        IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
+        MILLEFORMES = 'Mille formes', _('Mille formes')

    class DOWNLOADER(models.TextChoices):
        SIMPLE = "simple", _("simple")