From 280f04d22f8e2fb056286e2a659c263391193ae4 Mon Sep 17 00:00:00 2001
From: Jean-Marie Favreau <j-marie.favreau@uca.fr>
Date: Sat, 18 Jan 2025 14:58:06 +0100
Subject: [PATCH] =?UTF-8?q?Importation=20des=20=C3=A9v=C3=A9nements=20natu?=
 =?UTF-8?q?re=20du=20puy=20de=20d=C3=B4me?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 experimentations/get_puydedome.py             |  44 ++++++++
 src/agenda_culturel/celery.py                 |   2 +
 .../custom_extractors/apidae_tourisme.py      | 103 ++++++++++++++++++
 src/agenda_culturel/models.py                 |   1 +
 4 files changed, 150 insertions(+)
 create mode 100755 experimentations/get_puydedome.py
 create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py

diff --git a/experimentations/get_puydedome.py b/experimentations/get_puydedome.py
new file mode 100755
index 0000000..1069070
--- /dev/null
+++ b/experimentations/get_puydedome.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import os
+import json
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+ 
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+ 
+# adding the parent directory to 
+# the sys.path.
+sys.path.append(parent)
+sys.path.append(parent + "/src")
+
+from src.agenda_culturel.import_tasks.downloader import *
+from src.agenda_culturel.import_tasks.extractor import *
+from src.agenda_culturel.import_tasks.importer import *
+from src.agenda_culturel.import_tasks.custom_extractors import *
+
+
+
+
+
+if __name__ == "__main__":
+
+    u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor())
+    url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48"
+    url_human = "https://ens.puy-de-dome.fr/agenda.html"
+
+    try:
+        events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True)
+
+        exportfile = "events-puydedome.json"
+        print("Saving events to file {}".format(exportfile))
+        with open(exportfile, "w") as f:
+            json.dump(events, f, indent=4, default=str)
+    except Exception as e:
+        print("Exception: " + str(e))
diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py
index cf7d753..6af4c26 100644
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
         extractor = lerio.CExtractor()
     elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE:
         extractor = laraymonde.CExtractor()
+    elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE:
+        extractor = apidae_tourisme.CExtractor()
     else:
         extractor = None
 
diff --git a/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py b/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py
new file mode 100644
index 0000000..4cb3aff
--- /dev/null
+++ b/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py
@@ -0,0 +1,103 @@
+from ..generic_extractors import *
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+# A class dedicated to get events from apidae-tourisme widgets
+class CExtractor(TwoStepsExtractorNoPause):
+
+
+    def build_event_url_list(self, content, infuture_days=180):
+        
+        # Get line starting with wrapper.querySelector(".results_agenda").innerHTML = "
+        # split using "=" and keep the end
+        # strip it, and remove the first character (") and the two last ones (";)
+        # remove the escapes and parse the contained html
+        for line in content.split("\n"):
+            if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'):
+                html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
+                soup = BeautifulSoup(html, "html.parser")
+                links = soup.select('a.widgit_result')
+                for l in links:
+                    self.add_event_url(l["data-w-href"])
+                break
+
+
+    def add_event_from_content(
+        self,
+        event_content,
+        event_url,
+        url_human=None,
+        default_values=None,
+        published=False,
+    ):
+        # check for htag
+        for line in event_content.split("\n"):
+            if line.strip().startswith("window.location.hash"):
+                ref = line.split('"')[1]
+                break
+
+        # check for content
+        for line in event_content.split("\n"):
+            if line.startswith('detailsWrapper.innerHTML ='):
+                html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
+
+                soup = BeautifulSoup(html, "html.parser")
+                title = soup.select_one('h2.widgit_title').text.strip()
+                image = soup.select_one('img')
+                image_alt = image["alt"]
+                image = image["src"]
+                description = soup.select('div.desc')
+                description = '\n'.join([d.text for d in description])
+                openings = soup.select_one('.openings .mts').text.strip().split("\n")[0]
+                start_time = None
+                end_time = None
+                if "tous les" in openings:
+                    start_day = None
+                else:
+                    start_day = Extractor.parse_french_date(openings)
+                    details = openings.split("de")
+                    if len(details) > 1:
+                        hours = details[1].split("à")
+                        start_time = Extractor.parse_french_time(hours[0])
+                        if len(hours) > 1:
+                            end_time = Extractor.parse_french_time(hours[1])
+                
+                contact = soup.select_one(".contact")
+                sa = False
+                location = []
+                for c in contact.children:
+                    if c.name == 'h2' and c.text.strip() == "Adresse":
+                        sa = True
+                    else:
+                        if c.name == 'h2' and sa:
+                            break
+                    if c.name == 'p' and sa:
+                        e = c.text.strip()
+                        if e != "":
+                            location.append(e)
+
+                location = ', '.join(location)
+
+                websites = soup.select("a.website")
+                event_url = url_human + "#" + ref
+
+                self.add_event_with_props(
+                            default_values,
+                            event_url,
+                            title,
+                            None,
+                            start_day,
+                            location,
+                            description,
+                            [],
+                            recurrences=None,
+                            uuids=[event_url],
+                            url_human=event_url,
+                            start_time=start_time,
+                            end_day=start_day,
+                            end_time=end_time,
+                            published=published,
+                            image=image,
+                            image_alt=image_alt
+                        )             
+                return
diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py
index 1f40866..4bc5510 100644
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model):
         ARACHNEE = "arachnee", _("Arachnée concert")
         LERIO = "rio", _('Le Rio')
         LARAYMONDE = "raymonde", _('La Raymonde')
+        APIDAE = 'apidae', _('Agenda apidae tourisme')
 
     class DOWNLOADER(models.TextChoices):
         SIMPLE = "simple", _("simple")