Importation des événements nature du puy de dôme

2025-01-18 14:58:06 +01:00 · 2025-01-18 14:58:06 +01:00 · 280f04d22f
commit 280f04d22f
parent 20040268e7
4 changed files with 150 additions and 0 deletions
--- a/experimentations/get_puydedome.py
+++ b/experimentations/get_puydedome.py
@ -0,0 +1,44 @@
 #!/usr/bin/python3
 # coding: utf-8
 import os
 import json
 import sys
 # getting the name of the directory
 # where the this file is present.
 current = os.path.dirname(os.path.realpath(__file__))
 # Getting the parent directory name
 # where the current directory is present.
 parent = os.path.dirname(current)
 # adding the parent directory to 
 # the sys.path.
 sys.path.append(parent)
 sys.path.append(parent + "/src")
 from src.agenda_culturel.import_tasks.downloader import *
 from src.agenda_culturel.import_tasks.extractor import *
 from src.agenda_culturel.import_tasks.importer import *
 from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
    u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor())
    url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48"
    url_human = "https://ens.puy-de-dome.fr/agenda.html"
    try:
        events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True)
        exportfile = "events-puydedome.json"
        print("Saving events to file {}".format(exportfile))
        with open(exportfile, "w") as f:
            json.dump(events, f, indent=4, default=str)
    except Exception as e:
        print("Exception: " + str(e))
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
        extractor = lerio.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE:
        extractor = laraymonde.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE:
        extractor = apidae_tourisme.CExtractor()
    else:
        extractor = None
--- a/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py
@ -0,0 +1,103 @@
 from ..generic_extractors import *
 from bs4 import BeautifulSoup
 from datetime import datetime
 # A class dedicated to get events from apidae-tourisme widgets
 class CExtractor(TwoStepsExtractorNoPause):
    def build_event_url_list(self, content, infuture_days=180):
        # Get line starting with wrapper.querySelector(".results_agenda").innerHTML = "
        # split using "=" and keep the end
        # strip it, and remove the first character (") and the two last ones (";)
        # remove the escapes and parse the contained html
        for line in content.split("\n"):
            if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'):
                html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
                soup = BeautifulSoup(html, "html.parser")
                links = soup.select('a.widgit_result')
                for l in links:
                    self.add_event_url(l["data-w-href"])
                break
    def add_event_from_content(
        self,
        event_content,
        event_url,
        url_human=None,
        default_values=None,
        published=False,
    ):
        # check for htag
        for line in event_content.split("\n"):
            if line.strip().startswith("window.location.hash"):
                ref = line.split('"')[1]
                break
        # check for content
        for line in event_content.split("\n"):
            if line.startswith('detailsWrapper.innerHTML ='):
                html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
                soup = BeautifulSoup(html, "html.parser")
                title = soup.select_one('h2.widgit_title').text.strip()
                image = soup.select_one('img')
                image_alt = image["alt"]
                image = image["src"]
                description = soup.select('div.desc')
                description = '\n'.join([d.text for d in description])
                openings = soup.select_one('.openings .mts').text.strip().split("\n")[0]
                start_time = None
                end_time = None
                if "tous les" in openings:
                    start_day = None
                else:
                    start_day = Extractor.parse_french_date(openings)
                    details = openings.split("de")
                    if len(details) > 1:
                        hours = details[1].split("à")
                        start_time = Extractor.parse_french_time(hours[0])
                        if len(hours) > 1:
                            end_time = Extractor.parse_french_time(hours[1])
                contact = soup.select_one(".contact")
                sa = False
                location = []
                for c in contact.children:
                    if c.name == 'h2' and c.text.strip() == "Adresse":
                        sa = True
                    else:
                        if c.name == 'h2' and sa:
                            break
                    if c.name == 'p' and sa:
                        e = c.text.strip()
                        if e != "":
                            location.append(e)
                location = ', '.join(location)
                websites = soup.select("a.website")
                event_url = url_human + "#" + ref
                self.add_event_with_props(
                            default_values,
                            event_url,
                            title,
                            None,
                            start_day,
                            location,
                            description,
                            [],
                            recurrences=None,
                            uuids=[event_url],
                            url_human=event_url,
                            start_time=start_time,
                            end_day=start_day,
                            end_time=end_time,
                            published=published,
                            image=image,
                            image_alt=image_alt
                        )             
                return
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model):
        ARACHNEE = "arachnee", _("Arachnée concert")
        LERIO = "rio", _('Le Rio')
        LARAYMONDE = "raymonde", _('La Raymonde')
        APIDAE = 'apidae', _('Agenda apidae tourisme')
    class DOWNLOADER(models.TextChoices):
        SIMPLE = "simple", _("simple")