From 280f04d22f8e2fb056286e2a659c263391193ae4 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 18 Jan 2025 14:58:06 +0100 Subject: [PATCH] =?UTF-8?q?Importation=20des=20=C3=A9v=C3=A9nements=20natu?= =?UTF-8?q?re=20du=20puy=20de=20d=C3=B4me?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_puydedome.py | 44 ++++++++ src/agenda_culturel/celery.py | 2 + .../custom_extractors/apidae_tourisme.py | 103 ++++++++++++++++++ src/agenda_culturel/models.py | 1 + 4 files changed, 150 insertions(+) create mode 100755 experimentations/get_puydedome.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py diff --git a/experimentations/get_puydedome.py b/experimentations/get_puydedome.py new file mode 100755 index 0000000..1069070 --- /dev/null +++ b/experimentations/get_puydedome.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor()) + url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48" + url_human = "https://ens.puy-de-dome.fr/agenda.html" + + try: + events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True) + + exportfile = "events-puydedome.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index cf7d753..6af4c26 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = lerio.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE: extractor = laraymonde.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE: + extractor = apidae_tourisme.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py b/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py new file mode 100644 index 0000000..4cb3aff --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py @@ -0,0 +1,103 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup +from datetime import datetime + +# A class dedicated to get events from apidae-tourisme widgets +class CExtractor(TwoStepsExtractorNoPause): + + + def build_event_url_list(self, content, infuture_days=180): + + # Get line starting with wrapper.querySelector(".results_agenda").innerHTML = " + # split using "=" and keep the end + # strip it, and remove the first character (") and the two last ones (";) + # remove the escapes and parse the contained html + for line in content.split("\n"): + if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'): + html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/') + soup = BeautifulSoup(html, "html.parser") + links = soup.select('a.widgit_result') + for l in links: + self.add_event_url(l["data-w-href"]) + break + + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + # check for htag + for line in event_content.split("\n"): + if line.strip().startswith("window.location.hash"): + ref = line.split('"')[1] + break + + # check for content + for line in event_content.split("\n"): + if line.startswith('detailsWrapper.innerHTML ='): + html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/') + + soup = BeautifulSoup(html, "html.parser") + title = soup.select_one('h2.widgit_title').text.strip() + image = soup.select_one('img') + image_alt = image["alt"] + image = image["src"] + description = soup.select('div.desc') + description = '\n'.join([d.text for d in description]) + openings = soup.select_one('.openings .mts').text.strip().split("\n")[0] + start_time = None + end_time = None + if "tous les" in openings: + start_day = None + else: + start_day = Extractor.parse_french_date(openings) + details = openings.split("de") + if len(details) > 1: + hours = details[1].split("à") + start_time = Extractor.parse_french_time(hours[0]) + if len(hours) > 1: + end_time = Extractor.parse_french_time(hours[1]) + + contact = soup.select_one(".contact") + sa = False + location = [] + for c in contact.children: + if c.name == 'h2' and c.text.strip() == "Adresse": + sa = True + else: + if c.name == 'h2' and sa: + break + if c.name == 'p' and sa: + e = c.text.strip() + if e != "": + location.append(e) + + location = ', '.join(location) + + websites = soup.select("a.website") + event_url = url_human + "#" + ref + + self.add_event_with_props( + default_values, + event_url, + title, + None, + start_day, + location, + description, + [], + recurrences=None, + uuids=[event_url], + url_human=event_url, + start_time=start_time, + end_day=start_day, + end_time=end_time, + published=published, + image=image, + image_alt=image_alt + ) + return diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 1f40866..4bc5510 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model): ARACHNEE = "arachnee", _("Arachnée concert") LERIO = "rio", _('Le Rio') LARAYMONDE = "raymonde", _('La Raymonde') + APIDAE = 'apidae', _('Agenda apidae tourisme') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")