Importation des événements nature du puy de dôme

This commit is contained in:
Jean-Marie Favreau 2025-01-18 14:58:06 +01:00
parent 20040268e7
commit 280f04d22f
4 changed files with 150 additions and 0 deletions

View File

@ -0,0 +1,44 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor())
url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48"
url_human = "https://ens.puy-de-dome.fr/agenda.html"
try:
events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True)
exportfile = "events-puydedome.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
extractor = lerio.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE:
extractor = laraymonde.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE:
extractor = apidae_tourisme.CExtractor()
else:
extractor = None

View File

@ -0,0 +1,103 @@
from ..generic_extractors import *
from bs4 import BeautifulSoup
from datetime import datetime
# A class dedicated to get events from apidae-tourisme widgets
class CExtractor(TwoStepsExtractorNoPause):
def build_event_url_list(self, content, infuture_days=180):
# Get line starting with wrapper.querySelector(".results_agenda").innerHTML = "
# split using "=" and keep the end
# strip it, and remove the first character (") and the two last ones (";)
# remove the escapes and parse the contained html
for line in content.split("\n"):
if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'):
html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
soup = BeautifulSoup(html, "html.parser")
links = soup.select('a.widgit_result')
for l in links:
self.add_event_url(l["data-w-href"])
break
def add_event_from_content(
self,
event_content,
event_url,
url_human=None,
default_values=None,
published=False,
):
# check for htag
for line in event_content.split("\n"):
if line.strip().startswith("window.location.hash"):
ref = line.split('"')[1]
break
# check for content
for line in event_content.split("\n"):
if line.startswith('detailsWrapper.innerHTML ='):
html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
soup = BeautifulSoup(html, "html.parser")
title = soup.select_one('h2.widgit_title').text.strip()
image = soup.select_one('img')
image_alt = image["alt"]
image = image["src"]
description = soup.select('div.desc')
description = '\n'.join([d.text for d in description])
openings = soup.select_one('.openings .mts').text.strip().split("\n")[0]
start_time = None
end_time = None
if "tous les" in openings:
start_day = None
else:
start_day = Extractor.parse_french_date(openings)
details = openings.split("de")
if len(details) > 1:
hours = details[1].split("à")
start_time = Extractor.parse_french_time(hours[0])
if len(hours) > 1:
end_time = Extractor.parse_french_time(hours[1])
contact = soup.select_one(".contact")
sa = False
location = []
for c in contact.children:
if c.name == 'h2' and c.text.strip() == "Adresse":
sa = True
else:
if c.name == 'h2' and sa:
break
if c.name == 'p' and sa:
e = c.text.strip()
if e != "":
location.append(e)
location = ', '.join(location)
websites = soup.select("a.website")
event_url = url_human + "#" + ref
self.add_event_with_props(
default_values,
event_url,
title,
None,
start_day,
location,
description,
[],
recurrences=None,
uuids=[event_url],
url_human=event_url,
start_time=start_time,
end_day=start_day,
end_time=end_time,
published=published,
image=image,
image_alt=image_alt
)
return

View File

@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model):
ARACHNEE = "arachnee", _("Arachnée concert")
LERIO = "rio", _('Le Rio')
LARAYMONDE = "raymonde", _('La Raymonde')
APIDAE = 'apidae', _('Agenda apidae tourisme')
class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple")