Importation des événements nature du puy de dôme
This commit is contained in:
parent
20040268e7
commit
280f04d22f
44
experimentations/get_puydedome.py
Executable file
44
experimentations/get_puydedome.py
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
sys.path.append(parent + "/src")
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor())
|
||||
url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48"
|
||||
url_human = "https://ens.puy-de-dome.fr/agenda.html"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True)
|
||||
|
||||
exportfile = "events-puydedome.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
||||
extractor = lerio.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE:
|
||||
extractor = laraymonde.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE:
|
||||
extractor = apidae_tourisme.CExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
@ -0,0 +1,103 @@
|
||||
from ..generic_extractors import *
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
|
||||
# A class dedicated to get events from apidae-tourisme widgets
|
||||
class CExtractor(TwoStepsExtractorNoPause):
|
||||
|
||||
|
||||
def build_event_url_list(self, content, infuture_days=180):
|
||||
|
||||
# Get line starting with wrapper.querySelector(".results_agenda").innerHTML = "
|
||||
# split using "=" and keep the end
|
||||
# strip it, and remove the first character (") and the two last ones (";)
|
||||
# remove the escapes and parse the contained html
|
||||
for line in content.split("\n"):
|
||||
if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'):
|
||||
html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = soup.select('a.widgit_result')
|
||||
for l in links:
|
||||
self.add_event_url(l["data-w-href"])
|
||||
break
|
||||
|
||||
|
||||
def add_event_from_content(
|
||||
self,
|
||||
event_content,
|
||||
event_url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
):
|
||||
# check for htag
|
||||
for line in event_content.split("\n"):
|
||||
if line.strip().startswith("window.location.hash"):
|
||||
ref = line.split('"')[1]
|
||||
break
|
||||
|
||||
# check for content
|
||||
for line in event_content.split("\n"):
|
||||
if line.startswith('detailsWrapper.innerHTML ='):
|
||||
html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
title = soup.select_one('h2.widgit_title').text.strip()
|
||||
image = soup.select_one('img')
|
||||
image_alt = image["alt"]
|
||||
image = image["src"]
|
||||
description = soup.select('div.desc')
|
||||
description = '\n'.join([d.text for d in description])
|
||||
openings = soup.select_one('.openings .mts').text.strip().split("\n")[0]
|
||||
start_time = None
|
||||
end_time = None
|
||||
if "tous les" in openings:
|
||||
start_day = None
|
||||
else:
|
||||
start_day = Extractor.parse_french_date(openings)
|
||||
details = openings.split("de")
|
||||
if len(details) > 1:
|
||||
hours = details[1].split("à")
|
||||
start_time = Extractor.parse_french_time(hours[0])
|
||||
if len(hours) > 1:
|
||||
end_time = Extractor.parse_french_time(hours[1])
|
||||
|
||||
contact = soup.select_one(".contact")
|
||||
sa = False
|
||||
location = []
|
||||
for c in contact.children:
|
||||
if c.name == 'h2' and c.text.strip() == "Adresse":
|
||||
sa = True
|
||||
else:
|
||||
if c.name == 'h2' and sa:
|
||||
break
|
||||
if c.name == 'p' and sa:
|
||||
e = c.text.strip()
|
||||
if e != "":
|
||||
location.append(e)
|
||||
|
||||
location = ', '.join(location)
|
||||
|
||||
websites = soup.select("a.website")
|
||||
event_url = url_human + "#" + ref
|
||||
|
||||
self.add_event_with_props(
|
||||
default_values,
|
||||
event_url,
|
||||
title,
|
||||
None,
|
||||
start_day,
|
||||
location,
|
||||
description,
|
||||
[],
|
||||
recurrences=None,
|
||||
uuids=[event_url],
|
||||
url_human=event_url,
|
||||
start_time=start_time,
|
||||
end_day=start_day,
|
||||
end_time=end_time,
|
||||
published=published,
|
||||
image=image,
|
||||
image_alt=image_alt
|
||||
)
|
||||
return
|
@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model):
|
||||
ARACHNEE = "arachnee", _("Arachnée concert")
|
||||
LERIO = "rio", _('Le Rio')
|
||||
LARAYMONDE = "raymonde", _('La Raymonde')
|
||||
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
||||
|
||||
class DOWNLOADER(models.TextChoices):
|
||||
SIMPLE = "simple", _("simple")
|
||||
|
Loading…
x
Reference in New Issue
Block a user