Ajout (pas finalisé) de l'import Cour des 3 Coquins

This commit is contained in:
Jean-Marie Favreau 2024-09-04 11:42:31 +02:00
parent 0a5470e73d
commit 9bb3373f99
3 changed files with 152 additions and 2 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), c3c.CExtractor())
url = "https://billetterie-c3c.clermont-ferrand.fr/"
url_human = "https://billetterie-c3c.clermont-ferrand.fr/"
try:
events = u2e.process(url, url_human, cache = "cache-c3c.html", default_values = {"location": "La Cour des 3 Coquins"}, published = True)
exportfile = "events-c3c.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -0,0 +1,100 @@
from ..generic_extractors import *
from bs4 import BeautifulSoup
# A class dedicated to get events from La Cour des 3 Coquins
# URL: https://billetterie-c3c.clermont-ferrand.fr//
class CExtractor(TwoStepsExtractor):
nom_lieu = "La Cour des 3 Coquins"
def category_c3c2agenda(self, category):
if not category:
return None
mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
events = soup.select("div.fiche-info")
for e in events:
e_url = e.select_one("a.btn.lien_savoir_plus")["href"]
if e_url != "":
e_url = self.url + "/" + e_url
self.add_event_url(e_url)
def add_event_from_content(
self,
event_content,
event_url,
url_human=None,
default_values=None,
published=False,
):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.select_one("h1")
if title:
title = title.text
image = soup.select_one("#media .swiper-slide img")
if image:
image = image["src"]
else:
image = None
description = soup.select_one(".presentation")
duree = soup.select_one("#criteres .DUREE-V .valeur-critere li")
if duree is not None:
duree = self.parse_french_time(duree.text)
location = self.nom_lieu
tags = []
for t in soup.select(".sous-titre span"):
classes = t.get("class")
if classes and len(classes) > 0:
if classes[0].startswith("LIEU-"):
location = t.text
elif classes[0].startswith("THEMATIQUE-"):
tag = self.category_c3c2agenda(t.text)
if tag is not None:
tags.append(tag)
# TODO: parser les dates, récupérer les heures ()
print("EVENT ", event_url)
print("- ", title)
print("- ", image)
print("- ", len(description))
print("- ", duree)
print("- ", location)
print("- ", tags)
print("- ", dates)
return
url_human = event_url
self.add_event_with_props(
event_url,
None,
None,
start_day,
location,
description,
tags,
recurrences=None,
uuids=[event_url],
url_human=url_human,
start_time=start_time,
end_day=end_day,
end_time=end_time,
published=published,
image=image,
)

View File

@ -97,13 +97,20 @@ class Extractor(ABC):
s = "0"
else:
# format heures
m = re.search("([0-9]+)[ Hh:.]", text)
m = re.search("([0-9]+) [Hh:.]", text)
if m:
h = m.group(1)
m = "0"
s = "0"
else:
return None
# format minutes
m = re.search("([0-9]+)[ ]*(?:mn|min|Min|Mn)", text)
if m:
h = "0"
m = m.group(1)
s = "0"
else:
return None
try:
h = int(h)