Ajout (pas finalisé) de l'import Cour des 3 Coquins
This commit is contained in:
parent
0a5470e73d
commit
9bb3373f99
43
experimentations/get_c3c_events.py
Executable file
43
experimentations/get_c3c_events.py
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
u2e = URL2Events(SimpleDownloader(), c3c.CExtractor())
|
||||||
|
url = "https://billetterie-c3c.clermont-ferrand.fr/"
|
||||||
|
url_human = "https://billetterie-c3c.clermont-ferrand.fr/"
|
||||||
|
|
||||||
|
try:
|
||||||
|
events = u2e.process(url, url_human, cache = "cache-c3c.html", default_values = {"location": "La Cour des 3 Coquins"}, published = True)
|
||||||
|
|
||||||
|
exportfile = "events-c3c.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: " + str(e))
|
100
src/agenda_culturel/import_tasks/custom_extractors/c3c.py
Normal file
100
src/agenda_culturel/import_tasks/custom_extractors/c3c.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# A class dedicated to get events from La Cour des 3 Coquins
|
||||||
|
# URL: https://billetterie-c3c.clermont-ferrand.fr//
|
||||||
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
nom_lieu = "La Cour des 3 Coquins"
|
||||||
|
|
||||||
|
def category_c3c2agenda(self, category):
|
||||||
|
if not category:
|
||||||
|
return None
|
||||||
|
mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"}
|
||||||
|
if category in mapping:
|
||||||
|
return mapping[category]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
events = soup.select("div.fiche-info")
|
||||||
|
|
||||||
|
for e in events:
|
||||||
|
e_url = e.select_one("a.btn.lien_savoir_plus")["href"]
|
||||||
|
if e_url != "":
|
||||||
|
e_url = self.url + "/" + e_url
|
||||||
|
self.add_event_url(e_url)
|
||||||
|
|
||||||
|
def add_event_from_content(
|
||||||
|
self,
|
||||||
|
event_content,
|
||||||
|
event_url,
|
||||||
|
url_human=None,
|
||||||
|
default_values=None,
|
||||||
|
published=False,
|
||||||
|
):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
|
title = soup.select_one("h1")
|
||||||
|
if title:
|
||||||
|
title = title.text
|
||||||
|
|
||||||
|
image = soup.select_one("#media .swiper-slide img")
|
||||||
|
if image:
|
||||||
|
image = image["src"]
|
||||||
|
else:
|
||||||
|
image = None
|
||||||
|
|
||||||
|
description = soup.select_one(".presentation")
|
||||||
|
duree = soup.select_one("#criteres .DUREE-V .valeur-critere li")
|
||||||
|
if duree is not None:
|
||||||
|
duree = self.parse_french_time(duree.text)
|
||||||
|
|
||||||
|
location = self.nom_lieu
|
||||||
|
tags = []
|
||||||
|
for t in soup.select(".sous-titre span"):
|
||||||
|
classes = t.get("class")
|
||||||
|
if classes and len(classes) > 0:
|
||||||
|
if classes[0].startswith("LIEU-"):
|
||||||
|
location = t.text
|
||||||
|
elif classes[0].startswith("THEMATIQUE-"):
|
||||||
|
tag = self.category_c3c2agenda(t.text)
|
||||||
|
if tag is not None:
|
||||||
|
tags.append(tag)
|
||||||
|
|
||||||
|
# TODO: parser les dates, récupérer les heures ()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("EVENT ", event_url)
|
||||||
|
print("- ", title)
|
||||||
|
print("- ", image)
|
||||||
|
print("- ", len(description))
|
||||||
|
print("- ", duree)
|
||||||
|
print("- ", location)
|
||||||
|
print("- ", tags)
|
||||||
|
print("- ", dates)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(
|
||||||
|
event_url,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
tags,
|
||||||
|
recurrences=None,
|
||||||
|
uuids=[event_url],
|
||||||
|
url_human=url_human,
|
||||||
|
start_time=start_time,
|
||||||
|
end_day=end_day,
|
||||||
|
end_time=end_time,
|
||||||
|
published=published,
|
||||||
|
image=image,
|
||||||
|
)
|
@ -97,13 +97,20 @@ class Extractor(ABC):
|
|||||||
s = "0"
|
s = "0"
|
||||||
else:
|
else:
|
||||||
# format heures
|
# format heures
|
||||||
m = re.search("([0-9]+)[ Hh:.]", text)
|
m = re.search("([0-9]+) [Hh:.]", text)
|
||||||
if m:
|
if m:
|
||||||
h = m.group(1)
|
h = m.group(1)
|
||||||
m = "0"
|
m = "0"
|
||||||
s = "0"
|
s = "0"
|
||||||
else:
|
else:
|
||||||
return None
|
# format minutes
|
||||||
|
m = re.search("([0-9]+)[ ]*(?:mn|min|Min|Mn)", text)
|
||||||
|
if m:
|
||||||
|
h = "0"
|
||||||
|
m = m.group(1)
|
||||||
|
s = "0"
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
h = int(h)
|
h = int(h)
|
||||||
|
Loading…
Reference in New Issue
Block a user