Ajout Mille formes

Fix #274
This commit is contained in:
Jean-Marie Favreau 2025-02-02 14:21:03 +01:00
parent 55a0094e2f
commit 2fe2611788
6 changed files with 274 additions and 4 deletions

View File

@ -0,0 +1,44 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor())
url = "https://www.milleformes.fr/programme"
url_human = "https://www.milleformes.fr/programme"
try:
events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True)
exportfile = "events-1000formes.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
extractor = apidae_tourisme.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
extractor = iguana_agenda.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
extractor = mille_formes.CExtractor()
else:
extractor = None

View File

@ -0,0 +1,193 @@
from ..generic_extractors import *
from bs4 import BeautifulSoup
from datetime import datetime, date
# A class dedicated to get events from Mille formes
# URL: https://www.milleformes.fr/programme
class CExtractor(TwoStepsExtractorNoPause):
def extract(
self,
content,
url,
url_human=None,
default_values=None,
published=False,
only_future=True,
ignore_404=True):
self.root_address = "https://" + urlparse(url).netloc + "/"
self.today = date.today()
return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
def parse_category(self, cat):
cat = cat.replace("\n", "").strip()
if "exposition" in cat or "dispositif artistique interactif" in cat:
result = 'Visites & Expositions'
elif "atelier" in cat:
result = 'Animations & Ateliers'
elif cat in ["buffet"]:
result = 'Rendez-vous locaux'
elif "ciné" in cat:
result = 'Cinéma'
elif "concert" in cat:
result = 'Fêtes & Concerts'
elif "rencontre" in cat:
result = 'Rencontres & Débats'
elif "spectacle" in cat:
result = 'Spectacles'
else:
result = 'Sans catégorie'
return result
# this method is not perfect, but dates and hours are not structured
def parse_dates(self, date):
dl = date.replace(' à ', '\n').split('\n')
result = []
for d in dl:
# only lines with a digit
if sum(c.isdigit() for c in d) != 0:
# split subparts
for d2 in d.replace(' et ', ', ').split(', '):
d2 = d2.strip()
dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today)
if dd is None:
hh = Extractor.parse_french_time(d2)
for i, r in enumerate(result):
result[i][1].append(hh)
else:
result.append([dd, []])
if "De" in date and " à " in date:
for i, r in enumerate(result):
result[i].append(True)
return result
def build_event_url_list(self, content, infuture_days=180):
soup = BeautifulSoup(content, "html.parser")
links = soup.select('.cell a.evenement')
for l in links:
self.add_event_url(self.root_address + l["href"])
def add_event_from_content(
self,
event_content,
event_url,
url_human=None,
default_values=None,
published=False,
):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.select_one('h1').text.replace("\n", "").strip().title()
image = soup.select_one('.slide img')
if image is None:
image_alt = ''
else:
image_alt = image["alt"]
image = self.root_address + image["src"]
soustitre = soup.select_one('.sous-titre')
if not soustitre is None:
soustitre = soustitre.text.strip()
description = soup.select_one('.texte-full').text.strip()
infos = soup.select_one('.champ .infos')
if not infos is None:
infos = infos.text
location = soup.select_one('.champ .taxo.espace').text.strip()
age = soup.select_one('.champ.taxo-age').text
category = self.parse_category(soup.select_one('.champ.categorie').text)
date = soup.select_one('.champ.date-libre').text
description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None])
if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"):
return
dates = self.parse_dates(date)
end_day = None
for d in dates:
if len(d) >= 2:
start_day = d[0]
if len(d) == 3 and len(d[1]) == 2:
start_time = d[1][0]
end_time = d[1][1]
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
self.add_event_with_props(
default_values,
event_url,
title,
category,
start_day,
location,
description,
[],
recurrences=None,
uuids=[uuid],
url_human=event_url,
start_time=start_time,
end_day=start_day,
end_time=end_time,
published=published,
image=image,
image_alt=image_alt
)
else:
end_time = None
if len(d[1]) == 0:
start_time = None
uuid = event_url + "?date=" + str(start_day)
self.add_event_with_props(
default_values,
event_url,
title,
category,
start_day,
location,
description,
[],
recurrences=None,
uuids=[uuid],
url_human=event_url,
start_time=start_time,
end_day=start_day,
end_time=end_time,
published=published,
image=image,
image_alt=image_alt
)
for t in d[1]:
start_time = t
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
self.add_event_with_props(
default_values,
event_url,
title,
category,
start_day,
location,
description,
[],
recurrences=None,
uuids=[uuid],
url_human=event_url,
start_time=start_time,
end_day=start_day,
end_time=end_time,
published=published,
image=image,
image_alt=image_alt
)

View File

@ -54,7 +54,7 @@ class Extractor(ABC):
return i + 1
return None
def parse_french_date(text, default_year=None):
def parse_french_date(text, default_year=None, default_year_by_proximity=None):
# format NomJour Numero Mois Année
m = re.search(
"[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
@ -92,13 +92,25 @@ class Extractor(ABC):
return None
try:
day = int(day)
year = int(year)
if not year is None:
year = int(year)
except:
return None
if year < 100:
year = 2000 + year
if day >= 32:
return None
# by proximity
if year is None and not default_year_by_proximity is None:
dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]]
dates = [(abs((d - default_year_by_proximity).days), d) for d in dates]
d = min(dates, key=lambda x: x[0])
return d[1]
if year is None:
return None
if year < 100:
year = 2000 + year
return date(year, month, day)
def parse_french_time(text):

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.9 on 2025-02-02 14:18
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0141_alter_recurrentimport_processor'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model):
LARAYMONDE = "raymonde", _('La Raymonde')
APIDAE = 'apidae', _('Agenda apidae tourisme')
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
MILLEFORMES = 'Mille formes', _('Mille formes')
class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple")