parent
55a0094e2f
commit
2fe2611788
44
experimentations/get_milleformes_events.py
Executable file
44
experimentations/get_milleformes_events.py
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
sys.path.append(parent + "/src")
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor())
|
||||
url = "https://www.milleformes.fr/programme"
|
||||
url_human = "https://www.milleformes.fr/programme"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True)
|
||||
|
||||
exportfile = "events-1000formes.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
||||
extractor = apidae_tourisme.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
|
||||
extractor = iguana_agenda.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
|
||||
extractor = mille_formes.CExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
@ -0,0 +1,193 @@
|
||||
from ..generic_extractors import *
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime, date
|
||||
|
||||
# A class dedicated to get events from Mille formes
|
||||
# URL: https://www.milleformes.fr/programme
|
||||
class CExtractor(TwoStepsExtractorNoPause):
|
||||
|
||||
def extract(
|
||||
self,
|
||||
content,
|
||||
url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
only_future=True,
|
||||
ignore_404=True):
|
||||
self.root_address = "https://" + urlparse(url).netloc + "/"
|
||||
self.today = date.today()
|
||||
return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
|
||||
|
||||
|
||||
def parse_category(self, cat):
|
||||
cat = cat.replace("\n", "").strip()
|
||||
if "exposition" in cat or "dispositif artistique interactif" in cat:
|
||||
result = 'Visites & Expositions'
|
||||
elif "atelier" in cat:
|
||||
result = 'Animations & Ateliers'
|
||||
elif cat in ["buffet"]:
|
||||
result = 'Rendez-vous locaux'
|
||||
elif "ciné" in cat:
|
||||
result = 'Cinéma'
|
||||
elif "concert" in cat:
|
||||
result = 'Fêtes & Concerts'
|
||||
elif "rencontre" in cat:
|
||||
result = 'Rencontres & Débats'
|
||||
elif "spectacle" in cat:
|
||||
result = 'Spectacles'
|
||||
else:
|
||||
result = 'Sans catégorie'
|
||||
|
||||
return result
|
||||
|
||||
# this method is not perfect, but dates and hours are not structured
|
||||
def parse_dates(self, date):
|
||||
dl = date.replace(' à ', '\n').split('\n')
|
||||
result = []
|
||||
|
||||
for d in dl:
|
||||
# only lines with a digit
|
||||
if sum(c.isdigit() for c in d) != 0:
|
||||
# split subparts
|
||||
for d2 in d.replace(' et ', ', ').split(', '):
|
||||
d2 = d2.strip()
|
||||
dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today)
|
||||
if dd is None:
|
||||
hh = Extractor.parse_french_time(d2)
|
||||
for i, r in enumerate(result):
|
||||
result[i][1].append(hh)
|
||||
else:
|
||||
result.append([dd, []])
|
||||
|
||||
if "De" in date and " à " in date:
|
||||
for i, r in enumerate(result):
|
||||
result[i].append(True)
|
||||
|
||||
return result
|
||||
|
||||
def build_event_url_list(self, content, infuture_days=180):
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
links = soup.select('.cell a.evenement')
|
||||
for l in links:
|
||||
self.add_event_url(self.root_address + l["href"])
|
||||
|
||||
|
||||
def add_event_from_content(
|
||||
self,
|
||||
event_content,
|
||||
event_url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
title = soup.select_one('h1').text.replace("\n", "").strip().title()
|
||||
|
||||
image = soup.select_one('.slide img')
|
||||
if image is None:
|
||||
image_alt = ''
|
||||
else:
|
||||
image_alt = image["alt"]
|
||||
image = self.root_address + image["src"]
|
||||
|
||||
soustitre = soup.select_one('.sous-titre')
|
||||
if not soustitre is None:
|
||||
soustitre = soustitre.text.strip()
|
||||
|
||||
description = soup.select_one('.texte-full').text.strip()
|
||||
infos = soup.select_one('.champ .infos')
|
||||
if not infos is None:
|
||||
infos = infos.text
|
||||
|
||||
location = soup.select_one('.champ .taxo.espace').text.strip()
|
||||
|
||||
age = soup.select_one('.champ.taxo-age').text
|
||||
category = self.parse_category(soup.select_one('.champ.categorie').text)
|
||||
|
||||
|
||||
date = soup.select_one('.champ.date-libre').text
|
||||
|
||||
description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None])
|
||||
|
||||
if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"):
|
||||
return
|
||||
|
||||
dates = self.parse_dates(date)
|
||||
end_day = None
|
||||
|
||||
for d in dates:
|
||||
if len(d) >= 2:
|
||||
start_day = d[0]
|
||||
|
||||
if len(d) == 3 and len(d[1]) == 2:
|
||||
start_time = d[1][0]
|
||||
end_time = d[1][1]
|
||||
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
|
||||
self.add_event_with_props(
|
||||
default_values,
|
||||
event_url,
|
||||
title,
|
||||
category,
|
||||
start_day,
|
||||
location,
|
||||
description,
|
||||
[],
|
||||
recurrences=None,
|
||||
uuids=[uuid],
|
||||
url_human=event_url,
|
||||
start_time=start_time,
|
||||
end_day=start_day,
|
||||
end_time=end_time,
|
||||
published=published,
|
||||
image=image,
|
||||
image_alt=image_alt
|
||||
)
|
||||
else:
|
||||
end_time = None
|
||||
if len(d[1]) == 0:
|
||||
start_time = None
|
||||
uuid = event_url + "?date=" + str(start_day)
|
||||
self.add_event_with_props(
|
||||
default_values,
|
||||
event_url,
|
||||
title,
|
||||
category,
|
||||
start_day,
|
||||
location,
|
||||
description,
|
||||
[],
|
||||
recurrences=None,
|
||||
uuids=[uuid],
|
||||
url_human=event_url,
|
||||
start_time=start_time,
|
||||
end_day=start_day,
|
||||
end_time=end_time,
|
||||
published=published,
|
||||
image=image,
|
||||
image_alt=image_alt
|
||||
)
|
||||
for t in d[1]:
|
||||
start_time = t
|
||||
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
|
||||
self.add_event_with_props(
|
||||
default_values,
|
||||
event_url,
|
||||
title,
|
||||
category,
|
||||
start_day,
|
||||
location,
|
||||
description,
|
||||
[],
|
||||
recurrences=None,
|
||||
uuids=[uuid],
|
||||
url_human=event_url,
|
||||
start_time=start_time,
|
||||
end_day=start_day,
|
||||
end_time=end_time,
|
||||
published=published,
|
||||
image=image,
|
||||
image_alt=image_alt
|
||||
)
|
||||
|
@ -54,7 +54,7 @@ class Extractor(ABC):
|
||||
return i + 1
|
||||
return None
|
||||
|
||||
def parse_french_date(text, default_year=None):
|
||||
def parse_french_date(text, default_year=None, default_year_by_proximity=None):
|
||||
# format NomJour Numero Mois Année
|
||||
m = re.search(
|
||||
"[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
|
||||
@ -92,13 +92,25 @@ class Extractor(ABC):
|
||||
return None
|
||||
try:
|
||||
day = int(day)
|
||||
year = int(year)
|
||||
if not year is None:
|
||||
year = int(year)
|
||||
except:
|
||||
return None
|
||||
if year < 100:
|
||||
year = 2000 + year
|
||||
if day >= 32:
|
||||
return None
|
||||
|
||||
# by proximity
|
||||
if year is None and not default_year_by_proximity is None:
|
||||
dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]]
|
||||
dates = [(abs((d - default_year_by_proximity).days), d) for d in dates]
|
||||
d = min(dates, key=lambda x: x[0])
|
||||
return d[1]
|
||||
|
||||
if year is None:
|
||||
return None
|
||||
|
||||
if year < 100:
|
||||
year = 2000 + year
|
||||
return date(year, month, day)
|
||||
|
||||
def parse_french_time(text):
|
||||
|
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.9 on 2025-02-02 14:18
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('agenda_culturel', '0141_alter_recurrentimport_processor'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='recurrentimport',
|
||||
name='processor',
|
||||
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'),
|
||||
),
|
||||
]
|
@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model):
|
||||
LARAYMONDE = "raymonde", _('La Raymonde')
|
||||
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
||||
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
|
||||
MILLEFORMES = 'Mille formes', _('Mille formes')
|
||||
|
||||
class DOWNLOADER(models.TextChoices):
|
||||
SIMPLE = "simple", _("simple")
|
||||
|
Loading…
x
Reference in New Issue
Block a user