Ajout de la puce à l'oreille

This commit is contained in:
Jean-Marie Favreau 2024-04-20 15:54:48 +02:00
parent 44b6458d73
commit 551c919a9f
7 changed files with 144 additions and 5 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
url = "https://www.lapucealoreille63.fr/programmation/"
url_human = "https://www.lapucealoreille63.fr/programmation/"
try:
events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True)
exportfile = "events-lapucealoreille.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -107,6 +107,8 @@ def run_recurrent_import(self, pk):
extractor = LaComedieExtractor() extractor = LaComedieExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT: elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
extractor = LeFotomatExtractor() extractor = LeFotomatExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
extractor = LaPuceALOreilleExtractor()
else: else:
extractor = None extractor = None

View File

@ -195,3 +195,72 @@ class LeFotomatExtractor(TwoStepsExtractor):
url_human = event_url url_human = event_url
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
# A class dedicated to get events from La puce à l'oreille
# URL: https://www.lapucealoreille63.fr/
class LaPuceALOreilleExtractor(TwoStepsExtractor):
nom_lieu = "La Puce à l'Oreille"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
for e in events:
e_url = e.find("a")
if e_url:
if self.add_event_url(e_url["href"]):
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
if title:
title = title[0].contents[0].get_text().replace("\n", " ")
title = re.sub(" +", " ", title)
self.add_event_title(e_url["href"], title)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
spans = soup.select("div[data-testid=richTextElement] span")
start_time = None
end_time = None
location = None
for span in spans:
txt = span.get_text()
if txt.lstrip().startswith("DÉBUT"):
start_time = self.parse_french_time(txt.split(":")[-1])
end_time = None
elif txt.lstrip().startswith("HORAIRES :"):
hs = txt.split(":")[-1].split("-")
start_time = self.parse_french_time(hs[0])
if len(hs) > 1:
end_time = self.parse_french_time(hs[1])
else:
end_time = None
elif txt.lstrip().startswith("LIEU :") and not location:
location = txt.split(":")[-1].lstrip()
if not location:
location = self.nom_lieu
end_day = self.guess_end_day(start_day, start_time, end_time)
url_human = event_url
tags = []
image = soup.select("wow-image img[fetchpriority=high]")
if image:
image = image[0]["src"]
else:
image = None
descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -22,6 +22,8 @@ class Extractor(ABC):
return start_day return start_day
else: else:
return start_day + timedelta(days=1) return start_day + timedelta(days=1)
else:
return start_day
def guess_month(self, text): def guess_month(self, text):
mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"] mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
@ -33,14 +35,14 @@ class Extractor(ABC):
def parse_french_date(self, text): def parse_french_date(self, text):
# format NomJour Numero Mois Année # format NomJour Numero Mois Année
m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) m = re.search('[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
if m: if m:
day = m.group(1) day = m.group(1)
month = self.guess_month(m.group(2)) month = self.guess_month(m.group(2))
year = m.group(3) year = m.group(3)
else: else:
# format Numero Mois Annee # format Numero Mois Annee
m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) m = re.search('([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
if m: if m:
day = m.group(1) day = m.group(1)
month = self.guess_month(m.group(2)) month = self.guess_month(m.group(2))
@ -71,14 +73,14 @@ class Extractor(ABC):
s = m.group(3) s = m.group(3)
else: else:
# format heures minutes # format heures minutes
m = re.search('([0-9]+)[ h:.]+([0-9]+)', text) m = re.search('([0-9]+)[ hH:.]+([0-9]+)', text)
if m: if m:
h = m.group(1) h = m.group(1)
m = m.group(2) m = m.group(2)
s = "0" s = "0"
else: else:
# format heures # format heures
m = re.search('([0-9]+)[ h:.]', text) m = re.search('([0-9]+)[ Hh:.]', text)
if m: if m:
h = m.group(1) h = m.group(1)
m = "0" m = "0"

View File

@ -58,7 +58,11 @@ class TwoStepsExtractor(Extractor):
return url return url
def add_event_url(self, url): def add_event_url(self, url):
if url in self.event_urls:
return False
else:
self.event_urls.append(url) self.event_urls.append(url)
return True
def add_event_start_day(self, url, start_day): def add_event_start_day(self, url, start_day):
if not url in self.event_properties: if not url in self.event_properties:

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2024-04-20 13:51
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0053_alter_recurrentimport_processor'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', 'la puce à loreille')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -760,6 +760,7 @@ class RecurrentImport(models.Model):
LACOOPE = "lacoope", _('lacoope.org') LACOOPE = "lacoope", _('lacoope.org')
LACOMEDIE = "lacomedie", _('la comédie') LACOMEDIE = "lacomedie", _('la comédie')
LEFOTOMAT = "lefotomat", _('le fotomat') LEFOTOMAT = "lefotomat", _('le fotomat')
LAPUCEALOREILLE = "lapucealoreille", _('la puce à l''oreille')
class DOWNLOADER(models.TextChoices): class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple") SIMPLE = "simple", _("simple")