Ajout support le photomat

This commit is contained in:
Jean-Marie Favreau 2024-04-20 12:11:39 +02:00
parent a7f5645cf1
commit aa878b8fb3
7 changed files with 230 additions and 7 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LePhotomatExtractor())
url = "https://www.lefotomat.com/feed"
url_human = "https://www.lefotomat.com/"
try:
events = u2e.process(url, url_human, cache = "cache-lephotomat.xml", default_values = {"location": "Le Photomat'"}, published = True)
exportfile = "events-lephotomat.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -105,6 +105,8 @@ def run_recurrent_import(self, pk):
extractor = LaCoopeExtractor() extractor = LaCoopeExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE: elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
extractor = LaComedieExtractor() extractor = LaComedieExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LEPHOTOMAT:
extractor = LePhotomatExtractor()
else: else:
extractor = None extractor = None

View File

@ -2,7 +2,7 @@
from .generic_extractors import * from .generic_extractors import *
import re import re
import json5 import json5
from datetime import timedelta
# A class dedicated to get events from La Coopérative de Mai: # A class dedicated to get events from La Coopérative de Mai:
# URL: https://www.lacoope.org/concerts-calendrier/ # URL: https://www.lacoope.org/concerts-calendrier/
@ -19,8 +19,8 @@ class LaCoopeExtractor(TwoStepsExtractor):
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S) search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
if search: if search:
data = json5.loads(search.group(1)) data = json5.loads(search.group(1))
self.event_urls = [e['url'] for e in data['events']]
for e in data['events']: for e in data['events']:
self.add_event_url(e['url'])
if e['tag'] == "Gratuit": if e['tag'] == "Gratuit":
self.add_event_tag(e['url'], 'gratuit') self.add_event_tag(e['url'], 'gratuit')
@ -81,7 +81,6 @@ class LaComedieExtractor(TwoStepsExtractor):
def build_event_url_list(self, content): def build_event_url_list(self, content):
self.event_urls = []
dates = json5.loads(content)["data"][0] dates = json5.loads(content)["data"][0]
url = self.url.split("?")[0] url = self.url.split("?")[0]
@ -96,7 +95,7 @@ class LaComedieExtractor(TwoStepsExtractor):
events = soup.select("div.unedatedev") events = soup.select("div.unedatedev")
for e in events: for e in events:
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
self.event_urls.append(e_url) self.add_event_url(e_url)
self.add_event_start_day(e_url, d) self.add_event_start_day(e_url, d)
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0] t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
self.add_event_start_time(e_url, t) self.add_event_start_time(e_url, t)
@ -127,3 +126,72 @@ class LaComedieExtractor(TwoStepsExtractor):
url_human = event_url url_human = event_url
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image) self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
# A class dedicated to get events from Le Photomat'
# URL: https://www.lefotomat.com/
class LePhotomatExtractor(TwoStepsExtractor):
nom_lieu = "Le Photomat'"
def category_photomat2agenda(self, category):
if not category:
return None
mapping = { "Concerts": "Concert"}
if category in mapping:
return mapping[category]
else:
return None
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "xml")
events = soup.select("item")
for e in events:
e_url = e.find("link").contents[0]
self.add_event_url(e_url)
title = e.find("title").contents[0]
self.add_event_title(e_url, title)
category = self.category_photomat2agenda(e.find("category").contents[0])
if category:
self.add_event_category(e_url, category)
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
image = soup.select("div.post-content img.wp-post-image")
if image:
image = image[0]["src"]
else:
image = None
desc = soup.select("head meta[name=description]")[0]["content"]
start_day = self.parse_french_date(desc.split("-")[0])
start_time = self.parse_french_time(desc.split("-")[1])
end_time = self.parse_french_time(desc.split("-")[2])
end_day = self.guess_end_day(start_day, start_time, end_time)
location = self.nom_lieu
descriptions = soup.select("div.vce-col-content")
if descriptions:
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
else:
description = None
article = soup.select("article.post")
tags = []
for c in article[0]["class"]:
if c.startswith("category-"):
tag = '-'.join(c.split("-")[1:])
if tag != "concerts":
tags.append(tag)
url_human = event_url
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -1,8 +1,14 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime, time, date, timedelta
import re
import unicodedata
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
class Extractor(ABC): class Extractor(ABC):
def __init__(self): def __init__(self):
@ -10,6 +16,88 @@ class Extractor(ABC):
self.events = [] self.events = []
self.downloader = None self.downloader = None
def guess_end_day(self, start_day, start_time, end_time):
if end_time:
if end_time > start_time:
return start_day
else:
return start_day + timedelta(days=1)
def guess_month(self, text):
mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
t = remove_accents(text).lower()
for i, m in enumerate(mths):
if t.startswith(m):
return i + 1
return None
def parse_french_date(self, text):
# format NomJour Numero Mois Année
m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
if m:
day = m.group(1)
month = self.guess_month(m.group(2))
year = m.group(3)
else:
# format Numero Mois Annee
m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
if m:
day = m.group(1)
month = self.guess_month(m.group(2))
year = m.group(3)
else:
# TODO: consolider les cas non satisfaits
return None
if month is None:
return None
try:
day = int(day)
year = int(year)
except:
return None
if year < 100:
year = 2000 + year
if day >= 32:
return None
return date(year, month, day)
def parse_french_time(self, text):
# format heures minutes secondes
m = re.search('([0-9]+)[ a-zA-Z:.]+([0-9]+)[ a-zA-Z:.]+([0-9]+)', text)
if m:
h = m.group(1)
m = m.group(2)
s = m.group(3)
else:
# format heures minutes
m = re.search('([0-9]+)[ h:.]+([0-9]+)', text)
if m:
h = m.group(1)
m = m.group(2)
s = "0"
else:
# format heures
m = re.search('([0-9]+)[ h:.]', text)
if m:
h = m.group(1)
m = "0"
s = "0"
else:
return None
try:
h = int(h)
m = int(m)
s = int(s)
except:
return None
if h >= 24 or m >= 60 or s >= 60:
return None
return time(h, m, s)
@abstractmethod @abstractmethod
def extract(self, content, url, url_human = None, default_values = None, published = False): def extract(self, content, url, url_human = None, default_values = None, published = False):
pass pass

View File

@ -57,6 +57,9 @@ class TwoStepsExtractor(Extractor):
def clean_url(url): def clean_url(url):
return url return url
def add_event_url(self, url):
self.event_urls.append(url)
def add_event_start_day(self, url, start_day): def add_event_start_day(self, url, start_day):
if not url in self.event_properties: if not url in self.event_properties:
self.event_properties[url] = {} self.event_properties[url] = {}
@ -124,7 +127,7 @@ class TwoStepsExtractor(Extractor):
self.clear_events() self.clear_events()
self.url = url self.url = url
self.event_urls = None self.event_urls = []
self.event_properties.clear() self.event_properties.clear()
# first build the event list # first build the event list

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2024-04-20 10:03
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0050_alter_recurrentimport_processor'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lephotomat', 'le photomat')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -759,6 +759,7 @@ class RecurrentImport(models.Model):
ICALNOVC = "icalnovc", _("ical no VC") ICALNOVC = "icalnovc", _("ical no VC")
LACOOPE = "lacoope", _('lacoope.org') LACOOPE = "lacoope", _('lacoope.org')
LACOMEDIE = "lacomedie", _('la comédie') LACOMEDIE = "lacomedie", _('la comédie')
LEPHOTOMAT = "lephotomat", _('le photomat')
class DOWNLOADER(models.TextChoices): class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple") SIMPLE = "simple", _("simple")