Ajout support le photomat
This commit is contained in:
parent
a7f5645cf1
commit
aa878b8fb3
43
experimentations/get_lephotomat.py
Executable file
43
experimentations/get_lephotomat.py
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
u2e = URL2Events(SimpleDownloader(), LePhotomatExtractor())
|
||||||
|
url = "https://www.lefotomat.com/feed"
|
||||||
|
url_human = "https://www.lefotomat.com/"
|
||||||
|
|
||||||
|
try:
|
||||||
|
events = u2e.process(url, url_human, cache = "cache-lephotomat.xml", default_values = {"location": "Le Photomat'"}, published = True)
|
||||||
|
|
||||||
|
exportfile = "events-lephotomat.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: " + str(e))
|
@ -105,6 +105,8 @@ def run_recurrent_import(self, pk):
|
|||||||
extractor = LaCoopeExtractor()
|
extractor = LaCoopeExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
||||||
extractor = LaComedieExtractor()
|
extractor = LaComedieExtractor()
|
||||||
|
elif rimport.processor == RecurrentImport.PROCESSOR.LEPHOTOMAT:
|
||||||
|
extractor = LePhotomatExtractor()
|
||||||
else:
|
else:
|
||||||
extractor = None
|
extractor = None
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
from .generic_extractors import *
|
from .generic_extractors import *
|
||||||
import re
|
import re
|
||||||
import json5
|
import json5
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
# A class dedicated to get events from La Coopérative de Mai:
|
# A class dedicated to get events from La Coopérative de Mai:
|
||||||
# URL: https://www.lacoope.org/concerts-calendrier/
|
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||||
@ -19,8 +19,8 @@ class LaCoopeExtractor(TwoStepsExtractor):
|
|||||||
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||||
if search:
|
if search:
|
||||||
data = json5.loads(search.group(1))
|
data = json5.loads(search.group(1))
|
||||||
self.event_urls = [e['url'] for e in data['events']]
|
|
||||||
for e in data['events']:
|
for e in data['events']:
|
||||||
|
self.add_event_url(e['url'])
|
||||||
if e['tag'] == "Gratuit":
|
if e['tag'] == "Gratuit":
|
||||||
self.add_event_tag(e['url'], 'gratuit')
|
self.add_event_tag(e['url'], 'gratuit')
|
||||||
|
|
||||||
@ -81,7 +81,6 @@ class LaComedieExtractor(TwoStepsExtractor):
|
|||||||
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
def build_event_url_list(self, content):
|
||||||
self.event_urls = []
|
|
||||||
dates = json5.loads(content)["data"][0]
|
dates = json5.loads(content)["data"][0]
|
||||||
|
|
||||||
url = self.url.split("?")[0]
|
url = self.url.split("?")[0]
|
||||||
@ -96,7 +95,7 @@ class LaComedieExtractor(TwoStepsExtractor):
|
|||||||
events = soup.select("div.unedatedev")
|
events = soup.select("div.unedatedev")
|
||||||
for e in events:
|
for e in events:
|
||||||
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
|
||||||
self.event_urls.append(e_url)
|
self.add_event_url(e_url)
|
||||||
self.add_event_start_day(e_url, d)
|
self.add_event_start_day(e_url, d)
|
||||||
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
|
||||||
self.add_event_start_time(e_url, t)
|
self.add_event_start_time(e_url, t)
|
||||||
@ -127,3 +126,72 @@ class LaComedieExtractor(TwoStepsExtractor):
|
|||||||
url_human = event_url
|
url_human = event_url
|
||||||
|
|
||||||
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from Le Photomat'
|
||||||
|
# URL: https://www.lefotomat.com/
|
||||||
|
class LePhotomatExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "Le Photomat'"
|
||||||
|
|
||||||
|
def category_photomat2agenda(self, category):
|
||||||
|
if not category:
|
||||||
|
return None
|
||||||
|
mapping = { "Concerts": "Concert"}
|
||||||
|
if category in mapping:
|
||||||
|
return mapping[category]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "xml")
|
||||||
|
|
||||||
|
events = soup.select("item")
|
||||||
|
for e in events:
|
||||||
|
e_url = e.find("link").contents[0]
|
||||||
|
self.add_event_url(e_url)
|
||||||
|
|
||||||
|
title = e.find("title").contents[0]
|
||||||
|
self.add_event_title(e_url, title)
|
||||||
|
|
||||||
|
category = self.category_photomat2agenda(e.find("category").contents[0])
|
||||||
|
if category:
|
||||||
|
self.add_event_category(e_url, category)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
image = soup.select("div.post-content img.wp-post-image")
|
||||||
|
if image:
|
||||||
|
image = image[0]["src"]
|
||||||
|
else:
|
||||||
|
image = None
|
||||||
|
desc = soup.select("head meta[name=description]")[0]["content"]
|
||||||
|
start_day = self.parse_french_date(desc.split("-")[0])
|
||||||
|
start_time = self.parse_french_time(desc.split("-")[1])
|
||||||
|
end_time = self.parse_french_time(desc.split("-")[2])
|
||||||
|
end_day = self.guess_end_day(start_day, start_time, end_time)
|
||||||
|
|
||||||
|
location = self.nom_lieu
|
||||||
|
descriptions = soup.select("div.vce-col-content")
|
||||||
|
if descriptions:
|
||||||
|
descriptions = [d.get_text() for d in descriptions]
|
||||||
|
description = max(descriptions, key=len)
|
||||||
|
else:
|
||||||
|
description = None
|
||||||
|
|
||||||
|
article = soup.select("article.post")
|
||||||
|
tags = []
|
||||||
|
for c in article[0]["class"]:
|
||||||
|
if c.startswith("category-"):
|
||||||
|
tag = '-'.join(c.split("-")[1:])
|
||||||
|
if tag != "concerts":
|
||||||
|
tags.append(tag)
|
||||||
|
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
||||||
|
@ -1,8 +1,14 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime, time, date, timedelta
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
def remove_accents(input_str):
|
||||||
|
nfkd_form = unicodedata.normalize('NFKD', input_str)
|
||||||
|
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
||||||
|
|
||||||
class Extractor(ABC):
|
class Extractor(ABC):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -10,6 +16,88 @@ class Extractor(ABC):
|
|||||||
self.events = []
|
self.events = []
|
||||||
self.downloader = None
|
self.downloader = None
|
||||||
|
|
||||||
|
def guess_end_day(self, start_day, start_time, end_time):
|
||||||
|
if end_time:
|
||||||
|
if end_time > start_time:
|
||||||
|
return start_day
|
||||||
|
else:
|
||||||
|
return start_day + timedelta(days=1)
|
||||||
|
|
||||||
|
def guess_month(self, text):
|
||||||
|
mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
|
||||||
|
t = remove_accents(text).lower()
|
||||||
|
for i, m in enumerate(mths):
|
||||||
|
if t.startswith(m):
|
||||||
|
return i + 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_french_date(self, text):
|
||||||
|
# format NomJour Numero Mois Année
|
||||||
|
m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
|
||||||
|
if m:
|
||||||
|
day = m.group(1)
|
||||||
|
month = self.guess_month(m.group(2))
|
||||||
|
year = m.group(3)
|
||||||
|
else:
|
||||||
|
# format Numero Mois Annee
|
||||||
|
m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
|
||||||
|
if m:
|
||||||
|
day = m.group(1)
|
||||||
|
month = self.guess_month(m.group(2))
|
||||||
|
year = m.group(3)
|
||||||
|
else:
|
||||||
|
# TODO: consolider les cas non satisfaits
|
||||||
|
return None
|
||||||
|
|
||||||
|
if month is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
day = int(day)
|
||||||
|
year = int(year)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
if year < 100:
|
||||||
|
year = 2000 + year
|
||||||
|
if day >= 32:
|
||||||
|
return None
|
||||||
|
return date(year, month, day)
|
||||||
|
|
||||||
|
def parse_french_time(self, text):
|
||||||
|
# format heures minutes secondes
|
||||||
|
m = re.search('([0-9]+)[ a-zA-Z:.]+([0-9]+)[ a-zA-Z:.]+([0-9]+)', text)
|
||||||
|
if m:
|
||||||
|
h = m.group(1)
|
||||||
|
m = m.group(2)
|
||||||
|
s = m.group(3)
|
||||||
|
else:
|
||||||
|
# format heures minutes
|
||||||
|
m = re.search('([0-9]+)[ h:.]+([0-9]+)', text)
|
||||||
|
if m:
|
||||||
|
h = m.group(1)
|
||||||
|
m = m.group(2)
|
||||||
|
s = "0"
|
||||||
|
else:
|
||||||
|
# format heures
|
||||||
|
m = re.search('([0-9]+)[ h:.]', text)
|
||||||
|
if m:
|
||||||
|
h = m.group(1)
|
||||||
|
m = "0"
|
||||||
|
s = "0"
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
h = int(h)
|
||||||
|
m = int(m)
|
||||||
|
s = int(s)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
if h >= 24 or m >= 60 or s >= 60:
|
||||||
|
return None
|
||||||
|
return time(h, m, s)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||||
pass
|
pass
|
||||||
|
@ -57,6 +57,9 @@ class TwoStepsExtractor(Extractor):
|
|||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def add_event_url(self, url):
|
||||||
|
self.event_urls.append(url)
|
||||||
|
|
||||||
def add_event_start_day(self, url, start_day):
|
def add_event_start_day(self, url, start_day):
|
||||||
if not url in self.event_properties:
|
if not url in self.event_properties:
|
||||||
self.event_properties[url] = {}
|
self.event_properties[url] = {}
|
||||||
@ -124,7 +127,7 @@ class TwoStepsExtractor(Extractor):
|
|||||||
self.clear_events()
|
self.clear_events()
|
||||||
|
|
||||||
self.url = url
|
self.url = url
|
||||||
self.event_urls = None
|
self.event_urls = []
|
||||||
self.event_properties.clear()
|
self.event_properties.clear()
|
||||||
|
|
||||||
# first build the event list
|
# first build the event list
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 4.2.7 on 2024-04-20 10:03
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('agenda_culturel', '0050_alter_recurrentimport_processor'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='recurrentimport',
|
||||||
|
name='processor',
|
||||||
|
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lephotomat', 'le photomat')], default='ical', max_length=20, verbose_name='Processor'),
|
||||||
|
),
|
||||||
|
]
|
@ -759,6 +759,7 @@ class RecurrentImport(models.Model):
|
|||||||
ICALNOVC = "icalnovc", _("ical no VC")
|
ICALNOVC = "icalnovc", _("ical no VC")
|
||||||
LACOOPE = "lacoope", _('lacoope.org')
|
LACOOPE = "lacoope", _('lacoope.org')
|
||||||
LACOMEDIE = "lacomedie", _('la comédie')
|
LACOMEDIE = "lacomedie", _('la comédie')
|
||||||
|
LEPHOTOMAT = "lephotomat", _('le photomat')
|
||||||
|
|
||||||
class DOWNLOADER(models.TextChoices):
|
class DOWNLOADER(models.TextChoices):
|
||||||
SIMPLE = "simple", _("simple")
|
SIMPLE = "simple", _("simple")
|
||||||
|
Loading…
Reference in New Issue
Block a user