Ajout de l'importation des événements de la Coopé

This commit is contained in:
Jean-Marie Favreau 2024-04-19 14:45:14 +02:00
parent 8d8fa39c93
commit c1629f4692
12 changed files with 296 additions and 22 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
url = "https://www.lacoope.org/concerts-calendrier/"
url_human = "https://www.lacoope.org/concerts-calendrier/"
try:
events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
exportfile = "events-lacoope.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -0,0 +1,29 @@
# Notes sur les sources du territoire clermontois
## La Comédie de Clermont
URL des dates avec événements: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
URL des informations d'une date avec paramètres en post:
```curl --data "action=load_evenements_jour" --data "jour=2024-04-19" "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php"```
La donnée retournée est du html assez succinct, avec l'essentiel dedans.
## La coopé
Dans le source de https://www.lacoope.org/concerts-calendrier/ on trouve un tableau javascript qui contient les urls des événements. Ce tableau peut contenir "Gratuit" en tag. Il n'y a pas l'heure de l'événement.
Sur chaque page événémenet, il y a :
- meta name="description"
- une url https://calendar.google.com/calendar/ avec la plupart des données
## Le caveau de la michodière
L'adresse https://www.lecaveaudelamichodiere.com/concerts/ donne les concerts du mois en cours.
La page est peuplée par une requête javascript qui semble difficile à rejouer indépendamment, car on se prend un erreur 403 (fucking plugin propriétaire eventon).
Si on récupère l'identifiant de l'événement (type event_11377_0), on peut forger une url du type
```https://www.lecaveaudelamichodiere.com/wp-admin/admin-ajax.php?action=eventon_ics_download&event_id=11377&ri=0``` pour récupérer un ical de l'événement.
## La petite gaillarde
Le flux RSS https://lapetitegaillarde.fr/?feed=rss2 est à peu près bien structuré.

View File

@ -9,6 +9,7 @@ from .import_tasks.downloader import *
from .import_tasks.extractor import * from .import_tasks.extractor import *
from .import_tasks.importer import * from .import_tasks.importer import *
from .import_tasks.extractor_ical import * from .import_tasks.extractor_ical import *
from .import_tasks.custom_extractors import *
@ -100,6 +101,8 @@ def run_recurrent_import(self, pk):
extractor = ICALNoBusyExtractor() extractor = ICALNoBusyExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC: elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
extractor = ICALNoVCExtractor() extractor = ICALNoVCExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
extractor = LaCoopeExtractor()
else: else:
extractor = None extractor = None

View File

@ -36,6 +36,7 @@ class DBImporterEvents:
return self.nb_removed return self.nb_removed
def import_events(self, json_structure): def import_events(self, json_structure):
print(json_structure)
self.init_result_properties() self.init_result_properties()
try: try:

View File

@ -0,0 +1,64 @@
from .generic_extractors import *
import re
import json5
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://www.lacoope.org/concerts-calendrier/
class LaCoopeExtractor(TwoStepsExtractor):
nom_lieu = "La Coopérative de Mai"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
script = soup.find('div', class_="js-filter__results").findChildren('script')
if len(script) == 0:
raise Exception("Cannot find events in the first page")
script = script[0]
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
if search:
data = json5.loads(search.group(1))
self.event_urls = [e['url'] for e in data['events']]
for e in data['events']:
if e['tag'] == "Gratuit":
self.add_event_tag(e['url'], 'gratuit')
else:
raise Exception('Cannot extract events from javascript')
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.find("h1").contents[0]
category = "Concert"
image = soup.find("meta", property="og:image")
if image:
image = image["content"]
description = soup.find("div", class_="grid-concert-content")
if description:
description = description.find('div', class_="content-striped")
if description:
description = description.find('div', class_='wysiwyg')
if description:
description = description.get_text()
if description is None:
description = ""
tags = []
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
if len(link_calendar) == 0:
raise Exception('Cannot find the google calendar url')
gg_cal = GGCalendar(link_calendar[0]["href"])
start_day = gg_cal.start_day
start_time = gg_cal.start_time
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = LaCoopeExtractor.nom_lieu
url_human = event_url
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -1,5 +1,6 @@
from urllib.parse import urlparse from urllib.parse import urlparse
import urllib.request import urllib.request
import os
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
@ -15,6 +16,24 @@ class Downloader(ABC):
def download(self, url): def download(self, url):
pass pass
def get_content(self, url, cache = None):
if cache and os.path.exists(cache):
print("Loading cache ({})".format(cache))
with open(cache) as f:
content = "\n".join(f.readlines())
else:
content = self.download(url)
if cache:
print("Saving cache ({})".format(cache))
dir = os.path.dirname(cache)
if dir != "" and not os.path.exists(dir):
os.makedirs(dir)
with open(cache, "w") as text_file:
text_file.write(content)
return content
class SimpleDownloader(Downloader): class SimpleDownloader(Downloader):
def __init__(self): def __init__(self):

View File

@ -8,11 +8,15 @@ class Extractor(ABC):
def __init__(self): def __init__(self):
self.header = {} self.header = {}
self.events = [] self.events = []
self.downloader = None
@abstractmethod @abstractmethod
def extract(self, content, url, url_human = None): def extract(self, content, url, url_human = None, default_values = None, published = False):
pass pass
def set_downloader(self, downloader):
self.downloader = downloader
@abstractmethod @abstractmethod
def clean_url(url): def clean_url(url):
pass pass

View File

@ -0,0 +1,109 @@
from abc import abstractmethod
from urllib.parse import urlparse
from urllib.parse import parse_qs
from .extractor import *
from django.utils.translation import gettext_lazy as _
from dateutil import parser
import datetime
class GGCalendar:
def __init__(self, url):
self.url = url
self.extract_info()
def extract_info(self):
parsed_url = urlparse(self.url.replace("#", "%23"))
params = parse_qs(parsed_url.query)
self.location = params['location'][0] if 'location' in params else None
self.title = params['text'][0] if 'text' in params else None
if 'dates' in params:
dates = [x.replace(" ", "+") for x in params['dates'][0].split("/")]
if len(dates) > 0:
date = parser.parse(dates[0])
self.start_day = date.date()
self.start_time = date.time()
if len(dates) == 2:
date = parser.parse(dates[1])
self.end_day = date.date()
self.end_time = date.time()
else:
self.end_day = None
self.end_time = None
else:
raise Exception("Unable to find a date in google calendar URL")
self.start_day = None
self.start_time = None
self.end_day = None
self.end_time = None
# A class to extract events from URL with two steps:
# - first build a list of urls where the events will be found
# - then for each document downloaded from these urls, build the events
# This class is an abstract class
class TwoStepsExtractor(Extractor):
def __init__(self):
super().__init__()
self.event_urls = None
self.event_properties = {}
def clean_url(url):
return url
def add_event_tag(self, url, tag):
if not url in self.event_properties:
self.event_properties[url] = {}
if not "tags" in self.event_properties[url]:
self.event_properties[url]["tags"] = []
self.event_properties[url]["tags"].append(tag)
def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
if event_url in self.event_properties and 'tags' in self.event_properties[event_url]:
tags = tags + self.event_properties[event_url]['tags']
self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
@abstractmethod
def build_event_url_list(self, content):
pass
@abstractmethod
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
pass
def extract(self, content, url, url_human = None, default_values = None, published = False):
self.set_header(url)
self.clear_events()
self.event_urls = None
self.event_properties.clear()
# first build the event list
self.build_event_url_list(content)
if self.event_urls is None:
raise Exception('Unable to find the event list from the main document')
if self.downloader is None:
raise Exception('The downloader is not defined')
# then process each element of the list
for i, event_url in enumerate(self.event_urls):
# first download the content associated with this link
content_event = self.downloader.get_content(event_url)
if content_event is None:
raise Exception(_('Cannot extract event from url {}').format(event_url))
# then extract event information from this html document
self.add_event_from_content(content_event, event_url, url_human, default_values, published)
return self.get_structure()

View File

@ -12,40 +12,22 @@ class URL2Events:
self.extractor = extractor self.extractor = extractor
self.single_event = single_event self.single_event = single_event
def get_content(self, url, cache = None):
if cache and os.path.exists(cache):
print("Loading cache ({})".format(cache))
with open(cache) as f:
content = "\n".join(f.readlines())
else:
content = self.downloader.download(url)
if cache:
print("Saving cache ({})".format(cache))
dir = os.path.dirname(cache)
if dir != "" and not os.path.exists(dir):
os.makedirs(dir)
with open(cache, "w") as text_file:
text_file.write(content)
return content
def process(self, url, url_human = None, cache = None, default_values = None, published = False): def process(self, url, url_human = None, cache = None, default_values = None, published = False):
content = self.get_content(url, cache) content = self.downloader.get_content(url, cache)
if content is None: if content is None:
return None return None
if self.extractor is not None: if self.extractor is not None:
self.extractor.set_downloader(self.downloader)
return self.extractor.extract(content, url, url_human, default_values, published) return self.extractor.extract(content, url, url_human, default_values, published)
else: else:
# if the extractor is not defined, use a list of default extractors # if the extractor is not defined, use a list of default extractors
for e in Extractor.get_default_extractors(self.single_event): for e in Extractor.get_default_extractors(self.single_event):
#try: e.set_downloader(self.downloader)
events = e.extract(content, url, url_human, default_values, published) events = e.extract(content, url, url_human, default_values, published)
if events is not None: if events is not None:
return events return events
#except:
# continue
return None return None

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2024-04-19 12:07
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0048_auto_20240417_1212'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -754,6 +754,7 @@ class RecurrentImport(models.Model):
ICAL = "ical", _("ical") ICAL = "ical", _("ical")
ICALNOBUSY = "icalnobusy", _("ical no busy") ICALNOBUSY = "icalnobusy", _("ical no busy")
ICALNOVC = "icalnovc", _("ical no VC") ICALNOVC = "icalnovc", _("ical no VC")
LACOOPE = "lacoope", _('lacoope.org')
class DOWNLOADER(models.TextChoices): class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple") SIMPLE = "simple", _("simple")

View File

@ -36,4 +36,5 @@ django-recurrence==1.11.1
icalendar==5.0.11 icalendar==5.0.11
lxml==5.1.0 lxml==5.1.0
bbcode==1.1.0 bbcode==1.1.0
json5==0.9.25