Ajout de l'importation des événements de la Coopé

This commit is contained in:
Jean-Marie Favreau 2024-04-19 14:45:14 +02:00
parent 8d8fa39c93
commit c1629f4692
12 changed files with 296 additions and 22 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
url = "https://www.lacoope.org/concerts-calendrier/"
url_human = "https://www.lacoope.org/concerts-calendrier/"
try:
events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
exportfile = "events-lacoope.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -0,0 +1,29 @@
# Notes sur les sources du territoire clermontois
## La Comédie de Clermont
URL des dates avec événements: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
URL des informations d'une date avec paramètres en post:
```curl --data "action=load_evenements_jour" --data "jour=2024-04-19" "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php"```
La donnée retournée est du html assez succinct, avec l'essentiel dedans.
## La coopé
Dans le source de https://www.lacoope.org/concerts-calendrier/ on trouve un tableau javascript qui contient les urls des événements. Ce tableau peut contenir "Gratuit" en tag. Il n'y a pas l'heure de l'événement.
Sur chaque page événémenet, il y a :
- meta name="description"
- une url https://calendar.google.com/calendar/ avec la plupart des données
## Le caveau de la michodière
L'adresse https://www.lecaveaudelamichodiere.com/concerts/ donne les concerts du mois en cours.
La page est peuplée par une requête javascript qui semble difficile à rejouer indépendamment, car on se prend un erreur 403 (fucking plugin propriétaire eventon).
Si on récupère l'identifiant de l'événement (type event_11377_0), on peut forger une url du type
```https://www.lecaveaudelamichodiere.com/wp-admin/admin-ajax.php?action=eventon_ics_download&event_id=11377&ri=0``` pour récupérer un ical de l'événement.
## La petite gaillarde
Le flux RSS https://lapetitegaillarde.fr/?feed=rss2 est à peu près bien structuré.

View File

@ -9,6 +9,7 @@ from .import_tasks.downloader import *
from .import_tasks.extractor import *
from .import_tasks.importer import *
from .import_tasks.extractor_ical import *
from .import_tasks.custom_extractors import *
@ -100,6 +101,8 @@ def run_recurrent_import(self, pk):
extractor = ICALNoBusyExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
extractor = ICALNoVCExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
extractor = LaCoopeExtractor()
else:
extractor = None

View File

@ -36,6 +36,7 @@ class DBImporterEvents:
return self.nb_removed
def import_events(self, json_structure):
print(json_structure)
self.init_result_properties()
try:

View File

@ -0,0 +1,64 @@
from .generic_extractors import *
import re
import json5
# A class dedicated to get events from La Coopérative de Mai:
# URL: https://www.lacoope.org/concerts-calendrier/
class LaCoopeExtractor(TwoStepsExtractor):
nom_lieu = "La Coopérative de Mai"
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
script = soup.find('div', class_="js-filter__results").findChildren('script')
if len(script) == 0:
raise Exception("Cannot find events in the first page")
script = script[0]
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
if search:
data = json5.loads(search.group(1))
self.event_urls = [e['url'] for e in data['events']]
for e in data['events']:
if e['tag'] == "Gratuit":
self.add_event_tag(e['url'], 'gratuit')
else:
raise Exception('Cannot extract events from javascript')
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.find("h1").contents[0]
category = "Concert"
image = soup.find("meta", property="og:image")
if image:
image = image["content"]
description = soup.find("div", class_="grid-concert-content")
if description:
description = description.find('div', class_="content-striped")
if description:
description = description.find('div', class_='wysiwyg')
if description:
description = description.get_text()
if description is None:
description = ""
tags = []
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
if len(link_calendar) == 0:
raise Exception('Cannot find the google calendar url')
gg_cal = GGCalendar(link_calendar[0]["href"])
start_day = gg_cal.start_day
start_time = gg_cal.start_time
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = LaCoopeExtractor.nom_lieu
url_human = event_url
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)

View File

@ -1,5 +1,6 @@
from urllib.parse import urlparse
import urllib.request
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
@ -15,6 +16,24 @@ class Downloader(ABC):
def download(self, url):
pass
def get_content(self, url, cache = None):
if cache and os.path.exists(cache):
print("Loading cache ({})".format(cache))
with open(cache) as f:
content = "\n".join(f.readlines())
else:
content = self.download(url)
if cache:
print("Saving cache ({})".format(cache))
dir = os.path.dirname(cache)
if dir != "" and not os.path.exists(dir):
os.makedirs(dir)
with open(cache, "w") as text_file:
text_file.write(content)
return content
class SimpleDownloader(Downloader):
def __init__(self):

View File

@ -8,11 +8,15 @@ class Extractor(ABC):
def __init__(self):
self.header = {}
self.events = []
self.downloader = None
@abstractmethod
def extract(self, content, url, url_human = None):
def extract(self, content, url, url_human = None, default_values = None, published = False):
pass
def set_downloader(self, downloader):
self.downloader = downloader
@abstractmethod
def clean_url(url):
pass

View File

@ -0,0 +1,109 @@
from abc import abstractmethod
from urllib.parse import urlparse
from urllib.parse import parse_qs
from .extractor import *
from django.utils.translation import gettext_lazy as _
from dateutil import parser
import datetime
class GGCalendar:
def __init__(self, url):
self.url = url
self.extract_info()
def extract_info(self):
parsed_url = urlparse(self.url.replace("#", "%23"))
params = parse_qs(parsed_url.query)
self.location = params['location'][0] if 'location' in params else None
self.title = params['text'][0] if 'text' in params else None
if 'dates' in params:
dates = [x.replace(" ", "+") for x in params['dates'][0].split("/")]
if len(dates) > 0:
date = parser.parse(dates[0])
self.start_day = date.date()
self.start_time = date.time()
if len(dates) == 2:
date = parser.parse(dates[1])
self.end_day = date.date()
self.end_time = date.time()
else:
self.end_day = None
self.end_time = None
else:
raise Exception("Unable to find a date in google calendar URL")
self.start_day = None
self.start_time = None
self.end_day = None
self.end_time = None
# A class to extract events from URL with two steps:
# - first build a list of urls where the events will be found
# - then for each document downloaded from these urls, build the events
# This class is an abstract class
class TwoStepsExtractor(Extractor):
def __init__(self):
super().__init__()
self.event_urls = None
self.event_properties = {}
def clean_url(url):
return url
def add_event_tag(self, url, tag):
if not url in self.event_properties:
self.event_properties[url] = {}
if not "tags" in self.event_properties[url]:
self.event_properties[url]["tags"] = []
self.event_properties[url]["tags"].append(tag)
def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
if event_url in self.event_properties and 'tags' in self.event_properties[event_url]:
tags = tags + self.event_properties[event_url]['tags']
self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
@abstractmethod
def build_event_url_list(self, content):
pass
@abstractmethod
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
pass
def extract(self, content, url, url_human = None, default_values = None, published = False):
self.set_header(url)
self.clear_events()
self.event_urls = None
self.event_properties.clear()
# first build the event list
self.build_event_url_list(content)
if self.event_urls is None:
raise Exception('Unable to find the event list from the main document')
if self.downloader is None:
raise Exception('The downloader is not defined')
# then process each element of the list
for i, event_url in enumerate(self.event_urls):
# first download the content associated with this link
content_event = self.downloader.get_content(event_url)
if content_event is None:
raise Exception(_('Cannot extract event from url {}').format(event_url))
# then extract event information from this html document
self.add_event_from_content(content_event, event_url, url_human, default_values, published)
return self.get_structure()

View File

@ -12,40 +12,22 @@ class URL2Events:
self.extractor = extractor
self.single_event = single_event
def get_content(self, url, cache = None):
if cache and os.path.exists(cache):
print("Loading cache ({})".format(cache))
with open(cache) as f:
content = "\n".join(f.readlines())
else:
content = self.downloader.download(url)
if cache:
print("Saving cache ({})".format(cache))
dir = os.path.dirname(cache)
if dir != "" and not os.path.exists(dir):
os.makedirs(dir)
with open(cache, "w") as text_file:
text_file.write(content)
return content
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
content = self.get_content(url, cache)
content = self.downloader.get_content(url, cache)
if content is None:
return None
if self.extractor is not None:
self.extractor.set_downloader(self.downloader)
return self.extractor.extract(content, url, url_human, default_values, published)
else:
# if the extractor is not defined, use a list of default extractors
for e in Extractor.get_default_extractors(self.single_event):
#try:
e.set_downloader(self.downloader)
events = e.extract(content, url, url_human, default_values, published)
if events is not None:
return events
#except:
# continue
return None

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2024-04-19 12:07
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0048_auto_20240417_1212'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -754,6 +754,7 @@ class RecurrentImport(models.Model):
ICAL = "ical", _("ical")
ICALNOBUSY = "icalnobusy", _("ical no busy")
ICALNOVC = "icalnovc", _("ical no VC")
LACOOPE = "lacoope", _('lacoope.org')
class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple")

View File

@ -36,4 +36,5 @@ django-recurrence==1.11.1
icalendar==5.0.11
lxml==5.1.0
bbcode==1.1.0
json5==0.9.25