Ajout de l'importation des événements de la Coopé
This commit is contained in:
parent
8d8fa39c93
commit
c1629f4692
43
experimentations/get_lacoope_events.py
Executable file
43
experimentations/get_lacoope_events.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
|
||||
url = "https://www.lacoope.org/concerts-calendrier/"
|
||||
url_human = "https://www.lacoope.org/concerts-calendrier/"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
|
||||
|
||||
exportfile = "events-lacoope.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
29
experimentations/notes-sources.md
Normal file
29
experimentations/notes-sources.md
Normal file
@ -0,0 +1,29 @@
|
||||
# Notes sur les sources du territoire clermontois
|
||||
|
||||
## La Comédie de Clermont
|
||||
|
||||
URL des dates avec événements: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
||||
URL des informations d'une date avec paramètres en post:
|
||||
```curl --data "action=load_evenements_jour" --data "jour=2024-04-19" "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php"```
|
||||
La donnée retournée est du html assez succinct, avec l'essentiel dedans.
|
||||
|
||||
|
||||
## La coopé
|
||||
|
||||
Dans le source de https://www.lacoope.org/concerts-calendrier/ on trouve un tableau javascript qui contient les urls des événements. Ce tableau peut contenir "Gratuit" en tag. Il n'y a pas l'heure de l'événement.
|
||||
Sur chaque page événémenet, il y a :
|
||||
- meta name="description"
|
||||
- une url https://calendar.google.com/calendar/ avec la plupart des données
|
||||
|
||||
## Le caveau de la michodière
|
||||
|
||||
L'adresse https://www.lecaveaudelamichodiere.com/concerts/ donne les concerts du mois en cours.
|
||||
La page est peuplée par une requête javascript qui semble difficile à rejouer indépendamment, car on se prend un erreur 403 (fucking plugin propriétaire eventon).
|
||||
|
||||
Si on récupère l'identifiant de l'événement (type event_11377_0), on peut forger une url du type
|
||||
```https://www.lecaveaudelamichodiere.com/wp-admin/admin-ajax.php?action=eventon_ics_download&event_id=11377&ri=0``` pour récupérer un ical de l'événement.
|
||||
|
||||
## La petite gaillarde
|
||||
|
||||
Le flux RSS https://lapetitegaillarde.fr/?feed=rss2 est à peu près bien structuré.
|
||||
|
@ -9,6 +9,7 @@ from .import_tasks.downloader import *
|
||||
from .import_tasks.extractor import *
|
||||
from .import_tasks.importer import *
|
||||
from .import_tasks.extractor_ical import *
|
||||
from .import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
@ -100,6 +101,8 @@ def run_recurrent_import(self, pk):
|
||||
extractor = ICALNoBusyExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
||||
extractor = ICALNoVCExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
||||
extractor = LaCoopeExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
@ -36,6 +36,7 @@ class DBImporterEvents:
|
||||
return self.nb_removed
|
||||
|
||||
def import_events(self, json_structure):
|
||||
print(json_structure)
|
||||
self.init_result_properties()
|
||||
|
||||
try:
|
||||
|
64
src/agenda_culturel/import_tasks/custom_extractors.py
Normal file
64
src/agenda_culturel/import_tasks/custom_extractors.py
Normal file
@ -0,0 +1,64 @@
|
||||
|
||||
from .generic_extractors import *
|
||||
import re
|
||||
import json5
|
||||
|
||||
|
||||
# A class dedicated to get events from La Coopérative de Mai:
|
||||
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||
class LaCoopeExtractor(TwoStepsExtractor):
|
||||
|
||||
nom_lieu = "La Coopérative de Mai"
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
||||
if len(script) == 0:
|
||||
raise Exception("Cannot find events in the first page")
|
||||
script = script[0]
|
||||
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||
if search:
|
||||
data = json5.loads(search.group(1))
|
||||
self.event_urls = [e['url'] for e in data['events']]
|
||||
for e in data['events']:
|
||||
if e['tag'] == "Gratuit":
|
||||
self.add_event_tag(e['url'], 'gratuit')
|
||||
|
||||
else:
|
||||
raise Exception('Cannot extract events from javascript')
|
||||
|
||||
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
|
||||
title = soup.find("h1").contents[0]
|
||||
category = "Concert"
|
||||
image = soup.find("meta", property="og:image")
|
||||
if image:
|
||||
image = image["content"]
|
||||
|
||||
description = soup.find("div", class_="grid-concert-content")
|
||||
if description:
|
||||
description = description.find('div', class_="content-striped")
|
||||
if description:
|
||||
description = description.find('div', class_='wysiwyg')
|
||||
if description:
|
||||
description = description.get_text()
|
||||
if description is None:
|
||||
description = ""
|
||||
|
||||
tags = []
|
||||
|
||||
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
||||
if len(link_calendar) == 0:
|
||||
raise Exception('Cannot find the google calendar url')
|
||||
|
||||
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||
start_day = gg_cal.start_day
|
||||
start_time = gg_cal.start_time
|
||||
end_day = gg_cal.end_day
|
||||
end_time = gg_cal.end_time
|
||||
location = LaCoopeExtractor.nom_lieu
|
||||
url_human = event_url
|
||||
|
||||
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
@ -1,5 +1,6 @@
|
||||
from urllib.parse import urlparse
|
||||
import urllib.request
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
@ -15,6 +16,24 @@ class Downloader(ABC):
|
||||
def download(self, url):
|
||||
pass
|
||||
|
||||
def get_content(self, url, cache = None):
|
||||
if cache and os.path.exists(cache):
|
||||
print("Loading cache ({})".format(cache))
|
||||
with open(cache) as f:
|
||||
content = "\n".join(f.readlines())
|
||||
else:
|
||||
content = self.download(url)
|
||||
|
||||
if cache:
|
||||
print("Saving cache ({})".format(cache))
|
||||
dir = os.path.dirname(cache)
|
||||
if dir != "" and not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(cache, "w") as text_file:
|
||||
text_file.write(content)
|
||||
return content
|
||||
|
||||
|
||||
class SimpleDownloader(Downloader):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -8,11 +8,15 @@ class Extractor(ABC):
|
||||
def __init__(self):
|
||||
self.header = {}
|
||||
self.events = []
|
||||
self.downloader = None
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, content, url, url_human = None):
|
||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||
pass
|
||||
|
||||
def set_downloader(self, downloader):
|
||||
self.downloader = downloader
|
||||
|
||||
@abstractmethod
|
||||
def clean_url(url):
|
||||
pass
|
||||
|
109
src/agenda_culturel/import_tasks/generic_extractors.py
Normal file
109
src/agenda_culturel/import_tasks/generic_extractors.py
Normal file
@ -0,0 +1,109 @@
|
||||
from abc import abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from .extractor import *
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from dateutil import parser
|
||||
import datetime
|
||||
|
||||
class GGCalendar:
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.extract_info()
|
||||
|
||||
def extract_info(self):
|
||||
parsed_url = urlparse(self.url.replace("#", "%23"))
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
self.location = params['location'][0] if 'location' in params else None
|
||||
self.title = params['text'][0] if 'text' in params else None
|
||||
if 'dates' in params:
|
||||
dates = [x.replace(" ", "+") for x in params['dates'][0].split("/")]
|
||||
if len(dates) > 0:
|
||||
date = parser.parse(dates[0])
|
||||
self.start_day = date.date()
|
||||
self.start_time = date.time()
|
||||
if len(dates) == 2:
|
||||
date = parser.parse(dates[1])
|
||||
self.end_day = date.date()
|
||||
self.end_time = date.time()
|
||||
else:
|
||||
self.end_day = None
|
||||
self.end_time = None
|
||||
|
||||
else:
|
||||
raise Exception("Unable to find a date in google calendar URL")
|
||||
self.start_day = None
|
||||
self.start_time = None
|
||||
self.end_day = None
|
||||
self.end_time = None
|
||||
|
||||
|
||||
|
||||
# A class to extract events from URL with two steps:
|
||||
# - first build a list of urls where the events will be found
|
||||
# - then for each document downloaded from these urls, build the events
|
||||
# This class is an abstract class
|
||||
class TwoStepsExtractor(Extractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.event_urls = None
|
||||
self.event_properties = {}
|
||||
|
||||
def clean_url(url):
|
||||
return url
|
||||
|
||||
def add_event_tag(self, url, tag):
|
||||
if not url in self.event_properties:
|
||||
self.event_properties[url] = {}
|
||||
if not "tags" in self.event_properties[url]:
|
||||
self.event_properties[url]["tags"] = []
|
||||
self.event_properties[url]["tags"].append(tag)
|
||||
|
||||
def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
|
||||
|
||||
if event_url in self.event_properties and 'tags' in self.event_properties[event_url]:
|
||||
tags = tags + self.event_properties[event_url]['tags']
|
||||
|
||||
self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def build_event_url_list(self, content):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||
pass
|
||||
|
||||
|
||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||
self.set_header(url)
|
||||
self.clear_events()
|
||||
|
||||
self.event_urls = None
|
||||
self.event_properties.clear()
|
||||
|
||||
# first build the event list
|
||||
self.build_event_url_list(content)
|
||||
|
||||
if self.event_urls is None:
|
||||
raise Exception('Unable to find the event list from the main document')
|
||||
|
||||
if self.downloader is None:
|
||||
raise Exception('The downloader is not defined')
|
||||
|
||||
# then process each element of the list
|
||||
for i, event_url in enumerate(self.event_urls):
|
||||
# first download the content associated with this link
|
||||
content_event = self.downloader.get_content(event_url)
|
||||
if content_event is None:
|
||||
raise Exception(_('Cannot extract event from url {}').format(event_url))
|
||||
# then extract event information from this html document
|
||||
self.add_event_from_content(content_event, event_url, url_human, default_values, published)
|
||||
|
||||
return self.get_structure()
|
||||
|
@ -12,40 +12,22 @@ class URL2Events:
|
||||
self.extractor = extractor
|
||||
self.single_event = single_event
|
||||
|
||||
def get_content(self, url, cache = None):
|
||||
if cache and os.path.exists(cache):
|
||||
print("Loading cache ({})".format(cache))
|
||||
with open(cache) as f:
|
||||
content = "\n".join(f.readlines())
|
||||
else:
|
||||
content = self.downloader.download(url)
|
||||
|
||||
if cache:
|
||||
print("Saving cache ({})".format(cache))
|
||||
dir = os.path.dirname(cache)
|
||||
if dir != "" and not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(cache, "w") as text_file:
|
||||
text_file.write(content)
|
||||
return content
|
||||
|
||||
|
||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||
content = self.get_content(url, cache)
|
||||
content = self.downloader.get_content(url, cache)
|
||||
|
||||
if content is None:
|
||||
return None
|
||||
|
||||
if self.extractor is not None:
|
||||
self.extractor.set_downloader(self.downloader)
|
||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||
else:
|
||||
# if the extractor is not defined, use a list of default extractors
|
||||
for e in Extractor.get_default_extractors(self.single_event):
|
||||
#try:
|
||||
e.set_downloader(self.downloader)
|
||||
events = e.extract(content, url, url_human, default_values, published)
|
||||
if events is not None:
|
||||
return events
|
||||
#except:
|
||||
# continue
|
||||
return None
|
||||
|
||||
|
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.7 on 2024-04-19 12:07
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('agenda_culturel', '0048_auto_20240417_1212'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='recurrentimport',
|
||||
name='processor',
|
||||
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org')], default='ical', max_length=20, verbose_name='Processor'),
|
||||
),
|
||||
]
|
@ -754,6 +754,7 @@ class RecurrentImport(models.Model):
|
||||
ICAL = "ical", _("ical")
|
||||
ICALNOBUSY = "icalnobusy", _("ical no busy")
|
||||
ICALNOVC = "icalnovc", _("ical no VC")
|
||||
LACOOPE = "lacoope", _('lacoope.org')
|
||||
|
||||
class DOWNLOADER(models.TextChoices):
|
||||
SIMPLE = "simple", _("simple")
|
||||
|
@ -36,4 +36,5 @@ django-recurrence==1.11.1
|
||||
icalendar==5.0.11
|
||||
lxml==5.1.0
|
||||
bbcode==1.1.0
|
||||
json5==0.9.25
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user