Ajout de l'importation des événements de la Coopé
This commit is contained in:
parent
8d8fa39c93
commit
c1629f4692
43
experimentations/get_lacoope_events.py
Executable file
43
experimentations/get_lacoope_events.py
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
|
||||||
|
url = "https://www.lacoope.org/concerts-calendrier/"
|
||||||
|
url_human = "https://www.lacoope.org/concerts-calendrier/"
|
||||||
|
|
||||||
|
try:
|
||||||
|
events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
|
||||||
|
|
||||||
|
exportfile = "events-lacoope.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: " + str(e))
|
29
experimentations/notes-sources.md
Normal file
29
experimentations/notes-sources.md
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# Notes sur les sources du territoire clermontois
|
||||||
|
|
||||||
|
## La Comédie de Clermont
|
||||||
|
|
||||||
|
URL des dates avec événements: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
|
||||||
|
URL des informations d'une date avec paramètres en post:
|
||||||
|
```curl --data "action=load_evenements_jour" --data "jour=2024-04-19" "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php"```
|
||||||
|
La donnée retournée est du html assez succinct, avec l'essentiel dedans.
|
||||||
|
|
||||||
|
|
||||||
|
## La coopé
|
||||||
|
|
||||||
|
Dans le source de https://www.lacoope.org/concerts-calendrier/ on trouve un tableau javascript qui contient les urls des événements. Ce tableau peut contenir "Gratuit" en tag. Il n'y a pas l'heure de l'événement.
|
||||||
|
Sur chaque page événémenet, il y a :
|
||||||
|
- meta name="description"
|
||||||
|
- une url https://calendar.google.com/calendar/ avec la plupart des données
|
||||||
|
|
||||||
|
## Le caveau de la michodière
|
||||||
|
|
||||||
|
L'adresse https://www.lecaveaudelamichodiere.com/concerts/ donne les concerts du mois en cours.
|
||||||
|
La page est peuplée par une requête javascript qui semble difficile à rejouer indépendamment, car on se prend un erreur 403 (fucking plugin propriétaire eventon).
|
||||||
|
|
||||||
|
Si on récupère l'identifiant de l'événement (type event_11377_0), on peut forger une url du type
|
||||||
|
```https://www.lecaveaudelamichodiere.com/wp-admin/admin-ajax.php?action=eventon_ics_download&event_id=11377&ri=0``` pour récupérer un ical de l'événement.
|
||||||
|
|
||||||
|
## La petite gaillarde
|
||||||
|
|
||||||
|
Le flux RSS https://lapetitegaillarde.fr/?feed=rss2 est à peu près bien structuré.
|
||||||
|
|
@ -9,6 +9,7 @@ from .import_tasks.downloader import *
|
|||||||
from .import_tasks.extractor import *
|
from .import_tasks.extractor import *
|
||||||
from .import_tasks.importer import *
|
from .import_tasks.importer import *
|
||||||
from .import_tasks.extractor_ical import *
|
from .import_tasks.extractor_ical import *
|
||||||
|
from .import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -100,6 +101,8 @@ def run_recurrent_import(self, pk):
|
|||||||
extractor = ICALNoBusyExtractor()
|
extractor = ICALNoBusyExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
||||||
extractor = ICALNoVCExtractor()
|
extractor = ICALNoVCExtractor()
|
||||||
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
||||||
|
extractor = LaCoopeExtractor()
|
||||||
else:
|
else:
|
||||||
extractor = None
|
extractor = None
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ class DBImporterEvents:
|
|||||||
return self.nb_removed
|
return self.nb_removed
|
||||||
|
|
||||||
def import_events(self, json_structure):
|
def import_events(self, json_structure):
|
||||||
|
print(json_structure)
|
||||||
self.init_result_properties()
|
self.init_result_properties()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
64
src/agenda_culturel/import_tasks/custom_extractors.py
Normal file
64
src/agenda_culturel/import_tasks/custom_extractors.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
|
||||||
|
from .generic_extractors import *
|
||||||
|
import re
|
||||||
|
import json5
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from La Coopérative de Mai:
|
||||||
|
# URL: https://www.lacoope.org/concerts-calendrier/
|
||||||
|
class LaCoopeExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
nom_lieu = "La Coopérative de Mai"
|
||||||
|
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
script = soup.find('div', class_="js-filter__results").findChildren('script')
|
||||||
|
if len(script) == 0:
|
||||||
|
raise Exception("Cannot find events in the first page")
|
||||||
|
script = script[0]
|
||||||
|
search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
|
||||||
|
if search:
|
||||||
|
data = json5.loads(search.group(1))
|
||||||
|
self.event_urls = [e['url'] for e in data['events']]
|
||||||
|
for e in data['events']:
|
||||||
|
if e['tag'] == "Gratuit":
|
||||||
|
self.add_event_tag(e['url'], 'gratuit')
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception('Cannot extract events from javascript')
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
|
title = soup.find("h1").contents[0]
|
||||||
|
category = "Concert"
|
||||||
|
image = soup.find("meta", property="og:image")
|
||||||
|
if image:
|
||||||
|
image = image["content"]
|
||||||
|
|
||||||
|
description = soup.find("div", class_="grid-concert-content")
|
||||||
|
if description:
|
||||||
|
description = description.find('div', class_="content-striped")
|
||||||
|
if description:
|
||||||
|
description = description.find('div', class_='wysiwyg')
|
||||||
|
if description:
|
||||||
|
description = description.get_text()
|
||||||
|
if description is None:
|
||||||
|
description = ""
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
|
||||||
|
link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
|
||||||
|
if len(link_calendar) == 0:
|
||||||
|
raise Exception('Cannot find the google calendar url')
|
||||||
|
|
||||||
|
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||||
|
start_day = gg_cal.start_day
|
||||||
|
start_time = gg_cal.start_time
|
||||||
|
end_day = gg_cal.end_day
|
||||||
|
end_time = gg_cal.end_time
|
||||||
|
location = LaCoopeExtractor.nom_lieu
|
||||||
|
url_human = event_url
|
||||||
|
|
||||||
|
self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
|
@ -1,5 +1,6 @@
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
import os
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
@ -15,6 +16,24 @@ class Downloader(ABC):
|
|||||||
def download(self, url):
|
def download(self, url):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def get_content(self, url, cache = None):
|
||||||
|
if cache and os.path.exists(cache):
|
||||||
|
print("Loading cache ({})".format(cache))
|
||||||
|
with open(cache) as f:
|
||||||
|
content = "\n".join(f.readlines())
|
||||||
|
else:
|
||||||
|
content = self.download(url)
|
||||||
|
|
||||||
|
if cache:
|
||||||
|
print("Saving cache ({})".format(cache))
|
||||||
|
dir = os.path.dirname(cache)
|
||||||
|
if dir != "" and not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
with open(cache, "w") as text_file:
|
||||||
|
text_file.write(content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
class SimpleDownloader(Downloader):
|
class SimpleDownloader(Downloader):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -8,11 +8,15 @@ class Extractor(ABC):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.header = {}
|
self.header = {}
|
||||||
self.events = []
|
self.events = []
|
||||||
|
self.downloader = None
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract(self, content, url, url_human = None):
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def set_downloader(self, downloader):
|
||||||
|
self.downloader = downloader
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
pass
|
pass
|
||||||
|
109
src/agenda_culturel/import_tasks/generic_extractors.py
Normal file
109
src/agenda_culturel/import_tasks/generic_extractors.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
from abc import abstractmethod
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from urllib.parse import parse_qs
|
||||||
|
|
||||||
|
from .extractor import *
|
||||||
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
from dateutil import parser
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
class GGCalendar:
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self.extract_info()
|
||||||
|
|
||||||
|
def extract_info(self):
|
||||||
|
parsed_url = urlparse(self.url.replace("#", "%23"))
|
||||||
|
params = parse_qs(parsed_url.query)
|
||||||
|
|
||||||
|
self.location = params['location'][0] if 'location' in params else None
|
||||||
|
self.title = params['text'][0] if 'text' in params else None
|
||||||
|
if 'dates' in params:
|
||||||
|
dates = [x.replace(" ", "+") for x in params['dates'][0].split("/")]
|
||||||
|
if len(dates) > 0:
|
||||||
|
date = parser.parse(dates[0])
|
||||||
|
self.start_day = date.date()
|
||||||
|
self.start_time = date.time()
|
||||||
|
if len(dates) == 2:
|
||||||
|
date = parser.parse(dates[1])
|
||||||
|
self.end_day = date.date()
|
||||||
|
self.end_time = date.time()
|
||||||
|
else:
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("Unable to find a date in google calendar URL")
|
||||||
|
self.start_day = None
|
||||||
|
self.start_time = None
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# A class to extract events from URL with two steps:
|
||||||
|
# - first build a list of urls where the events will be found
|
||||||
|
# - then for each document downloaded from these urls, build the events
|
||||||
|
# This class is an abstract class
|
||||||
|
class TwoStepsExtractor(Extractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.event_urls = None
|
||||||
|
self.event_properties = {}
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
return url
|
||||||
|
|
||||||
|
def add_event_tag(self, url, tag):
|
||||||
|
if not url in self.event_properties:
|
||||||
|
self.event_properties[url] = {}
|
||||||
|
if not "tags" in self.event_properties[url]:
|
||||||
|
self.event_properties[url]["tags"] = []
|
||||||
|
self.event_properties[url]["tags"].append(tag)
|
||||||
|
|
||||||
|
def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
|
||||||
|
|
||||||
|
if event_url in self.event_properties and 'tags' in self.event_properties[event_url]:
|
||||||
|
tags = tags + self.event_properties[event_url]['tags']
|
||||||
|
|
||||||
|
self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def build_event_url_list(self, content):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||||
|
self.set_header(url)
|
||||||
|
self.clear_events()
|
||||||
|
|
||||||
|
self.event_urls = None
|
||||||
|
self.event_properties.clear()
|
||||||
|
|
||||||
|
# first build the event list
|
||||||
|
self.build_event_url_list(content)
|
||||||
|
|
||||||
|
if self.event_urls is None:
|
||||||
|
raise Exception('Unable to find the event list from the main document')
|
||||||
|
|
||||||
|
if self.downloader is None:
|
||||||
|
raise Exception('The downloader is not defined')
|
||||||
|
|
||||||
|
# then process each element of the list
|
||||||
|
for i, event_url in enumerate(self.event_urls):
|
||||||
|
# first download the content associated with this link
|
||||||
|
content_event = self.downloader.get_content(event_url)
|
||||||
|
if content_event is None:
|
||||||
|
raise Exception(_('Cannot extract event from url {}').format(event_url))
|
||||||
|
# then extract event information from this html document
|
||||||
|
self.add_event_from_content(content_event, event_url, url_human, default_values, published)
|
||||||
|
|
||||||
|
return self.get_structure()
|
||||||
|
|
@ -12,40 +12,22 @@ class URL2Events:
|
|||||||
self.extractor = extractor
|
self.extractor = extractor
|
||||||
self.single_event = single_event
|
self.single_event = single_event
|
||||||
|
|
||||||
def get_content(self, url, cache = None):
|
|
||||||
if cache and os.path.exists(cache):
|
|
||||||
print("Loading cache ({})".format(cache))
|
|
||||||
with open(cache) as f:
|
|
||||||
content = "\n".join(f.readlines())
|
|
||||||
else:
|
|
||||||
content = self.downloader.download(url)
|
|
||||||
|
|
||||||
if cache:
|
|
||||||
print("Saving cache ({})".format(cache))
|
|
||||||
dir = os.path.dirname(cache)
|
|
||||||
if dir != "" and not os.path.exists(dir):
|
|
||||||
os.makedirs(dir)
|
|
||||||
with open(cache, "w") as text_file:
|
|
||||||
text_file.write(content)
|
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||||
content = self.get_content(url, cache)
|
content = self.downloader.get_content(url, cache)
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if self.extractor is not None:
|
if self.extractor is not None:
|
||||||
|
self.extractor.set_downloader(self.downloader)
|
||||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||||
else:
|
else:
|
||||||
# if the extractor is not defined, use a list of default extractors
|
# if the extractor is not defined, use a list of default extractors
|
||||||
for e in Extractor.get_default_extractors(self.single_event):
|
for e in Extractor.get_default_extractors(self.single_event):
|
||||||
#try:
|
e.set_downloader(self.downloader)
|
||||||
events = e.extract(content, url, url_human, default_values, published)
|
events = e.extract(content, url, url_human, default_values, published)
|
||||||
if events is not None:
|
if events is not None:
|
||||||
return events
|
return events
|
||||||
#except:
|
|
||||||
# continue
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 4.2.7 on 2024-04-19 12:07
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('agenda_culturel', '0048_auto_20240417_1212'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='recurrentimport',
|
||||||
|
name='processor',
|
||||||
|
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org')], default='ical', max_length=20, verbose_name='Processor'),
|
||||||
|
),
|
||||||
|
]
|
@ -754,6 +754,7 @@ class RecurrentImport(models.Model):
|
|||||||
ICAL = "ical", _("ical")
|
ICAL = "ical", _("ical")
|
||||||
ICALNOBUSY = "icalnobusy", _("ical no busy")
|
ICALNOBUSY = "icalnobusy", _("ical no busy")
|
||||||
ICALNOVC = "icalnovc", _("ical no VC")
|
ICALNOVC = "icalnovc", _("ical no VC")
|
||||||
|
LACOOPE = "lacoope", _('lacoope.org')
|
||||||
|
|
||||||
class DOWNLOADER(models.TextChoices):
|
class DOWNLOADER(models.TextChoices):
|
||||||
SIMPLE = "simple", _("simple")
|
SIMPLE = "simple", _("simple")
|
||||||
|
@ -36,4 +36,5 @@ django-recurrence==1.11.1
|
|||||||
icalendar==5.0.11
|
icalendar==5.0.11
|
||||||
lxml==5.1.0
|
lxml==5.1.0
|
||||||
bbcode==1.1.0
|
bbcode==1.1.0
|
||||||
|
json5==0.9.25
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user