Ajout d'un extracteur pour Arachnée Concerts

This commit is contained in:
Jean-Marie Favreau 2024-10-19 15:36:50 +02:00
parent 30aafd4979
commit 9f0a1a33cf
6 changed files with 190 additions and 0 deletions

View File

@ -0,0 +1,40 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(ChromiumHeadlessDownloader(), arachnee.CExtractor())
url = "https://www.arachnee-concerts.com/wp-admin/admin-ajax.php?action=movies-filter&per_page=9999&date=NaN.NaN.NaN&theatres=Clermont-Fd&cat=&sorting=&list_all_events=&current_page="
url_human = "https://www.arachnee-concerts.com/agenda-des-concerts/Clermont-Fd/"
try:
events = u2e.process(url, url_human, cache = "cache-arachnee.html", default_values = {}, published = True)
exportfile = "events-arachnee.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -145,6 +145,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
extractor = fbevents.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.C3C:
extractor = c3c.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE:
extractor = arachnee.CExtractor()
else:
extractor = None

View File

@ -0,0 +1,109 @@
from ..generic_extractors import *
from bs4 import BeautifulSoup
# A class dedicated to get events from Arachnée Concert
# URL: https://www.arachnee-concerts.com/agenda-des-concerts/
class CExtractor(TwoStepsExtractorNoPause):
def __init__(self):
super().__init__()
self.possible_dates = {}
self.theater = None
def extract(
self,
content,
url,
url_human=None,
default_values=None,
published=False,
only_future=True,
ignore_404=True
):
match = re.match(r".*\&theatres=([^&]*)&.*", url)
if match:
self.theater = match[1]
return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
def build_event_url_list(self, content, infuture_days=180):
soup = BeautifulSoup(content, "html.parser")
containers = soup.select("ul.event_container>li")
if containers:
for c in containers:
d = Extractor.parse_french_date(c.select_one(".date").text)
l = c.select_one(".event_auditory").text
if (self.theater is None or (l.startswith(self.theater))) and d < datetime.date.today() + timedelta(days=infuture_days):
t = Extractor.parse_french_time(c.select_one(".time").text)
e_url = c.select_one(".info a")["href"]
if not e_url in self.possible_dates:
self.possible_dates[e_url] = []
self.possible_dates[e_url].append((str(d) + " " + str(t)))
self.add_event_url(e_url)
def add_event_from_content(
self,
event_content,
event_url,
url_human=None,
default_values=None,
published=False,
):
soup = BeautifulSoup(event_content, "html.parser")
title = ", ".join([x.text for x in [soup.select_one(y) for y in [".page_title", ".artiste-subtitle"]] if x])
image = soup.select_one(".entry-image .image_wrapper img")
if not image is None:
image = image["src"]
descs = soup.select(".entry-content p")
if descs:
description = "\n".join([d.text for d in descs])
else:
description = None
category = soup.select_one(".event_category").text
tags = []
if category in ["Grand Spectacle"]:
category = "Danse"
elif category in ["Théâtre"]:
category = "Théâtre"
elif category in ["Chanson française", "Musique du monde", "Pop / Rock", "Rap, RnB", "Raggae", "Variété"]:
category = "Concert"
elif category in ["Comédie Musicale", "Humour / One Man Show", "Spectacle équestre"]:
category = "Art du spectacle"
elif category in ["Spectacle pour enfant"]:
tags = ["jeune public"]
category = None
else:
category = ""
dates = soup.select("#event_ticket_content>ul>li")
for d in dates:
dt = datetime.datetime.fromisoformat(d.select_one(".date")["content"])
date = dt.date()
time = dt.time()
if str(date) + " " + str(time) in self.possible_dates[event_url]:
location = d.select_one(".event_auditory").text
self.add_event_with_props(
default_values,
event_url,
title,
category,
date,
location,
description,
tags,
recurrences=None,
uuids=[event_url + "?d=" + str(date) + "&t=" + str(time)],
url_human=url_human,
start_time=time,
end_day=None,
end_time=None,
published=published,
image=image,
)

View File

@ -250,3 +250,23 @@ class TwoStepsExtractor(Extractor):
)
return self.get_structure()
class TwoStepsExtractorNoPause(TwoStepsExtractor):
def extract(
self,
content,
url,
url_human=None,
default_values=None,
published=False,
only_future=True,
ignore_404=True
):
pause = self.downloader.pause
self.downloader.pause = False
result = super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
self.downloader.pause = pause
return result

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.9 on 2024-10-19 13:24
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0089_alter_recurrentimport_defaultcategory'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert')], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -1307,6 +1307,7 @@ class RecurrentImport(models.Model):
MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
FBEVENTS = "Facebook events", _("Événements d'une page FB")
C3C = "cour3coquins", _("la cour des 3 coquins")
ARACHNEE = "arachnee", _("Arachnée concert")
class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple")