Ajout d'un extracteur pour Arachnée Concerts
This commit is contained in:
parent
30aafd4979
commit
9f0a1a33cf
40
experimentations/get_arachnee_events.py
Executable file
40
experimentations/get_arachnee_events.py
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(ChromiumHeadlessDownloader(), arachnee.CExtractor())
|
||||
url = "https://www.arachnee-concerts.com/wp-admin/admin-ajax.php?action=movies-filter&per_page=9999&date=NaN.NaN.NaN&theatres=Clermont-Fd&cat=&sorting=&list_all_events=¤t_page="
|
||||
url_human = "https://www.arachnee-concerts.com/agenda-des-concerts/Clermont-Fd/"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-arachnee.html", default_values = {}, published = True)
|
||||
|
||||
exportfile = "events-arachnee.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
@ -145,6 +145,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
||||
extractor = fbevents.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.C3C:
|
||||
extractor = c3c.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE:
|
||||
extractor = arachnee.CExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
109
src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
Normal file
109
src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
Normal file
@ -0,0 +1,109 @@
|
||||
from ..generic_extractors import *
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# A class dedicated to get events from Arachnée Concert
|
||||
# URL: https://www.arachnee-concerts.com/agenda-des-concerts/
|
||||
class CExtractor(TwoStepsExtractorNoPause):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.possible_dates = {}
|
||||
self.theater = None
|
||||
|
||||
def extract(
|
||||
self,
|
||||
content,
|
||||
url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
only_future=True,
|
||||
ignore_404=True
|
||||
):
|
||||
match = re.match(r".*\&theatres=([^&]*)&.*", url)
|
||||
if match:
|
||||
self.theater = match[1]
|
||||
|
||||
return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
|
||||
|
||||
def build_event_url_list(self, content, infuture_days=180):
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
containers = soup.select("ul.event_container>li")
|
||||
if containers:
|
||||
for c in containers:
|
||||
d = Extractor.parse_french_date(c.select_one(".date").text)
|
||||
l = c.select_one(".event_auditory").text
|
||||
if (self.theater is None or (l.startswith(self.theater))) and d < datetime.date.today() + timedelta(days=infuture_days):
|
||||
t = Extractor.parse_french_time(c.select_one(".time").text)
|
||||
e_url = c.select_one(".info a")["href"]
|
||||
if not e_url in self.possible_dates:
|
||||
self.possible_dates[e_url] = []
|
||||
self.possible_dates[e_url].append((str(d) + " " + str(t)))
|
||||
self.add_event_url(e_url)
|
||||
|
||||
def add_event_from_content(
|
||||
self,
|
||||
event_content,
|
||||
event_url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
):
|
||||
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
title = ", ".join([x.text for x in [soup.select_one(y) for y in [".page_title", ".artiste-subtitle"]] if x])
|
||||
|
||||
image = soup.select_one(".entry-image .image_wrapper img")
|
||||
if not image is None:
|
||||
image = image["src"]
|
||||
|
||||
descs = soup.select(".entry-content p")
|
||||
if descs:
|
||||
description = "\n".join([d.text for d in descs])
|
||||
else:
|
||||
description = None
|
||||
|
||||
category = soup.select_one(".event_category").text
|
||||
tags = []
|
||||
if category in ["Grand Spectacle"]:
|
||||
category = "Danse"
|
||||
elif category in ["Théâtre"]:
|
||||
category = "Théâtre"
|
||||
elif category in ["Chanson française", "Musique du monde", "Pop / Rock", "Rap, RnB", "Raggae", "Variété"]:
|
||||
category = "Concert"
|
||||
elif category in ["Comédie Musicale", "Humour / One Man Show", "Spectacle équestre"]:
|
||||
category = "Art du spectacle"
|
||||
elif category in ["Spectacle pour enfant"]:
|
||||
tags = ["jeune public"]
|
||||
category = None
|
||||
else:
|
||||
category = ""
|
||||
|
||||
dates = soup.select("#event_ticket_content>ul>li")
|
||||
for d in dates:
|
||||
dt = datetime.datetime.fromisoformat(d.select_one(".date")["content"])
|
||||
date = dt.date()
|
||||
time = dt.time()
|
||||
if str(date) + " " + str(time) in self.possible_dates[event_url]:
|
||||
location = d.select_one(".event_auditory").text
|
||||
|
||||
self.add_event_with_props(
|
||||
default_values,
|
||||
event_url,
|
||||
title,
|
||||
category,
|
||||
date,
|
||||
location,
|
||||
description,
|
||||
tags,
|
||||
recurrences=None,
|
||||
uuids=[event_url + "?d=" + str(date) + "&t=" + str(time)],
|
||||
url_human=url_human,
|
||||
start_time=time,
|
||||
end_day=None,
|
||||
end_time=None,
|
||||
published=published,
|
||||
image=image,
|
||||
)
|
@ -250,3 +250,23 @@ class TwoStepsExtractor(Extractor):
|
||||
)
|
||||
|
||||
return self.get_structure()
|
||||
|
||||
|
||||
class TwoStepsExtractorNoPause(TwoStepsExtractor):
|
||||
|
||||
def extract(
|
||||
self,
|
||||
content,
|
||||
url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
only_future=True,
|
||||
ignore_404=True
|
||||
):
|
||||
pause = self.downloader.pause
|
||||
self.downloader.pause = False
|
||||
result = super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
|
||||
self.downloader.pause = pause
|
||||
|
||||
return result
|
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.9 on 2024-10-19 13:24
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('agenda_culturel', '0089_alter_recurrentimport_defaultcategory'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='recurrentimport',
|
||||
name='processor',
|
||||
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert')], default='ical', max_length=20, verbose_name='Processor'),
|
||||
),
|
||||
]
|
@ -1307,6 +1307,7 @@ class RecurrentImport(models.Model):
|
||||
MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
|
||||
FBEVENTS = "Facebook events", _("Événements d'une page FB")
|
||||
C3C = "cour3coquins", _("la cour des 3 coquins")
|
||||
ARACHNEE = "arachnee", _("Arachnée concert")
|
||||
|
||||
class DOWNLOADER(models.TextChoices):
|
||||
SIMPLE = "simple", _("simple")
|
||||
|
Loading…
x
Reference in New Issue
Block a user