diff --git a/experimentations/get_meditheques_clermont.py b/experimentations/get_meditheques_clermont.py new file mode 100755 index 0000000..1e43c82 --- /dev/null +++ b/experimentations/get_meditheques_clermont.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), iguana_agenda.CExtractor()) + url = "https://bibliotheques-clermontmetropole.eu/iguana/Service.PubContainer.cls?uuid=a4a1f992-06da-4ff4-9176-4af0a095c7d1" + url_human = "https://bibliotheques-clermontmetropole.eu/iguana/www.main.cls?surl=AGENDA_Tout%20lagenda" + + try: + events = u2e.process(url, url_human, cache = "cache-mediatheques.html", default_values = {}, published = True) + + exportfile = "events-mediatheques.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 6af4c26..ff8e749 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -154,6 +154,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = laraymonde.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE: extractor = apidae_tourisme.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA: + extractor = iguana_agenda.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py b/src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py new file mode 100644 index 0000000..709985b --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py @@ -0,0 +1,111 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup +from datetime import datetime + +# A class dedicated to get events from Raymond Bar +# URL: https://www.raymondbar.net/ +class CExtractor(TwoStepsExtractorNoPause): + + def __init__(self): + super().__init__() + + def guess_category(self, category): + if "Cinéma" in category: + return "Cinéma" + if "Conférence" in category or "Rencontres" in category: + return "Rencontres & débats" + if "Lecture" in category or "Conte" in category: + return "Spectacles" + if "Atelier" in category or "Jeux" in category or "": + return "Animations & Ateliers" + if "Numérique" in category: + return "Rendez-vous locaux" + + return "Sans catégorie" + + + def guess_tags_from_category(self, category): + tags = [] + if "Lecture" in category: + tags.append("📖 lecture") + if "Jeux" in category: + tags.append("🎲 jeux") + + return tags + + def build_event_url_list(self, content, infuture_days=180): + + soup = BeautifulSoup(content, "html.parser") + + root_address_human = self.url_human.split('?')[0] + root_address = self.url.split('Service')[0] + + items = soup.select("li.listItem") + if items: + for item in items: + elems = item["onclick"].split('"') + v = elems[3].split('^')[1] + contentItem = elems[1] + multidate = item.select_one('.until.maindate').text != '' + if not multidate: + url_human = root_address_human + '?p=*&v=' + v + "#contentitem=" + contentItem + url = root_address + 'Service.PubItem.cls?action=get&instance=*&uuid=' + contentItem + self.add_event_url(url) + self.add_event_url_human(url, url_human) + + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + + soup = BeautifulSoup(event_content, "xml") + + + title = soup.select_one("Title").text + content = soup.select_one("Content").text + + soup = BeautifulSoup(content, "html.parser") + + image = soup.select_one(".image img")["src"] + description = soup.select_one(".rightcolumn .content").text + location = soup.select_one(".infos .location").text + public = soup.select_one(".infos .public").text + start_day = Extractor.parse_french_date(soup.select_one(".infos .date .from").text) + start_time = Extractor.parse_french_time(soup.select_one(".infos .date .time").text) + acces = soup.select_one(".infos .acces").text + category = soup.select_one(".rightcolumn .category").text + infos = soup.select_one('.infos').text + + description = description + "\n" + infos + + tags = self.guess_tags_from_category(category) + category = self.guess_category(category) + if "Tout-petits" in public or "Jeunesse" in public: + tags.append("🎈 jeune public") + if "Accès libre" in acces: + tags.append("💶 gratuit") + + self.add_event_with_props( + default_values, + event_url, + title, + category, + start_day, + location, + description, + tags, + recurrences=None, + uuids=[event_url], + url_human=event_url, + start_time=start_time, + end_day=None, + end_time=None, + published=published, + image=image, + image_alt="" + ) \ No newline at end of file diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 4bc5510..ab03bc8 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -2011,6 +2011,7 @@ class RecurrentImport(models.Model): LERIO = "rio", _('Le Rio') LARAYMONDE = "raymonde", _('La Raymonde') APIDAE = 'apidae', _('Agenda apidae tourisme') + IGUANA = 'iguana', _('Agenda iguana (médiathèques)') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")