Ajout (pas finalisé) de l'import Cour des 3 Coquins

2024-09-04 11:42:31 +02:00 · 2024-09-04 11:42:31 +02:00 · 9bb3373f99
commit 9bb3373f99
parent 0a5470e73d
3 changed files with 152 additions and 2 deletions
--- a/experimentations/get_c3c_events.py
+++ b/experimentations/get_c3c_events.py
@ -0,0 +1,43 @@
 #!/usr/bin/python3
 # coding: utf-8
 import os
 import json
 import sys
 # getting the name of the directory
 # where the this file is present.
 current = os.path.dirname(os.path.realpath(__file__))
 # Getting the parent directory name
 # where the current directory is present.
 parent = os.path.dirname(current)
 # adding the parent directory to 
 # the sys.path.
 sys.path.append(parent)
 from src.agenda_culturel.import_tasks.downloader import *
 from src.agenda_culturel.import_tasks.extractor import *
 from src.agenda_culturel.import_tasks.importer import *
 from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
    u2e = URL2Events(SimpleDownloader(), c3c.CExtractor())
    url = "https://billetterie-c3c.clermont-ferrand.fr/"
    url_human = "https://billetterie-c3c.clermont-ferrand.fr/"
    try:
        events = u2e.process(url, url_human, cache = "cache-c3c.html", default_values = {"location": "La Cour des 3 Coquins"}, published = True)
        exportfile = "events-c3c.json"
        print("Saving events to file {}".format(exportfile))
        with open(exportfile, "w") as f:
            json.dump(events, f, indent=4, default=str)
    except Exception as e:
        print("Exception: " + str(e))
--- a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py
@ -0,0 +1,100 @@
 from ..generic_extractors import *
 from bs4 import BeautifulSoup
 # A class dedicated to get events from La Cour des 3 Coquins
 # URL: https://billetterie-c3c.clermont-ferrand.fr//
 class CExtractor(TwoStepsExtractor):
    nom_lieu = "La Cour des 3 Coquins"
    def category_c3c2agenda(self, category):
        if not category:
            return None
        mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"}
        if category in mapping:
            return mapping[category]
        else:
            return None
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "html.parser")
        events = soup.select("div.fiche-info")
        for e in events:
            e_url = e.select_one("a.btn.lien_savoir_plus")["href"]
            if e_url != "":
                e_url = self.url + "/" + e_url
                self.add_event_url(e_url)
    def add_event_from_content(
        self,
        event_content,
        event_url,
        url_human=None,
        default_values=None,
        published=False,
    ):
        soup = BeautifulSoup(event_content, "html.parser")
        title = soup.select_one("h1")
        if title:
            title = title.text
        image = soup.select_one("#media .swiper-slide img")
        if image:
            image = image["src"]
        else:
            image = None
        description = soup.select_one(".presentation")
        duree = soup.select_one("#criteres .DUREE-V .valeur-critere li")
        if duree is not None:
            duree = self.parse_french_time(duree.text)
        location = self.nom_lieu
        tags = []
        for t in soup.select(".sous-titre span"):
            classes = t.get("class")
            if classes and len(classes) > 0:
                if classes[0].startswith("LIEU-"):
                    location = t.text
                elif classes[0].startswith("THEMATIQUE-"):
                    tag = self.category_c3c2agenda(t.text)
                    if tag is not None:
                        tags.append(tag)
        # TODO: parser les dates, récupérer les heures ()
        print("EVENT ", event_url)
        print("- ", title)
        print("- ", image)
        print("- ", len(description))
        print("- ", duree)
        print("- ", location)
        print("- ", tags)
        print("- ", dates)
        return
        url_human = event_url
        self.add_event_with_props(
            event_url,
            None,
            None,
            start_day,
            location,
            description,
            tags,
            recurrences=None,
            uuids=[event_url],
            url_human=url_human,
            start_time=start_time,
            end_day=end_day,
            end_time=end_time,
            published=published,
            image=image,
        )
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@ -97,13 +97,20 @@ class Extractor(ABC):
                s = "0"
            else:
                # format heures
-                m = re.search("([0-9]+)[ Hh:.]", text)
+                m = re.search("([0-9]+) [Hh:.]", text)
                if m:
                    h = m.group(1)
                    m = "0"
                    s = "0"
                else:
-                    return None
+                    # format minutes
                    m = re.search("([0-9]+)[ ]*(?:mn|min|Min|Mn)", text)
                    if m:
                        h = "0"
                        m = m.group(1)
                        s = "0"
                    else:
                        return None
        try:
            h = int(h)