Refactoring extracteurs

2024-04-22 09:42:23 +02:00 · 2024-04-22 09:42:23 +02:00 · c043ba198c
commit c043ba198c
parent 2862a0c5dd
12 changed files with 292 additions and 274 deletions
--- a/experimentations/get_lacomedie_events.py
+++ b/experimentations/get_lacomedie_events.py
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
-    u2e = URL2Events(SimpleDownloader(), LaComedieExtractor())
+    u2e = URL2Events(SimpleDownloader(), lacomedie.CExtractor())
    url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
    url_human = "https://lacomediedeclermont.com/saison23-24/"
--- a/experimentations/get_lacoope_events.py
+++ b/experimentations/get_lacoope_events.py
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
-    u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
+    u2e = URL2Events(SimpleDownloader(), lacoope.CExtractor())
    url = "https://www.lacoope.org/concerts-calendrier/"
    url_human = "https://www.lacoope.org/concerts-calendrier/"
--- a/experimentations/get_lapucealoreille_events.py
+++ b/experimentations/get_lapucealoreille_events.py
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
-    u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
+    u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
    url = "https://www.lapucealoreille63.fr/programmation/"
    url_human = "https://www.lapucealoreille63.fr/programmation/"
--- a/experimentations/get_lefotomat.py
+++ b/experimentations/get_lefotomat.py
@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
-    u2e = URL2Events(SimpleDownloader(), LeFotomatExtractor())
+    u2e = URL2Events(SimpleDownloader(), lefotomat.CExtractor())
    url = "https://www.lefotomat.com/feed"
    url_human = "https://www.lefotomat.com/"
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@ -102,13 +102,13 @@ def run_recurrent_import(self, pk):
    elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
        extractor = ICALNoVCExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
-        extractor = LaCoopeExtractor()
+        extractor = lacoope.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
-        extractor = LaComedieExtractor()
+        extractor = lacomedie.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
-        extractor = LeFotomatExtractor()
+        extractor = lefotomat.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
-        extractor = LaPuceALOreilleExtractor()
+        extractor = lapucealoreille.CExtractor()
    else:
        extractor = None
--- a/src/agenda_culturel/import_tasks/custom_extractors.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors.py
@ -1,266 +0,0 @@
 from .generic_extractors import *
 import re
 import json5
 from datetime import timedelta
 # A class dedicated to get events from La Coopérative de Mai:
 # URL: https://www.lacoope.org/concerts-calendrier/
 class LaCoopeExtractor(TwoStepsExtractor):
    nom_lieu = "La Coopérative de Mai"
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "html.parser")
        script = soup.find('div', class_="js-filter__results").findChildren('script')
        if len(script) == 0:
            raise Exception("Cannot find events in the first page")
        script = script[0]
        search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
        if search:
            data = json5.loads(search.group(1))
            for e in data['events']:
                self.add_event_url(e['url'])
                if e['tag'] == "Gratuit":
                    self.add_event_tag(e['url'], 'gratuit')
        else:
            raise Exception('Cannot extract events from javascript')
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        title = soup.find("h1").contents[0]
        category = "Concert"
        image = soup.find("meta", property="og:image")
        if image:
            image = image["content"]
        description = soup.find("div", class_="grid-concert-content")
        if description:
            description = description.find('div', class_="content-striped")
            if description:
                description = description.find('div', class_='wysiwyg')
                if description:
                    description = description.get_text()
        if description is None:
            description = ""
        tags = []
        link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
        if len(link_calendar) == 0:
            raise Exception('Cannot find the google calendar url')
        gg_cal = GGCalendar(link_calendar[0]["href"])
        start_day = gg_cal.start_day
        start_time = gg_cal.start_time
        end_day = gg_cal.end_day
        end_time = gg_cal.end_time
        location = LaCoopeExtractor.nom_lieu
        url_human = event_url
        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 # A class dedicated to get events from La Coopérative de Mai:
 # URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
 # URL pour les humains: https://lacomediedeclermont.com/saison23-24/
 class LaComedieExtractor(TwoStepsExtractor):
    nom_lieu = "La Comédie de Clermont"
    def category_comedie2agenda(self, category):
        mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
        if category in mapping:
            return mapping[category]
        else:
            return None
    def build_event_url_list(self, content):
        dates = json5.loads(content)["data"][0]
        url = self.url.split("?")[0]
        for d in list(set(dates)):
            if not self.only_future or self.now <= datetime.date.fromisoformat(d):
                events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
                if events:
                    events = json5.loads(events)
                    if "data" in events:
                        events = events["data"][0]
                        soup = BeautifulSoup(events, "html.parser")
                        events = soup.select("div.unedatedev")
                        for e in events:
                            e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
                            self.add_event_url(e_url)
                            self.add_event_start_day(e_url, d)
                            t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
                            self.add_event_start_time(e_url, t)
                            title = e.select('a')[0].contents[0]
                            self.add_event_title(e_url, title)
                            category = e.select("div#lieuevtcal span")
                            if len(category) > 0:
                                category = self.category_comedie2agenda(category[-1].contents[0])
                                if category is not None:
                                    self.add_event_category(e_url, category)
                            location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
                            self.add_event_location(e_url, location)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        image = soup.select("#imgspec img")
        if image:
            image = image[0]["src"]
        else:
            image = None
        description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
        url_human = event_url
        self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
 # A class dedicated to get events from Le Fotomat'
 # URL: https://www.lefotomat.com/
 class LeFotomatExtractor(TwoStepsExtractor):
    nom_lieu = "Le Fotomat'"
    def category_fotomat2agenda(self, category):
        if not category:
            return None
        mapping = { "Concerts": "Concert"}
        if category in mapping:
            return mapping[category]
        else:
            return None
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "xml")
        events = soup.select("item")
        for e in events:
            e_url = e.find("link").contents[0]
            self.add_event_url(e_url)
            title = e.find("title").contents[0]
            self.add_event_title(e_url, title)
            category = self.category_fotomat2agenda(e.find("category").contents[0])
            if category:
                self.add_event_category(e_url, category)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        image = soup.select("div.post-content img.wp-post-image")
        if image:
            image = image[0]["src"]
        else:
            image = None
        desc = soup.select("head meta[name=description]")[0]["content"]
        start_day = self.parse_french_date(desc.split("-")[0])
        start_time = self.parse_french_time(desc.split("-")[1])
        end_time = self.parse_french_time(desc.split("-")[2])
        end_day = self.guess_end_day(start_day, start_time, end_time)
        location = self.nom_lieu
        descriptions = soup.select("div.vce-col-content")
        if descriptions:
            descriptions = [d.get_text() for d in descriptions]
            description = max(descriptions, key=len)
        else:
            description = None
        article = soup.select("article.post")
        tags = []
        for c in article[0]["class"]:
            if c.startswith("category-"):
                tag = '-'.join(c.split("-")[1:])
                if tag != "concerts":
                    tags.append(tag)
        url_human = event_url
        self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 # A class dedicated to get events from La puce à l'oreille
 # URL: https://www.lapucealoreille63.fr/
 class LaPuceALOreilleExtractor(TwoStepsExtractor):
    nom_lieu = "La Puce à l'Oreille"
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "html.parser")
        events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
        for e in events:
            e_url = e.find("a")
            if e_url:
                if self.add_event_url(e_url["href"]):
                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
                    if title:
                        title = title[0].contents[0].get_text().replace("\n", " ")
                        title = re.sub(" +", " ", title)
                        self.add_event_title(e_url["href"], title)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
        spans = soup.select("div[data-testid=richTextElement] span")
        start_time = None
        end_time = None
        location = None
        for span in spans:
            txt = span.get_text()
            if txt.lstrip().startswith("DÉBUT"):
                start_time = self.parse_french_time(txt.split(":")[-1])
                end_time = None
            elif txt.lstrip().startswith("HORAIRES :"):
                hs = txt.split(":")[-1].split("-")
                start_time = self.parse_french_time(hs[0])
                if len(hs) > 1:
                    end_time = self.parse_french_time(hs[1])
                else:
                    end_time = None
            elif txt.lstrip().startswith("LIEU :") and not location:
                location = txt.split(":")[-1].lstrip()
        if not location:
            location = self.nom_lieu
        end_day = self.guess_end_day(start_day, start_time, end_time)
        url_human = event_url
        tags = []
        image = soup.select("wow-image img[fetchpriority=high]")
        if image:
            image = image[0]["src"]
        else:
            image = None
        descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
        if descriptions:
            descriptions = [d.get_text() for d in descriptions]
            description = max(descriptions, key=len)
        else:
            description = None
        self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
--- a/src/agenda_culturel/import_tasks/custom_extractors/init.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/init.py
@ -0,0 +1,4 @@
 from os.path import dirname, basename, isfile, join
 import glob
 modules = glob.glob(join(dirname(__file__), "*.py"))
 __all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
--- a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py
@ -0,0 +1,69 @@
 from ..generic_extractors import *
 import re
 import json5
 from datetime import timedelta
 # A class dedicated to get events from La Coopérative de Mai:
 # URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
 # URL pour les humains: https://lacomediedeclermont.com/saison23-24/
 class CExtractor(TwoStepsExtractor):
    nom_lieu = "La Comédie de Clermont"
    def category_comedie2agenda(self, category):
        mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
        if category in mapping:
            return mapping[category]
        else:
            return None
    def build_event_url_list(self, content):
        dates = json5.loads(content)["data"][0]
        url = self.url.split("?")[0]
        for d in list(set(dates)):
            if not self.only_future or self.now <= datetime.date.fromisoformat(d):
                events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
                if events:
                    events = json5.loads(events)
                    if "data" in events:
                        events = events["data"][0]
                        soup = BeautifulSoup(events, "html.parser")
                        events = soup.select("div.unedatedev")
                        for e in events:
                            e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
                            self.add_event_url(e_url)
                            self.add_event_start_day(e_url, d)
                            t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
                            self.add_event_start_time(e_url, t)
                            title = e.select('a')[0].contents[0]
                            self.add_event_title(e_url, title)
                            category = e.select("div#lieuevtcal span")
                            if len(category) > 0:
                                category = self.category_comedie2agenda(category[-1].contents[0])
                                if category is not None:
                                    self.add_event_category(e_url, category)
                            location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
                            self.add_event_location(e_url, location)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        image = soup.select("#imgspec img")
        if image:
            image = image[0]["src"]
        else:
            image = None
        description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
        url_human = event_url
        self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
--- a/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py
@ -0,0 +1,64 @@
 from ..generic_extractors import *
 import re
 import json5
 from datetime import timedelta
 # A class dedicated to get events from La Coopérative de Mai:
 # URL: https://www.lacoope.org/concerts-calendrier/
 class CExtractor(TwoStepsExtractor):
    nom_lieu = "La Coopérative de Mai"
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "html.parser")
        script = soup.find('div', class_="js-filter__results").findChildren('script')
        if len(script) == 0:
            raise Exception("Cannot find events in the first page")
        script = script[0]
        search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
        if search:
            data = json5.loads(search.group(1))
            for e in data['events']:
                self.add_event_url(e['url'])
                if e['tag'] == "Gratuit":
                    self.add_event_tag(e['url'], 'gratuit')
        else:
            raise Exception('Cannot extract events from javascript')
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        title = soup.find("h1").contents[0]
        category = "Concert"
        image = soup.find("meta", property="og:image")
        if image:
            image = image["content"]
        description = soup.find("div", class_="grid-concert-content")
        if description:
            description = description.find('div', class_="content-striped")
            if description:
                description = description.find('div', class_='wysiwyg')
                if description:
                    description = description.get_text()
        if description is None:
            description = ""
        tags = []
        link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
        if len(link_calendar) == 0:
            raise Exception('Cannot find the google calendar url')
        gg_cal = GGCalendar(link_calendar[0]["href"])
        start_day = gg_cal.start_day
        start_time = gg_cal.start_time
        end_day = gg_cal.end_day
        end_time = gg_cal.end_time
        location = CExtractor.nom_lieu
        url_human = event_url
        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
--- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py
@ -0,0 +1,73 @@
 from ..generic_extractors import *
 import re
 import json5
 from datetime import timedelta
 # A class dedicated to get events from La puce à l'oreille
 # URL: https://www.lapucealoreille63.fr/
 class CExtractor(TwoStepsExtractor):
    nom_lieu = "La Puce à l'Oreille"
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "html.parser")
        events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
        for e in events:
            e_url = e.find("a")
            if e_url:
                if self.add_event_url(e_url["href"]):
                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
                    if title:
                        title = title[0].contents[0].get_text().replace("\n", " ")
                        title = re.sub(" +", " ", title)
                        self.add_event_title(e_url["href"], title)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
        spans = soup.select("div[data-testid=richTextElement] span")
        start_time = None
        end_time = None
        location = None
        for span in spans:
            txt = span.get_text()
            if txt.lstrip().startswith("DÉBUT"):
                start_time = self.parse_french_time(txt.split(":")[-1])
                end_time = None
            elif txt.lstrip().startswith("HORAIRES :"):
                hs = txt.split(":")[-1].split("-")
                start_time = self.parse_french_time(hs[0])
                if len(hs) > 1:
                    end_time = self.parse_french_time(hs[1])
                else:
                    end_time = None
            elif txt.lstrip().startswith("LIEU :") and not location:
                location = txt.split(":")[-1].lstrip()
        if not location:
            location = self.nom_lieu
        end_day = self.guess_end_day(start_day, start_time, end_time)
        url_human = event_url
        tags = []
        image = soup.select("wow-image img[fetchpriority=high]")
        if image:
            image = image[0]["src"]
        else:
            image = None
        descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
        if descriptions:
            descriptions = [d.get_text() for d in descriptions]
            description = max(descriptions, key=len)
        else:
            description = None
        self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
--- a/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py
@ -0,0 +1,72 @@
 from ..generic_extractors import *
 import re
 import json5
 from datetime import timedelta
 # A class dedicated to get events from Le Fotomat'
 # URL: https://www.lefotomat.com/
 class CExtractor(TwoStepsExtractor):
    nom_lieu = "Le Fotomat'"
    def category_fotomat2agenda(self, category):
        if not category:
            return None
        mapping = { "Concerts": "Concert"}
        if category in mapping:
            return mapping[category]
        else:
            return None
    def build_event_url_list(self, content):
        soup = BeautifulSoup(content, "xml")
        events = soup.select("item")
        for e in events:
            e_url = e.find("link").contents[0]
            self.add_event_url(e_url)
            title = e.find("title").contents[0]
            self.add_event_title(e_url, title)
            category = self.category_fotomat2agenda(e.find("category").contents[0])
            if category:
                self.add_event_category(e_url, category)
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
        soup = BeautifulSoup(event_content, "html.parser")
        image = soup.select("div.post-content img.wp-post-image")
        if image:
            image = image[0]["src"]
        else:
            image = None
        desc = soup.select("head meta[name=description]")[0]["content"]
        start_day = self.parse_french_date(desc.split("-")[0])
        start_time = self.parse_french_time(desc.split("-")[1])
        end_time = self.parse_french_time(desc.split("-")[2])
        end_day = self.guess_end_day(start_day, start_time, end_time)
        location = self.nom_lieu
        descriptions = soup.select("div.vce-col-content")
        if descriptions:
            descriptions = [d.get_text() for d in descriptions]
            description = max(descriptions, key=len)
        else:
            description = None
        article = soup.select("article.post")
        tags = []
        for c in article[0]["class"]:
            if c.startswith("category-"):
                tag = '-'.join(c.split("-")[1:])
                if tag != "concerts":
                    tags.append(tag)
        url_human = event_url
        self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
--- a/src/agenda_culturel/templates/agenda_culturel/batch-imports-inc.html
+++ b/src/agenda_culturel/templates/agenda_culturel/batch-imports-inc.html
@ -3,6 +3,7 @@
            <tr>
                <th rowspan="2">Identifiant</th>
                <th rowspan="2">Date</th>
                <th rowspan="2">Source</th>
                <th rowspan="2">Status</th>
                <th rowspan="2">Action</th>
                <th colspan="4">événements</th>
@ -19,6 +20,7 @@
        <tr>
            <td>{{ obj.id }}</a></td>
            <td>{{ obj.created_date }}</td>
            <td>{% if obj.recurrentImport %}<a href="{{ obj.recurrentImport.get_absolute_url }}">{{ obj.recurrentImport.name }}</a>{% else %}-{% endif %} </td>
            <td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
            <td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
            <td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>