From 995aa4b2d3108a0b2bb29afbe34fcefa709a9db6 Mon Sep 17 00:00:00 2001
From: Jean-Marie Favreau <j-marie.favreau@uca.fr>
Date: Fri, 19 Apr 2024 23:59:59 +0200
Subject: [PATCH] =?UTF-8?q?Ajout=20de=20l'import=20de=20la=20programmation?=
 =?UTF-8?q?=20de=20la=20Com=C3=A9die?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 experimentations/get_lacomedie_events.py      | 43 ++++++++++++
 experimentations/get_lacoope_events.py        |  2 +-
 src/agenda_culturel/celery.py                 |  2 +
 .../import_tasks/custom_extractors.py         | 67 ++++++++++++++++++-
 .../import_tasks/downloader.py                | 23 ++++---
 .../import_tasks/generic_extractors.py        | 48 +++++++++++--
 .../0050_alter_recurrentimport_processor.py   | 18 +++++
 src/agenda_culturel/models.py                 |  6 +-
 8 files changed, 194 insertions(+), 15 deletions(-)
 create mode 100755 experimentations/get_lacomedie_events.py
 create mode 100644 src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py

diff --git a/experimentations/get_lacomedie_events.py b/experimentations/get_lacomedie_events.py
new file mode 100755
index 0000000..411938d
--- /dev/null
+++ b/experimentations/get_lacomedie_events.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import os
+import json
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+ 
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+ 
+# adding the parent directory to 
+# the sys.path.
+sys.path.append(parent)
+
+from src.agenda_culturel.import_tasks.downloader import *
+from src.agenda_culturel.import_tasks.extractor import *
+from src.agenda_culturel.import_tasks.importer import *
+from src.agenda_culturel.import_tasks.custom_extractors import *
+
+
+
+
+
+if __name__ == "__main__":
+
+    u2e = URL2Events(SimpleDownloader(), LaComedieExtractor())
+    url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
+    url_human = "https://lacomediedeclermont.com/saison23-24/"
+
+    try:
+        events = u2e.process(url, url_human, cache = "cache-lacomedie.html", default_values = {"location": "La Comédie de Clermont"}, published = True)
+
+        exportfile = "events-lacomedie.json"
+        print("Saving events to file {}".format(exportfile))
+        with open(exportfile, "w") as f:
+            json.dump(events, f, indent=4, default=str)
+    except Exception as e:
+        print("Exception: " + str(e))
diff --git a/experimentations/get_lacoope_events.py b/experimentations/get_lacoope_events.py
index b98dddb..c76dedf 100755
--- a/experimentations/get_lacoope_events.py
+++ b/experimentations/get_lacoope_events.py
@@ -33,7 +33,7 @@ if __name__ == "__main__":
     url_human = "https://www.lacoope.org/concerts-calendrier/"
 
     try:
-        events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
+        events = u2e.process(url, url_human, cache = "cache-lacoope.html", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True)
 
         exportfile = "events-lacoope.json"
         print("Saving events to file {}".format(exportfile))
diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py
index 16e72c7..386d7ab 100644
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@@ -103,6 +103,8 @@ def run_recurrent_import(self, pk):
         extractor = ICALNoVCExtractor()
     elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
         extractor = LaCoopeExtractor()
+    elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
+        extractor = LaComedieExtractor()
     else:
         extractor = None
 
diff --git a/src/agenda_culturel/import_tasks/custom_extractors.py b/src/agenda_culturel/import_tasks/custom_extractors.py
index 473d83a..6b10dc4 100644
--- a/src/agenda_culturel/import_tasks/custom_extractors.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors.py
@@ -61,4 +61,69 @@ class LaCoopeExtractor(TwoStepsExtractor):
         location = LaCoopeExtractor.nom_lieu
         url_human = event_url
 
-        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
\ No newline at end of file
+        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
+
+
+# A class dedicated to get events from La Coopérative de Mai:
+# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
+# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
+class LaComedieExtractor(TwoStepsExtractor):
+
+    nom_lieu = "La Comédie de Clermont"
+
+    def category_comedie2agenda(self, category):
+        mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
+        if category in mapping:
+            return mapping[category]
+        else:
+            return None
+
+
+
+    def build_event_url_list(self, content):
+        self.event_urls = []
+        dates = json5.loads(content)["data"][0]
+        
+        url = self.url.split("?")[0]
+        for d in dates:
+            if not self.only_future or self.now <= datetime.date.fromisoformat(d):
+                events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
+                if events:
+                    events = json5.loads(events)
+                    if "data" in events:
+                        events = events["data"][0]
+                        soup = BeautifulSoup(events, "html.parser")
+                        events = soup.select("div.unedatedev")
+                        for e in events:
+                            e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
+                            self.event_urls.append(e_url)
+                            self.add_event_start_day(e_url, d)
+                            t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
+                            self.add_event_start_time(e_url, t)
+                            title = e.select('a')[0].contents[0]
+                            self.add_event_title(e_url, title)
+                            category = e.select("div#lieuevtcal span")
+                            if len(category) > 0:
+                                category = self.category_comedie2agenda(category[-1].contents[0])
+                                if category is not None:
+                                    self.add_event_category(e_url, category)
+                            location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
+                            self.add_event_location(e_url, location)
+
+            
+        
+    
+    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
+        soup = BeautifulSoup(event_content, "html.parser")
+        
+        image = soup.select("#imgspec img")
+        if image:
+            image = image[0]["src"]
+        else:
+            image = None
+
+        description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
+
+        url_human = event_url
+
+        self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py
index d5333a9..785e4f3 100644
--- a/src/agenda_culturel/import_tasks/downloader.py
+++ b/src/agenda_culturel/import_tasks/downloader.py
@@ -1,4 +1,4 @@
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urlencode
 import urllib.request
 import os
 from selenium import webdriver
@@ -13,16 +13,16 @@ class Downloader(ABC):
         pass
 
     @abstractmethod
-    def download(self, url):
+    def download(self, url, post=None):
         pass
 
-    def get_content(self, url, cache = None):
+    def get_content(self, url, cache = None, post = None):
         if cache and os.path.exists(cache):
             print("Loading cache ({})".format(cache))
             with open(cache) as f:
                 content = "\n".join(f.readlines())
         else:
-            content = self.download(url)
+            content = self.download(url, post)
 
             if cache:
                 print("Saving cache ({})".format(cache))
@@ -40,14 +40,19 @@ class SimpleDownloader(Downloader):
         super().__init__()
 
 
-    def download(self, url):
+    def download(self, url, post=None):
         print("Downloading {}".format(url))
 
         try:
-            resource = urllib.request.urlopen(url)
+            if post:
+                post_args = urlencode(post).encode()
+                resource = urllib.request.urlopen(url, post_args)
+            else:
+                resource = urllib.request.urlopen(url)
             data = resource.read().decode(resource.headers.get_content_charset())
             return data
-        except:
+        except Exception as e:
+            print(e)
             return None
 
 
@@ -63,7 +68,9 @@ class ChromiumHeadlessDownloader(Downloader):
         self.service = Service("/usr/bin/chromedriver")
 
 
-    def download(self, url):
+    def download(self, url, post=None):
+        if post:
+            raise Exception('POST method with Chromium headless not yet implemented')
         print("Download {}".format(url))
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
 
diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py
index 0a8182e..d3b5c81 100644
--- a/src/agenda_culturel/import_tasks/generic_extractors.py
+++ b/src/agenda_culturel/import_tasks/generic_extractors.py
@@ -2,6 +2,7 @@ from abc import abstractmethod
 from urllib.parse import urlparse
 from urllib.parse import parse_qs
 
+
 from .extractor import *
 from django.utils.translation import gettext_lazy as _
 from dateutil import parser
@@ -56,6 +57,21 @@ class TwoStepsExtractor(Extractor):
     def clean_url(url):
         return url
 
+    def add_event_start_day(self, url, start_day):
+        if not url in self.event_properties:
+            self.event_properties[url] = {}
+        self.event_properties[url]["start_day"] = start_day
+
+    def add_event_start_time(self, url, start_time):
+        if not url in self.event_properties:
+            self.event_properties[url] = {}
+        self.event_properties[url]["start_time"] = start_time
+
+    def add_event_title(self, url, title):
+        if not url in self.event_properties:
+            self.event_properties[url] = {}
+        self.event_properties[url]["title"] = title
+
     def add_event_tag(self, url, tag):
         if not url in self.event_properties:
             self.event_properties[url] = {}
@@ -63,11 +79,32 @@ class TwoStepsExtractor(Extractor):
             self.event_properties[url]["tags"] = []
         self.event_properties[url]["tags"].append(tag)
 
+    def add_event_category(self, url, cat):
+        if not url in self.event_properties:
+            self.event_properties[url] = {}
+        self.event_properties[url]["category"] = cat
+
+    def add_event_location(self, url, loc):
+        if not url in self.event_properties:
+            self.event_properties[url] = {}
+        self.event_properties[url]["location"] = loc
+
     def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
 
-        if event_url in self.event_properties and 'tags' in self.event_properties[event_url]:
-            tags = tags + self.event_properties[event_url]['tags']
-        
+        if event_url in self.event_properties:
+            if 'tags' in self.event_properties[event_url]:
+                tags = tags + self.event_properties[event_url]['tags']
+            if 'start_day' in self.event_properties[event_url]:
+                start_day = self.event_properties[event_url]['start_day']
+            if 'start_time' in self.event_properties[event_url]:
+                start_time = self.event_properties[event_url]['start_time']
+            if 'title' in self.event_properties[event_url]:
+                title = self.event_properties[event_url]['title']
+            if 'category' in self.event_properties[event_url]:
+                category = self.event_properties[event_url]['category']
+            if 'location' in self.event_properties[event_url]:
+                location = self.event_properties[event_url]['location']
+
         self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
 
 
@@ -80,10 +117,13 @@ class TwoStepsExtractor(Extractor):
         pass
 
 
-    def extract(self, content, url, url_human = None, default_values = None, published = False):
+    def extract(self, content, url, url_human = None, default_values = None, published = False, only_future=True):
+        self.only_future = only_future
+        self.now = datetime.datetime.now().date()
         self.set_header(url)
         self.clear_events()
 
+        self.url = url
         self.event_urls = None
         self.event_properties.clear()
 
diff --git a/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py
new file mode 100644
index 0000000..248e4e9
--- /dev/null
+++ b/src/agenda_culturel/migrations/0050_alter_recurrentimport_processor.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.7 on 2024-04-19 21:44
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('agenda_culturel', '0049_alter_recurrentimport_processor'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='recurrentimport',
+            name='processor',
+            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie')], default='ical', max_length=20, verbose_name='Processor'),
+        ),
+    ]
diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py
index bbd3677..94493b6 100644
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@@ -604,7 +604,10 @@ class Event(models.Model):
         # for each event, check if it's a new one, or a one to be updated
         for event in events:
             sdate = date.fromisoformat(event.start_day)
-            edate = date.fromisoformat(event.end_day)
+            if event.end_day:
+                edate = date.fromisoformat(event.end_day)
+            else:
+                edate = sdate
             if min_date is None or min_date > sdate:
                 min_date = sdate
             if max_date is None or max_date < sdate:
@@ -755,6 +758,7 @@ class RecurrentImport(models.Model):
         ICALNOBUSY = "icalnobusy", _("ical no busy")
         ICALNOVC = "icalnovc", _("ical no VC")
         LACOOPE = "lacoope", _('lacoope.org')
+        LACOMEDIE = "lacomedie", _('la comédie')
 
     class DOWNLOADER(models.TextChoices):
         SIMPLE = "simple", _("simple")