feat: parser for acolab

2024-10-12 19:26:52 +02:00 · 2024-10-12 19:26:52 +02:00 · b56cbf66a0
commit b56cbf66a0
parent 521f904778
2 changed files with 189 additions and 0 deletions
--- a/experimentations/get_acolab_events.py
+++ b/experimentations/get_acolab_events.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 # coding: utf-8
 import os
 import json
 import sys
 # getting the name of the directory
 # where the this file is present.
 current = os.path.dirname(os.path.realpath(__file__))
 # Getting the parent directory name
 # where the current directory is present.
 parent = os.path.dirname(current)
 # adding the parent directory to 
 # the sys.path.
 sys.path.append(parent)
 from src.agenda_culturel.import_tasks.downloader import *
 from src.agenda_culturel.import_tasks.extractor import *
 from src.agenda_culturel.import_tasks.importer import *
 from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
    u2e = URL2Events(SimpleDownloader(), acolab.CExtractor())
    url = "https://forum.acolab.fr/c/genericactivity/ouverture/15.rss"
    url_human = "https://forum.acolab.fr/c/genericactivity/ouverture/15"
    try:
        events = u2e.process(url, url_human, cache = "cache-acolab.html", default_values = {"location": "AcoLab"}, published = True)
        exportfile = "events-acolab.json"
        print("Saving events to file {}".format(exportfile))
        with open(exportfile, "w") as f:
            json.dump(events, f, indent=4, default=str)
    except Exception as e:
        print("Exception: " + str(e))
--- a/src/agenda_culturel/import_tasks/custom_extractors/acolab.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/acolab.py
@ -0,0 +1,146 @@
 import icalendar
 import warnings
 import bbcode
 from datetime import datetime, date, timedelta
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from ..extractor import *
 from celery.utils.log import get_task_logger
 logger = get_task_logger(__name__)
 class CExtractor(Extractor):
    title_ouverture = 'Permanence AcoLab'
    def __init__(self):
        super().__init__()
    def find_date(text):
        splited_text = text.split(' ')
        month = Extractor.guess_month(splited_text[1])
        year = splited_text[2]
        if month is None:
            return None
        if not year.isnumeric():
            return None
        year = int(year)
        return (year, month)
    def is_nickname(text):
        return '@' in text
    def is_canceled(text):
        text = Extractor.remove_accents(text.lower())
        words = ['annule']
        for word in words:
            if word in text:
                return True
        return False
    #['Samedi 12', '@Manon', '14:30-18:00', 'Dimanches 13', '@gaeldu63', '14h30 (j utilise la scie a format)']
    #['Mercredi 16 :']
    #['Samedi 19', '@Manon', '14:30-18:00']
    #['Samedi 22', ': ANNULE', '@Manon', '14h30 - 18h00']
    def find_timeslots(p):
        result = []
        date = None
        tstart = None
        tend = None
        is_open = False
        # for each element in the paragraph
        for e in p.stripped_strings:
            day = CExtractor.find_day_name(e)
            if not day is None:
                if not date is None and is_open:
                    # we reach a new day
                    result.append((date, tstart, tend))
                date = day
                tstart = None
                tend = None
                is_open = False
                continue
            elif not is_open:
                continue
            if CExtractor.is_nickname(e):
                # we found a nickname
                is_open = True
                continue
            hours = CExtractor.find_hours(e)
            if not hours is None:
                # we found hours
                tstart = hours[0]
                tend = hours[1]
                continue
            if CExtractor.is_canceled(e):
                is_open = False
                continue
        if not date is None and is_open:
            # we reach a new day
            result.append((date, tstart, tend))
        return result
        # [(10, time(14, 0, 0), time(17, 0, 0)), ]
    def extract(self, content, url, url_human=None, default_values=None, published=False):
        soup = BeautifulSoup(content, 'xml')
        for item in soup.find_all('item'):
            item_url = item.link.text
            title_text = item.title.text.lower()
            if not 'ouverture' in title_text:
                continue
            title = CExtractor.title_ouverture
            when = CExtractor.find_date(title_text)
            if when is None:
                continue
            description = BeautifulSoup(item.description.text, 'html.parser')
            # annule
            # menage
            for p in description.select('p'):
                CExtractor.find_time_slots(p)
                if not '@' in p.text:
                    continue
        # if title is not None:
        # luuids = [event_url]
        # if uuidrel is not None:
        #     luuids += [uuidrel]
        # self.add_event(
        #     title,
        #     category,
        #     start_day,
        #     location,
        #     description,ù
        #     tags,
        #     recurrences=recurrences,
        #     uuids=luuids,
        #     url_human=url_human,
        #     start_time=start_time,
        #     end_day=end_day,
        #     end_time=end_time,
        #     last_modified=last_modified,
        #     published=published,
        #     image=image
        # )
        return self.get_structure()