diff --git a/experimentations/get_acolab_events.py b/experimentations/get_acolab_events.py new file mode 100755 index 0000000..428d0df --- /dev/null +++ b/experimentations/get_acolab_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), acolab.CExtractor()) + url = "https://forum.acolab.fr/c/genericactivity/ouverture/15.rss" + url_human = "https://forum.acolab.fr/c/genericactivity/ouverture/15" + + try: + events = u2e.process(url, url_human, cache = "cache-acolab.html", default_values = {"location": "AcoLab"}, published = True) + + exportfile = "events-acolab.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/acolab.py b/src/agenda_culturel/import_tasks/custom_extractors/acolab.py new file mode 100644 index 0000000..f132cdc --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/acolab.py @@ -0,0 +1,146 @@ +import icalendar +import warnings + +import bbcode + +from datetime import datetime, date, timedelta +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning + +from ..extractor import * + +from celery.utils.log import get_task_logger + +logger = get_task_logger(__name__) + + +class CExtractor(Extractor): + + title_ouverture = 'Permanence AcoLab' + + def __init__(self): + super().__init__() + + def find_date(text): + splited_text = text.split(' ') + month = Extractor.guess_month(splited_text[1]) + year = splited_text[2] + if month is None: + return None + if not year.isnumeric(): + return None + year = int(year) + + return (year, month) + + def is_nickname(text): + return '@' in text + + def is_canceled(text): + text = Extractor.remove_accents(text.lower()) + words = ['annule'] + for word in words: + if word in text: + return True + return False + + + #['Samedi 12', '@Manon', '14:30-18:00', 'Dimanches 13', '@gaeldu63', '14h30 (j utilise la scie a format)'] + #['Mercredi 16 :'] + #['Samedi 19', '@Manon', '14:30-18:00'] + #['Samedi 22', ': ANNULE', '@Manon', '14h30 - 18h00'] + def find_timeslots(p): + result = [] + + date = None + tstart = None + tend = None + is_open = False + + # for each element in the paragraph + for e in p.stripped_strings: + day = CExtractor.find_day_name(e) + if not day is None: + if not date is None and is_open: + # we reach a new day + result.append((date, tstart, tend)) + date = day + tstart = None + tend = None + is_open = False + continue + elif not is_open: + continue + + if CExtractor.is_nickname(e): + # we found a nickname + is_open = True + continue + + hours = CExtractor.find_hours(e) + if not hours is None: + # we found hours + tstart = hours[0] + tend = hours[1] + continue + + if CExtractor.is_canceled(e): + is_open = False + continue + + if not date is None and is_open: + # we reach a new day + result.append((date, tstart, tend)) + + return result + # [(10, time(14, 0, 0), time(17, 0, 0)), ] + + def extract(self, content, url, url_human=None, default_values=None, published=False): + + soup = BeautifulSoup(content, 'xml') + + for item in soup.find_all('item'): + item_url = item.link.text + title_text = item.title.text.lower() + if not 'ouverture' in title_text: + continue + title = CExtractor.title_ouverture + + when = CExtractor.find_date(title_text) + if when is None: + continue + + description = BeautifulSoup(item.description.text, 'html.parser') + # annule + # menage + for p in description.select('p'): + CExtractor.find_time_slots(p) + + if not '@' in p.text: + continue + + + + # if title is not None: + # luuids = [event_url] + # if uuidrel is not None: + # luuids += [uuidrel] + # self.add_event( + # title, + # category, + # start_day, + # location, + # description,รน + # tags, + # recurrences=recurrences, + # uuids=luuids, + # url_human=url_human, + # start_time=start_time, + # end_day=end_day, + # end_time=end_time, + # last_modified=last_modified, + # published=published, + # image=image + # ) + return self.get_structure() + + \ No newline at end of file