[WIP] New source: AcoLab #165

Draft
siriusnottin wants to merge 8 commits from feat-parser-acolab into main
3 changed files with 280 additions and 7 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), acolab.CExtractor())
url = "https://forum.acolab.fr/c/genericactivity/ouverture/15.rss"
url_human = "https://forum.acolab.fr/c/genericactivity/ouverture/15"
try:
events = u2e.process(url, url_human, cache = "cache-acolab.html", default_values = {"location": "AcoLab"}, published = True)
exportfile = "events-acolab.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -0,0 +1,205 @@
import re
from datetime import datetime, date, timedelta
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from ..extractor import *
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)
class Timeslot:
def __init__(self, start_time, end_time):
self.start_time = start_time
self.end_time = end_time
def merge(self, hours):
self.start_time = min(self.start_time, hours.start_time)
if not hours.end_time is None:
if not self.end_time is None:
self.end_time = max(self.end_time, hours.end_time)
else:
self.end_time = hours.end_time
class CExtractor(Extractor):
title_ouverture = 'Permanence AcoLab'
def __init__(self):
super().__init__()
def find_date(text):
splited_text = text.split(' ')
if len(splited_text) < 3:
return None
month = Extractor.guess_month(splited_text[1])
if month is None:
return None
year = splited_text[2]
if not year.isnumeric():
return None
year = int(year)
return (year, month)
def find_hours(text):
text = re.split(r"[ -/=>]+", text)
text = [Extractor.parse_french_time(k) for k in text]
text = [k for k in text if not k is None]
match len(text):
case 0:
return None
case 1:
return Timeslot(text[0], None)
case 2:
return Timeslot(text[0], text[1])
case _:
return None
def is_nickname(text):
return '@' in text
def is_canceled(text):
text = Extractor.remove_accents(text.lower())
words = ['annule']
for word in words:
if word in text:
return True
return False
def find_timeslot(text):
text = re.sub(' +', ' ', text).split(' ')
if len(text) < 3:
return None
day_name = text[0]
day_num = text[1]
hours = text[2]
if not Extractor.guess_day_name(day_name):
return None
day_num = [c for c in re.split(r'\D+', day_num) if c != ""]
if len(day_num) == 0:
return None
day_num = int(day_num[0])
hours = CExtractor.find_hours(hours)
if hours is None:
return None
return (day_num, hours)
#['Samedi 12', '@Manon', '14:30-18:00', 'Dimanches 13', '@gaeldu63', '14h30 (j utilise la scie a format)']
#['Mercredi 16 :']
#['Samedi 19', '@Manon', '14:30-18:00']
#['Samedi 22', ': ANNULE', '@Manon', '14h30 - 18h00']
def find_timeslots(p):
result = []
date = None
slot = None
is_open = False
# for each element in the paragraph
for e in p.stripped_strings:
day = CExtractor.find_timeslot(e)
if not day is None:
print('on a une date', day)
if not date is None and is_open:
# we reach a new day
result.append((date, slot))
if isinstance(day, tuple):
date = day[0]
slot = day[1]
else:
date = day
slot = None
is_open = False
continue
elif not is_open:
continue
if CExtractor.is_nickname(e):
print('on a un nickname', e)
# we found a nickname
is_open = True
continue
hours = CExtractor.find_hours(e)
if not hours is None:
print('on a une heure', hours)
if slot is None:
slot = hours
else:
slot.merge(hours)
continue
if CExtractor.is_canceled(e):
print('on a un cancel')
is_open = False
continue
if not date is None and is_open:
# we reach a new day
result.append((date, slot))
return result
# [(10, time(14, 0, 0), time(17, 0, 0)), ]
def extract(self, content, url, url_human=None, default_values=None, published=False):
soup = BeautifulSoup(content, 'xml')
for item in soup.find_all('item'):
item_url = item.link.text
title_text = item.title.text.lower()
if not 'ouverture' in title_text:
continue
title = CExtractor.title_ouverture
when = CExtractor.find_date(title_text)
if when is None:
continue
description = BeautifulSoup(item.description.text, 'html.parser')
# annule
# menage
for p in description.select('p'):
ts = CExtractor.find_timeslots(p)
print(ts, when, title)
# if title is not None:
# luuids = [event_url]
# if uuidrel is not None:
# luuids += [uuidrel]
# self.add_event(
# title,
# category,
# start_day,
# location,
# description,ù
# tags,
# recurrences=recurrences,
# uuids=luuids,
# url_human=url_human,
# start_time=start_time,
# end_day=end_day,
# end_time=end_time,
# last_modified=last_modified,
# published=published,
# image=image
# )
return self.get_structure()

View File

@ -28,8 +28,38 @@ class Extractor(ABC):
else: else:
return start_day return start_day
def guess_startswith(text, strs):
t = Extractor.remove_accents(text).lower()
for i, s in enumerate(strs):
if t.startswith(s):
return i + 1
return None
def guess_day_name(text, exact=False):
if exact:
return Extractor.guess_startswith(text, [
"lundi",
"mardi",
"mercredi",
"jeudi",
"vendredi",
"samedi",
"dimanche",
])
else:
return Extractor.guess_startswith(text, [
"lun",
"mar",
"mer",
"jeu",
"ven",
"sa",
"di",
])
def guess_month(text): def guess_month(text):
mths = [ return Extractor.guess_startswith(text, [
"jan", "jan",
"fe", "fe",
"mar", "mar",
@ -42,12 +72,7 @@ class Extractor(ABC):
"oct", "oct",
"nov", "nov",
"dec", "dec",
] ])
t = remove_accents(text).lower()
for i, m in enumerate(mths):
if t.startswith(m):
return i + 1
return None
def parse_french_date(text): def parse_french_date(text):
# format NomJour Numero Mois Année # format NomJour Numero Mois Année