From c1073451fd0a3656dde79ad4e735d800ce8c0758 Mon Sep 17 00:00:00 2001 From: Sirius Nottin <30043962+siriusnottin@users.noreply.github.com> Date: Sat, 12 Oct 2024 21:13:48 +0200 Subject: [PATCH] wip --- .../import_tasks/custom_extractors/acolab.py | 81 ++++++++++++++----- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/acolab.py b/src/agenda_culturel/import_tasks/custom_extractors/acolab.py index 5466b38..579ad4d 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/acolab.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/acolab.py @@ -1,7 +1,4 @@ -import icalendar -import warnings - -import bbcode +import re from datetime import datetime, date, timedelta from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning @@ -12,6 +9,19 @@ from celery.utils.log import get_task_logger logger = get_task_logger(__name__) +class Timeslot: + def __init__(self, start_time, end_time): + self.start_time = start_time + self.end_time = end_time + + def merge(self, hours): + self.start_time = min(self.start_time, hours.start_time) + + if not hours.end_time is None: + if not self.end_time is None: + self.end_time = max(self.end_time, hours.end_time) + else: + self.end_time = hours.end_time class CExtractor(Extractor): @@ -32,6 +42,21 @@ class CExtractor(Extractor): return (year, month) + def find_hours(text): + text = re.split(r"[ -/=>]+", text) + text = [Extractor.parse_french_time(k) for k in text] + text = [k for k in text if not k is None] + match len(text): + case 0: + return None + case 1: + return Timeslot(text[0], None) + case 2: + return Timeslot(text[0], text[1]) + case _: + return None + + def is_nickname(text): return '@' in text @@ -42,7 +67,27 @@ class CExtractor(Extractor): if word in text: return True return False + + def find_timeslot(text): + text = re.sub(' +', ' ', text).split(' ') + day_name = text[0] + day_num = text[1] + hours = text[2] + if not Extractor.guess_day_name(day_name): + return None + + day_num = [c for c in re.split(r'\D+', day_num) if c != ""] + if len(day_num) == 0: + return None + day_num = int(day_num[0]) + + hours = CExtractor.find_hours(hours) + if hours is None: + return None + + return (day_num, hours) + #['Samedi 12', '@Manon', '14:30-18:00', 'Dimanches 13', '@gaeldu63', '14h30 (j utilise la scie a format)'] #['Mercredi 16 :'] @@ -52,25 +97,22 @@ class CExtractor(Extractor): result = [] date = None - tstart = None - tend = None + slot = None is_open = False # for each element in the paragraph for e in p.stripped_strings: - day = CExtractor.find_day_name(e) + day = CExtractor.find_timeslot(e) if not day is None: if not date is None and is_open: # we reach a new day - result.append((date, tstart, tend)) + result.append((date, slot)) if isinstance(day, tuple): date = day[0] - tstart = day[1] - tend = day[2] + slot = day[1] else: date = day - tstart = None - tend = None + slot = None is_open = False continue elif not is_open: @@ -83,15 +125,10 @@ class CExtractor(Extractor): hours = CExtractor.find_hours(e) if not hours is None: - # we found hours - if tstart is None: - tstart = hours[0] + if slot is None: + slot = hours else: - tstart = min(tstart, hours[0]) - if tend is None: - tend = hours[1] - else: - tend = max(tend, hours[1]) + slot.merge(hours) continue if CExtractor.is_canceled(e): @@ -100,7 +137,7 @@ class CExtractor(Extractor): if not date is None and is_open: # we reach a new day - result.append((date, tstart, tend)) + result.append((date, slot)) return result # [(10, time(14, 0, 0), time(17, 0, 0)), ] @@ -124,7 +161,7 @@ class CExtractor(Extractor): # annule # menage for p in description.select('p'): - CExtractor.find_time_slots(p) + CExtractor.find_timeslots(p) if not '@' in p.text: continue