feat: parser for acolab
This commit is contained in:
parent
521f904778
commit
b56cbf66a0
43
experimentations/get_acolab_events.py
Executable file
43
experimentations/get_acolab_events.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(SimpleDownloader(), acolab.CExtractor())
|
||||
url = "https://forum.acolab.fr/c/genericactivity/ouverture/15.rss"
|
||||
url_human = "https://forum.acolab.fr/c/genericactivity/ouverture/15"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-acolab.html", default_values = {"location": "AcoLab"}, published = True)
|
||||
|
||||
exportfile = "events-acolab.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
146
src/agenda_culturel/import_tasks/custom_extractors/acolab.py
Normal file
146
src/agenda_culturel/import_tasks/custom_extractors/acolab.py
Normal file
@ -0,0 +1,146 @@
|
||||
import icalendar
|
||||
import warnings
|
||||
|
||||
import bbcode
|
||||
|
||||
from datetime import datetime, date, timedelta
|
||||
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
||||
|
||||
from ..extractor import *
|
||||
|
||||
from celery.utils.log import get_task_logger
|
||||
|
||||
logger = get_task_logger(__name__)
|
||||
|
||||
|
||||
class CExtractor(Extractor):
|
||||
|
||||
title_ouverture = 'Permanence AcoLab'
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def find_date(text):
|
||||
splited_text = text.split(' ')
|
||||
month = Extractor.guess_month(splited_text[1])
|
||||
year = splited_text[2]
|
||||
if month is None:
|
||||
return None
|
||||
if not year.isnumeric():
|
||||
return None
|
||||
year = int(year)
|
||||
|
||||
return (year, month)
|
||||
|
||||
def is_nickname(text):
|
||||
return '@' in text
|
||||
|
||||
def is_canceled(text):
|
||||
text = Extractor.remove_accents(text.lower())
|
||||
words = ['annule']
|
||||
for word in words:
|
||||
if word in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
#['Samedi 12', '@Manon', '14:30-18:00', 'Dimanches 13', '@gaeldu63', '14h30 (j utilise la scie a format)']
|
||||
#['Mercredi 16 :']
|
||||
#['Samedi 19', '@Manon', '14:30-18:00']
|
||||
#['Samedi 22', ': ANNULE', '@Manon', '14h30 - 18h00']
|
||||
def find_timeslots(p):
|
||||
result = []
|
||||
|
||||
date = None
|
||||
tstart = None
|
||||
tend = None
|
||||
is_open = False
|
||||
|
||||
# for each element in the paragraph
|
||||
for e in p.stripped_strings:
|
||||
day = CExtractor.find_day_name(e)
|
||||
if not day is None:
|
||||
if not date is None and is_open:
|
||||
# we reach a new day
|
||||
result.append((date, tstart, tend))
|
||||
date = day
|
||||
tstart = None
|
||||
tend = None
|
||||
is_open = False
|
||||
continue
|
||||
elif not is_open:
|
||||
continue
|
||||
|
||||
if CExtractor.is_nickname(e):
|
||||
# we found a nickname
|
||||
is_open = True
|
||||
continue
|
||||
|
||||
hours = CExtractor.find_hours(e)
|
||||
if not hours is None:
|
||||
# we found hours
|
||||
tstart = hours[0]
|
||||
tend = hours[1]
|
||||
continue
|
||||
|
||||
if CExtractor.is_canceled(e):
|
||||
is_open = False
|
||||
continue
|
||||
|
||||
if not date is None and is_open:
|
||||
# we reach a new day
|
||||
result.append((date, tstart, tend))
|
||||
|
||||
return result
|
||||
# [(10, time(14, 0, 0), time(17, 0, 0)), ]
|
||||
|
||||
def extract(self, content, url, url_human=None, default_values=None, published=False):
|
||||
|
||||
soup = BeautifulSoup(content, 'xml')
|
||||
|
||||
for item in soup.find_all('item'):
|
||||
item_url = item.link.text
|
||||
title_text = item.title.text.lower()
|
||||
if not 'ouverture' in title_text:
|
||||
continue
|
||||
title = CExtractor.title_ouverture
|
||||
|
||||
when = CExtractor.find_date(title_text)
|
||||
if when is None:
|
||||
continue
|
||||
|
||||
description = BeautifulSoup(item.description.text, 'html.parser')
|
||||
# annule
|
||||
# menage
|
||||
for p in description.select('p'):
|
||||
CExtractor.find_time_slots(p)
|
||||
|
||||
if not '@' in p.text:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
# if title is not None:
|
||||
# luuids = [event_url]
|
||||
# if uuidrel is not None:
|
||||
# luuids += [uuidrel]
|
||||
# self.add_event(
|
||||
# title,
|
||||
# category,
|
||||
# start_day,
|
||||
# location,
|
||||
# description,ù
|
||||
# tags,
|
||||
# recurrences=recurrences,
|
||||
# uuids=luuids,
|
||||
# url_human=url_human,
|
||||
# start_time=start_time,
|
||||
# end_day=end_day,
|
||||
# end_time=end_time,
|
||||
# last_modified=last_modified,
|
||||
# published=published,
|
||||
# image=image
|
||||
# )
|
||||
return self.get_structure()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user