agenda_culturel/src/agenda_culturel/import_tasks/extractor.py

181 lines
5.5 KiB
Python

from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from datetime import datetime, time, date, timedelta
import re
import unicodedata
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
class Extractor(ABC):
def __init__(self):
self.header = {}
self.events = []
self.downloader = None
def guess_end_day(self, start_day, start_time, end_time):
if end_time:
if end_time > start_time:
return start_day
else:
return start_day + timedelta(days=1)
else:
return start_day
def guess_month(self, text):
mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
t = remove_accents(text).lower()
for i, m in enumerate(mths):
if t.startswith(m):
return i + 1
return None
def parse_french_date(self, text):
# format NomJour Numero Mois Année
m = re.search('[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
if m:
day = m.group(1)
month = self.guess_month(m.group(2))
year = m.group(3)
else:
# format Numero Mois Annee
m = re.search('([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
if m:
day = m.group(1)
month = self.guess_month(m.group(2))
year = m.group(3)
else:
# TODO: consolider les cas non satisfaits
return None
if month is None:
return None
try:
day = int(day)
year = int(year)
except:
return None
if year < 100:
year = 2000 + year
if day >= 32:
return None
return date(year, month, day)
def parse_french_time(self, text):
# format heures minutes secondes
m = re.search('([0-9]+)[ a-zA-Z:.]+([0-9]+)[ a-zA-Z:.]+([0-9]+)', text)
if m:
h = m.group(1)
m = m.group(2)
s = m.group(3)
else:
# format heures minutes
m = re.search('([0-9]+)[ hH:.]+([0-9]+)', text)
if m:
h = m.group(1)
m = m.group(2)
s = "0"
else:
# format heures
m = re.search('([0-9]+)[ Hh:.]', text)
if m:
h = m.group(1)
m = "0"
s = "0"
else:
return None
try:
h = int(h)
m = int(m)
s = int(s)
except:
return None
if h >= 24 or m >= 60 or s >= 60:
return None
return time(h, m, s)
@abstractmethod
def extract(self, content, url, url_human = None, default_values = None, published = False):
pass
def set_downloader(self, downloader):
self.downloader = downloader
@abstractmethod
def clean_url(url):
pass
def set_header(self, url):
self.header["url"] = url
self.header["date"] = datetime.now()
def clear_events(self):
self.events = []
def add_event(self, title, category, start_day, location, description, tags, uuids, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
if title is None:
print("ERROR: cannot import an event without name")
return
if start_day is None:
print("ERROR: cannot import an event without start day")
return
event = {
"title": title,
"category": category,
"start_day": start_day,
"uuids": uuids,
"location": location,
"description": description,
"tags": tags,
"published": published,
"image": image,
"image_alt": image_alt
}
# TODO: pourquoi url_human et non reference_url
if url_human is not None:
event["url_human"] = url_human
if start_time is not None:
event["start_time"] = start_time
if end_day is not None:
event["end_day"] = end_day
if end_time is not None:
event["end_time"] = end_time
if last_modified is not None:
event["last_modified"] = last_modified
if recurrences is not None:
event["recurrences"] = recurrences
self.events.append(event)
def default_value_if_exists(self, default_values, key):
return default_values[key] if default_values is not None and key in default_values else None
def get_structure(self):
return { "header": self.header, "events": self.events}
def clean_url(url):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
result = url
for e in [ICALExtractor, FacebookEventExtractor]:
result = e.clean_url(result)
return result
def get_default_extractors(single_event=False):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
if single_event:
return [FacebookEventExtractor(single_event=True)]
else:
return [ICALExtractor(), FacebookEventExtractor(single_event=False)]