181 lines
5.5 KiB
Python
181 lines
5.5 KiB
Python
from abc import ABC, abstractmethod
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime, time, date, timedelta
|
|
import re
|
|
import unicodedata
|
|
|
|
|
|
def remove_accents(input_str):
|
|
nfkd_form = unicodedata.normalize('NFKD', input_str)
|
|
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
|
|
|
class Extractor(ABC):
|
|
|
|
def __init__(self):
|
|
self.header = {}
|
|
self.events = []
|
|
self.downloader = None
|
|
|
|
def guess_end_day(self, start_day, start_time, end_time):
|
|
if end_time:
|
|
if end_time > start_time:
|
|
return start_day
|
|
else:
|
|
return start_day + timedelta(days=1)
|
|
else:
|
|
return start_day
|
|
|
|
def guess_month(self, text):
|
|
mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
|
|
t = remove_accents(text).lower()
|
|
for i, m in enumerate(mths):
|
|
if t.startswith(m):
|
|
return i + 1
|
|
return None
|
|
|
|
def parse_french_date(self, text):
|
|
# format NomJour Numero Mois Année
|
|
m = re.search('[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
|
|
if m:
|
|
day = m.group(1)
|
|
month = self.guess_month(m.group(2))
|
|
year = m.group(3)
|
|
else:
|
|
# format Numero Mois Annee
|
|
m = re.search('([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
|
|
if m:
|
|
day = m.group(1)
|
|
month = self.guess_month(m.group(2))
|
|
year = m.group(3)
|
|
else:
|
|
# TODO: consolider les cas non satisfaits
|
|
return None
|
|
|
|
if month is None:
|
|
return None
|
|
try:
|
|
day = int(day)
|
|
year = int(year)
|
|
except:
|
|
return None
|
|
if year < 100:
|
|
year = 2000 + year
|
|
if day >= 32:
|
|
return None
|
|
return date(year, month, day)
|
|
|
|
def parse_french_time(self, text):
|
|
# format heures minutes secondes
|
|
m = re.search('([0-9]+)[ a-zA-Z:.]+([0-9]+)[ a-zA-Z:.]+([0-9]+)', text)
|
|
if m:
|
|
h = m.group(1)
|
|
m = m.group(2)
|
|
s = m.group(3)
|
|
else:
|
|
# format heures minutes
|
|
m = re.search('([0-9]+)[ hH:.]+([0-9]+)', text)
|
|
if m:
|
|
h = m.group(1)
|
|
m = m.group(2)
|
|
s = "0"
|
|
else:
|
|
# format heures
|
|
m = re.search('([0-9]+)[ Hh:.]', text)
|
|
if m:
|
|
h = m.group(1)
|
|
m = "0"
|
|
s = "0"
|
|
else:
|
|
return None
|
|
|
|
try:
|
|
h = int(h)
|
|
m = int(m)
|
|
s = int(s)
|
|
except:
|
|
return None
|
|
if h >= 24 or m >= 60 or s >= 60:
|
|
return None
|
|
return time(h, m, s)
|
|
|
|
|
|
|
|
@abstractmethod
|
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
|
pass
|
|
|
|
def set_downloader(self, downloader):
|
|
self.downloader = downloader
|
|
|
|
@abstractmethod
|
|
def clean_url(url):
|
|
pass
|
|
|
|
def set_header(self, url):
|
|
self.header["url"] = url
|
|
self.header["date"] = datetime.now()
|
|
|
|
def clear_events(self):
|
|
self.events = []
|
|
|
|
def add_event(self, title, category, start_day, location, description, tags, uuids, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
|
|
if title is None:
|
|
print("ERROR: cannot import an event without name")
|
|
return
|
|
if start_day is None:
|
|
print("ERROR: cannot import an event without start day")
|
|
return
|
|
|
|
event = {
|
|
"title": title,
|
|
"category": category,
|
|
"start_day": start_day,
|
|
"uuids": uuids,
|
|
"location": location,
|
|
"description": description,
|
|
"tags": tags,
|
|
"published": published,
|
|
"image": image,
|
|
"image_alt": image_alt
|
|
}
|
|
# TODO: pourquoi url_human et non reference_url
|
|
if url_human is not None:
|
|
event["url_human"] = url_human
|
|
if start_time is not None:
|
|
event["start_time"] = start_time
|
|
if end_day is not None:
|
|
event["end_day"] = end_day
|
|
if end_time is not None:
|
|
event["end_time"] = end_time
|
|
|
|
if last_modified is not None:
|
|
event["last_modified"] = last_modified
|
|
|
|
if recurrences is not None:
|
|
event["recurrences"] = recurrences
|
|
|
|
self.events.append(event)
|
|
|
|
def default_value_if_exists(self, default_values, key):
|
|
return default_values[key] if default_values is not None and key in default_values else None
|
|
|
|
def get_structure(self):
|
|
return { "header": self.header, "events": self.events}
|
|
|
|
def clean_url(url):
|
|
from .extractor_ical import ICALExtractor
|
|
from .extractor_facebook import FacebookEventExtractor
|
|
|
|
result = url
|
|
for e in [ICALExtractor, FacebookEventExtractor]:
|
|
result = e.clean_url(result)
|
|
return result
|
|
|
|
def get_default_extractors(single_event=False):
|
|
from .extractor_ical import ICALExtractor
|
|
from .extractor_facebook import FacebookEventExtractor
|
|
|
|
if single_event:
|
|
return [FacebookEventExtractor(single_event=True)]
|
|
else:
|
|
return [ICALExtractor(), FacebookEventExtractor(single_event=False)] |