agenda_culturel/src/agenda_culturel/import_tasks/extractor_ical.py

162 lines
6.1 KiB
Python

import icalendar
import warnings
from icalendar import vDatetime
import bbcode
from datetime import datetime, date
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from .extractor import *
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)
class ICALExtractor(Extractor):
def __init__(self):
super().__init__()
def get_item_from_vevent(self, event, name, raw = False):
try:
r = event.decoded(name)
if raw:
return r
else:
return r.decode()
except:
return None
def get_dt_item_from_vevent(self, event, name):
item = self.get_item_from_vevent(event, name, raw = True)
day = None
time = None
if item is not None:
if isinstance(item, datetime):
day = item.date()
time = item.time()
elif isinstance(item, date):
day = item
time = None
return day, time
def clean_url(url):
return url
def extract(self, content, url, url_human = None, default_values = None, published = False):
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
print("Extracting ical events from {}".format(url))
self.set_header(url)
self.clear_events()
self.uuids = {}
calendar = icalendar.Calendar.from_ical(content)
for event in calendar.walk('VEVENT'):
title = self.get_item_from_vevent(event, "SUMMARY")
category = self.default_value_if_exists(default_values, "category")
start_day, start_time = self.get_dt_item_from_vevent(event, "DTSTART")
end_day, end_time = self.get_dt_item_from_vevent(event, "DTEND")
location = self.get_item_from_vevent(event, "LOCATION")
if location is None:
location = self.default_value_if_exists(default_values, "location")
description = self.get_item_from_vevent(event, "DESCRIPTION")
if description is not None:
soup = BeautifulSoup(description, features="lxml")
delimiter = '\n'
for line_break in soup.findAll('br'):
line_break.replaceWith(delimiter)
description = soup.get_text()
last_modified = self.get_item_from_vevent(event, "LAST_MODIFIED")
uuid = self.get_item_from_vevent(event, "UID")
if uuid is not None:
if uuid in self.uuids:
self.uuids[uuid] += 1
uuid += ":{:04}".format(self.uuids[uuid] - 1)
else:
self.uuids[uuid] = 1
event_url = url + "#" + uuid
uuidrel = None
related_to = self.get_item_from_vevent(event, "RELATED-TO")
if related_to is not None:
if related_to in self.uuids:
self.uuids[related_to] += 1
uuidrel = url + "#" + related_to + ":{:04}".format(self.uuids[related_to] - 1)
# possible limitation: if the ordering is not original then related
tags = self.default_value_if_exists(default_values, "tags")
last_modified = self.get_item_from_vevent(event, "LAST-MODIFIED", raw = True)
recurrence_entries = {}
for e in ["RRULE", "EXRULE", "EXDATE", "RDATE"]:
i = self.get_item_from_vevent(event, e, raw = True)
if i is not None:
recurrence_entries[e] = i
if start_day is not None and len(recurrence_entries) != 0:
recurrences = ""
for k, r in recurrence_entries.items():
if isinstance(r, list):
recurrences += "\n".join([k + ":" + e.to_ical().decode() for e in r]) + "\n"
else:
recurrences += k + ":" + r.to_ical().decode() + "\n"
else:
recurrences = None
if title is not None:
luuids = [event_url]
if uuidrel is not None:
luuids += [uuidrel]
self.add_event(title, category, start_day, location, description, tags, recurrences=recurrences, uuids=luuids, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, last_modified=last_modified, published=published)
return self.get_structure()
# A variation on ICAL extractor that removes any even named "Busy"
class ICALNoBusyExtractor(ICALExtractor):
def add_event(self, title, category, start_day, location, description, tags, uuids, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
if title != 'Busy':
super().add_event(title, category, start_day, location, description, tags, uuids, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)
# A variation on ICAL extractor that remove any visual composer anchors
class ICALNoVCExtractor(ICALExtractor):
def __init__(self):
self.parser = bbcode.Parser(newline="\n", drop_unrecognized=True, install_defaults=False)
self.parser.add_simple_formatter("vc_row", "%(value)s")
self.parser.add_simple_formatter("vc_column", "%(value)s")
self.parser.add_simple_formatter("vc_column_text", "%(value)s")
self.parser.add_simple_formatter("vc_raw_html", "")
super().__init__()
def clean_vc(self, text):
if text is None:
return text
else:
result = self.parser.format(text)
return result
def add_event(self, title, category, start_day, location, description, tags, uuids, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
super().add_event(title, category, start_day, location, self.clean_vc(description), tags, uuids, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt)