2023-12-22 11:38:42 +01:00
#!/usr/bin/python3
# coding: utf-8
from abc import ABC , abstractmethod
from urllib . parse import urlparse
import urllib . request
import os
from selenium import webdriver
from selenium . webdriver . chrome . service import Service
from selenium . webdriver . chrome . options import Options
import icalendar
2024-01-06 23:08:59 +01:00
from icalendar import vDatetime
2023-12-22 15:09:13 +01:00
from datetime import datetime , date
2023-12-23 09:53:30 +01:00
import json
2023-12-29 16:56:38 +01:00
from bs4 import BeautifulSoup
2024-01-06 23:08:59 +01:00
import pickle
2023-12-22 11:38:42 +01:00
class Downloader ( ABC ) :
def __init__ ( self ) :
pass
@abstractmethod
def download ( self , url ) :
pass
class SimpleDownloader ( Downloader ) :
def __init__ ( self ) :
super ( ) . __init__ ( )
def download ( self , url ) :
print ( " Downloading {} " . format ( url ) )
try :
resource = urllib . request . urlopen ( url )
data = resource . read ( ) . decode ( resource . headers . get_content_charset ( ) )
return data
except :
return None
class ChromiumHeadlessDownloader ( Downloader ) :
def __init__ ( self ) :
super ( ) . __init__ ( )
options = Options ( )
options . add_argument ( " --headless=new " )
service = Service ( " /usr/bin/chromedriver " )
self . driver = webdriver . Chrome ( service = service , options = options )
def download ( self , url ) :
print ( " Download {} " . format ( url ) )
self . driver . get ( url )
return driver . page_source
class Extractor ( ABC ) :
def __init__ ( self ) :
2023-12-22 15:09:13 +01:00
self . header = { }
self . events = [ ]
2023-12-22 11:38:42 +01:00
@abstractmethod
def extract ( self , content , url , url_human = None ) :
pass
2023-12-22 15:09:13 +01:00
def set_header ( self , url ) :
self . header [ " url " ] = url
self . header [ " date " ] = datetime . now ( )
def clear_events ( self ) :
self . events = [ ]
2024-01-06 23:08:59 +01:00
def add_event ( self , title , category , start_day , location , description , tags , uuid , recurrences = None , url_human = None , start_time = None , end_day = None , end_time = None , last_modified = None , published = False ) :
2023-12-22 15:09:13 +01:00
if title is None :
print ( " ERROR: cannot import an event without name " )
return
if start_day is None :
print ( " ERROR: cannot import an event without start day " )
return
event = {
" title " : title ,
" category " : category ,
" start_day " : start_day ,
2023-12-29 16:56:38 +01:00
" uuid " : uuid ,
2023-12-22 15:09:13 +01:00
" location " : location ,
2023-12-29 16:56:38 +01:00
" description " : description ,
2023-12-22 15:45:21 +01:00
" tags " : tags ,
" published " : published
2023-12-22 15:09:13 +01:00
}
if url_human is not None :
event [ " url_human " ] = url_human
if start_time is not None :
event [ " start_time " ] = start_time
if end_day is not None :
event [ " end_day " ] = end_day
if end_time is not None :
event [ " end_time " ] = end_time
if last_modified is not None :
event [ " last_modified " ] = last_modified
2024-01-06 23:08:59 +01:00
if recurrences is not None :
event [ " recurrences " ] = recurrences
2023-12-22 15:09:13 +01:00
self . events . append ( event )
def default_value_if_exists ( self , default_values , key ) :
return default_values [ key ] if default_values is not None and key in default_values else None
def get_structure ( self ) :
return { " header " : self . header , " events " : self . events }
2023-12-22 11:38:42 +01:00
class ICALExtractor ( Extractor ) :
def __init__ ( self ) :
super ( ) . __init__ ( )
2023-12-22 15:09:13 +01:00
def get_item_from_vevent ( self , event , name , raw = False ) :
try :
r = event . decoded ( name )
if raw :
return r
else :
return r . decode ( )
except :
return None
def get_dt_item_from_vevent ( self , event , name ) :
item = self . get_item_from_vevent ( event , name , raw = True )
day = None
time = None
if item is not None :
if isinstance ( item , datetime ) :
day = item . date ( )
time = item . time ( )
elif isinstance ( item , date ) :
day = item
time = None
return day , time
2023-12-22 15:45:21 +01:00
def extract ( self , content , url , url_human = None , default_values = None , published = False ) :
2023-12-22 11:38:42 +01:00
print ( " Extracting ical events from {} " . format ( url ) )
2023-12-22 15:09:13 +01:00
self . set_header ( url )
self . clear_events ( )
2024-01-01 12:57:22 +01:00
self . uuids = { }
2023-12-22 11:38:42 +01:00
calendar = icalendar . Calendar . from_ical ( content )
for event in calendar . walk ( ' VEVENT ' ) :
2023-12-22 15:09:13 +01:00
title = self . get_item_from_vevent ( event , " SUMMARY " )
category = self . default_value_if_exists ( default_values , " category " )
start_day , start_time = self . get_dt_item_from_vevent ( event , " DTSTART " )
2023-12-22 11:38:42 +01:00
2023-12-22 15:09:13 +01:00
end_day , end_time = self . get_dt_item_from_vevent ( event , " DTEND " )
2023-12-22 15:45:21 +01:00
location = self . get_item_from_vevent ( event , " LOCATION " )
if location is None :
2023-12-23 09:53:30 +01:00
location = self . default_value_if_exists ( default_values , " location " )
2023-12-22 15:09:13 +01:00
description = self . get_item_from_vevent ( event , " DESCRIPTION " )
2023-12-29 16:56:38 +01:00
if description is not None :
soup = BeautifulSoup ( description )
delimiter = ' \n '
for line_break in soup . findAll ( ' br ' ) :
line_break . replaceWith ( delimiter )
description = soup . get_text ( )
2023-12-22 15:09:13 +01:00
last_modified = self . get_item_from_vevent ( event , " LAST_MODIFIED " )
uuid = self . get_item_from_vevent ( event , " UID " )
if uuid is not None :
2024-01-01 12:57:22 +01:00
if uuid in self . uuids :
self . uuids [ uuid ] + = 1
uuid + = " : {:04} " . format ( self . uuids [ uuid ] - 1 )
else :
self . uuids [ uuid ] = 1
2023-12-22 15:09:13 +01:00
event_url = url + " # " + uuid
tags = self . default_value_if_exists ( default_values , " tags " )
last_modified = self . get_item_from_vevent ( event , " LAST-MODIFIED " , raw = True )
2024-01-06 23:08:59 +01:00
recurrence_entries = { }
for e in [ " RRULE " , " EXRULE " , " EXDATE " , " RDATE " ] :
i = self . get_item_from_vevent ( event , e , raw = True )
if i is not None :
recurrence_entries [ e ] = i
if start_day is not None and len ( recurrence_entries ) != 0 :
recurrences = " "
for k , r in recurrence_entries . items ( ) :
if isinstance ( r , list ) :
recurrences + = " \n " . join ( [ k + " : " + e . to_ical ( ) . decode ( ) for e in r ] ) + " \n "
else :
recurrences + = k + " : " + r . to_ical ( ) . decode ( ) + " \n "
else :
recurrences = None
2023-12-22 15:09:13 +01:00
2024-01-06 23:08:59 +01:00
self . add_event ( title , category , start_day , location , description , tags , recurrences = recurrences , uuid = event_url , url_human = url_human , start_time = start_time , end_day = end_day , end_time = end_time , last_modified = last_modified , published = published )
2023-12-22 15:09:13 +01:00
return self . get_structure ( )
2023-12-22 11:38:42 +01:00
class URL2Events :
def __init__ ( self , downloader , extractor ) :
self . downloader = downloader
self . extractor = extractor
2023-12-23 09:53:30 +01:00
def process ( self , url , url_human = None , cache = None , default_values = None , published = False ) :
2023-12-22 11:38:42 +01:00
if cache and os . path . exists ( cache ) :
print ( " Loading cache ( {} ) " . format ( cache ) )
with open ( cache ) as f :
content = " \n " . join ( f . readlines ( ) )
else :
content = self . downloader . download ( url )
if cache :
print ( " Saving cache ( {} ) " . format ( cache ) )
dir = os . path . dirname ( cache )
if dir != " " and not os . path . exists ( dir ) :
os . makedirs ( dir )
with open ( cache , " w " ) as text_file :
text_file . write ( content )
2023-12-23 09:53:30 +01:00
return self . extractor . extract ( content , url , url_human , default_values , published )
2023-12-22 11:38:42 +01:00
if __name__ == " __main__ " :
u2e = URL2Events ( SimpleDownloader ( ) , ICALExtractor ( ) )
url = " https://calendar.google.com/calendar/ical/programmation.lesaugustes %40g mail.com/public/basic.ics "
url_human = " https://www.cafelesaugustes.fr/la-programmation/ "
2023-12-22 15:45:21 +01:00
events = u2e . process ( url , url_human , cache = " cache-augustes.ical " , default_values = { " category " : " Autre " , " location " : " Café lecture les Augustes " } , published = True )
2023-12-22 11:38:42 +01:00
2023-12-23 09:53:30 +01:00
exportfile = " events-augustes.json "
print ( " Saving events to file {} " . format ( exportfile ) )
with open ( exportfile , " w " ) as f :
json . dump ( events , f , indent = 4 , default = str )