202 lines
7.9 KiB
Python
202 lines
7.9 KiB
Python
import icalendar
|
|
import warnings
|
|
|
|
from datetime import datetime, date
|
|
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
|
from urllib.parse import urlparse
|
|
|
|
from .extractor import *
|
|
import json
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FacebookEventExtractor(Extractor):
|
|
|
|
class SimpleFacebookEvent:
|
|
|
|
def __init__(self, data):
|
|
self.elements = {}
|
|
|
|
for key in ["id", "start_timestamp", "end_timestamp"]:
|
|
self.elements[key] = data[key] if key in data else None
|
|
|
|
if "parent_event" in data:
|
|
self.parent = FacebookEventExtractor.SimpleFacebookEvent(data["parent_event"])
|
|
|
|
|
|
class FacebookEvent:
|
|
|
|
name = "event"
|
|
keys = [
|
|
["start_time_formatted", 'start_timestamp',
|
|
'is_past',
|
|
"name",
|
|
"price_info",
|
|
"cover_media_renderer",
|
|
"event_creator",
|
|
"id",
|
|
"day_time_sentence",
|
|
"event_place",
|
|
"comet_neighboring_siblings"],
|
|
["event_description"],
|
|
["start_timestamp", "end_timestamp"]
|
|
]
|
|
rules = {
|
|
"event_description": { "description": ["text"]},
|
|
"cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
|
|
"event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
|
|
"event_place": {"event_place_name": ["name"] }
|
|
}
|
|
|
|
def __init__(self, i, event):
|
|
self.fragments = {}
|
|
self.elements = {}
|
|
self.neighbor_events = None
|
|
self.possible_end_timestamp = []
|
|
self.add_fragment(i, event)
|
|
|
|
def get_element(self, key):
|
|
return self.elements[key] if key in self.elements else None
|
|
|
|
|
|
def get_element_date(self, key):
|
|
v = self.get_element(key)
|
|
return datetime.fromtimestamp(v).date() if v is not None and v != 0 else None
|
|
|
|
def get_element_time(self, key):
|
|
v = self.get_element(key)
|
|
return datetime.fromtimestamp(v).strftime('%H:%M') if v is not None and v != 0 else None
|
|
|
|
def add_fragment(self, i, event):
|
|
self.fragments[i] = event
|
|
|
|
if FacebookEventExtractor.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
|
|
self.get_possible_end_timestamp(i, event)
|
|
else:
|
|
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
|
|
if k == "comet_neighboring_siblings":
|
|
self.get_neighbor_events(event[k])
|
|
elif k in FacebookEventExtractor.FacebookEvent.rules:
|
|
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[k].items():
|
|
error = False
|
|
c = event[k]
|
|
for ki in rule:
|
|
if c is not None:
|
|
c = c[ki]
|
|
else:
|
|
error = True
|
|
if not error:
|
|
self.elements[nk] = c
|
|
else:
|
|
self.elements[k] = event[k]
|
|
|
|
|
|
def get_possible_end_timestamp(self, i, data):
|
|
self.possible_end_timestamp.append(dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]))
|
|
|
|
|
|
def get_neighbor_events(self, data):
|
|
self.neighbor_events = [FacebookEventExtractor.SimpleFacebookEvent(d) for d in data]
|
|
|
|
def __str__(self):
|
|
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
|
|
|
def consolidate_current_event(self):
|
|
if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
|
|
if self.neighbor_events is not None and "id" in self.elements:
|
|
id = self.elements["id"]
|
|
for ne in self.neighbor_events:
|
|
if ne.elements["id"] == id:
|
|
self.elements["end_timestamp"] = ne.elements["end_timestamp"]
|
|
|
|
if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
|
|
for s in self.possible_end_timestamp:
|
|
if "start_timestamp" in s and "start_timestamp" in self.elements and s["start_timestamp"] == self.elements["start_timestamp"]:
|
|
self.elements["end_timestamp"] = s["end_timestamp"]
|
|
break
|
|
|
|
def find_event_fragment_in_array(array, event, first = True):
|
|
if isinstance(array, dict):
|
|
|
|
seen = False
|
|
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
|
|
if len(ks) == len([k for k in ks if k in array]):
|
|
seen = True
|
|
if event is None:
|
|
event = FacebookEventExtractor.FacebookEvent(i, array)
|
|
else:
|
|
event.add_fragment(i, array)
|
|
# only consider the first of FacebookEvent.keys
|
|
break
|
|
if not seen:
|
|
for k in array:
|
|
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
|
|
elif isinstance(array, list):
|
|
for e in array:
|
|
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(e, event, False)
|
|
|
|
if event is not None and first:
|
|
event.consolidate_current_event()
|
|
return event
|
|
|
|
|
|
def build_event(self, url):
|
|
image = self.get_element("image")
|
|
|
|
return {
|
|
"title": self.get_element("name"),
|
|
"category": None,
|
|
"start_day": self.get_element_date("start_timestamp"),
|
|
"location": self.get_element("event_place_name"),
|
|
"description": self.get_element("description"),
|
|
"tags": [],
|
|
"uuids": [url],
|
|
"url_human": url,
|
|
"start_time": self.get_element_time("start_timestamp"),
|
|
"end_day": self.get_element_date("end_timestamp"),
|
|
"end_time": self.get_element_time("end_timestamp"),
|
|
"image": self.get_element("image"),
|
|
"image_alt": self.get_element("image"),
|
|
}
|
|
|
|
|
|
def __init__(self, single_event=False):
|
|
self.single_event = single_event
|
|
super().__init__()
|
|
|
|
|
|
def clean_url(url):
|
|
|
|
if FacebookEventExtractor.is_known_url(url):
|
|
u = urlparse(url)
|
|
return "https://www.facebook.com" + u.path
|
|
else:
|
|
return url
|
|
|
|
def is_known_url(url):
|
|
u = urlparse(url)
|
|
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
|
|
|
|
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
|
# NOTE: this method does not use url_human = None and default_values = None
|
|
|
|
# get step by step all information from the content
|
|
fevent = None
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
for json_script in soup.find_all('script', type="application/json"):
|
|
json_txt = json_script.get_text()
|
|
json_struct = json.loads(json_txt)
|
|
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
|
|
|
|
if fevent is not None:
|
|
self.set_header(url)
|
|
event = fevent.build_event(url)
|
|
logger.warning("published: " + str(published))
|
|
event["published"] = published
|
|
self.add_event(**event)
|
|
return self.get_structure()
|
|
|
|
return None |