Ajout de l'extraction des événements d'une page facebook
This commit is contained in:
parent
cbb34190cf
commit
81601ec5da
43
experimentations/get_facebook_events.py
Executable file
43
experimentations/get_facebook_events.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
u2e = URL2Events(ChromiumHeadlessDownloader(), fbevents.CExtractor())
|
||||
url = "https://www.facebook.com/laJeteeClermont/events"
|
||||
url_human = "https://www.facebook.com/laJeteeClermont/events"
|
||||
|
||||
try:
|
||||
events = u2e.process(url, url_human, cache = "cache-lajetee-fb.html", default_values = {"location": "La Jetée"}, published = True)
|
||||
|
||||
exportfile = "events-lajetee-fb.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
except Exception as e:
|
||||
print("Exception: " + str(e))
|
@ -124,6 +124,8 @@ def run_recurrent_import(self, pk):
|
||||
extractor = lapucealoreille.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.MECWORDPRESS:
|
||||
extractor = wordpress_mec.CExtractor()
|
||||
elif rimport.processor == RecurrentImport.PROCESSOR.FBEVENTS:
|
||||
extractor = fbevents.CExtractor()
|
||||
else:
|
||||
extractor = None
|
||||
|
||||
|
@ -0,0 +1,48 @@
|
||||
from ..generic_extractors import *
|
||||
from ..extractor_facebook import FacebookEvent
|
||||
import json5
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
|
||||
|
||||
# A class dedicated to get events from a facebook events page
|
||||
# such as https://www.facebook.com/laJeteeClermont/events
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
links = soup.find_all("a")
|
||||
for link in links:
|
||||
if link.get("href").startswith('https://www.facebook.com/events/'):
|
||||
self.add_event_url(link.get('href').split('?')[0])
|
||||
|
||||
|
||||
def add_event_from_content(
|
||||
self,
|
||||
event_content,
|
||||
event_url,
|
||||
url_human=None,
|
||||
default_values=None,
|
||||
published=False,
|
||||
):
|
||||
|
||||
fevent = None
|
||||
soup = BeautifulSoup(event_content, "html.parser")
|
||||
for json_script in soup.find_all("script", type="application/json"):
|
||||
json_txt = json_script.get_text()
|
||||
json_struct = json.loads(json_txt)
|
||||
fevent = FacebookEvent.find_event_fragment_in_array(
|
||||
json_struct, fevent
|
||||
)
|
||||
|
||||
if fevent is not None:
|
||||
event = fevent.build_event(event_url)
|
||||
event["published"] = published
|
||||
print([e.elements for e in fevent.neighbor_events])
|
||||
|
||||
if "category" in default_values:
|
||||
event["category"] = default_values["category"]
|
||||
self.add_event(**event)
|
||||
|
@ -5,9 +5,9 @@ import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import *
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class Downloader(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
@ -90,19 +90,20 @@ class ChromiumHeadlessDownloader(Downloader):
|
||||
self.driver.get(url)
|
||||
doc = self.driver.page_source
|
||||
|
||||
except exceptions.StaleElementReferenceException as e:
|
||||
|
||||
except StaleElementReferenceException as e:
|
||||
print(f">> {type(e).__name__}: {e.args}")
|
||||
return None
|
||||
except exceptions.NoSuchElementException as e:
|
||||
except NoSuchElementException as e:
|
||||
print(f">> {type(e).__name__}: {e.args}")
|
||||
return None
|
||||
except exceptions.TimeoutException as e:
|
||||
except TimeoutException as e:
|
||||
print(f">> {type(e).__name__}: {e.args}")
|
||||
return None
|
||||
except exceptions.WebDriverException as e:
|
||||
except WebDriverException as e:
|
||||
print(f">> {type(e).__name__}: {e.args}")
|
||||
return None
|
||||
except exceptions.SessionNotCreatedException as e:
|
||||
except SessionNotCreatedException as e:
|
||||
print(f">> {type(e).__name__}: {e.args}")
|
||||
return None
|
||||
except Exception as e:
|
||||
|
@ -9,193 +9,194 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SimpleFacebookEvent:
|
||||
def __init__(self, data):
|
||||
self.elements = {}
|
||||
|
||||
class FacebookEventExtractor(Extractor):
|
||||
class SimpleFacebookEvent:
|
||||
def __init__(self, data):
|
||||
self.elements = {}
|
||||
for key in ["id", "start_timestamp", "end_timestamp"]:
|
||||
self.elements[key] = data[key] if key in data else None
|
||||
|
||||
for key in ["id", "start_timestamp", "end_timestamp"]:
|
||||
self.elements[key] = data[key] if key in data else None
|
||||
if "parent_event" in data:
|
||||
self.parent = SimpleFacebookEvent(
|
||||
data["parent_event"]
|
||||
)
|
||||
|
||||
if "parent_event" in data:
|
||||
self.parent = FacebookEventExtractor.SimpleFacebookEvent(
|
||||
data["parent_event"]
|
||||
class FacebookEvent:
|
||||
name = "event"
|
||||
keys = [
|
||||
[
|
||||
"start_time_formatted",
|
||||
"start_timestamp",
|
||||
"is_past",
|
||||
"name",
|
||||
"price_info",
|
||||
"cover_media_renderer",
|
||||
"id",
|
||||
"day_time_sentence",
|
||||
"event_place",
|
||||
"comet_neighboring_siblings",
|
||||
],
|
||||
["event_description"],
|
||||
["start_timestamp", "end_timestamp"],
|
||||
]
|
||||
rules = {
|
||||
"event_description": {"description": ["text"]},
|
||||
"cover_media_renderer": {
|
||||
"image_alt": ["cover_photo", "photo", "accessibility_caption"],
|
||||
"image": ["cover_photo", "photo", "full_image", "uri"],
|
||||
},
|
||||
"event_creator": {
|
||||
"event_creator_name": ["name"],
|
||||
"event_creator_url": ["url"],
|
||||
},
|
||||
"event_place": {"event_place_name": ["name"]},
|
||||
}
|
||||
|
||||
def __init__(self, i, event):
|
||||
self.fragments = {}
|
||||
self.elements = {}
|
||||
self.neighbor_events = None
|
||||
self.possible_end_timestamp = []
|
||||
self.add_fragment(i, event)
|
||||
|
||||
def get_element(self, key):
|
||||
return self.elements[key] if key in self.elements else None
|
||||
|
||||
def get_element_date(self, key):
|
||||
v = self.get_element(key)
|
||||
return (
|
||||
datetime.fromtimestamp(v).date() if v is not None and v != 0 else None
|
||||
)
|
||||
|
||||
def get_element_time(self, key):
|
||||
v = self.get_element(key)
|
||||
return (
|
||||
datetime.fromtimestamp(v).strftime("%H:%M")
|
||||
if v is not None and v != 0
|
||||
else None
|
||||
)
|
||||
|
||||
def add_fragment(self, i, event):
|
||||
self.fragments[i] = event
|
||||
|
||||
if FacebookEvent.keys[i] == [
|
||||
"start_timestamp",
|
||||
"end_timestamp",
|
||||
]:
|
||||
self.get_possible_end_timestamp(i, event)
|
||||
else:
|
||||
for k in FacebookEvent.keys[i]:
|
||||
if k == "comet_neighboring_siblings":
|
||||
self.get_neighbor_events(event[k])
|
||||
elif k in FacebookEvent.rules:
|
||||
for nk, rule in FacebookEvent.rules[
|
||||
k
|
||||
].items():
|
||||
error = False
|
||||
c = event[k]
|
||||
for ki in rule:
|
||||
if c is not None:
|
||||
c = c[ki]
|
||||
else:
|
||||
error = True
|
||||
if not error:
|
||||
self.elements[nk] = c
|
||||
else:
|
||||
self.elements[k] = event[k]
|
||||
|
||||
def get_possible_end_timestamp(self, i, data):
|
||||
self.possible_end_timestamp.append(
|
||||
dict((k, data[k]) for k in FacebookEvent.keys[i])
|
||||
)
|
||||
|
||||
def get_neighbor_events(self, data):
|
||||
self.neighbor_events = [
|
||||
SimpleFacebookEvent(d) for d in data
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
str(self.elements)
|
||||
+ "\n Neighbors: "
|
||||
+ ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||
)
|
||||
|
||||
def consolidate_current_event(self):
|
||||
if (
|
||||
self.neighbor_events is not None
|
||||
and "id" in self.elements
|
||||
and "end_timestamp" not in self.elements
|
||||
):
|
||||
if self.neighbor_events is not None and "id" in self.elements:
|
||||
id = self.elements["id"]
|
||||
for ne in self.neighbor_events:
|
||||
if ne.elements["id"] == id:
|
||||
self.elements["end_timestamp"] = ne.elements[
|
||||
"end_timestamp"
|
||||
]
|
||||
|
||||
if (
|
||||
"end_timestamp" not in self.elements
|
||||
and len(self.possible_end_timestamp) != 0
|
||||
):
|
||||
for s in self.possible_end_timestamp:
|
||||
if (
|
||||
"start_timestamp" in s
|
||||
and "start_timestamp" in self.elements
|
||||
and s["start_timestamp"] == self.elements["start_timestamp"]
|
||||
):
|
||||
self.elements["end_timestamp"] = s["end_timestamp"]
|
||||
break
|
||||
|
||||
def find_event_fragment_in_array(array, event, first=True):
|
||||
if isinstance(array, dict):
|
||||
seen = False
|
||||
for i, ks in enumerate(FacebookEvent.keys):
|
||||
# DEBUG: print([k for k in ks if k in array], "il manque", [k for k in ks if k not in array])
|
||||
if len(ks) == len([k for k in ks if k in array]):
|
||||
seen = True
|
||||
if event is None:
|
||||
event = FacebookEvent(i, array)
|
||||
else:
|
||||
event.add_fragment(i, array)
|
||||
# only consider the first of FacebookEvent.keys
|
||||
break
|
||||
if not seen:
|
||||
for k in array:
|
||||
event = FacebookEvent.find_event_fragment_in_array(
|
||||
array[k], event, False
|
||||
)
|
||||
elif isinstance(array, list):
|
||||
for e in array:
|
||||
event = FacebookEvent.find_event_fragment_in_array(
|
||||
e, event, False
|
||||
)
|
||||
|
||||
class FacebookEvent:
|
||||
name = "event"
|
||||
keys = [
|
||||
[
|
||||
"start_time_formatted",
|
||||
"start_timestamp",
|
||||
"is_past",
|
||||
"name",
|
||||
"price_info",
|
||||
"cover_media_renderer",
|
||||
"id",
|
||||
"day_time_sentence",
|
||||
"event_place",
|
||||
"comet_neighboring_siblings",
|
||||
],
|
||||
["event_description"],
|
||||
["start_timestamp", "end_timestamp"],
|
||||
]
|
||||
rules = {
|
||||
"event_description": {"description": ["text"]},
|
||||
"cover_media_renderer": {
|
||||
"image_alt": ["cover_photo", "photo", "accessibility_caption"],
|
||||
"image": ["cover_photo", "photo", "full_image", "uri"],
|
||||
},
|
||||
"event_creator": {
|
||||
"event_creator_name": ["name"],
|
||||
"event_creator_url": ["url"],
|
||||
},
|
||||
"event_place": {"event_place_name": ["name"]},
|
||||
if event is not None and first:
|
||||
event.consolidate_current_event()
|
||||
return event
|
||||
|
||||
def build_event(self, url):
|
||||
self.get_element("image")
|
||||
|
||||
return {
|
||||
"title": self.get_element("name"),
|
||||
"category": None,
|
||||
"start_day": self.get_element_date("start_timestamp"),
|
||||
"location": self.get_element("event_place_name"),
|
||||
"description": self.get_element("description"),
|
||||
"tags": [],
|
||||
"uuids": [url],
|
||||
"url_human": url,
|
||||
"start_time": self.get_element_time("start_timestamp"),
|
||||
"end_day": self.get_element_date("end_timestamp"),
|
||||
"end_time": self.get_element_time("end_timestamp"),
|
||||
"image": self.get_element("image"),
|
||||
"image_alt": self.get_element("image"),
|
||||
}
|
||||
|
||||
def __init__(self, i, event):
|
||||
self.fragments = {}
|
||||
self.elements = {}
|
||||
self.neighbor_events = None
|
||||
self.possible_end_timestamp = []
|
||||
self.add_fragment(i, event)
|
||||
|
||||
def get_element(self, key):
|
||||
return self.elements[key] if key in self.elements else None
|
||||
|
||||
def get_element_date(self, key):
|
||||
v = self.get_element(key)
|
||||
return (
|
||||
datetime.fromtimestamp(v).date() if v is not None and v != 0 else None
|
||||
)
|
||||
|
||||
def get_element_time(self, key):
|
||||
v = self.get_element(key)
|
||||
return (
|
||||
datetime.fromtimestamp(v).strftime("%H:%M")
|
||||
if v is not None and v != 0
|
||||
else None
|
||||
)
|
||||
|
||||
def add_fragment(self, i, event):
|
||||
self.fragments[i] = event
|
||||
|
||||
if FacebookEventExtractor.FacebookEvent.keys[i] == [
|
||||
"start_timestamp",
|
||||
"end_timestamp",
|
||||
]:
|
||||
self.get_possible_end_timestamp(i, event)
|
||||
else:
|
||||
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
|
||||
if k == "comet_neighboring_siblings":
|
||||
self.get_neighbor_events(event[k])
|
||||
elif k in FacebookEventExtractor.FacebookEvent.rules:
|
||||
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[
|
||||
k
|
||||
].items():
|
||||
error = False
|
||||
c = event[k]
|
||||
for ki in rule:
|
||||
if c is not None:
|
||||
c = c[ki]
|
||||
else:
|
||||
error = True
|
||||
if not error:
|
||||
self.elements[nk] = c
|
||||
else:
|
||||
self.elements[k] = event[k]
|
||||
|
||||
def get_possible_end_timestamp(self, i, data):
|
||||
self.possible_end_timestamp.append(
|
||||
dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i])
|
||||
)
|
||||
|
||||
def get_neighbor_events(self, data):
|
||||
self.neighbor_events = [
|
||||
FacebookEventExtractor.SimpleFacebookEvent(d) for d in data
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
str(self.elements)
|
||||
+ "\n Neighbors: "
|
||||
+ ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||
)
|
||||
|
||||
def consolidate_current_event(self):
|
||||
if (
|
||||
self.neighbor_events is not None
|
||||
and "id" in self.elements
|
||||
and "end_timestamp" not in self.elements
|
||||
):
|
||||
if self.neighbor_events is not None and "id" in self.elements:
|
||||
id = self.elements["id"]
|
||||
for ne in self.neighbor_events:
|
||||
if ne.elements["id"] == id:
|
||||
self.elements["end_timestamp"] = ne.elements[
|
||||
"end_timestamp"
|
||||
]
|
||||
|
||||
if (
|
||||
"end_timestamp" not in self.elements
|
||||
and len(self.possible_end_timestamp) != 0
|
||||
):
|
||||
for s in self.possible_end_timestamp:
|
||||
if (
|
||||
"start_timestamp" in s
|
||||
and "start_timestamp" in self.elements
|
||||
and s["start_timestamp"] == self.elements["start_timestamp"]
|
||||
):
|
||||
self.elements["end_timestamp"] = s["end_timestamp"]
|
||||
break
|
||||
|
||||
def find_event_fragment_in_array(array, event, first=True):
|
||||
if isinstance(array, dict):
|
||||
seen = False
|
||||
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
|
||||
# DEBUG: print([k for k in ks if k in array], "il manque", [k for k in ks if k not in array])
|
||||
if len(ks) == len([k for k in ks if k in array]):
|
||||
seen = True
|
||||
if event is None:
|
||||
event = FacebookEventExtractor.FacebookEvent(i, array)
|
||||
else:
|
||||
event.add_fragment(i, array)
|
||||
# only consider the first of FacebookEvent.keys
|
||||
break
|
||||
if not seen:
|
||||
for k in array:
|
||||
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
|
||||
array[k], event, False
|
||||
)
|
||||
elif isinstance(array, list):
|
||||
for e in array:
|
||||
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
|
||||
e, event, False
|
||||
)
|
||||
|
||||
if event is not None and first:
|
||||
event.consolidate_current_event()
|
||||
return event
|
||||
|
||||
def build_event(self, url):
|
||||
self.get_element("image")
|
||||
|
||||
return {
|
||||
"title": self.get_element("name"),
|
||||
"category": None,
|
||||
"start_day": self.get_element_date("start_timestamp"),
|
||||
"location": self.get_element("event_place_name"),
|
||||
"description": self.get_element("description"),
|
||||
"tags": [],
|
||||
"uuids": [url],
|
||||
"url_human": url,
|
||||
"start_time": self.get_element_time("start_timestamp"),
|
||||
"end_day": self.get_element_date("end_timestamp"),
|
||||
"end_time": self.get_element_time("end_timestamp"),
|
||||
"image": self.get_element("image"),
|
||||
"image_alt": self.get_element("image"),
|
||||
}
|
||||
class FacebookEventExtractor(Extractor):
|
||||
|
||||
def __init__(self, single_event=False):
|
||||
self.single_event = single_event
|
||||
@ -223,7 +224,7 @@ class FacebookEventExtractor(Extractor):
|
||||
for json_script in soup.find_all("script", type="application/json"):
|
||||
json_txt = json_script.get_text()
|
||||
json_struct = json.loads(json_txt)
|
||||
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
|
||||
fevent = FacebookEvent.find_event_fragment_in_array(
|
||||
json_struct, fevent
|
||||
)
|
||||
|
||||
|
@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.7 on 2024-08-28 21:42
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('agenda_culturel', '0067_categorisationrule_place'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='recurrentimport',
|
||||
name='processor',
|
||||
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page")], default='ical', max_length=20, verbose_name='Processor'),
|
||||
),
|
||||
]
|
@ -1237,8 +1237,9 @@ class RecurrentImport(models.Model):
|
||||
LACOOPE = "lacoope", _("lacoope.org")
|
||||
LACOMEDIE = "lacomedie", _("la comédie")
|
||||
LEFOTOMAT = "lefotomat", _("le fotomat")
|
||||
LAPUCEALOREILLE = "lapucealoreille", _("la puce à l" "oreille")
|
||||
LAPUCEALOREILLE = "lapucealoreille", _("la puce à l'oreille")
|
||||
MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
|
||||
FBEVENTS = "Facebook events", _("Événements d'une page")
|
||||
|
||||
class DOWNLOADER(models.TextChoices):
|
||||
SIMPLE = "simple", _("simple")
|
||||
|
Loading…
Reference in New Issue
Block a user