L'import facebook partage maintenant son code avec les autres imports

Fix #80
This commit is contained in:
Jean-Marie Favreau 2024-02-03 18:57:39 +01:00
parent 3ebc53995b
commit c5db83cf87
12 changed files with 209 additions and 301 deletions

3
experimentations/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.json
*.html
*.ical

View File

@ -1,171 +1,40 @@
#!/usr/bin/python3
# coding: utf-8
import requests
import hashlib
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import sys
class SimpleEvent:
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
def __init__(self, data):
self.elements = {}
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
for key in ["id", "start_timestamp", "end_timestamp"]:
self.elements[key] = data[key] if key in data else None
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
if "parent_event" in data:
self.parent = SimpleEvent(data["parent_event"])
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.extractor_facebook import *
class Event:
name = "event"
keys = [
["start_time_formatted", 'start_timestamp',
'is_past',
"name",
"price_info",
"cover_media_renderer",
"event_creator",
"id",
"day_time_sentence",
"event_place",
"comet_neighboring_siblings"],
["event_description"],
["start_timestamp", "end_timestamp"]
]
rules = {
"event_description": { "description": ["text"]},
"cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
"event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
"event_place": {"event_place_name": ["name"] }
}
def __init__(self, i, event):
self.fragments = {}
self.elements = {}
self.neighbor_events = None
self.possible_end_timestamp = []
self.add_fragment(i, event)
def add_fragment(self, i, event):
self.fragments[i] = event
if Event.keys[i] == ["start_timestamp", "end_timestamp"]:
self.get_possible_end_timestamp(i, event)
else:
for k in Event.keys[i]:
if k == "comet_neighboring_siblings":
self.get_neighbor_events(event[k])
elif k in Event.rules:
for nk, rule in Event.rules[k].items():
c = event[k]
for ki in rule:
c = c[ki]
self.elements[nk] = c
else:
self.elements[k] = event[k]
def get_possible_end_timestamp(self, i, data):
self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i]))
def get_neighbor_events(self, data):
self.neighbor_events = [SimpleEvent(d) for d in data]
def __str__(self):
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
def consolidate_current_event(self):
if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
id = self.elements["id"]
for ne in self.neighbor_events:
if ne.elements["id"] == id:
self.elements["end_timestamp"] = ne.elements["end_timestamp"]
if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
for s in self.possible_end_timestamp:
if s["start_timestamp"] == self.elements["start_timestamp"]:
self.elements["end_timestamp"] = s["end_timestamp"]
break
def find_event_fragment_in_array(array, event, first = True):
if isinstance(array, dict):
seen = False
for i, ks in enumerate(Event.keys):
if len(ks) == len([k for k in ks if k in array]):
seen = True
if event is None:
event = Event(i, array)
else:
event.add_fragment(i, array)
# only consider the first of Event.keys
break
if not seen:
for k in array:
event = Event.find_event_fragment_in_array(array[k], event, False)
elif isinstance(array, list):
for e in array:
event = Event.find_event_fragment_in_array(e, event, False)
if event is not None and first:
event.consolidate_current_event()
return event
#url="https://www.facebook.com/events/ical/export/?eid=2294200007432315"
#url="https://www.facebook.com/events/2294199997432316/2294200007432315/"
#url="https://www.facebook.com/events/635247792092358/"
url="https://www.facebook.com/events/872781744074648"
url="https://www.facebook.com/events/1432798543943663?"
#url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410"
#url="https://jmtrivial.info"
cachedir = "cache"
result = hashlib.md5(url.encode())
hash = result.hexdigest()
if __name__ == "__main__":
filename = os.path.join(cachedir, hash + ".html")
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor(single_event=True))
url="https://www.facebook.com/events/872781744074648"
if os.path.isfile(filename):
# print("Use cache")
with open(filename) as f:
doc = "\n".join(f.readlines())
else:
print("Download page")
events = u2e.process(url, cache = "fb.html", published = True)
options = Options()
options.add_argument("--headless=new")
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
doc = driver.page_source
driver.quit()
dir = os.path.dirname(filename)
if not os.path.exists(dir):
os.makedirs(dir)
with open(filename, "w") as text_file:
text_file.write(doc)
soup = BeautifulSoup(doc)
event = None
for json_script in soup.find_all('script', type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
event = Event.find_event_fragment_in_array(json_struct, event)
print(event)
exportfile = "event-facebook.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)

View File

@ -5,8 +5,6 @@ from celery import Celery
from celery.schedules import crontab
from celery.utils.log import get_task_logger
from .extractors import ExtractorAllURLs
from .import_tasks.downloader import *
from .import_tasks.extractor import *
from .import_tasks.importer import *
@ -53,7 +51,7 @@ def close_import_task(taskid, success, error_message, importer):
@app.task(bind=True)
def import_events_from_json(self, json):
from agenda_culturel.models import Event, BatchImportation
from .importation import EventsImporter
from .db_importer import DBImporterEvents
# create a batch importation
importation = BatchImportation(celery_id=self.request.id)
@ -63,7 +61,7 @@ def import_events_from_json(self, json):
logger.info("Import events from json: {}".format(self.request.id))
importer = EventsImporter(self.request.id)
importer = DBImporterEvents(self.request.id)
#try:
success, error_message = importer.import_events(json)
@ -78,7 +76,7 @@ def import_events_from_json(self, json):
@app.task(bind=True)
def run_recurrent_import(self, pk):
from agenda_culturel.models import RecurrentImport, BatchImportation
from .importation import EventsImporter
from .db_importer import DBImporterEvents
from django.shortcuts import get_object_or_404
logger.info("Run recurrent import: {}".format(self.request.id))
@ -92,7 +90,7 @@ def run_recurrent_import(self, pk):
importation.save()
# create an importer
importer = EventsImporter(self.request.id)
importer = DBImporterEvents(self.request.id)
# prepare downloading and extracting processes
downloader = SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE else ChromiumHeadlessDownloader()

View File

@ -7,7 +7,7 @@ import logging
logger = logging.getLogger(__name__)
class EventsImporter:
class DBImporterEvents:
def __init__(self, celery_id):
self.celery_id = celery_id

View File

@ -37,14 +37,18 @@ class ChromiumHeadlessDownloader(Downloader):
def __init__(self):
super().__init__()
options = Options()
options.add_argument("--headless=new")
service = Service("/usr/bin/chromedriver")
self.driver = webdriver.Chrome(service=service, options=options)
self.options = Options()
self.options.add_argument("--headless=new")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--no-sandbox")
self.service = Service("/usr/bin/chromedriver")
def download(self, url):
print("Download {}".format(url))
self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver.get(url)
return driver.page_source
doc = self.driver.page_source
self.driver.quit()
return doc

View File

@ -13,6 +13,10 @@ class Extractor(ABC):
def extract(self, content, url, url_human = None):
pass
@abstractmethod
def clean_url(url):
pass
def set_header(self, url):
self.header["url"] = url
self.header["date"] = datetime.now()
@ -20,7 +24,7 @@ class Extractor(ABC):
def clear_events(self):
self.events = []
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False):
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
if title is None:
print("ERROR: cannot import an event without name")
return
@ -36,8 +40,11 @@ class Extractor(ABC):
"location": location,
"description": description,
"tags": tags,
"published": published
"published": published,
"image": image,
"image_alt": image_alt
}
# TODO: pourquoi url_human et non reference_url
if url_human is not None:
event["url_human"] = url_human
if start_time is not None:
@ -60,3 +67,21 @@ class Extractor(ABC):
def get_structure(self):
return { "header": self.header, "events": self.events}
def clean_url(url):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
result = url
for e in [ICALExtractor, FacebookEventExtractor]:
result = e.clean_url(result)
return result
def get_default_extractors(single_event=False):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
if single_event:
return [FacebookEventExtractor(single_event=True)]
else:
return [ICALExtractor(), FacebookEventExtractor(single_event=False)]

View File

@ -1,65 +1,18 @@
from abc import ABC, abstractmethod
import icalendar
import warnings
from django.db import models
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import urllib.request
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
import os
from bs4 import BeautifulSoup
import json
from datetime import datetime, date
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from urllib.parse import urlparse
from .extractor import *
import json
import logging
logger = logging.getLogger(__name__)
class Extractor:
name = None
@abstractmethod
def is_known_url(url):
pass
@abstractmethod
def extract(url):
pass
@abstractmethod
def clean_url(url):
pass
def download(url):
try:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
doc = driver.page_source
driver.quit()
return doc
except Exception as e:
logger.error(e)
return None
class ExtractorFacebook(Extractor):
name = "Facebook"
class FacebookEventExtractor(Extractor):
class SimpleFacebookEvent:
@ -70,7 +23,7 @@ class ExtractorFacebook(Extractor):
self.elements[key] = data[key] if key in data else None
if "parent_event" in data:
self.parent = ExtractorFacebook.SimpleFacebookEvent(data["parent_event"])
self.parent = FacebookEventExtractor.SimpleFacebookEvent(data["parent_event"])
class FacebookEvent:
@ -119,14 +72,14 @@ class ExtractorFacebook(Extractor):
def add_fragment(self, i, event):
self.fragments[i] = event
if ExtractorFacebook.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
if FacebookEventExtractor.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
self.get_possible_end_timestamp(i, event)
else:
for k in ExtractorFacebook.FacebookEvent.keys[i]:
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
if k == "comet_neighboring_siblings":
self.get_neighbor_events(event[k])
elif k in ExtractorFacebook.FacebookEvent.rules:
for nk, rule in ExtractorFacebook.FacebookEvent.rules[k].items():
elif k in FacebookEventExtractor.FacebookEvent.rules:
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[k].items():
error = False
c = event[k]
for ki in rule:
@ -141,11 +94,11 @@ class ExtractorFacebook(Extractor):
def get_possible_end_timestamp(self, i, data):
self.possible_end_timestamp.append(dict((k, data[k]) for k in ExtractorFacebook.FacebookEvent.keys[i]))
self.possible_end_timestamp.append(dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]))
def get_neighbor_events(self, data):
self.neighbor_events = [ExtractorFacebook.SimpleFacebookEvent(d) for d in data]
self.neighbor_events = [FacebookEventExtractor.SimpleFacebookEvent(d) for d in data]
def __str__(self):
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
@ -168,21 +121,21 @@ class ExtractorFacebook(Extractor):
if isinstance(array, dict):
seen = False
for i, ks in enumerate(ExtractorFacebook.FacebookEvent.keys):
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
if len(ks) == len([k for k in ks if k in array]):
seen = True
if event is None:
event = ExtractorFacebook.FacebookEvent(i, array)
event = FacebookEventExtractor.FacebookEvent(i, array)
else:
event.add_fragment(i, array)
# only consider the first of FacebookEvent.keys
break
if not seen:
for k in array:
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
elif isinstance(array, list):
for e in array:
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(e, event, False)
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(e, event, False)
if event is not None and first:
event.consolidate_current_event()
@ -190,28 +143,33 @@ class ExtractorFacebook(Extractor):
def build_event(self, url):
from .models import Event
image = self.get_element("image")
return {
"title": self.get_element("name"),
"category": None,
"start_day": self.get_element_date("start_timestamp"),
"location": self.get_element("event_place_name"),
"description": self.get_element("description"),
"tags": [],
"uuid": url,
"url_human": url,
"start_time": self.get_element_time("start_timestamp"),
"end_day": self.get_element_date("end_timestamp"),
"end_time": self.get_element_time("end_timestamp"),
"image": self.get_element("image"),
"image_alt": self.get_element("image"),
}
return Event(title=self.get_element("name"),
status=Event.STATUS.DRAFT,
start_day=self.get_element_date("start_timestamp"),
start_time=self.get_element_time("start_timestamp"),
end_day=self.get_element_date("end_timestamp"),
end_time=self.get_element_time("end_timestamp"),
location=self.get_element("event_place_name"),
description=self.get_element("description"),
image=self.get_element("image"),
image_alt=self.get_element("image_alt"),
uuids=[url],
reference_urls=[url])
def __init__(self, single_event=False):
self.single_event = single_event
super().__init__()
def clean_url(url):
if ExtractorFacebook.is_known_url(url):
if FacebookEventExtractor.is_known_url(url):
u = urlparse(url)
return "https://www.facebook.com" + u.path
else:
@ -222,46 +180,23 @@ class ExtractorFacebook(Extractor):
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
def process_page(txt, url):
def extract(self, content, url, url_human = None, default_values = None, published = False):
# NOTE: this method does not use url_human = None and default_values = None
# get step by step all information from the content
fevent = None
soup = BeautifulSoup(txt, "html.parser")
soup = BeautifulSoup(content, "html.parser")
for json_script in soup.find_all('script', type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
if fevent is not None:
logger.info("Facebook event: " + str(fevent))
result = fevent.build_event(url)
return result
return None
class ExtractorAllURLs:
extractors = [ExtractorFacebook]
def clean_url(url):
result = url
for e in ExtractorAllURLs.extractors:
result = e.clean_url(result)
return result
def extract(url):
logger.info("Run extraction")
txt = Extractor.download(url)
if txt is None:
logger.info("Cannot download url")
return None
for e in ExtractorAllURLs.extractors:
result = e.process_page(txt, url)
if result is not None:
return result
else:
logger.info("Not a " + e.name + " link")
self.set_header(url)
event = fevent.build_event(url)
logger.warning("published: " + str(published))
event["published"] = published
self.add_event(**event)
return self.get_structure()
return None

View File

@ -39,6 +39,9 @@ class ICALExtractor(Extractor):
return day, time
def clean_url(url):
return url
def extract(self, content, url, url_human = None, default_values = None, published = False):
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

View File

@ -6,13 +6,13 @@ from .extractor import *
class URL2Events:
def __init__(self, downloader, extractor):
def __init__(self, downloader = SimpleDownloader(), extractor = None, single_event=False):
self.downloader = downloader
self.extractor = extractor
self.single_event = single_event
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
def get_content(self, url, cache = None):
if cache and os.path.exists(cache):
print("Loading cache ({})".format(cache))
with open(cache) as f:
@ -27,5 +27,25 @@ class URL2Events:
os.makedirs(dir)
with open(cache, "w") as text_file:
text_file.write(content)
return content
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
content = self.get_content(url, cache)
if content is None:
return None
if self.extractor is not None:
return self.extractor.extract(content, url, url_human, default_values, published)
else:
# if the extractor is not defined, use a list of default extractors
for e in Extractor.get_default_extractors(self.single_event):
#try:
events = e.extract(content, url, url_human, default_values, published)
if events is not None:
return events
#except:
# continue
return None
return self.extractor.extract(content, url, url_human, default_values, published)

View File

@ -345,6 +345,9 @@ article#filters {
.helptext, .subentry-search, .remarque {
font-size: 80%;
margin-top: -0.7em;
ul {
font-size: 100%;
}
}
.django-ckeditor-widget {
@ -648,6 +651,9 @@ aside nav a.badge {
/* mise en forme pour les récurrences */
.container-fluid article form p .recurrence-widget {
@extend article;
width: 100%;
border: 0;
.header a, .add-button {
@extend [role="button"];

View File

@ -27,11 +27,14 @@
{% load static_content_extra %}
{% if object %}
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
{% else %}
<h1>Édition de l'événement importé</h1>
{% endif %}
<article>
<header>
{% if object %}
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
{% else %}
<h1>Édition de l'événement importé</h1>
{% endif %}
</header>
<div id="container"></div>
<form method="post">{% csrf_token %}
@ -42,5 +45,37 @@
<input type="submit" value="Enregistrer">
</div>
</form>
{% if object %}
<footer class="remarque">
Informations complémentaires non éditables&nbsp;:
<ul>
{% if object.created_date %}<li>Création&nbsp;: {{ object.created_date }}</li>{% endif %}
{% if object.modified_date %}<li>Dernière modification&nbsp;: {{ object.modified_date }}</li>{% endif %}
{% if object.imported_date %}<li>Dernière importation&nbsp;: {{ object.imported_date }}</li>{% endif %}
{% if object.uuids %}
{% if object.uuids|length > 0 %}
<li>UUIDs (identifiants uniques d'événements dans les sources)&nbsp;:
<ul>
{% for u in object.uuids %}
<li>{{ u }}</li>
{% endfor %}
</ul></li>
{% endif %}
{% endif %}
{% if object.import_sources %}
{% if object.import_sources|length > 0 %}
<li>Sources d'import&nbsp;:
<ul>
{% for u in object.import_sources %}
<li><a href="{{ u }}">{{ u }}</a></li>
{% endfor %}
</ul>
</li>
{% endif %}
{% endif %}
</ul>
</footer>
{% endif %}
</article>
{% endblock %}

View File

@ -30,7 +30,10 @@ from django.contrib import messages
from django.contrib.messages.views import SuccessMessageMixin
from .calendar import CalendarMonth, CalendarWeek, CalendarDay
from .extractors import ExtractorAllURLs
from .import_tasks.importer import URL2Events
from .import_tasks.extractor import Extractor
from .import_tasks.downloader import ChromiumHeadlessDownloader
from .celery import app as celery_app, import_events_from_json, run_recurrent_import
@ -262,7 +265,7 @@ def import_from_url(request):
logger = logging.getLogger(__name__)
if request.method == 'POST' and "title" in request.POST:
form = EventForm(request.POST)
form = EventForm(request.POST, is_authenticated=request.user.is_authenticated)
if form.is_valid():
new_event = form.save()
if request.user.is_authenticated:
@ -284,25 +287,32 @@ def import_from_url(request):
form_event = EventForm(initial=initial)
if request.method == 'POST':
form = EventSubmissionForm(request.POST)
form = EventSubmissionForm(request.POST)
if form.is_valid():
cd = form.cleaned_data
url = cd.get('url')
url = ExtractorAllURLs.clean_url(url)
url = Extractor.clean_url(url)
existing = Event.objects.filter(uuids__contains=[url])
if len(existing) == 0:
event = ExtractorAllURLs.extract(url)
event = None
u2e = URL2Events(ChromiumHeadlessDownloader(), single_event=True)
events_structure = u2e.process(url, published=request.user.is_authenticated)
if events_structure is not None and "events" in events_structure and len(events_structure["events"]) > 0:
event = Event.from_structure(events_structure["events"][0], events_structure["header"]["url"])
# TODO: use celery to import the other events
if event != None:
form = EventForm(instance=event)
form = EventForm(instance=event, is_authenticated=request.user.is_authenticated)
messages.success(request, _("The event has been successfully extracted, and you can now submit it after modifying it if necessary."))
return render(request, 'agenda_culturel/event_form.html', context={'form': form })
else:
form = EventForm(initial={'reference_urls': [url]})
form = EventForm(initial={'reference_urls': [url]}, is_authenticated=request.user.is_authenticated)
messages.error(request, _("Unable to extract an event from the proposed URL. Please use the form below to submit the event."))
return render(request, 'agenda_culturel/import.html', context={'form': form, 'form_event': form_event})
else: