L'import facebook partage maintenant son code avec les autres imports
Fix #80
This commit is contained in:
parent
3ebc53995b
commit
c5db83cf87
3
experimentations/.gitignore
vendored
Normal file
3
experimentations/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
*.json
|
||||||
|
*.html
|
||||||
|
*.ical
|
@ -1,171 +1,40 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
import requests
|
|
||||||
import hashlib
|
|
||||||
import os
|
import os
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
class SimpleEvent:
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
def __init__(self, data):
|
# Getting the parent directory name
|
||||||
self.elements = {}
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
for key in ["id", "start_timestamp", "end_timestamp"]:
|
# adding the parent directory to
|
||||||
self.elements[key] = data[key] if key in data else None
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
|
||||||
if "parent_event" in data:
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
self.parent = SimpleEvent(data["parent_event"])
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor_facebook import *
|
||||||
|
|
||||||
|
|
||||||
class Event:
|
|
||||||
|
|
||||||
name = "event"
|
|
||||||
keys = [
|
|
||||||
["start_time_formatted", 'start_timestamp',
|
|
||||||
'is_past',
|
|
||||||
"name",
|
|
||||||
"price_info",
|
|
||||||
"cover_media_renderer",
|
|
||||||
"event_creator",
|
|
||||||
"id",
|
|
||||||
"day_time_sentence",
|
|
||||||
"event_place",
|
|
||||||
"comet_neighboring_siblings"],
|
|
||||||
["event_description"],
|
|
||||||
["start_timestamp", "end_timestamp"]
|
|
||||||
]
|
|
||||||
rules = {
|
|
||||||
"event_description": { "description": ["text"]},
|
|
||||||
"cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
|
|
||||||
"event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
|
|
||||||
"event_place": {"event_place_name": ["name"] }
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, i, event):
|
|
||||||
self.fragments = {}
|
|
||||||
self.elements = {}
|
|
||||||
self.neighbor_events = None
|
|
||||||
self.possible_end_timestamp = []
|
|
||||||
self.add_fragment(i, event)
|
|
||||||
|
|
||||||
def add_fragment(self, i, event):
|
|
||||||
self.fragments[i] = event
|
|
||||||
|
|
||||||
if Event.keys[i] == ["start_timestamp", "end_timestamp"]:
|
|
||||||
self.get_possible_end_timestamp(i, event)
|
|
||||||
else:
|
|
||||||
for k in Event.keys[i]:
|
|
||||||
if k == "comet_neighboring_siblings":
|
|
||||||
self.get_neighbor_events(event[k])
|
|
||||||
elif k in Event.rules:
|
|
||||||
for nk, rule in Event.rules[k].items():
|
|
||||||
c = event[k]
|
|
||||||
for ki in rule:
|
|
||||||
c = c[ki]
|
|
||||||
self.elements[nk] = c
|
|
||||||
else:
|
|
||||||
self.elements[k] = event[k]
|
|
||||||
|
|
||||||
def get_possible_end_timestamp(self, i, data):
|
|
||||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i]))
|
|
||||||
|
|
||||||
def get_neighbor_events(self, data):
|
|
||||||
self.neighbor_events = [SimpleEvent(d) for d in data]
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
|
||||||
|
|
||||||
def consolidate_current_event(self):
|
|
||||||
if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
|
|
||||||
id = self.elements["id"]
|
|
||||||
for ne in self.neighbor_events:
|
|
||||||
if ne.elements["id"] == id:
|
|
||||||
self.elements["end_timestamp"] = ne.elements["end_timestamp"]
|
|
||||||
|
|
||||||
if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
|
|
||||||
for s in self.possible_end_timestamp:
|
|
||||||
if s["start_timestamp"] == self.elements["start_timestamp"]:
|
|
||||||
self.elements["end_timestamp"] = s["end_timestamp"]
|
|
||||||
break
|
|
||||||
|
|
||||||
def find_event_fragment_in_array(array, event, first = True):
|
|
||||||
if isinstance(array, dict):
|
|
||||||
|
|
||||||
seen = False
|
|
||||||
for i, ks in enumerate(Event.keys):
|
|
||||||
if len(ks) == len([k for k in ks if k in array]):
|
|
||||||
seen = True
|
|
||||||
if event is None:
|
|
||||||
event = Event(i, array)
|
|
||||||
else:
|
|
||||||
event.add_fragment(i, array)
|
|
||||||
# only consider the first of Event.keys
|
|
||||||
break
|
|
||||||
if not seen:
|
|
||||||
for k in array:
|
|
||||||
event = Event.find_event_fragment_in_array(array[k], event, False)
|
|
||||||
elif isinstance(array, list):
|
|
||||||
for e in array:
|
|
||||||
event = Event.find_event_fragment_in_array(e, event, False)
|
|
||||||
|
|
||||||
if event is not None and first:
|
|
||||||
event.consolidate_current_event()
|
|
||||||
return event
|
|
||||||
|
|
||||||
|
|
||||||
#url="https://www.facebook.com/events/ical/export/?eid=2294200007432315"
|
|
||||||
#url="https://www.facebook.com/events/2294199997432316/2294200007432315/"
|
|
||||||
#url="https://www.facebook.com/events/635247792092358/"
|
|
||||||
url="https://www.facebook.com/events/872781744074648"
|
|
||||||
url="https://www.facebook.com/events/1432798543943663?"
|
|
||||||
#url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410"
|
|
||||||
#url="https://jmtrivial.info"
|
|
||||||
|
|
||||||
cachedir = "cache"
|
if __name__ == "__main__":
|
||||||
result = hashlib.md5(url.encode())
|
|
||||||
hash = result.hexdigest()
|
|
||||||
|
|
||||||
filename = os.path.join(cachedir, hash + ".html")
|
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor(single_event=True))
|
||||||
|
url="https://www.facebook.com/events/872781744074648"
|
||||||
|
|
||||||
if os.path.isfile(filename):
|
events = u2e.process(url, cache = "fb.html", published = True)
|
||||||
# print("Use cache")
|
|
||||||
with open(filename) as f:
|
|
||||||
doc = "\n".join(f.readlines())
|
|
||||||
else:
|
|
||||||
print("Download page")
|
|
||||||
|
|
||||||
options = Options()
|
exportfile = "event-facebook.json"
|
||||||
options.add_argument("--headless=new")
|
print("Saving events to file {}".format(exportfile))
|
||||||
service = Service("/usr/bin/chromedriver")
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
driver = webdriver.Chrome(service=service, options=options)
|
|
||||||
driver.get(url)
|
|
||||||
doc = driver.page_source
|
|
||||||
driver.quit()
|
|
||||||
|
|
||||||
dir = os.path.dirname(filename)
|
|
||||||
if not os.path.exists(dir):
|
|
||||||
os.makedirs(dir)
|
|
||||||
with open(filename, "w") as text_file:
|
|
||||||
text_file.write(doc)
|
|
||||||
|
|
||||||
|
|
||||||
soup = BeautifulSoup(doc)
|
|
||||||
|
|
||||||
event = None
|
|
||||||
for json_script in soup.find_all('script', type="application/json"):
|
|
||||||
json_txt = json_script.get_text()
|
|
||||||
json_struct = json.loads(json_txt)
|
|
||||||
|
|
||||||
event = Event.find_event_fragment_in_array(json_struct, event)
|
|
||||||
|
|
||||||
print(event)
|
|
||||||
|
|
||||||
|
@ -5,8 +5,6 @@ from celery import Celery
|
|||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
|
|
||||||
from .extractors import ExtractorAllURLs
|
|
||||||
|
|
||||||
from .import_tasks.downloader import *
|
from .import_tasks.downloader import *
|
||||||
from .import_tasks.extractor import *
|
from .import_tasks.extractor import *
|
||||||
from .import_tasks.importer import *
|
from .import_tasks.importer import *
|
||||||
@ -53,7 +51,7 @@ def close_import_task(taskid, success, error_message, importer):
|
|||||||
@app.task(bind=True)
|
@app.task(bind=True)
|
||||||
def import_events_from_json(self, json):
|
def import_events_from_json(self, json):
|
||||||
from agenda_culturel.models import Event, BatchImportation
|
from agenda_culturel.models import Event, BatchImportation
|
||||||
from .importation import EventsImporter
|
from .db_importer import DBImporterEvents
|
||||||
|
|
||||||
# create a batch importation
|
# create a batch importation
|
||||||
importation = BatchImportation(celery_id=self.request.id)
|
importation = BatchImportation(celery_id=self.request.id)
|
||||||
@ -63,7 +61,7 @@ def import_events_from_json(self, json):
|
|||||||
|
|
||||||
logger.info("Import events from json: {}".format(self.request.id))
|
logger.info("Import events from json: {}".format(self.request.id))
|
||||||
|
|
||||||
importer = EventsImporter(self.request.id)
|
importer = DBImporterEvents(self.request.id)
|
||||||
|
|
||||||
#try:
|
#try:
|
||||||
success, error_message = importer.import_events(json)
|
success, error_message = importer.import_events(json)
|
||||||
@ -78,7 +76,7 @@ def import_events_from_json(self, json):
|
|||||||
@app.task(bind=True)
|
@app.task(bind=True)
|
||||||
def run_recurrent_import(self, pk):
|
def run_recurrent_import(self, pk):
|
||||||
from agenda_culturel.models import RecurrentImport, BatchImportation
|
from agenda_culturel.models import RecurrentImport, BatchImportation
|
||||||
from .importation import EventsImporter
|
from .db_importer import DBImporterEvents
|
||||||
from django.shortcuts import get_object_or_404
|
from django.shortcuts import get_object_or_404
|
||||||
|
|
||||||
logger.info("Run recurrent import: {}".format(self.request.id))
|
logger.info("Run recurrent import: {}".format(self.request.id))
|
||||||
@ -92,7 +90,7 @@ def run_recurrent_import(self, pk):
|
|||||||
importation.save()
|
importation.save()
|
||||||
|
|
||||||
# create an importer
|
# create an importer
|
||||||
importer = EventsImporter(self.request.id)
|
importer = DBImporterEvents(self.request.id)
|
||||||
|
|
||||||
# prepare downloading and extracting processes
|
# prepare downloading and extracting processes
|
||||||
downloader = SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE else ChromiumHeadlessDownloader()
|
downloader = SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE else ChromiumHeadlessDownloader()
|
||||||
|
@ -7,7 +7,7 @@ import logging
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EventsImporter:
|
class DBImporterEvents:
|
||||||
|
|
||||||
def __init__(self, celery_id):
|
def __init__(self, celery_id):
|
||||||
self.celery_id = celery_id
|
self.celery_id = celery_id
|
@ -37,14 +37,18 @@ class ChromiumHeadlessDownloader(Downloader):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
options = Options()
|
self.options = Options()
|
||||||
options.add_argument("--headless=new")
|
self.options.add_argument("--headless=new")
|
||||||
service = Service("/usr/bin/chromedriver")
|
self.options.add_argument("--disable-dev-shm-usage")
|
||||||
self.driver = webdriver.Chrome(service=service, options=options)
|
self.options.add_argument("--no-sandbox")
|
||||||
|
self.service = Service("/usr/bin/chromedriver")
|
||||||
|
|
||||||
|
|
||||||
def download(self, url):
|
def download(self, url):
|
||||||
print("Download {}".format(url))
|
print("Download {}".format(url))
|
||||||
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
return driver.page_source
|
doc = self.driver.page_source
|
||||||
|
self.driver.quit()
|
||||||
|
return doc
|
||||||
|
@ -13,6 +13,10 @@ class Extractor(ABC):
|
|||||||
def extract(self, content, url, url_human = None):
|
def extract(self, content, url, url_human = None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def clean_url(url):
|
||||||
|
pass
|
||||||
|
|
||||||
def set_header(self, url):
|
def set_header(self, url):
|
||||||
self.header["url"] = url
|
self.header["url"] = url
|
||||||
self.header["date"] = datetime.now()
|
self.header["date"] = datetime.now()
|
||||||
@ -20,7 +24,7 @@ class Extractor(ABC):
|
|||||||
def clear_events(self):
|
def clear_events(self):
|
||||||
self.events = []
|
self.events = []
|
||||||
|
|
||||||
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False):
|
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
|
||||||
if title is None:
|
if title is None:
|
||||||
print("ERROR: cannot import an event without name")
|
print("ERROR: cannot import an event without name")
|
||||||
return
|
return
|
||||||
@ -36,8 +40,11 @@ class Extractor(ABC):
|
|||||||
"location": location,
|
"location": location,
|
||||||
"description": description,
|
"description": description,
|
||||||
"tags": tags,
|
"tags": tags,
|
||||||
"published": published
|
"published": published,
|
||||||
|
"image": image,
|
||||||
|
"image_alt": image_alt
|
||||||
}
|
}
|
||||||
|
# TODO: pourquoi url_human et non reference_url
|
||||||
if url_human is not None:
|
if url_human is not None:
|
||||||
event["url_human"] = url_human
|
event["url_human"] = url_human
|
||||||
if start_time is not None:
|
if start_time is not None:
|
||||||
@ -60,3 +67,21 @@ class Extractor(ABC):
|
|||||||
|
|
||||||
def get_structure(self):
|
def get_structure(self):
|
||||||
return { "header": self.header, "events": self.events}
|
return { "header": self.header, "events": self.events}
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
from .extractor_ical import ICALExtractor
|
||||||
|
from .extractor_facebook import FacebookEventExtractor
|
||||||
|
|
||||||
|
result = url
|
||||||
|
for e in [ICALExtractor, FacebookEventExtractor]:
|
||||||
|
result = e.clean_url(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_default_extractors(single_event=False):
|
||||||
|
from .extractor_ical import ICALExtractor
|
||||||
|
from .extractor_facebook import FacebookEventExtractor
|
||||||
|
|
||||||
|
if single_event:
|
||||||
|
return [FacebookEventExtractor(single_event=True)]
|
||||||
|
else:
|
||||||
|
return [ICALExtractor(), FacebookEventExtractor(single_event=False)]
|
@ -1,65 +1,18 @@
|
|||||||
from abc import ABC, abstractmethod
|
import icalendar
|
||||||
|
import warnings
|
||||||
|
|
||||||
from django.db import models
|
|
||||||
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
|
|
||||||
import urllib.request
|
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import json
|
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
|
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from .extractor import *
|
||||||
|
import json
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Extractor:
|
class FacebookEventExtractor(Extractor):
|
||||||
|
|
||||||
name = None
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def is_known_url(url):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def extract(url):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def clean_url(url):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def download(url):
|
|
||||||
try:
|
|
||||||
options = Options()
|
|
||||||
options.add_argument("--headless=new")
|
|
||||||
options.add_argument("--disable-dev-shm-usage")
|
|
||||||
options.add_argument("--no-sandbox")
|
|
||||||
service = Service("/usr/bin/chromedriver")
|
|
||||||
|
|
||||||
driver = webdriver.Chrome(service=service, options=options)
|
|
||||||
driver.get(url)
|
|
||||||
doc = driver.page_source
|
|
||||||
driver.quit()
|
|
||||||
return doc
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ExtractorFacebook(Extractor):
|
|
||||||
|
|
||||||
name = "Facebook"
|
|
||||||
|
|
||||||
class SimpleFacebookEvent:
|
class SimpleFacebookEvent:
|
||||||
|
|
||||||
@ -70,7 +23,7 @@ class ExtractorFacebook(Extractor):
|
|||||||
self.elements[key] = data[key] if key in data else None
|
self.elements[key] = data[key] if key in data else None
|
||||||
|
|
||||||
if "parent_event" in data:
|
if "parent_event" in data:
|
||||||
self.parent = ExtractorFacebook.SimpleFacebookEvent(data["parent_event"])
|
self.parent = FacebookEventExtractor.SimpleFacebookEvent(data["parent_event"])
|
||||||
|
|
||||||
|
|
||||||
class FacebookEvent:
|
class FacebookEvent:
|
||||||
@ -119,14 +72,14 @@ class ExtractorFacebook(Extractor):
|
|||||||
def add_fragment(self, i, event):
|
def add_fragment(self, i, event):
|
||||||
self.fragments[i] = event
|
self.fragments[i] = event
|
||||||
|
|
||||||
if ExtractorFacebook.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
|
if FacebookEventExtractor.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
|
||||||
self.get_possible_end_timestamp(i, event)
|
self.get_possible_end_timestamp(i, event)
|
||||||
else:
|
else:
|
||||||
for k in ExtractorFacebook.FacebookEvent.keys[i]:
|
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
|
||||||
if k == "comet_neighboring_siblings":
|
if k == "comet_neighboring_siblings":
|
||||||
self.get_neighbor_events(event[k])
|
self.get_neighbor_events(event[k])
|
||||||
elif k in ExtractorFacebook.FacebookEvent.rules:
|
elif k in FacebookEventExtractor.FacebookEvent.rules:
|
||||||
for nk, rule in ExtractorFacebook.FacebookEvent.rules[k].items():
|
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[k].items():
|
||||||
error = False
|
error = False
|
||||||
c = event[k]
|
c = event[k]
|
||||||
for ki in rule:
|
for ki in rule:
|
||||||
@ -141,11 +94,11 @@ class ExtractorFacebook(Extractor):
|
|||||||
|
|
||||||
|
|
||||||
def get_possible_end_timestamp(self, i, data):
|
def get_possible_end_timestamp(self, i, data):
|
||||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in ExtractorFacebook.FacebookEvent.keys[i]))
|
self.possible_end_timestamp.append(dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]))
|
||||||
|
|
||||||
|
|
||||||
def get_neighbor_events(self, data):
|
def get_neighbor_events(self, data):
|
||||||
self.neighbor_events = [ExtractorFacebook.SimpleFacebookEvent(d) for d in data]
|
self.neighbor_events = [FacebookEventExtractor.SimpleFacebookEvent(d) for d in data]
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||||
@ -168,21 +121,21 @@ class ExtractorFacebook(Extractor):
|
|||||||
if isinstance(array, dict):
|
if isinstance(array, dict):
|
||||||
|
|
||||||
seen = False
|
seen = False
|
||||||
for i, ks in enumerate(ExtractorFacebook.FacebookEvent.keys):
|
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
|
||||||
if len(ks) == len([k for k in ks if k in array]):
|
if len(ks) == len([k for k in ks if k in array]):
|
||||||
seen = True
|
seen = True
|
||||||
if event is None:
|
if event is None:
|
||||||
event = ExtractorFacebook.FacebookEvent(i, array)
|
event = FacebookEventExtractor.FacebookEvent(i, array)
|
||||||
else:
|
else:
|
||||||
event.add_fragment(i, array)
|
event.add_fragment(i, array)
|
||||||
# only consider the first of FacebookEvent.keys
|
# only consider the first of FacebookEvent.keys
|
||||||
break
|
break
|
||||||
if not seen:
|
if not seen:
|
||||||
for k in array:
|
for k in array:
|
||||||
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
|
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
|
||||||
elif isinstance(array, list):
|
elif isinstance(array, list):
|
||||||
for e in array:
|
for e in array:
|
||||||
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(e, event, False)
|
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(e, event, False)
|
||||||
|
|
||||||
if event is not None and first:
|
if event is not None and first:
|
||||||
event.consolidate_current_event()
|
event.consolidate_current_event()
|
||||||
@ -190,28 +143,33 @@ class ExtractorFacebook(Extractor):
|
|||||||
|
|
||||||
|
|
||||||
def build_event(self, url):
|
def build_event(self, url):
|
||||||
from .models import Event
|
|
||||||
|
|
||||||
image = self.get_element("image")
|
image = self.get_element("image")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": self.get_element("name"),
|
||||||
|
"category": None,
|
||||||
|
"start_day": self.get_element_date("start_timestamp"),
|
||||||
|
"location": self.get_element("event_place_name"),
|
||||||
|
"description": self.get_element("description"),
|
||||||
|
"tags": [],
|
||||||
|
"uuid": url,
|
||||||
|
"url_human": url,
|
||||||
|
"start_time": self.get_element_time("start_timestamp"),
|
||||||
|
"end_day": self.get_element_date("end_timestamp"),
|
||||||
|
"end_time": self.get_element_time("end_timestamp"),
|
||||||
|
"image": self.get_element("image"),
|
||||||
|
"image_alt": self.get_element("image"),
|
||||||
|
}
|
||||||
|
|
||||||
return Event(title=self.get_element("name"),
|
|
||||||
status=Event.STATUS.DRAFT,
|
def __init__(self, single_event=False):
|
||||||
start_day=self.get_element_date("start_timestamp"),
|
self.single_event = single_event
|
||||||
start_time=self.get_element_time("start_timestamp"),
|
super().__init__()
|
||||||
end_day=self.get_element_date("end_timestamp"),
|
|
||||||
end_time=self.get_element_time("end_timestamp"),
|
|
||||||
location=self.get_element("event_place_name"),
|
|
||||||
description=self.get_element("description"),
|
|
||||||
image=self.get_element("image"),
|
|
||||||
image_alt=self.get_element("image_alt"),
|
|
||||||
uuids=[url],
|
|
||||||
reference_urls=[url])
|
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
|
|
||||||
if ExtractorFacebook.is_known_url(url):
|
if FacebookEventExtractor.is_known_url(url):
|
||||||
u = urlparse(url)
|
u = urlparse(url)
|
||||||
return "https://www.facebook.com" + u.path
|
return "https://www.facebook.com" + u.path
|
||||||
else:
|
else:
|
||||||
@ -222,46 +180,23 @@ class ExtractorFacebook(Extractor):
|
|||||||
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
||||||
|
|
||||||
|
|
||||||
def process_page(txt, url):
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||||
|
# NOTE: this method does not use url_human = None and default_values = None
|
||||||
|
|
||||||
|
# get step by step all information from the content
|
||||||
fevent = None
|
fevent = None
|
||||||
soup = BeautifulSoup(txt, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
for json_script in soup.find_all('script', type="application/json"):
|
for json_script in soup.find_all('script', type="application/json"):
|
||||||
json_txt = json_script.get_text()
|
json_txt = json_script.get_text()
|
||||||
json_struct = json.loads(json_txt)
|
json_struct = json.loads(json_txt)
|
||||||
fevent = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
|
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
|
||||||
|
|
||||||
if fevent is not None:
|
if fevent is not None:
|
||||||
logger.info("Facebook event: " + str(fevent))
|
self.set_header(url)
|
||||||
result = fevent.build_event(url)
|
event = fevent.build_event(url)
|
||||||
return result
|
logger.warning("published: " + str(published))
|
||||||
|
event["published"] = published
|
||||||
return None
|
self.add_event(**event)
|
||||||
|
return self.get_structure()
|
||||||
|
|
||||||
class ExtractorAllURLs:
|
|
||||||
|
|
||||||
extractors = [ExtractorFacebook]
|
|
||||||
|
|
||||||
def clean_url(url):
|
|
||||||
result = url
|
|
||||||
for e in ExtractorAllURLs.extractors:
|
|
||||||
result = e.clean_url(result)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def extract(url):
|
|
||||||
logger.info("Run extraction")
|
|
||||||
|
|
||||||
txt = Extractor.download(url)
|
|
||||||
if txt is None:
|
|
||||||
logger.info("Cannot download url")
|
|
||||||
return None
|
|
||||||
|
|
||||||
for e in ExtractorAllURLs.extractors:
|
|
||||||
result = e.process_page(txt, url)
|
|
||||||
if result is not None:
|
|
||||||
return result
|
|
||||||
else:
|
|
||||||
logger.info("Not a " + e.name + " link")
|
|
||||||
|
|
||||||
return None
|
return None
|
@ -39,6 +39,9 @@ class ICALExtractor(Extractor):
|
|||||||
|
|
||||||
return day, time
|
return day, time
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||||
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
||||||
|
@ -6,13 +6,13 @@ from .extractor import *
|
|||||||
|
|
||||||
class URL2Events:
|
class URL2Events:
|
||||||
|
|
||||||
def __init__(self, downloader, extractor):
|
def __init__(self, downloader = SimpleDownloader(), extractor = None, single_event=False):
|
||||||
|
|
||||||
self.downloader = downloader
|
self.downloader = downloader
|
||||||
self.extractor = extractor
|
self.extractor = extractor
|
||||||
|
self.single_event = single_event
|
||||||
|
|
||||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
def get_content(self, url, cache = None):
|
||||||
|
|
||||||
if cache and os.path.exists(cache):
|
if cache and os.path.exists(cache):
|
||||||
print("Loading cache ({})".format(cache))
|
print("Loading cache ({})".format(cache))
|
||||||
with open(cache) as f:
|
with open(cache) as f:
|
||||||
@ -27,5 +27,25 @@ class URL2Events:
|
|||||||
os.makedirs(dir)
|
os.makedirs(dir)
|
||||||
with open(cache, "w") as text_file:
|
with open(cache, "w") as text_file:
|
||||||
text_file.write(content)
|
text_file.write(content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||||
|
content = self.get_content(url, cache)
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if self.extractor is not None:
|
||||||
|
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||||
|
else:
|
||||||
|
# if the extractor is not defined, use a list of default extractors
|
||||||
|
for e in Extractor.get_default_extractors(self.single_event):
|
||||||
|
#try:
|
||||||
|
events = e.extract(content, url, url_human, default_values, published)
|
||||||
|
if events is not None:
|
||||||
|
return events
|
||||||
|
#except:
|
||||||
|
# continue
|
||||||
|
return None
|
||||||
|
|
||||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
|
||||||
|
@ -345,6 +345,9 @@ article#filters {
|
|||||||
.helptext, .subentry-search, .remarque {
|
.helptext, .subentry-search, .remarque {
|
||||||
font-size: 80%;
|
font-size: 80%;
|
||||||
margin-top: -0.7em;
|
margin-top: -0.7em;
|
||||||
|
ul {
|
||||||
|
font-size: 100%;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
.django-ckeditor-widget {
|
.django-ckeditor-widget {
|
||||||
@ -648,6 +651,9 @@ aside nav a.badge {
|
|||||||
|
|
||||||
/* mise en forme pour les récurrences */
|
/* mise en forme pour les récurrences */
|
||||||
.container-fluid article form p .recurrence-widget {
|
.container-fluid article form p .recurrence-widget {
|
||||||
|
@extend article;
|
||||||
|
width: 100%;
|
||||||
|
border: 0;
|
||||||
|
|
||||||
.header a, .add-button {
|
.header a, .add-button {
|
||||||
@extend [role="button"];
|
@extend [role="button"];
|
||||||
|
@ -27,11 +27,14 @@
|
|||||||
|
|
||||||
{% load static_content_extra %}
|
{% load static_content_extra %}
|
||||||
|
|
||||||
{% if object %}
|
<article>
|
||||||
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
|
<header>
|
||||||
{% else %}
|
{% if object %}
|
||||||
<h1>Édition de l'événement importé</h1>
|
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
|
||||||
{% endif %}
|
{% else %}
|
||||||
|
<h1>Édition de l'événement importé</h1>
|
||||||
|
{% endif %}
|
||||||
|
</header>
|
||||||
|
|
||||||
<div id="container"></div>
|
<div id="container"></div>
|
||||||
<form method="post">{% csrf_token %}
|
<form method="post">{% csrf_token %}
|
||||||
@ -42,5 +45,37 @@
|
|||||||
<input type="submit" value="Enregistrer">
|
<input type="submit" value="Enregistrer">
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
|
{% if object %}
|
||||||
|
<footer class="remarque">
|
||||||
|
Informations complémentaires non éditables :
|
||||||
|
<ul>
|
||||||
|
{% if object.created_date %}<li>Création : {{ object.created_date }}</li>{% endif %}
|
||||||
|
{% if object.modified_date %}<li>Dernière modification : {{ object.modified_date }}</li>{% endif %}
|
||||||
|
{% if object.imported_date %}<li>Dernière importation : {{ object.imported_date }}</li>{% endif %}
|
||||||
|
{% if object.uuids %}
|
||||||
|
{% if object.uuids|length > 0 %}
|
||||||
|
<li>UUIDs (identifiants uniques d'événements dans les sources) :
|
||||||
|
<ul>
|
||||||
|
{% for u in object.uuids %}
|
||||||
|
<li>{{ u }}</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul></li>
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% if object.import_sources %}
|
||||||
|
{% if object.import_sources|length > 0 %}
|
||||||
|
<li>Sources d'import :
|
||||||
|
<ul>
|
||||||
|
{% for u in object.import_sources %}
|
||||||
|
<li><a href="{{ u }}">{{ u }}</a></li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
</ul>
|
||||||
|
</footer>
|
||||||
|
{% endif %}
|
||||||
|
</article>
|
||||||
|
|
||||||
{% endblock %}
|
{% endblock %}
|
@ -30,7 +30,10 @@ from django.contrib import messages
|
|||||||
from django.contrib.messages.views import SuccessMessageMixin
|
from django.contrib.messages.views import SuccessMessageMixin
|
||||||
|
|
||||||
from .calendar import CalendarMonth, CalendarWeek, CalendarDay
|
from .calendar import CalendarMonth, CalendarWeek, CalendarDay
|
||||||
from .extractors import ExtractorAllURLs
|
|
||||||
|
from .import_tasks.importer import URL2Events
|
||||||
|
from .import_tasks.extractor import Extractor
|
||||||
|
from .import_tasks.downloader import ChromiumHeadlessDownloader
|
||||||
|
|
||||||
from .celery import app as celery_app, import_events_from_json, run_recurrent_import
|
from .celery import app as celery_app, import_events_from_json, run_recurrent_import
|
||||||
|
|
||||||
@ -262,7 +265,7 @@ def import_from_url(request):
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
if request.method == 'POST' and "title" in request.POST:
|
if request.method == 'POST' and "title" in request.POST:
|
||||||
form = EventForm(request.POST)
|
form = EventForm(request.POST, is_authenticated=request.user.is_authenticated)
|
||||||
if form.is_valid():
|
if form.is_valid():
|
||||||
new_event = form.save()
|
new_event = form.save()
|
||||||
if request.user.is_authenticated:
|
if request.user.is_authenticated:
|
||||||
@ -284,25 +287,32 @@ def import_from_url(request):
|
|||||||
form_event = EventForm(initial=initial)
|
form_event = EventForm(initial=initial)
|
||||||
|
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
form = EventSubmissionForm(request.POST)
|
|
||||||
|
|
||||||
|
form = EventSubmissionForm(request.POST)
|
||||||
|
|
||||||
if form.is_valid():
|
if form.is_valid():
|
||||||
cd = form.cleaned_data
|
cd = form.cleaned_data
|
||||||
url = cd.get('url')
|
url = cd.get('url')
|
||||||
|
|
||||||
url = ExtractorAllURLs.clean_url(url)
|
url = Extractor.clean_url(url)
|
||||||
|
|
||||||
existing = Event.objects.filter(uuids__contains=[url])
|
existing = Event.objects.filter(uuids__contains=[url])
|
||||||
|
|
||||||
if len(existing) == 0:
|
if len(existing) == 0:
|
||||||
event = ExtractorAllURLs.extract(url)
|
event = None
|
||||||
|
|
||||||
|
u2e = URL2Events(ChromiumHeadlessDownloader(), single_event=True)
|
||||||
|
events_structure = u2e.process(url, published=request.user.is_authenticated)
|
||||||
|
if events_structure is not None and "events" in events_structure and len(events_structure["events"]) > 0:
|
||||||
|
event = Event.from_structure(events_structure["events"][0], events_structure["header"]["url"])
|
||||||
|
# TODO: use celery to import the other events
|
||||||
|
|
||||||
if event != None:
|
if event != None:
|
||||||
form = EventForm(instance=event)
|
form = EventForm(instance=event, is_authenticated=request.user.is_authenticated)
|
||||||
messages.success(request, _("The event has been successfully extracted, and you can now submit it after modifying it if necessary."))
|
messages.success(request, _("The event has been successfully extracted, and you can now submit it after modifying it if necessary."))
|
||||||
return render(request, 'agenda_culturel/event_form.html', context={'form': form })
|
return render(request, 'agenda_culturel/event_form.html', context={'form': form })
|
||||||
else:
|
else:
|
||||||
form = EventForm(initial={'reference_urls': [url]})
|
form = EventForm(initial={'reference_urls': [url]}, is_authenticated=request.user.is_authenticated)
|
||||||
messages.error(request, _("Unable to extract an event from the proposed URL. Please use the form below to submit the event."))
|
messages.error(request, _("Unable to extract an event from the proposed URL. Please use the form below to submit the event."))
|
||||||
return render(request, 'agenda_culturel/import.html', context={'form': form, 'form_event': form_event})
|
return render(request, 'agenda_culturel/import.html', context={'form': form, 'form_event': form_event})
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user