L'import facebook partage maintenant son code avec les autres imports
Fix #80
This commit is contained in:
parent
3ebc53995b
commit
c5db83cf87
3
experimentations/.gitignore
vendored
Normal file
3
experimentations/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
*.json
|
||||
*.html
|
||||
*.ical
|
@ -1,171 +1,40 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
class SimpleEvent:
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
def __init__(self, data):
|
||||
self.elements = {}
|
||||
|
||||
for key in ["id", "start_timestamp", "end_timestamp"]:
|
||||
self.elements[key] = data[key] if key in data else None
|
||||
|
||||
if "parent_event" in data:
|
||||
self.parent = SimpleEvent(data["parent_event"])
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.extractor_facebook import *
|
||||
|
||||
|
||||
class Event:
|
||||
|
||||
name = "event"
|
||||
keys = [
|
||||
["start_time_formatted", 'start_timestamp',
|
||||
'is_past',
|
||||
"name",
|
||||
"price_info",
|
||||
"cover_media_renderer",
|
||||
"event_creator",
|
||||
"id",
|
||||
"day_time_sentence",
|
||||
"event_place",
|
||||
"comet_neighboring_siblings"],
|
||||
["event_description"],
|
||||
["start_timestamp", "end_timestamp"]
|
||||
]
|
||||
rules = {
|
||||
"event_description": { "description": ["text"]},
|
||||
"cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
|
||||
"event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
|
||||
"event_place": {"event_place_name": ["name"] }
|
||||
}
|
||||
|
||||
def __init__(self, i, event):
|
||||
self.fragments = {}
|
||||
self.elements = {}
|
||||
self.neighbor_events = None
|
||||
self.possible_end_timestamp = []
|
||||
self.add_fragment(i, event)
|
||||
|
||||
def add_fragment(self, i, event):
|
||||
self.fragments[i] = event
|
||||
|
||||
if Event.keys[i] == ["start_timestamp", "end_timestamp"]:
|
||||
self.get_possible_end_timestamp(i, event)
|
||||
else:
|
||||
for k in Event.keys[i]:
|
||||
if k == "comet_neighboring_siblings":
|
||||
self.get_neighbor_events(event[k])
|
||||
elif k in Event.rules:
|
||||
for nk, rule in Event.rules[k].items():
|
||||
c = event[k]
|
||||
for ki in rule:
|
||||
c = c[ki]
|
||||
self.elements[nk] = c
|
||||
else:
|
||||
self.elements[k] = event[k]
|
||||
|
||||
def get_possible_end_timestamp(self, i, data):
|
||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i]))
|
||||
|
||||
def get_neighbor_events(self, data):
|
||||
self.neighbor_events = [SimpleEvent(d) for d in data]
|
||||
|
||||
def __str__(self):
|
||||
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||
|
||||
def consolidate_current_event(self):
|
||||
if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
|
||||
id = self.elements["id"]
|
||||
for ne in self.neighbor_events:
|
||||
if ne.elements["id"] == id:
|
||||
self.elements["end_timestamp"] = ne.elements["end_timestamp"]
|
||||
|
||||
if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
|
||||
for s in self.possible_end_timestamp:
|
||||
if s["start_timestamp"] == self.elements["start_timestamp"]:
|
||||
self.elements["end_timestamp"] = s["end_timestamp"]
|
||||
break
|
||||
|
||||
def find_event_fragment_in_array(array, event, first = True):
|
||||
if isinstance(array, dict):
|
||||
|
||||
seen = False
|
||||
for i, ks in enumerate(Event.keys):
|
||||
if len(ks) == len([k for k in ks if k in array]):
|
||||
seen = True
|
||||
if event is None:
|
||||
event = Event(i, array)
|
||||
else:
|
||||
event.add_fragment(i, array)
|
||||
# only consider the first of Event.keys
|
||||
break
|
||||
if not seen:
|
||||
for k in array:
|
||||
event = Event.find_event_fragment_in_array(array[k], event, False)
|
||||
elif isinstance(array, list):
|
||||
for e in array:
|
||||
event = Event.find_event_fragment_in_array(e, event, False)
|
||||
|
||||
if event is not None and first:
|
||||
event.consolidate_current_event()
|
||||
return event
|
||||
|
||||
|
||||
#url="https://www.facebook.com/events/ical/export/?eid=2294200007432315"
|
||||
#url="https://www.facebook.com/events/2294199997432316/2294200007432315/"
|
||||
#url="https://www.facebook.com/events/635247792092358/"
|
||||
url="https://www.facebook.com/events/872781744074648"
|
||||
url="https://www.facebook.com/events/1432798543943663?"
|
||||
#url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410"
|
||||
#url="https://jmtrivial.info"
|
||||
|
||||
cachedir = "cache"
|
||||
result = hashlib.md5(url.encode())
|
||||
hash = result.hexdigest()
|
||||
if __name__ == "__main__":
|
||||
|
||||
filename = os.path.join(cachedir, hash + ".html")
|
||||
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor(single_event=True))
|
||||
url="https://www.facebook.com/events/872781744074648"
|
||||
|
||||
if os.path.isfile(filename):
|
||||
# print("Use cache")
|
||||
with open(filename) as f:
|
||||
doc = "\n".join(f.readlines())
|
||||
else:
|
||||
print("Download page")
|
||||
events = u2e.process(url, cache = "fb.html", published = True)
|
||||
|
||||
options = Options()
|
||||
options.add_argument("--headless=new")
|
||||
service = Service("/usr/bin/chromedriver")
|
||||
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
driver.get(url)
|
||||
doc = driver.page_source
|
||||
driver.quit()
|
||||
|
||||
dir = os.path.dirname(filename)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(filename, "w") as text_file:
|
||||
text_file.write(doc)
|
||||
|
||||
|
||||
soup = BeautifulSoup(doc)
|
||||
|
||||
event = None
|
||||
for json_script in soup.find_all('script', type="application/json"):
|
||||
json_txt = json_script.get_text()
|
||||
json_struct = json.loads(json_txt)
|
||||
|
||||
event = Event.find_event_fragment_in_array(json_struct, event)
|
||||
|
||||
print(event)
|
||||
exportfile = "event-facebook.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
|
||||
|
@ -5,8 +5,6 @@ from celery import Celery
|
||||
from celery.schedules import crontab
|
||||
from celery.utils.log import get_task_logger
|
||||
|
||||
from .extractors import ExtractorAllURLs
|
||||
|
||||
from .import_tasks.downloader import *
|
||||
from .import_tasks.extractor import *
|
||||
from .import_tasks.importer import *
|
||||
@ -53,7 +51,7 @@ def close_import_task(taskid, success, error_message, importer):
|
||||
@app.task(bind=True)
|
||||
def import_events_from_json(self, json):
|
||||
from agenda_culturel.models import Event, BatchImportation
|
||||
from .importation import EventsImporter
|
||||
from .db_importer import DBImporterEvents
|
||||
|
||||
# create a batch importation
|
||||
importation = BatchImportation(celery_id=self.request.id)
|
||||
@ -63,7 +61,7 @@ def import_events_from_json(self, json):
|
||||
|
||||
logger.info("Import events from json: {}".format(self.request.id))
|
||||
|
||||
importer = EventsImporter(self.request.id)
|
||||
importer = DBImporterEvents(self.request.id)
|
||||
|
||||
#try:
|
||||
success, error_message = importer.import_events(json)
|
||||
@ -78,7 +76,7 @@ def import_events_from_json(self, json):
|
||||
@app.task(bind=True)
|
||||
def run_recurrent_import(self, pk):
|
||||
from agenda_culturel.models import RecurrentImport, BatchImportation
|
||||
from .importation import EventsImporter
|
||||
from .db_importer import DBImporterEvents
|
||||
from django.shortcuts import get_object_or_404
|
||||
|
||||
logger.info("Run recurrent import: {}".format(self.request.id))
|
||||
@ -92,7 +90,7 @@ def run_recurrent_import(self, pk):
|
||||
importation.save()
|
||||
|
||||
# create an importer
|
||||
importer = EventsImporter(self.request.id)
|
||||
importer = DBImporterEvents(self.request.id)
|
||||
|
||||
# prepare downloading and extracting processes
|
||||
downloader = SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE else ChromiumHeadlessDownloader()
|
||||
|
@ -7,7 +7,7 @@ import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EventsImporter:
|
||||
class DBImporterEvents:
|
||||
|
||||
def __init__(self, celery_id):
|
||||
self.celery_id = celery_id
|
@ -37,14 +37,18 @@ class ChromiumHeadlessDownloader(Downloader):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
options = Options()
|
||||
options.add_argument("--headless=new")
|
||||
service = Service("/usr/bin/chromedriver")
|
||||
self.driver = webdriver.Chrome(service=service, options=options)
|
||||
self.options = Options()
|
||||
self.options.add_argument("--headless=new")
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.service = Service("/usr/bin/chromedriver")
|
||||
|
||||
|
||||
def download(self, url):
|
||||
print("Download {}".format(url))
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
self.driver.get(url)
|
||||
return driver.page_source
|
||||
doc = self.driver.page_source
|
||||
self.driver.quit()
|
||||
return doc
|
||||
|
@ -13,6 +13,10 @@ class Extractor(ABC):
|
||||
def extract(self, content, url, url_human = None):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean_url(url):
|
||||
pass
|
||||
|
||||
def set_header(self, url):
|
||||
self.header["url"] = url
|
||||
self.header["date"] = datetime.now()
|
||||
@ -20,7 +24,7 @@ class Extractor(ABC):
|
||||
def clear_events(self):
|
||||
self.events = []
|
||||
|
||||
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False):
|
||||
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
|
||||
if title is None:
|
||||
print("ERROR: cannot import an event without name")
|
||||
return
|
||||
@ -36,8 +40,11 @@ class Extractor(ABC):
|
||||
"location": location,
|
||||
"description": description,
|
||||
"tags": tags,
|
||||
"published": published
|
||||
"published": published,
|
||||
"image": image,
|
||||
"image_alt": image_alt
|
||||
}
|
||||
# TODO: pourquoi url_human et non reference_url
|
||||
if url_human is not None:
|
||||
event["url_human"] = url_human
|
||||
if start_time is not None:
|
||||
@ -60,3 +67,21 @@ class Extractor(ABC):
|
||||
|
||||
def get_structure(self):
|
||||
return { "header": self.header, "events": self.events}
|
||||
|
||||
def clean_url(url):
|
||||
from .extractor_ical import ICALExtractor
|
||||
from .extractor_facebook import FacebookEventExtractor
|
||||
|
||||
result = url
|
||||
for e in [ICALExtractor, FacebookEventExtractor]:
|
||||
result = e.clean_url(result)
|
||||
return result
|
||||
|
||||
def get_default_extractors(single_event=False):
|
||||
from .extractor_ical import ICALExtractor
|
||||
from .extractor_facebook import FacebookEventExtractor
|
||||
|
||||
if single_event:
|
||||
return [FacebookEventExtractor(single_event=True)]
|
||||
else:
|
||||
return [ICALExtractor(), FacebookEventExtractor(single_event=False)]
|
@ -1,65 +1,18 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import icalendar
|
||||
import warnings
|
||||
|
||||
from django.db import models
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
import urllib.request
|
||||
from tempfile import NamedTemporaryFile
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import json
|
||||
from datetime import datetime, date
|
||||
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from .extractor import *
|
||||
import json
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Extractor:
|
||||
|
||||
name = None
|
||||
|
||||
@abstractmethod
|
||||
def is_known_url(url):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def extract(url):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean_url(url):
|
||||
pass
|
||||
|
||||
def download(url):
|
||||
try:
|
||||
options = Options()
|
||||
options.add_argument("--headless=new")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_argument("--no-sandbox")
|
||||
service = Service("/usr/bin/chromedriver")
|
||||
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
driver.get(url)
|
||||
doc = driver.page_source
|
||||
driver.quit()
|
||||
return doc
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ExtractorFacebook(Extractor):
|
||||
|
||||
name = "Facebook"
|
||||
class FacebookEventExtractor(Extractor):
|
||||
|
||||
class SimpleFacebookEvent:
|
||||
|
||||
@ -70,7 +23,7 @@ class ExtractorFacebook(Extractor):
|
||||
self.elements[key] = data[key] if key in data else None
|
||||
|
||||
if "parent_event" in data:
|
||||
self.parent = ExtractorFacebook.SimpleFacebookEvent(data["parent_event"])
|
||||
self.parent = FacebookEventExtractor.SimpleFacebookEvent(data["parent_event"])
|
||||
|
||||
|
||||
class FacebookEvent:
|
||||
@ -119,14 +72,14 @@ class ExtractorFacebook(Extractor):
|
||||
def add_fragment(self, i, event):
|
||||
self.fragments[i] = event
|
||||
|
||||
if ExtractorFacebook.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
|
||||
if FacebookEventExtractor.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
|
||||
self.get_possible_end_timestamp(i, event)
|
||||
else:
|
||||
for k in ExtractorFacebook.FacebookEvent.keys[i]:
|
||||
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
|
||||
if k == "comet_neighboring_siblings":
|
||||
self.get_neighbor_events(event[k])
|
||||
elif k in ExtractorFacebook.FacebookEvent.rules:
|
||||
for nk, rule in ExtractorFacebook.FacebookEvent.rules[k].items():
|
||||
elif k in FacebookEventExtractor.FacebookEvent.rules:
|
||||
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[k].items():
|
||||
error = False
|
||||
c = event[k]
|
||||
for ki in rule:
|
||||
@ -141,11 +94,11 @@ class ExtractorFacebook(Extractor):
|
||||
|
||||
|
||||
def get_possible_end_timestamp(self, i, data):
|
||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in ExtractorFacebook.FacebookEvent.keys[i]))
|
||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]))
|
||||
|
||||
|
||||
def get_neighbor_events(self, data):
|
||||
self.neighbor_events = [ExtractorFacebook.SimpleFacebookEvent(d) for d in data]
|
||||
self.neighbor_events = [FacebookEventExtractor.SimpleFacebookEvent(d) for d in data]
|
||||
|
||||
def __str__(self):
|
||||
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||
@ -168,21 +121,21 @@ class ExtractorFacebook(Extractor):
|
||||
if isinstance(array, dict):
|
||||
|
||||
seen = False
|
||||
for i, ks in enumerate(ExtractorFacebook.FacebookEvent.keys):
|
||||
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
|
||||
if len(ks) == len([k for k in ks if k in array]):
|
||||
seen = True
|
||||
if event is None:
|
||||
event = ExtractorFacebook.FacebookEvent(i, array)
|
||||
event = FacebookEventExtractor.FacebookEvent(i, array)
|
||||
else:
|
||||
event.add_fragment(i, array)
|
||||
# only consider the first of FacebookEvent.keys
|
||||
break
|
||||
if not seen:
|
||||
for k in array:
|
||||
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
|
||||
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
|
||||
elif isinstance(array, list):
|
||||
for e in array:
|
||||
event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(e, event, False)
|
||||
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(e, event, False)
|
||||
|
||||
if event is not None and first:
|
||||
event.consolidate_current_event()
|
||||
@ -190,28 +143,33 @@ class ExtractorFacebook(Extractor):
|
||||
|
||||
|
||||
def build_event(self, url):
|
||||
from .models import Event
|
||||
|
||||
image = self.get_element("image")
|
||||
|
||||
return {
|
||||
"title": self.get_element("name"),
|
||||
"category": None,
|
||||
"start_day": self.get_element_date("start_timestamp"),
|
||||
"location": self.get_element("event_place_name"),
|
||||
"description": self.get_element("description"),
|
||||
"tags": [],
|
||||
"uuid": url,
|
||||
"url_human": url,
|
||||
"start_time": self.get_element_time("start_timestamp"),
|
||||
"end_day": self.get_element_date("end_timestamp"),
|
||||
"end_time": self.get_element_time("end_timestamp"),
|
||||
"image": self.get_element("image"),
|
||||
"image_alt": self.get_element("image"),
|
||||
}
|
||||
|
||||
return Event(title=self.get_element("name"),
|
||||
status=Event.STATUS.DRAFT,
|
||||
start_day=self.get_element_date("start_timestamp"),
|
||||
start_time=self.get_element_time("start_timestamp"),
|
||||
end_day=self.get_element_date("end_timestamp"),
|
||||
end_time=self.get_element_time("end_timestamp"),
|
||||
location=self.get_element("event_place_name"),
|
||||
description=self.get_element("description"),
|
||||
image=self.get_element("image"),
|
||||
image_alt=self.get_element("image_alt"),
|
||||
uuids=[url],
|
||||
reference_urls=[url])
|
||||
|
||||
def __init__(self, single_event=False):
|
||||
self.single_event = single_event
|
||||
super().__init__()
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
|
||||
if ExtractorFacebook.is_known_url(url):
|
||||
if FacebookEventExtractor.is_known_url(url):
|
||||
u = urlparse(url)
|
||||
return "https://www.facebook.com" + u.path
|
||||
else:
|
||||
@ -222,46 +180,23 @@ class ExtractorFacebook(Extractor):
|
||||
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
||||
|
||||
|
||||
def process_page(txt, url):
|
||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||
# NOTE: this method does not use url_human = None and default_values = None
|
||||
|
||||
# get step by step all information from the content
|
||||
fevent = None
|
||||
soup = BeautifulSoup(txt, "html.parser")
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for json_script in soup.find_all('script', type="application/json"):
|
||||
json_txt = json_script.get_text()
|
||||
json_struct = json.loads(json_txt)
|
||||
fevent = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
|
||||
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
|
||||
|
||||
if fevent is not None:
|
||||
logger.info("Facebook event: " + str(fevent))
|
||||
result = fevent.build_event(url)
|
||||
return result
|
||||
self.set_header(url)
|
||||
event = fevent.build_event(url)
|
||||
logger.warning("published: " + str(published))
|
||||
event["published"] = published
|
||||
self.add_event(**event)
|
||||
return self.get_structure()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class ExtractorAllURLs:
|
||||
|
||||
extractors = [ExtractorFacebook]
|
||||
|
||||
def clean_url(url):
|
||||
result = url
|
||||
for e in ExtractorAllURLs.extractors:
|
||||
result = e.clean_url(result)
|
||||
return result
|
||||
|
||||
def extract(url):
|
||||
logger.info("Run extraction")
|
||||
|
||||
txt = Extractor.download(url)
|
||||
if txt is None:
|
||||
logger.info("Cannot download url")
|
||||
return None
|
||||
|
||||
for e in ExtractorAllURLs.extractors:
|
||||
result = e.process_page(txt, url)
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
logger.info("Not a " + e.name + " link")
|
||||
|
||||
return None
|
||||
return None
|
@ -39,6 +39,9 @@ class ICALExtractor(Extractor):
|
||||
|
||||
return day, time
|
||||
|
||||
def clean_url(url):
|
||||
return url
|
||||
|
||||
|
||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
||||
|
@ -6,13 +6,13 @@ from .extractor import *
|
||||
|
||||
class URL2Events:
|
||||
|
||||
def __init__(self, downloader, extractor):
|
||||
def __init__(self, downloader = SimpleDownloader(), extractor = None, single_event=False):
|
||||
|
||||
self.downloader = downloader
|
||||
self.extractor = extractor
|
||||
self.single_event = single_event
|
||||
|
||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||
|
||||
def get_content(self, url, cache = None):
|
||||
if cache and os.path.exists(cache):
|
||||
print("Loading cache ({})".format(cache))
|
||||
with open(cache) as f:
|
||||
@ -27,5 +27,25 @@ class URL2Events:
|
||||
os.makedirs(dir)
|
||||
with open(cache, "w") as text_file:
|
||||
text_file.write(content)
|
||||
return content
|
||||
|
||||
|
||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||
content = self.get_content(url, cache)
|
||||
|
||||
if content is None:
|
||||
return None
|
||||
|
||||
if self.extractor is not None:
|
||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||
else:
|
||||
# if the extractor is not defined, use a list of default extractors
|
||||
for e in Extractor.get_default_extractors(self.single_event):
|
||||
#try:
|
||||
events = e.extract(content, url, url_human, default_values, published)
|
||||
if events is not None:
|
||||
return events
|
||||
#except:
|
||||
# continue
|
||||
return None
|
||||
|
||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||
|
@ -345,6 +345,9 @@ article#filters {
|
||||
.helptext, .subentry-search, .remarque {
|
||||
font-size: 80%;
|
||||
margin-top: -0.7em;
|
||||
ul {
|
||||
font-size: 100%;
|
||||
}
|
||||
}
|
||||
|
||||
.django-ckeditor-widget {
|
||||
@ -648,6 +651,9 @@ aside nav a.badge {
|
||||
|
||||
/* mise en forme pour les récurrences */
|
||||
.container-fluid article form p .recurrence-widget {
|
||||
@extend article;
|
||||
width: 100%;
|
||||
border: 0;
|
||||
|
||||
.header a, .add-button {
|
||||
@extend [role="button"];
|
||||
|
@ -27,11 +27,14 @@
|
||||
|
||||
{% load static_content_extra %}
|
||||
|
||||
{% if object %}
|
||||
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
|
||||
{% else %}
|
||||
<h1>Édition de l'événement importé</h1>
|
||||
{% endif %}
|
||||
<article>
|
||||
<header>
|
||||
{% if object %}
|
||||
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
|
||||
{% else %}
|
||||
<h1>Édition de l'événement importé</h1>
|
||||
{% endif %}
|
||||
</header>
|
||||
|
||||
<div id="container"></div>
|
||||
<form method="post">{% csrf_token %}
|
||||
@ -42,5 +45,37 @@
|
||||
<input type="submit" value="Enregistrer">
|
||||
</div>
|
||||
</form>
|
||||
{% if object %}
|
||||
<footer class="remarque">
|
||||
Informations complémentaires non éditables :
|
||||
<ul>
|
||||
{% if object.created_date %}<li>Création : {{ object.created_date }}</li>{% endif %}
|
||||
{% if object.modified_date %}<li>Dernière modification : {{ object.modified_date }}</li>{% endif %}
|
||||
{% if object.imported_date %}<li>Dernière importation : {{ object.imported_date }}</li>{% endif %}
|
||||
{% if object.uuids %}
|
||||
{% if object.uuids|length > 0 %}
|
||||
<li>UUIDs (identifiants uniques d'événements dans les sources) :
|
||||
<ul>
|
||||
{% for u in object.uuids %}
|
||||
<li>{{ u }}</li>
|
||||
{% endfor %}
|
||||
</ul></li>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if object.import_sources %}
|
||||
{% if object.import_sources|length > 0 %}
|
||||
<li>Sources d'import :
|
||||
<ul>
|
||||
{% for u in object.import_sources %}
|
||||
<li><a href="{{ u }}">{{ u }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</li>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</ul>
|
||||
</footer>
|
||||
{% endif %}
|
||||
</article>
|
||||
|
||||
{% endblock %}
|
@ -30,7 +30,10 @@ from django.contrib import messages
|
||||
from django.contrib.messages.views import SuccessMessageMixin
|
||||
|
||||
from .calendar import CalendarMonth, CalendarWeek, CalendarDay
|
||||
from .extractors import ExtractorAllURLs
|
||||
|
||||
from .import_tasks.importer import URL2Events
|
||||
from .import_tasks.extractor import Extractor
|
||||
from .import_tasks.downloader import ChromiumHeadlessDownloader
|
||||
|
||||
from .celery import app as celery_app, import_events_from_json, run_recurrent_import
|
||||
|
||||
@ -262,7 +265,7 @@ def import_from_url(request):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if request.method == 'POST' and "title" in request.POST:
|
||||
form = EventForm(request.POST)
|
||||
form = EventForm(request.POST, is_authenticated=request.user.is_authenticated)
|
||||
if form.is_valid():
|
||||
new_event = form.save()
|
||||
if request.user.is_authenticated:
|
||||
@ -284,25 +287,32 @@ def import_from_url(request):
|
||||
form_event = EventForm(initial=initial)
|
||||
|
||||
if request.method == 'POST':
|
||||
form = EventSubmissionForm(request.POST)
|
||||
|
||||
form = EventSubmissionForm(request.POST)
|
||||
|
||||
if form.is_valid():
|
||||
cd = form.cleaned_data
|
||||
url = cd.get('url')
|
||||
|
||||
url = ExtractorAllURLs.clean_url(url)
|
||||
url = Extractor.clean_url(url)
|
||||
|
||||
existing = Event.objects.filter(uuids__contains=[url])
|
||||
|
||||
if len(existing) == 0:
|
||||
event = ExtractorAllURLs.extract(url)
|
||||
event = None
|
||||
|
||||
u2e = URL2Events(ChromiumHeadlessDownloader(), single_event=True)
|
||||
events_structure = u2e.process(url, published=request.user.is_authenticated)
|
||||
if events_structure is not None and "events" in events_structure and len(events_structure["events"]) > 0:
|
||||
event = Event.from_structure(events_structure["events"][0], events_structure["header"]["url"])
|
||||
# TODO: use celery to import the other events
|
||||
|
||||
if event != None:
|
||||
form = EventForm(instance=event)
|
||||
form = EventForm(instance=event, is_authenticated=request.user.is_authenticated)
|
||||
messages.success(request, _("The event has been successfully extracted, and you can now submit it after modifying it if necessary."))
|
||||
return render(request, 'agenda_culturel/event_form.html', context={'form': form })
|
||||
else:
|
||||
form = EventForm(initial={'reference_urls': [url]})
|
||||
form = EventForm(initial={'reference_urls': [url]}, is_authenticated=request.user.is_authenticated)
|
||||
messages.error(request, _("Unable to extract an event from the proposed URL. Please use the form below to submit the event."))
|
||||
return render(request, 'agenda_culturel/import.html', context={'form': form, 'form_event': form_event})
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user