Première intégration d'un événement FB

This commit is contained in:
Jean-Marie Favreau 2023-10-18 19:53:26 +02:00
parent d69ed7f3d8
commit 0af45f6d23
8 changed files with 61 additions and 32 deletions

View File

@ -5,7 +5,7 @@ WORKDIR /usr/src/app
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \
&& rm -rf /var/lib/apt/lists/*

View File

@ -3,4 +3,4 @@
set -o errexit
set -o nounset
celery -A "$APP_NAME" worker -l info
python3 /usr/local/lib/python3.11/site-packages/watchdog/watchmedo.py auto-restart -d agenda_culturel -p '*.py' --recursive -- celery -A "$APP_NAME" worker -l info

View File

@ -45,6 +45,7 @@ services:
volumes:
- redis_data:/data
celery-worker: &celery-worker
container_name: "${APP_NAME}-celery-worker"
build:

View File

@ -6,6 +6,7 @@ from celery.utils.log import get_task_logger
from .extractors import ExtractorAllURLs
# Set the default Django settings module for the 'celery' program.
APP_ENV = os.getenv("APP_ENV", "dev")
os.environ.setdefault("DJANGO_SETTINGS_MODULE", f"agenda_culturel.settings.{APP_ENV}")
@ -27,15 +28,25 @@ app.autodiscover_tasks()
@app.task(bind=True)
def create_event_from_submission(self, url):
from agenda_culturel.models import Event
logger.info(f"{url=}")
try:
logger.info("About to create event from submission")
events = ExtractorAllURLs.extract(url)
# TODO
except BadHeaderError:
logger.info("BadHeaderError")
except Exception as e:
logger.error(e)
if len(Event.objects.filter(reference_urls__contains=[url])) != 0:
logger.info("Already known url: ", url)
else:
try:
logger.info("About to create event from submission")
events = ExtractorAllURLs.extract(url)
if events != None:
for e in events:
e.save()
except BadHeaderError:
logger.info("BadHeaderError")
except Exception as e:
logger.error(e)
app.conf.timezone = "Europe/Paris"

View File

@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
#from .models import Event
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
@ -8,6 +8,8 @@ from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
from datetime import datetime
from celery.utils.log import get_task_logger
@ -67,21 +69,27 @@ class ExtractorFacebook(Extractor):
return v
return None
def extract(url):
txt = Extractor.download(url)
if txt is None:
logger.error("Cannot download " + url)
return None
else:
soup = BeautifulSoup(txt, "html.parser")
for json_script in soup.find_all('script', type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct)
if fevent != None:
logger.info(str(fevent))
result = "TODO"
return result
def build_event(self, url):
from .models import Event
# TODO
return Event(title=self.data["name"],
status=Event.STATUS.DRAFT,
start_day=datetime.fromtimestamp(self.data["start_timestamp"]),
reference_urls=[url])
def process_page(txt, url):
soup = BeautifulSoup(txt, "html.parser")
for json_script in soup.find_all('script', type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct)
if fevent != None:
logger.info(str(fevent.data))
result = fevent.build_event(url)
return [result]
return None
@ -92,11 +100,18 @@ class ExtractorAllURLs:
def extract(url):
logger.info("Run extraction")
result = ExtractorFacebook.extract(url)
txt = Extractor.download(url)
if txt is None:
logger.info("Cannot download url")
return None
if result is None:
result = ExtractorFacebook.process_page(txt, url)
if result is not None:
return result
else:
logger.info("Not a Facebook link")
# add here other extrators
pass
return result
# TODO: add here other extrators
return None

View File

@ -1 +1,2 @@
<h1>{{ object.title }}</h1>
<p>Date&nbsp;: {{ object.start_day }}</p>

View File

@ -10,7 +10,7 @@ from .views import *
urlpatterns = [
path("", EventListView.as_view(), name="home"),
re_path(r'^(?P<mode>' + '|'.join([dm.value for dm in DisplayModes]) + ')/$', view_interval, name='view_interval'),
path("event/<pk>-<extra>", EventDetailView.as_view(), name="view_event"),
path("event/<int:pk>-<extra>", EventDetailView.as_view(), name="view_event"),
path("proposer", EventSubmissionFormView.as_view(), name="event_submission_form"),
path("admin/", admin.site.urls),
path("test_app/", include("test_app.urls")),

View File

@ -22,3 +22,4 @@ redis==4.5.5
whitenoise==6.4.0
selenium==4.14.0
BeautifulSoup4==4.12.2
watchdog==3.0.0