diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 08c6567..ab12052 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /usr/src/app RUN --mount=type=cache,target=/var/cache/apt \ apt-get update && \ - apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \ + apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \ && rm -rf /var/lib/apt/lists/* diff --git a/deployment/scripts/celery/start-worker.sh b/deployment/scripts/celery/start-worker.sh index 1b4eb4c..9927408 100644 --- a/deployment/scripts/celery/start-worker.sh +++ b/deployment/scripts/celery/start-worker.sh @@ -3,4 +3,4 @@ set -o errexit set -o nounset -celery -A "$APP_NAME" worker -l info +python3 /usr/local/lib/python3.11/site-packages/watchdog/watchmedo.py auto-restart -d agenda_culturel -p '*.py' --recursive -- celery -A "$APP_NAME" worker -l info diff --git a/docker-compose.yml b/docker-compose.yml index f792ccf..2270e5c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,6 +45,7 @@ services: volumes: - redis_data:/data + celery-worker: &celery-worker container_name: "${APP_NAME}-celery-worker" build: diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index c80dfb7..9b4d3e6 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -6,6 +6,7 @@ from celery.utils.log import get_task_logger from .extractors import ExtractorAllURLs + # Set the default Django settings module for the 'celery' program. APP_ENV = os.getenv("APP_ENV", "dev") os.environ.setdefault("DJANGO_SETTINGS_MODULE", f"agenda_culturel.settings.{APP_ENV}") @@ -27,15 +28,25 @@ app.autodiscover_tasks() @app.task(bind=True) def create_event_from_submission(self, url): + from agenda_culturel.models import Event + logger.info(f"{url=}") - try: - logger.info("About to create event from submission") - events = ExtractorAllURLs.extract(url) - # TODO - except BadHeaderError: - logger.info("BadHeaderError") - except Exception as e: - logger.error(e) + + if len(Event.objects.filter(reference_urls__contains=[url])) != 0: + logger.info("Already known url: ", url) + else: + try: + logger.info("About to create event from submission") + events = ExtractorAllURLs.extract(url) + + if events != None: + for e in events: + e.save() + + except BadHeaderError: + logger.info("BadHeaderError") + except Exception as e: + logger.error(e) app.conf.timezone = "Europe/Paris" diff --git a/src/agenda_culturel/extractors.py b/src/agenda_culturel/extractors.py index 5d3c01e..d20927f 100644 --- a/src/agenda_culturel/extractors.py +++ b/src/agenda_culturel/extractors.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -#from .models import Event + from selenium import webdriver from selenium.webdriver.chrome.service import Service @@ -8,6 +8,8 @@ from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup import json +from datetime import datetime + from celery.utils.log import get_task_logger @@ -67,21 +69,27 @@ class ExtractorFacebook(Extractor): return v return None - def extract(url): - txt = Extractor.download(url) - if txt is None: - logger.error("Cannot download " + url) - return None - else: - soup = BeautifulSoup(txt, "html.parser") - for json_script in soup.find_all('script', type="application/json"): - json_txt = json_script.get_text() - json_struct = json.loads(json_txt) - fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct) - if fevent != None: - logger.info(str(fevent)) - result = "TODO" - return result + + def build_event(self, url): + from .models import Event + # TODO + return Event(title=self.data["name"], + status=Event.STATUS.DRAFT, + start_day=datetime.fromtimestamp(self.data["start_timestamp"]), + reference_urls=[url]) + + def process_page(txt, url): + + soup = BeautifulSoup(txt, "html.parser") + for json_script in soup.find_all('script', type="application/json"): + json_txt = json_script.get_text() + json_struct = json.loads(json_txt) + fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct) + if fevent != None: + logger.info(str(fevent.data)) + + result = fevent.build_event(url) + return [result] return None @@ -92,11 +100,18 @@ class ExtractorAllURLs: def extract(url): logger.info("Run extraction") - result = ExtractorFacebook.extract(url) + txt = Extractor.download(url) + if txt is None: + logger.info("Cannot download url") + return None - if result is None: + result = ExtractorFacebook.process_page(txt, url) + + if result is not None: + return result + else: logger.info("Not a Facebook link") - # add here other extrators - pass - return result + # TODO: add here other extrators + + return None diff --git a/src/agenda_culturel/templates/agenda_culturel/event.html b/src/agenda_culturel/templates/agenda_culturel/event.html index 28b1dd4..a7ccdee 100644 --- a/src/agenda_culturel/templates/agenda_culturel/event.html +++ b/src/agenda_culturel/templates/agenda_culturel/event.html @@ -1 +1,2 @@

{{ object.title }}

+

Date : {{ object.start_day }}

\ No newline at end of file diff --git a/src/agenda_culturel/urls.py b/src/agenda_culturel/urls.py index 0dbd4ab..383eb6a 100644 --- a/src/agenda_culturel/urls.py +++ b/src/agenda_culturel/urls.py @@ -10,7 +10,7 @@ from .views import * urlpatterns = [ path("", EventListView.as_view(), name="home"), re_path(r'^(?P' + '|'.join([dm.value for dm in DisplayModes]) + ')/$', view_interval, name='view_interval'), - path("event/-", EventDetailView.as_view(), name="view_event"), + path("event/-", EventDetailView.as_view(), name="view_event"), path("proposer", EventSubmissionFormView.as_view(), name="event_submission_form"), path("admin/", admin.site.urls), path("test_app/", include("test_app.urls")), diff --git a/src/requirements.txt b/src/requirements.txt index a327a55..1657d88 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -22,3 +22,4 @@ redis==4.5.5 whitenoise==6.4.0 selenium==4.14.0 BeautifulSoup4==4.12.2 +watchdog==3.0.0