diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 533a9e8..7f038bf 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -5,10 +5,11 @@ WORKDIR /usr/src/app RUN --mount=type=cache,target=/var/cache/apt \ apt-get update && \ - apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver gdal-bin \ + apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver gdal-bin fonts-symbola \ && rm -rf /var/lib/apt/lists/* + COPY src/requirements.txt ./requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 67672bb..1932cb9 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -6,7 +6,8 @@ from celery.schedules import crontab from celery.utils.log import get_task_logger from celery.exceptions import MaxRetriesExceededError import time as time_ - +from django.conf import settings +from celery.signals import worker_ready from contextlib import contextmanager @@ -250,6 +251,23 @@ def daily_imports(self): run_recurrent_imports_from_list([imp.pk for imp in imports]) +SCREENSHOT_FILE = settings.MEDIA_ROOT + '/screenshot.png' + +@app.task(bind=True) +def screenshot(self): + downloader = ChromiumHeadlessDownloader(noimage=False) + downloader.screenshot("https://pommesdelune.fr", SCREENSHOT_FILE) + +@worker_ready.connect +def at_start(sender, **k): + if not os.path.isfile(SCREENSHOT_FILE): + logger.info("Init screenshot file") + with sender.app.connection() as conn: + sender.app.send_task('agenda_culturel.celery.screenshot', None, connection=conn) + else: + logger.info("Screenshot file already exists") + + @app.task(bind=True) def run_all_recurrent_imports(self): from agenda_culturel.models import RecurrentImport @@ -370,6 +388,10 @@ app.conf.beat_schedule = { # Daily imports at 3:14 a.m. "schedule": crontab(hour=3, minute=14), }, + "daily_screenshot": { + "task": "agenda_culturel.celery.screenshot", + "schedule": crontab(hour=3, minute=3), + }, "weekly_imports": { "task": "agenda_culturel.celery.weekly_imports", # Daily imports on Mondays at 2:22 a.m. diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index 7fd45ee..905c130 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -66,7 +66,7 @@ class SimpleDownloader(Downloader): class ChromiumHeadlessDownloader(Downloader): - def __init__(self, pause=True): + def __init__(self, pause=True, noimage=True): super().__init__() self.pause = pause self.options = Options() @@ -78,17 +78,31 @@ class ChromiumHeadlessDownloader(Downloader): self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--disable-browser-side-navigation") self.options.add_argument("--disable-gpu") - self.options.add_experimental_option( - "prefs", { - # block image loading - "profile.managed_default_content_settings.images": 2, - } - ) + if noimage: + self.options.add_experimental_option( + "prefs", { + # block image loading + "profile.managed_default_content_settings.images": 2, + } + ) self.service = Service("/usr/bin/chromedriver") self.driver = webdriver.Chrome(service=self.service, options=self.options) + def screenshot(self, url, path_image): + print("Screenshot {}".format(url)) + try: + self.driver.get(url) + if self.pause: + time.sleep(2) + self.driver.save_screenshot(path_image) + except: + print(f">> Exception: {URL}") + return False + + return True + def download(self, url, referer=None, post=None): if post: raise Exception("POST method with Chromium headless not yet implemented") diff --git a/src/agenda_culturel/import_tasks/extractor_facebook.py b/src/agenda_culturel/import_tasks/extractor_facebook.py index e0c50b6..b7970ab 100644 --- a/src/agenda_culturel/import_tasks/extractor_facebook.py +++ b/src/agenda_culturel/import_tasks/extractor_facebook.py @@ -239,7 +239,7 @@ class FacebookEventExtractor(Extractor): result = "https://www.facebook.com" + u.path # remove name in the url - match = re.match(r"(.*/events)/s/([a-zA-Z-][a-zA-Z-0-9]+)/([0-9/]*)", result) + match = re.match(r"(.*/events)/s/([a-zA-Z-][a-zA-Z-0-9-]+)/([0-9/]*)", result) if match: result = match[1] + "/" + match[3] diff --git a/src/agenda_culturel/static/images/capture.png b/src/agenda_culturel/static/images/capture.png deleted file mode 100644 index 203a3de..0000000 Binary files a/src/agenda_culturel/static/images/capture.png and /dev/null differ diff --git a/src/agenda_culturel/templates/agenda_culturel/page.html b/src/agenda_culturel/templates/agenda_culturel/page.html index 7483394..465cb99 100644 --- a/src/agenda_culturel/templates/agenda_culturel/page.html +++ b/src/agenda_culturel/templates/agenda_culturel/page.html @@ -9,7 +9,7 @@ {% load static %} - + {% if debug %}