On fait une capture par jour, pour l'aperçu moteurs de recherche

Fix #225
This commit is contained in:
Jean-Marie Favreau 2024-11-29 21:13:21 +01:00
parent 21b42e4fee
commit ec707bf272
6 changed files with 48 additions and 11 deletions

View File

@ -5,10 +5,11 @@ WORKDIR /usr/src/app
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver gdal-bin \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver gdal-bin fonts-symbola \
&& rm -rf /var/lib/apt/lists/*
COPY src/requirements.txt ./requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \

View File

@ -6,7 +6,8 @@ from celery.schedules import crontab
from celery.utils.log import get_task_logger
from celery.exceptions import MaxRetriesExceededError
import time as time_
from django.conf import settings
from celery.signals import worker_ready
from contextlib import contextmanager
@ -250,6 +251,23 @@ def daily_imports(self):
run_recurrent_imports_from_list([imp.pk for imp in imports])
SCREENSHOT_FILE = settings.MEDIA_ROOT + '/screenshot.png'
@app.task(bind=True)
def screenshot(self):
downloader = ChromiumHeadlessDownloader(noimage=False)
downloader.screenshot("https://pommesdelune.fr", SCREENSHOT_FILE)
@worker_ready.connect
def at_start(sender, **k):
if not os.path.isfile(SCREENSHOT_FILE):
logger.info("Init screenshot file")
with sender.app.connection() as conn:
sender.app.send_task('agenda_culturel.celery.screenshot', None, connection=conn)
else:
logger.info("Screenshot file already exists")
@app.task(bind=True)
def run_all_recurrent_imports(self):
from agenda_culturel.models import RecurrentImport
@ -370,6 +388,10 @@ app.conf.beat_schedule = {
# Daily imports at 3:14 a.m.
"schedule": crontab(hour=3, minute=14),
},
"daily_screenshot": {
"task": "agenda_culturel.celery.screenshot",
"schedule": crontab(hour=3, minute=3),
},
"weekly_imports": {
"task": "agenda_culturel.celery.weekly_imports",
# Daily imports on Mondays at 2:22 a.m.

View File

@ -66,7 +66,7 @@ class SimpleDownloader(Downloader):
class ChromiumHeadlessDownloader(Downloader):
def __init__(self, pause=True):
def __init__(self, pause=True, noimage=True):
super().__init__()
self.pause = pause
self.options = Options()
@ -78,17 +78,31 @@ class ChromiumHeadlessDownloader(Downloader):
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-browser-side-navigation")
self.options.add_argument("--disable-gpu")
self.options.add_experimental_option(
"prefs", {
# block image loading
"profile.managed_default_content_settings.images": 2,
}
)
if noimage:
self.options.add_experimental_option(
"prefs", {
# block image loading
"profile.managed_default_content_settings.images": 2,
}
)
self.service = Service("/usr/bin/chromedriver")
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def screenshot(self, url, path_image):
print("Screenshot {}".format(url))
try:
self.driver.get(url)
if self.pause:
time.sleep(2)
self.driver.save_screenshot(path_image)
except:
print(f">> Exception: {URL}")
return False
return True
def download(self, url, referer=None, post=None):
if post:
raise Exception("POST method with Chromium headless not yet implemented")

View File

@ -239,7 +239,7 @@ class FacebookEventExtractor(Extractor):
result = "https://www.facebook.com" + u.path
# remove name in the url
match = re.match(r"(.*/events)/s/([a-zA-Z-][a-zA-Z-0-9]+)/([0-9/]*)", result)
match = re.match(r"(.*/events)/s/([a-zA-Z-][a-zA-Z-0-9-]+)/([0-9/]*)", result)
if match:
result = match[1] + "/" + match[3]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 237 KiB

View File

@ -9,7 +9,7 @@
{% load static %}
<meta property="og:title" content="Pommes de lune — {% block og_title %}{% endblock %}" />
<meta property="og:description" content="{% block og_description %}Événements culturels à Clermont-Ferrand et aux environs{% endblock %}" />
<meta property="og:image" content="{% block og_image %}{% static 'images/capture.png' %}{% endblock %}" />
<meta property="og:image" content="{% block og_image %}{% get_media_prefix %}/screenshot.png{% endblock %}" />
<meta property="og:url" content="{{ request.build_absolute_uri }}" />
{% if debug %}