Intégration de beautifulsoup et selenium pour récupérer le contenu d'un événement Facebook

This commit is contained in:
Jean-Marie Favreau 2023-10-18 11:53:07 +02:00
parent a21b9d030e
commit d69ed7f3d8
5 changed files with 110 additions and 1 deletions

View File

@ -53,6 +53,9 @@ migrate:
build-dev:
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d
build-dev-log:
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build
build-prod:
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d

View File

@ -5,7 +5,7 @@ WORKDIR /usr/src/app
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext \
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \
&& rm -rf /var/lib/apt/lists/*

View File

@ -4,6 +4,7 @@ from celery import Celery
from celery.schedules import crontab
from celery.utils.log import get_task_logger
from .extractors import ExtractorAllURLs
# Set the default Django settings module for the 'celery' program.
APP_ENV = os.getenv("APP_ENV", "dev")
@ -29,6 +30,7 @@ def create_event_from_submission(self, url):
logger.info(f"{url=}")
try:
logger.info("About to create event from submission")
events = ExtractorAllURLs.extract(url)
# TODO
except BadHeaderError:
logger.info("BadHeaderError")

View File

@ -0,0 +1,102 @@
from abc import ABC, abstractmethod
#from .models import Event
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)
class Extractor:
@abstractmethod
def extract(url):
pass
def download(url):
try:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
doc = driver.page_source
driver.quit()
return doc
except Exception as e:
logger.error(e)
return None
class ExtractorFacebook(Extractor):
class FacebookEvent:
name = "event"
keys = ["start_time_formatted", 'start_timestamp', 'is_past', "name", "price_info", "cover_media_renderer", "event_creator", "id", "day_time_sentence", "event_place", "comet_neighboring_siblings"]
def __init__(self, event):
self.data = event
def __str__(self):
return self.data["name"]
def find_event_in_array(array):
if isinstance(array, dict):
if len(ExtractorFacebook.FacebookEvent.keys) == len([k for k in ExtractorFacebook.FacebookEvent.keys if k in array]):
return ExtractorFacebook.FacebookEvent(array)
else:
for k in array:
v = ExtractorFacebook.FacebookEvent.find_event_in_array(array[k])
if v != None:
return v
elif isinstance(array, list):
for e in array:
v = ExtractorFacebook.FacebookEvent.find_event_in_array(e)
if v != None:
return v
return None
def extract(url):
txt = Extractor.download(url)
if txt is None:
logger.error("Cannot download " + url)
return None
else:
soup = BeautifulSoup(txt, "html.parser")
for json_script in soup.find_all('script', type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct)
if fevent != None:
logger.info(str(fevent))
result = "TODO"
return result
return None
class ExtractorAllURLs:
def extract(url):
logger.info("Run extraction")
result = ExtractorFacebook.extract(url)
if result is None:
logger.info("Not a Facebook link")
# add here other extrators
pass
return result

View File

@ -20,3 +20,5 @@ vine==5.0.0
wcwidth==0.2.6
redis==4.5.5
whitenoise==6.4.0
selenium==4.14.0
BeautifulSoup4==4.12.2