Intégration de beautifulsoup et selenium pour récupérer le contenu d'un événement Facebook
This commit is contained in:
parent
a21b9d030e
commit
d69ed7f3d8
3
Makefile
3
Makefile
@ -53,6 +53,9 @@ migrate:
|
|||||||
build-dev:
|
build-dev:
|
||||||
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d
|
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d
|
||||||
|
|
||||||
|
build-dev-log:
|
||||||
|
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build
|
||||||
|
|
||||||
build-prod:
|
build-prod:
|
||||||
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d
|
DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ WORKDIR /usr/src/app
|
|||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install --no-install-recommends -y build-essential libpq-dev gettext \
|
apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ from celery import Celery
|
|||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
|
|
||||||
|
from .extractors import ExtractorAllURLs
|
||||||
|
|
||||||
# Set the default Django settings module for the 'celery' program.
|
# Set the default Django settings module for the 'celery' program.
|
||||||
APP_ENV = os.getenv("APP_ENV", "dev")
|
APP_ENV = os.getenv("APP_ENV", "dev")
|
||||||
@ -29,6 +30,7 @@ def create_event_from_submission(self, url):
|
|||||||
logger.info(f"{url=}")
|
logger.info(f"{url=}")
|
||||||
try:
|
try:
|
||||||
logger.info("About to create event from submission")
|
logger.info("About to create event from submission")
|
||||||
|
events = ExtractorAllURLs.extract(url)
|
||||||
# TODO
|
# TODO
|
||||||
except BadHeaderError:
|
except BadHeaderError:
|
||||||
logger.info("BadHeaderError")
|
logger.info("BadHeaderError")
|
||||||
|
102
src/agenda_culturel/extractors.py
Normal file
102
src/agenda_culturel/extractors.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
#from .models import Event
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
from celery.utils.log import get_task_logger
|
||||||
|
|
||||||
|
logger = get_task_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Extractor:
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract(url):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def download(url):
|
||||||
|
try:
|
||||||
|
options = Options()
|
||||||
|
options.add_argument("--headless=new")
|
||||||
|
options.add_argument("--disable-dev-shm-usage")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
service = Service("/usr/bin/chromedriver")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(service=service, options=options)
|
||||||
|
driver.get(url)
|
||||||
|
doc = driver.page_source
|
||||||
|
driver.quit()
|
||||||
|
return doc
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
class ExtractorFacebook(Extractor):
|
||||||
|
|
||||||
|
class FacebookEvent:
|
||||||
|
|
||||||
|
name = "event"
|
||||||
|
keys = ["start_time_formatted", 'start_timestamp', 'is_past', "name", "price_info", "cover_media_renderer", "event_creator", "id", "day_time_sentence", "event_place", "comet_neighboring_siblings"]
|
||||||
|
|
||||||
|
def __init__(self, event):
|
||||||
|
self.data = event
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.data["name"]
|
||||||
|
|
||||||
|
def find_event_in_array(array):
|
||||||
|
if isinstance(array, dict):
|
||||||
|
if len(ExtractorFacebook.FacebookEvent.keys) == len([k for k in ExtractorFacebook.FacebookEvent.keys if k in array]):
|
||||||
|
return ExtractorFacebook.FacebookEvent(array)
|
||||||
|
else:
|
||||||
|
for k in array:
|
||||||
|
v = ExtractorFacebook.FacebookEvent.find_event_in_array(array[k])
|
||||||
|
if v != None:
|
||||||
|
return v
|
||||||
|
elif isinstance(array, list):
|
||||||
|
for e in array:
|
||||||
|
v = ExtractorFacebook.FacebookEvent.find_event_in_array(e)
|
||||||
|
if v != None:
|
||||||
|
return v
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract(url):
|
||||||
|
txt = Extractor.download(url)
|
||||||
|
if txt is None:
|
||||||
|
logger.error("Cannot download " + url)
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
soup = BeautifulSoup(txt, "html.parser")
|
||||||
|
for json_script in soup.find_all('script', type="application/json"):
|
||||||
|
json_txt = json_script.get_text()
|
||||||
|
json_struct = json.loads(json_txt)
|
||||||
|
fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct)
|
||||||
|
if fevent != None:
|
||||||
|
logger.info(str(fevent))
|
||||||
|
result = "TODO"
|
||||||
|
return result
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractorAllURLs:
|
||||||
|
|
||||||
|
|
||||||
|
def extract(url):
|
||||||
|
logger.info("Run extraction")
|
||||||
|
|
||||||
|
result = ExtractorFacebook.extract(url)
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
logger.info("Not a Facebook link")
|
||||||
|
# add here other extrators
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result
|
@ -20,3 +20,5 @@ vine==5.0.0
|
|||||||
wcwidth==0.2.6
|
wcwidth==0.2.6
|
||||||
redis==4.5.5
|
redis==4.5.5
|
||||||
whitenoise==6.4.0
|
whitenoise==6.4.0
|
||||||
|
selenium==4.14.0
|
||||||
|
BeautifulSoup4==4.12.2
|
||||||
|
Loading…
Reference in New Issue
Block a user