diff --git a/Makefile b/Makefile index a686897..a8e0359 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,9 @@ migrate: build-dev: DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d +build-dev-log: + DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build + build-prod: DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 5cab593..08c6567 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /usr/src/app RUN --mount=type=cache,target=/var/cache/apt \ apt-get update && \ - apt-get install --no-install-recommends -y build-essential libpq-dev gettext \ + apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \ && rm -rf /var/lib/apt/lists/* diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index e467f4e..c80dfb7 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -4,6 +4,7 @@ from celery import Celery from celery.schedules import crontab from celery.utils.log import get_task_logger +from .extractors import ExtractorAllURLs # Set the default Django settings module for the 'celery' program. APP_ENV = os.getenv("APP_ENV", "dev") @@ -29,6 +30,7 @@ def create_event_from_submission(self, url): logger.info(f"{url=}") try: logger.info("About to create event from submission") + events = ExtractorAllURLs.extract(url) # TODO except BadHeaderError: logger.info("BadHeaderError") diff --git a/src/agenda_culturel/extractors.py b/src/agenda_culturel/extractors.py new file mode 100644 index 0000000..5d3c01e --- /dev/null +++ b/src/agenda_culturel/extractors.py @@ -0,0 +1,102 @@ +from abc import ABC, abstractmethod +#from .models import Event + +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options + +from bs4 import BeautifulSoup + +import json + + +from celery.utils.log import get_task_logger + +logger = get_task_logger(__name__) + + +class Extractor: + + @abstractmethod + def extract(url): + pass + + def download(url): + try: + options = Options() + options.add_argument("--headless=new") + options.add_argument("--disable-dev-shm-usage") + options.add_argument("--no-sandbox") + service = Service("/usr/bin/chromedriver") + + driver = webdriver.Chrome(service=service, options=options) + driver.get(url) + doc = driver.page_source + driver.quit() + return doc + except Exception as e: + logger.error(e) + return None + +class ExtractorFacebook(Extractor): + + class FacebookEvent: + + name = "event" + keys = ["start_time_formatted", 'start_timestamp', 'is_past', "name", "price_info", "cover_media_renderer", "event_creator", "id", "day_time_sentence", "event_place", "comet_neighboring_siblings"] + + def __init__(self, event): + self.data = event + + def __str__(self): + return self.data["name"] + + def find_event_in_array(array): + if isinstance(array, dict): + if len(ExtractorFacebook.FacebookEvent.keys) == len([k for k in ExtractorFacebook.FacebookEvent.keys if k in array]): + return ExtractorFacebook.FacebookEvent(array) + else: + for k in array: + v = ExtractorFacebook.FacebookEvent.find_event_in_array(array[k]) + if v != None: + return v + elif isinstance(array, list): + for e in array: + v = ExtractorFacebook.FacebookEvent.find_event_in_array(e) + if v != None: + return v + return None + + def extract(url): + txt = Extractor.download(url) + if txt is None: + logger.error("Cannot download " + url) + return None + else: + soup = BeautifulSoup(txt, "html.parser") + for json_script in soup.find_all('script', type="application/json"): + json_txt = json_script.get_text() + json_struct = json.loads(json_txt) + fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct) + if fevent != None: + logger.info(str(fevent)) + result = "TODO" + return result + + return None + + +class ExtractorAllURLs: + + + def extract(url): + logger.info("Run extraction") + + result = ExtractorFacebook.extract(url) + + if result is None: + logger.info("Not a Facebook link") + # add here other extrators + pass + + return result diff --git a/src/requirements.txt b/src/requirements.txt index d626fb3..a327a55 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -20,3 +20,5 @@ vine==5.0.0 wcwidth==0.2.6 redis==4.5.5 whitenoise==6.4.0 +selenium==4.14.0 +BeautifulSoup4==4.12.2