diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 09c0b8e..efb5bd6 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -20,6 +20,8 @@ app = Celery("agenda_culturel") logger = get_task_logger(__name__) +chromiumDownloader = ChromiumHeadlessDownloader() + # Using a string here means the worker doesn't have to serialize # the configuration object to child processes. @@ -96,7 +98,7 @@ def run_recurrent_import(self, pk): downloader = ( SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE - else ChromiumHeadlessDownloader() + else chromiumDownloader ) if rimport.processor == RecurrentImport.PROCESSOR.ICAL: extractor = ICALExtractor() diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index c6a9cdf..3a4776d 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -67,14 +67,14 @@ class ChromiumHeadlessDownloader(Downloader): self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--no-sandbox") self.service = Service("/usr/bin/chromedriver") + self.driver = webdriver.Chrome(service=self.service, options=self.options) - def download(self, url, post=None): + def download(self, url, referer=None, post=None): if post: raise Exception("POST method with Chromium headless not yet implemented") print("Download {}".format(url)) - self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver.get(url) doc = self.driver.page_source - self.driver.quit() + self.driver.close() return doc diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 7870a7d..ccb4380 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -16,6 +16,7 @@ class Extractor(ABC): self.header = {} self.events = [] self.downloader = None + self.referer = "" def guess_end_day(self, start_day, start_time, end_time): if end_time: diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index 960f93c..753381f 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -13,7 +13,10 @@ class URL2Events: def process( self, url, url_human=None, cache=None, default_values=None, published=False ): - content = self.downloader.get_content(url, cache, referer=self.extractor.url_referer) + referer = "" + if self.extractor: + referer = self.extractor.url_referer + content = self.downloader.get_content(url, cache, referer=referer) if content is None: return None