diff --git a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py index d6be501..c47c8ba 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py @@ -15,6 +15,31 @@ logger = logging.getLogger(__name__) # such as https://www.facebook.com/laJeteeClermont/events class CExtractor(TwoStepsExtractor): + def find_event_id_fragment_in_array(self, array, first=True): + found = False + if isinstance(array, dict): + if "__typename" in array and array["__typename"] == "Event" and "id" in array: + self.add_event_url("https://www.facebook.com/events/" + array["id"] + "/") + found = True + if not found: + for k in array: + found = self.find_event_id_fragment_in_array(array[k], False) or found + elif isinstance(array, list): + for e in array: + found = self.find_event_id_fragment_in_array(e, False) or found + return found + + + def find_in_js(self, soup): + found = False + + for json_script in soup.find_all("script", type="application/json"): + json_txt = json_script.get_text() + json_struct = json.loads(json_txt) + found = self.find_event_id_fragment_in_array(json_struct) or found + + return found + def build_event_url_list(self, content): soup = BeautifulSoup(content, "html.parser") @@ -28,6 +53,8 @@ class CExtractor(TwoStepsExtractor): self.add_event_url(link.get('href').split('?')[0]) found = True + found = self.find_in_js(soup) or found + if not found and debug: directory = "errors/" if not os.path.exists(directory):