diff --git a/experimentations/get_facebook_event.py b/experimentations/get_facebook_event.py index 41ca318..19defd6 100755 --- a/experimentations/get_facebook_event.py +++ b/experimentations/get_facebook_event.py @@ -39,7 +39,8 @@ class Event: "day_time_sentence", "event_place", "comet_neighboring_siblings"], - ["event_description"] + ["event_description"], + ["start_timestamp", "end_timestamp"] ] rules = { "event_description": { "description": ["text"]}, @@ -52,23 +53,29 @@ class Event: self.fragments = {} self.elements = {} self.neighbor_events = None + self.possible_end_timestamp = [] self.add_fragment(i, event) def add_fragment(self, i, event): self.fragments[i] = event - for k in Event.keys[i]: - if k == "comet_neighboring_siblings": - self.get_neighbor_events(event[k]) - elif k in Event.rules: - for nk, rule in Event.rules[k].items(): - c = event[k] - for ki in rule: - c = c[ki] - self.elements[nk] = c - else: - self.elements[k] = event[k] + if Event.keys[i] == ["start_timestamp", "end_timestamp"]: + self.get_possible_end_timestamp(i, event) + else: + for k in Event.keys[i]: + if k == "comet_neighboring_siblings": + self.get_neighbor_events(event[k]) + elif k in Event.rules: + for nk, rule in Event.rules[k].items(): + c = event[k] + for ki in rule: + c = c[ki] + self.elements[nk] = c + else: + self.elements[k] = event[k] + def get_possible_end_timestamp(self, i, data): + self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i])) def get_neighbor_events(self, data): self.neighbor_events = [SimpleEvent(d) for d in data] @@ -77,35 +84,48 @@ class Event: return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events]) def consolidate_current_event(self): - if self.neighbor_events is not None and "id" in self.elements: + if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements: id = self.elements["id"] for ne in self.neighbor_events: if ne.elements["id"] == id: self.elements["end_timestamp"] = ne.elements["end_timestamp"] + + if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0: + for s in self.possible_end_timestamp: + if s["start_timestamp"] == self.elements["start_timestamp"]: + self.elements["end_timestamp"] = s["end_timestamp"] + break - def find_event_fragment_in_array(array, event): + def find_event_fragment_in_array(array, event, first = True): if isinstance(array, dict): + seen = False for i, ks in enumerate(Event.keys): if len(ks) == len([k for k in ks if k in array]): + seen = True if event is None: - event = Event(i, array,) + event = Event(i, array) else: event.add_fragment(i, array) - else: - for k in array: - event = Event.find_event_fragment_in_array(array[k], event) + # only consider the first of Event.keys + break + if not seen: + for k in array: + event = Event.find_event_fragment_in_array(array[k], event, False) elif isinstance(array, list): for e in array: - event = Event.find_event_fragment_in_array(e, event) + event = Event.find_event_fragment_in_array(e, event, False) - if event is not None: + if event is not None and first: event.consolidate_current_event() return event #url="https://www.facebook.com/events/ical/export/?eid=2294200007432315" -url="https://www.facebook.com/events/2294199997432316/2294200007432315/" +#url="https://www.facebook.com/events/2294199997432316/2294200007432315/" +#url="https://www.facebook.com/events/635247792092358/" +url="https://www.facebook.com/events/872781744074648" +url="https://www.facebook.com/events/1432798543943663?" #url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410" #url="https://jmtrivial.info" diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 9b4d3e6..8a4d90a 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -33,7 +33,7 @@ def create_event_from_submission(self, url): logger.info(f"{url=}") if len(Event.objects.filter(reference_urls__contains=[url])) != 0: - logger.info("Already known url: ", url) + logger.info("Already known url: %s", url) else: try: logger.info("About to create event from submission") @@ -43,8 +43,6 @@ def create_event_from_submission(self, url): for e in events: e.save() - except BadHeaderError: - logger.info("BadHeaderError") except Exception as e: logger.error(e) diff --git a/src/agenda_culturel/extractors.py b/src/agenda_culturel/extractors.py index d20927f..2a92722 100644 --- a/src/agenda_culturel/extractors.py +++ b/src/agenda_culturel/extractors.py @@ -42,55 +42,155 @@ class Extractor: class ExtractorFacebook(Extractor): + class SimpleFacebookEvent: + + def __init__(self, data): + self.elements = {} + + for key in ["id", "start_timestamp", "end_timestamp"]: + self.elements[key] = data[key] if key in data else None + + if "parent_event" in data: + self.parent = ExtractorFacebook.SimpleFacebookEvent(data["parent_event"]) + + class FacebookEvent: name = "event" - keys = ["start_time_formatted", 'start_timestamp', 'is_past', "name", "price_info", "cover_media_renderer", "event_creator", "id", "day_time_sentence", "event_place", "comet_neighboring_siblings"] + keys = [ + ["start_time_formatted", 'start_timestamp', + 'is_past', + "name", + "price_info", + "cover_media_renderer", + "event_creator", + "id", + "day_time_sentence", + "event_place", + "comet_neighboring_siblings"], + ["event_description"], + ["start_timestamp", "end_timestamp"] + ] + rules = { + "event_description": { "description": ["text"]}, + "cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]}, + "event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] }, + "event_place": {"event_place_name": ["name"] } + } - def __init__(self, event): - self.data = event + def __init__(self, i, event): + self.fragments = {} + self.elements = {} + self.neighbor_events = None + self.possible_end_timestamp = [] + self.add_fragment(i, event) + + def get_element(self, key): + return self.elements[key] if key in self.elements else None + + + def get_element_datetime(self, key): + v = self.get_element(key) + return datetime.fromtimestamp(v) if v is not None else None + + def add_fragment(self, i, event): + self.fragments[i] = event + + if ExtractorFacebook.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]: + self.get_possible_end_timestamp(i, event) + else: + for k in ExtractorFacebook.FacebookEvent.keys[i]: + if k == "comet_neighboring_siblings": + self.get_neighbor_events(event[k]) + elif k in ExtractorFacebook.FacebookEvent.rules: + for nk, rule in ExtractorFacebook.FacebookEvent.rules[k].items(): + c = event[k] + for ki in rule: + c = c[ki] + self.elements[nk] = c + else: + self.elements[k] = event[k] + + + def get_possible_end_timestamp(self, i, data): + self.possible_end_timestamp.append(dict((k, data[k]) for k in ExtractorFacebook.FacebookEvent.keys[i])) + + + def get_neighbor_events(self, data): + self.neighbor_events = [ExtractorFacebook.SimpleFacebookEvent(d) for d in data] def __str__(self): - return self.data["name"] + return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events]) - def find_event_in_array(array): + def consolidate_current_event(self): + if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements: + if self.neighbor_events is not None and "id" in self.elements: + id = self.elements["id"] + for ne in self.neighbor_events: + if ne.elements["id"] == id: + self.elements["end_timestamp"] = ne.elements["end_timestamp"] + + if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0: + for s in self.possible_end_timestamp: + if s["start_timestamp"] == self.elements["start_timestamp"]: + self.elements["end_timestamp"] = s["end_timestamp"] + break + + def find_event_fragment_in_array(array, event, first = True): if isinstance(array, dict): - if len(ExtractorFacebook.FacebookEvent.keys) == len([k for k in ExtractorFacebook.FacebookEvent.keys if k in array]): - return ExtractorFacebook.FacebookEvent(array) - else: + + seen = False + for i, ks in enumerate(ExtractorFacebook.FacebookEvent.keys): + if len(ks) == len([k for k in ks if k in array]): + seen = True + if event is None: + event = ExtractorFacebook.FacebookEvent(i, array) + else: + event.add_fragment(i, array) + # only consider the first of FacebookEvent.keys + break + if not seen: for k in array: - v = ExtractorFacebook.FacebookEvent.find_event_in_array(array[k]) - if v != None: - return v + event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(array[k], event, False) elif isinstance(array, list): for e in array: - v = ExtractorFacebook.FacebookEvent.find_event_in_array(e) - if v != None: - return v - return None + event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(e, event, False) + + if event is not None and first: + event.consolidate_current_event() + return event def build_event(self, url): from .models import Event - # TODO - return Event(title=self.data["name"], - status=Event.STATUS.DRAFT, - start_day=datetime.fromtimestamp(self.data["start_timestamp"]), - reference_urls=[url]) + + return Event(title=self.get_element("name"), + status=Event.STATUS.DRAFT, + start_day=self.get_element_datetime("start_timestamp"), + start_time=self.get_element_datetime("start_timestamp"), + end_day=self.get_element_datetime("end_timestamp"), + end_time=self.get_element_datetime("end_timestamp"), + location=self.get_element("event_place_name"), + description=self.get_element("description"), + image=self.get_element("image"), + image_alt=self.get_element("image_alt"), + reference_urls=[url]) + def process_page(txt, url): + fevent = None soup = BeautifulSoup(txt, "html.parser") for json_script in soup.find_all('script', type="application/json"): json_txt = json_script.get_text() json_struct = json.loads(json_txt) - fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct) - if fevent != None: - logger.info(str(fevent.data)) + fevent = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(json_struct, fevent) - result = fevent.build_event(url) - return [result] - + if fevent is not None: + logger.info("Facebook event: " + str(fevent)) + result = fevent.build_event(url) + return [result] + return None diff --git a/src/agenda_culturel/migrations/0014_alter_event_image_alter_event_image_alt.py b/src/agenda_culturel/migrations/0014_alter_event_image_alter_event_image_alt.py new file mode 100644 index 0000000..327c1cf --- /dev/null +++ b/src/agenda_culturel/migrations/0014_alter_event_image_alter_event_image_alt.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.1 on 2023-11-11 17:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0013_staticcontent_url_path'), + ] + + operations = [ + migrations.AlterField( + model_name='event', + name='image', + field=models.URLField(blank=True, help_text='URL of the illustration image', max_length=1024, null=True, verbose_name='Illustration'), + ), + migrations.AlterField( + model_name='event', + name='image_alt', + field=models.CharField(blank=True, help_text='Alternative text used by screen readers for the image', max_length=1024, null=True, verbose_name='Illustration description'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index be8b546..3759510 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -107,8 +107,8 @@ class Event(models.Model): description = models.TextField(verbose_name=_('Description'), help_text=_('General description of the event'), blank=True, null=True) - image = models.URLField(verbose_name=_('Illustration'), help_text=_("URL of the illustration image"), max_length=200, blank=True, null=True) - image_alt = models.CharField(verbose_name=_('Illustration description'), help_text=_('Alternative text used by screen readers for the image'), blank=True, null=True, max_length=512) + image = models.URLField(verbose_name=_('Illustration'), help_text=_("URL of the illustration image"), max_length=1024, blank=True, null=True) + image_alt = models.CharField(verbose_name=_('Illustration description'), help_text=_('Alternative text used by screen readers for the image'), blank=True, null=True, max_length=1024) reference_urls = ArrayField(models.URLField(max_length=512), verbose_name=_('URLs'), help_text=_("List of all the urls where this event can be found."), blank=True, null=True)