Fix nouveau site puce à l'oreille

This commit is contained in:
Jean-Marie Favreau 2024-09-14 15:43:16 +02:00
parent 62060925cd
commit 6c86a8fc18
2 changed files with 7 additions and 9 deletions

View File

@ -29,8 +29,8 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
url = "https://www.lapucealoreille63.fr/programmation/"
url_human = "https://www.lapucealoreille63.fr/programmation/"
url = "https://www.lapucealoreille63.fr/agenda"
url_human = "https://www.lapucealoreille63.fr/agenda"
try:
events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True)

View File

@ -15,11 +15,6 @@ class CExtractor(TwoStepsExtractor):
e_url = e.find("a")
if e_url:
if self.add_event_url(e_url["href"]):
title = e.select("div[data-testid=richTextElement] h1.font_0 span")
if title:
title = title[0].contents[0].get_text().replace("\n", " ")
title = re.sub(" +", " ", title)
self.add_event_title(e_url["href"], title)
def add_event_from_content(
self,
@ -31,9 +26,12 @@ class CExtractor(TwoStepsExtractor):
):
soup = BeautifulSoup(event_content, "html.parser")
title = soup.select("h2")[0].get_text()
start_day = self.parse_french_date(
soup.find("h2").get_text()
soup.select("h2")[1].get_text()
) # pas parfait, mais bordel que ce site est mal construit
print(soup.select("h2")[1].get_text())
spans = soup.select("div[data-testid=richTextElement] span")
start_time = None
@ -79,7 +77,7 @@ class CExtractor(TwoStepsExtractor):
self.add_event_with_props(
event_url,
None,
title,
"Concert",
start_day,
location,