import requests from bs4 import BeautifulSoup import json import codecs import time def scrape_page(url): # Récupérer le contenu HTML de la page response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") # Extraire les informations de la page title = soup.select_one("#main figure h1").text.strip() tags = ", ".join([a.text.strip() for a in soup.select_one("#main figure ul").find_all("a")]) image_url = soup.select_one("#main figure img")["src"] date = soup.select_one("#main #article-date").text.strip() content = soup.select_one("#main #content").text.strip() html_content = str(soup.select_one("#main #content")) return { "title": title, "tags": tags, "image": image_url, "date": date, "content": content, "html_content": html_content, "url": url } # Charger les URL depuis le fichier JSON with open("partipirate_links.json", "r", encoding="utf-8") as f: urls = json.load(f) # Boucler sur les URL all_pages = [] counter=1 for url in urls: if counter < 1000: print(f"{counter} / {len(urls)}, récupération de la page ", url) page = scrape_page(url) all_pages.append(page) time.sleep(0.2) counter+=1 # Enregistrer les informations dans le fichier JSON with open("all_pages.json", "w", encoding="utf-8") as f: json.dump(all_pages, f, ensure_ascii=False, indent=4)