48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import codecs
|
|
import time
|
|
|
|
def scrape_page(url):
|
|
# Récupérer le contenu HTML de la page
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
# Extraire les informations de la page
|
|
title = soup.select_one("#main figure h1").text.strip()
|
|
tags = ", ".join([a.text.strip() for a in soup.select_one("#main figure ul").find_all("a")])
|
|
image_url = soup.select_one("#main figure img")["src"]
|
|
date = soup.select_one("#main #article-date").text.strip()
|
|
content = soup.select_one("#main #content").text.strip()
|
|
html_content = str(soup.select_one("#main #content"))
|
|
|
|
return {
|
|
"title": title,
|
|
"tags": tags,
|
|
"image": image_url,
|
|
"date": date,
|
|
"content": content,
|
|
"html_content": html_content,
|
|
"url": url
|
|
}
|
|
|
|
# Charger les URL depuis le fichier JSON
|
|
with open("partipirate_links.json", "r", encoding="utf-8") as f:
|
|
urls = json.load(f)
|
|
|
|
# Boucler sur les URL
|
|
all_pages = []
|
|
counter=1
|
|
for url in urls:
|
|
if counter < 1000:
|
|
print(f"{counter} / {len(urls)}, récupération de la page ", url)
|
|
page = scrape_page(url)
|
|
all_pages.append(page)
|
|
time.sleep(0.2)
|
|
counter+=1
|
|
|
|
# Enregistrer les informations dans le fichier JSON
|
|
with open("all_pages.json", "w", encoding="utf-8") as f:
|
|
json.dump(all_pages, f, ensure_ascii=False, indent=4)
|