71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
# encoding: utf-8
|
|
|
|
from datetime import datetime
|
|
from string import ascii_uppercase
|
|
import os
|
|
import sys
|
|
from time import sleep
|
|
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import requests
|
|
from slugify import slugify
|
|
|
|
data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp")
|
|
|
|
data_source = os.path.join(data_root, "liste_deputes.txt")
|
|
with open(data_source) as file_handler:
|
|
deputes = file_handler.read().splitlines()
|
|
|
|
data_target = os.path.join(data_root, "lqdn_stances.csv")
|
|
with open(data_target, "w", encoding="utf-8", newline="") as csvfile:
|
|
writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
|
writer.writerow(["name", "slug", "matter", "subject", "date", "extract", "source_url"])
|
|
|
|
for depute in deputes:
|
|
print(".", end="")
|
|
sys.stdout.flush()
|
|
try:
|
|
# Do not DDOS lqdn wiki ;o)
|
|
sleep(.2)
|
|
content = requests.get(f"https://wiki.laquadrature.net/{depute}").text
|
|
soup = BeautifulSoup(content, features="lxml")
|
|
deputy = soup.find("span", attrs={"class": "mw-headline"})
|
|
if deputy is not None:
|
|
stance_author = deputy.text.split(",")[0].split(":")[1].strip()
|
|
else:
|
|
stance_author = depute
|
|
quotes = soup.find_all("h5")
|
|
for quote in quotes:
|
|
try:
|
|
stance_date = datetime.strptime(quote.text.split()[0], "%d/%m/%Y")
|
|
stance_subject = " ".join(quote.text.split()[1:])
|
|
except:
|
|
stance_date = None
|
|
stance_subject = quote.text
|
|
# TODO: Set the matter accordingly to the subject
|
|
stance_matter = stance_subject.split(":")[0]
|
|
if quote.find("a"):
|
|
stance_link = quote.find("a").get("href")
|
|
else:
|
|
stance_link = None
|
|
# quote
|
|
quotes = []
|
|
block = quote.find_next_sibling()
|
|
if block is not None:
|
|
while block is not None and block.name == "blockquote":
|
|
quotes.append(block.text)
|
|
block = block.find_next_sibling()
|
|
stance_quote = "\n".join(quotes)
|
|
writer.writerow([
|
|
stance_author,
|
|
slugify(stance_author),
|
|
stance_matter,
|
|
stance_subject,
|
|
datetime.strftime(stance_date, "%Y-%m-%d") if stance_date is not None else None,
|
|
stance_quote,
|
|
stance_link,
|
|
])
|
|
except AttributeError:
|
|
print(f"\nError while scraping stances for representative '{depute}'")
|
|
continue |