tools/lqdn/scrap_votes.py

# encoding: utf-8

from datetime import datetime
from string import ascii_uppercase
import os
import sys
from time import sleep

from bs4 import BeautifulSoup
import csv
import requests
from slugify import slugify

data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp")

data_source = os.path.join(data_root, "liste_deputes.txt")
with open(data_source) as file_handler:
    deputes = file_handler.read().splitlines()

data_target = os.path.join(data_root, "lqdn_votes.csv")
with open(data_target, "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["name", "slug", "matter", "date", "source_url"])

    for depute in deputes:
        print(".", end="")
        sys.stdout.flush()
        try:
            # Do not DDOS lqdn wiki ;o)
            sleep(.2)
            content = requests.get(f"https://wiki.laquadrature.net/{depute}").text
            soup = BeautifulSoup(content, features="lxml")
            deputy = soup.find("span", attrs={"class": "mw-headline"})
            if deputy is not None:
                vote_author = deputy.text.split(",")[0].split(":")[1].strip()
            else:
                vote_author = depute
            votes = soup.find("span", attrs={"id": "Votes"}).parent.find_next_sibling("ul")
            if votes is not None:
                for vote in votes.find_all("li"):
                    print(".", end="")
                    sys.stdout.flush()
                    vote_date = datetime.strptime(vote.text.split()[0], "%d/%m/%Y")
                    vote_matter = vote.find("a").text
                    vote_url = vote.find("a").get("href")
                    writer.writerow([
                        vote_author,
                        slugify(vote_author),
                        vote_matter,
                        datetime.strftime(vote_date, "%Y-%m-%d"),
                        vote_url,
                    ])
            print()
        except (AttributeError, ValueError):
            print(f"\nError while scraping stances for representative '{depute}'")
            continue