# encoding: utf-8 from datetime import datetime from string import ascii_uppercase import os import sys from time import sleep from bs4 import BeautifulSoup import csv import requests from slugify import slugify data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") data_source = os.path.join(data_root, "liste_deputes.txt") with open(data_source) as file_handler: deputes = file_handler.read().splitlines() data_target = os.path.join(data_root, "lqdn_votes.csv") with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["name", "slug", "matter", "date", "source_url"]) for depute in deputes: print(".", end="") sys.stdout.flush() try: # Do not DDOS lqdn wiki ;o) sleep(.2) content = requests.get(f"https://wiki.laquadrature.net/{depute}").text soup = BeautifulSoup(content, features="lxml") deputy = soup.find("span", attrs={"class": "mw-headline"}) if deputy is not None: vote_author = deputy.text.split(",")[0].split(":")[1].strip() else: vote_author = depute votes = soup.find("span", attrs={"id": "Votes"}).parent.find_next_sibling("ul") if votes is not None: for vote in votes.find_all("li"): print(".", end="") sys.stdout.flush() vote_date = datetime.strptime(vote.text.split()[0], "%d/%m/%Y") vote_matter = vote.find("a").text vote_url = vote.find("a").get("href") writer.writerow([ vote_author, slugify(vote_author), vote_matter, datetime.strftime(vote_date, "%Y-%m-%d"), vote_url, ]) print() except (AttributeError, ValueError): print(f"\nError while scraping stances for representative '{depute}'") continue