tools/lqdn/scrap_representatives.py

62 lines
2.1 KiB
Python
Raw Normal View History

2021-07-23 17:21:38 +02:00
# encoding: utf-8
import csv
from datetime import datetime
import locale
import re
from string import ascii_uppercase
from time import sleep
import sys
from bs4 import BeautifulSoup
import requests
from slugify import slugify
url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom="
locale.setlocale(locale.LC_ALL, "FR")
deputes = []
with open("../tmp/liste_deputes.txt", encoding="utf-8") as file_handler:
deputes = file_handler.read().splitlines()
# Extract representatives
with open("../tmp/lqdn_representatives.csv", "w", encoding="utf-8", newline="") as csvfile:
writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["name", "picture", "nationality", "sex", "birth_date", "birth_place", "job"])
for depute in deputes:
print(".", end="")
sys.stdout.flush()
# Loading informations
content = requests.get(f"https://wiki.laquadrature.net/{depute}").text
soup = BeautifulSoup(content, features="lxml")
deputy = soup.find("span", attrs={"class": "mw-headline"})
# Identity
fullname = deputy.text.split(":")[1].split(",")[0].strip()
if "Né le" in content:
sex = "M"
else:
sex = "F"
birth = soup.find(text=re.compile("Née? le")).parent.parent
birth_date = birth.contents[1].strip()
birth_date = datetime.strptime(birth_date, "%d %B %Y").strftime("%Y-%m-%d")
birth_city = birth.contents[3].strip("(").split()[0].strip()
try:
job = soup.find(text=re.compile("Profession")).parent.parent
job_name = job.contents[1].split(":")[1].strip()
except:
job_name = ""
# Picture
picture = soup.find("img", attrs={"alt": fullname})["src"]
picture = f"https://wiki.laquadrature.net{picture}"
# CSV line
writer.writerow([
fullname,
picture,
"FR",
sex,
birth_date,
birth_city,
job_name,
])