71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
# encoding: utf-8
|
|
|
|
import csv
|
|
from datetime import datetime
|
|
import locale
|
|
import os
|
|
import re
|
|
from string import ascii_uppercase
|
|
import sys
|
|
from time import sleep
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from slugify import slugify
|
|
|
|
url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom="
|
|
data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp")
|
|
locale.setlocale(locale.LC_ALL, "FR")
|
|
|
|
deputes = []
|
|
data_source = os.path.join(data_root, "liste_deputes.txt")
|
|
with open(data_source, encoding="utf-8") as file_handler:
|
|
deputes = file_handler.read().splitlines()
|
|
|
|
# Extract representatives
|
|
data_target = os.path.join(data_root, "lqdn_representatives.csv")
|
|
with open(data_target, "w", encoding="utf-8", newline="") as csvfile:
|
|
writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
|
writer.writerow(["name", "picture", "nationality", "sex", "birth_date", "birth_place", "job"])
|
|
|
|
for depute in deputes:
|
|
print(".", end="")
|
|
sys.stdout.flush()
|
|
try:
|
|
# Do not DDOS lqdn wiki ;o)
|
|
sleep(.2)
|
|
# Loading informations
|
|
content = requests.get(f"https://wiki.laquadrature.net/{depute}").text
|
|
soup = BeautifulSoup(content, features="lxml")
|
|
deputy = soup.find("span", attrs={"class": "mw-headline"})
|
|
# Identity
|
|
fullname = deputy.text.split(":")[1].split(",")[0].strip()
|
|
if "Né le" in content:
|
|
sex = "M"
|
|
else:
|
|
sex = "F"
|
|
birth = soup.find(text=re.compile("Née? le")).parent.parent
|
|
birth_date = birth.contents[1].strip()
|
|
birth_date = datetime.strptime(birth_date, "%d %B %Y").strftime("%Y-%m-%d")
|
|
birth_city = birth.contents[3].strip("(").split()[0].strip()
|
|
try:
|
|
job = soup.find(text=re.compile("Profession")).parent.parent
|
|
job_name = job.contents[1].split(":")[1].strip()
|
|
except:
|
|
job_name = ""
|
|
# Picture
|
|
picture = soup.find("img", attrs={"alt": fullname})["src"]
|
|
picture = f"https://wiki.laquadrature.net{picture}"
|
|
# CSV line
|
|
writer.writerow([
|
|
fullname,
|
|
picture,
|
|
"FR",
|
|
sex,
|
|
birth_date,
|
|
birth_city,
|
|
job_name,
|
|
])
|
|
except AttributeError:
|
|
print(f"\nError while scraping representative '{depute}'")
|
|
continue |