# encoding: utf-8 import csv from datetime import datetime import locale import re from string import ascii_uppercase from time import sleep import sys from bs4 import BeautifulSoup import requests from slugify import slugify url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom=" locale.setlocale(locale.LC_ALL, "FR") deputes = [] with open("../tmp/liste_deputes.txt", encoding="utf-8") as file_handler: deputes = file_handler.read().splitlines() # Extract representatives with open("../tmp/lqdn_representatives.csv", "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["name", "picture", "nationality", "sex", "birth_date", "birth_place", "job"]) for depute in deputes: print(".", end="") sys.stdout.flush() # Loading informations content = requests.get(f"https://wiki.laquadrature.net/{depute}").text soup = BeautifulSoup(content, features="lxml") deputy = soup.find("span", attrs={"class": "mw-headline"}) # Identity fullname = deputy.text.split(":")[1].split(",")[0].strip() if "Né le" in content: sex = "M" else: sex = "F" birth = soup.find(text=re.compile("Née? le")).parent.parent birth_date = birth.contents[1].strip() birth_date = datetime.strptime(birth_date, "%d %B %Y").strftime("%Y-%m-%d") birth_city = birth.contents[3].strip("(").split()[0].strip() try: job = soup.find(text=re.compile("Profession")).parent.parent job_name = job.contents[1].split(":")[1].strip() except: job_name = "" # Picture picture = soup.find("img", attrs={"alt": fullname})["src"] picture = f"https://wiki.laquadrature.net{picture}" # CSV line writer.writerow([ fullname, picture, "FR", sex, birth_date, birth_city, job_name, ])