# encoding: utf-8 import os from string import ascii_uppercase import sys from time import sleep from bs4 import BeautifulSoup import requests url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom=" data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") deputes = [] for letter in ascii_uppercase: # Do not DDOS lqdn wiki ;o) sleep(.2) content = requests.get(f"{url}{letter}").text soup = BeautifulSoup(content, features="lxml") anchors = soup.find_all("a") print(letter, end="") for anchor in anchors: if anchor.text == anchor.get("title") and not anchor.text.startswith("Deputes"): deputes.append(anchor.text) print(".", end="") sys.stdout.flush() print() data_folder = os.path.join(data_root, "liste_deputes.txt") with open(data_folder, "w", encoding="utf-8") as file_handler: for depute in sorted(list(set(deputes))): file_handler.write(f"{depute}\n")