From 5300a08f395f17992658603dc12bf01dbb59fbea Mon Sep 17 00:00:00 2001 From: Mindiell Date: Fri, 8 Jul 2022 13:51:54 +0200 Subject: [PATCH] wip --- assemblee_nationale/README.md | 27 ++ assemblee_nationale/load_datas.py | 14 +- assemblee_nationale/schemas.txt | 1 - assemblee_nationale/scrap_entities.py | 15 +- assemblee_nationale/scrap_memberships.py | 17 +- assemblee_nationale/scrap_representatives.py | 11 +- assemblee_nationale/scrap_roles.py | 12 +- assemblee_nationale/scrap_types.py | 12 +- lqdn/README.md | 18 ++ lqdn/list_deputes.py | 7 +- lqdn/scrap_representatives.py | 78 ++--- lqdn/scrap_stances.py | 97 ++++--- lqdn/scrap_votes.py | 68 +++-- parltrack/README.md | 12 + parltrack/parltrack.py | 286 ------------------- parltrack/scrap_representatives.py | 31 ++ 16 files changed, 286 insertions(+), 420 deletions(-) create mode 100644 assemblee_nationale/README.md delete mode 100644 assemblee_nationale/schemas.txt create mode 100644 lqdn/README.md create mode 100644 parltrack/README.md delete mode 100644 parltrack/parltrack.py create mode 100644 parltrack/scrap_representatives.py diff --git a/assemblee_nationale/README.md b/assemblee_nationale/README.md new file mode 100644 index 0000000..f76e304 --- /dev/null +++ b/assemblee_nationale/README.md @@ -0,0 +1,27 @@ +# Assemblée Nationale + +This institution is the french lower house of the french parliament. + +First of all, you need to load historic datas. + +Those tools are used to scrap : + +* Representatives basic informations + * AN code, Name, Nationality, Sex, Birth date and place, job +* Types of entities + * Code and name of different types of entities linked to the house +* Entities + * Type, Country, Name, Code, Picture, Start and End of each entity +* Roles of representatives + * Code and name +* Membership + * Link between Representatives and Entities : Start and End of their memberships + + +## Schemas + +https://www.assemblee-nationale.fr/opendata/Schemas_Entites/AMO/Schemas_Organes.html + + +Keep in mind that all those datas can be used to do a first push into politikorama and +that they should be subject to human proof-reading. diff --git a/assemblee_nationale/load_datas.py b/assemblee_nationale/load_datas.py index 0e96abc..ac3e6ad 100644 --- a/assemblee_nationale/load_datas.py +++ b/assemblee_nationale/load_datas.py @@ -10,14 +10,18 @@ import requests url = "https://data.assemblee-nationale.fr/static/openData/repository/15/amo/tous_acteurs_mandats_organes_xi_legislature/AMO30_tous_acteurs_tous_mandats_tous_organes_historique.json.zip" +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +data_source = os.path.join(data_root, "assemblee_nationale.zip") +data_target = os.path.join(data_root, "json") + # Cleaning old data try: - os.remove("../tmp/assemblee_nationale.zip") + os.remove(data_source) except FileNotFoundError: # No file to remove pass try: - shutil.rmtree("../tmp/json") + shutil.rmtree(data_target) except FileNotFoundError: # No folder to remove pass @@ -26,10 +30,10 @@ except FileNotFoundError: print("Downloading archive") with requests.get(url, stream=True) as result: result.raise_for_status() - with open("../tmp/assemblee_nationale.zip", "wb") as f: + with open(data_source, "wb") as f: for chunk in result.iter_content(chunk_size=8192): f.write(chunk) print("Unpacking archive") -shutil.unpack_archive("../tmp/assemblee_nationale.zip", "../tmp/") +shutil.unpack_archive(data_source, data_root) -os.remove("../tmp/assemblee_nationale.zip") +os.remove(data_source) diff --git a/assemblee_nationale/schemas.txt b/assemblee_nationale/schemas.txt deleted file mode 100644 index ffc1775..0000000 --- a/assemblee_nationale/schemas.txt +++ /dev/null @@ -1 +0,0 @@ -https://www.assemblee-nationale.fr/opendata/Schemas_Entites/AMO/Schemas_Organes.html diff --git a/assemblee_nationale/scrap_entities.py b/assemblee_nationale/scrap_entities.py index 954f0fb..979aa94 100644 --- a/assemblee_nationale/scrap_entities.py +++ b/assemblee_nationale/scrap_entities.py @@ -7,16 +7,23 @@ import csv from datetime import datetime import json import os +import sys + +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +target_root = os.path.join(data_root, "assemblee_nationale") # Extract representatives -print("Scraping entities") -with open("../tmp/assemblee_nationale_entities.csv", "w", encoding="utf-8", newline="") as csvfile: +data_source = os.path.join(data_root, "json/organe") +data_target = os.path.join(target_root, "assemblee_nationale_entities.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["type_code", "country_iso2", "name", "code", "picture", "start", "end"]) - for filename in os.listdir("../tmp/json/organe"): + for filename in os.listdir(data_source): + print(".", end="") + sys.stdout.flush() # Loading informations - with open(os.path.join("../tmp/json/organe", filename)) as file_handler: + with open(os.path.join(data_source, filename)) as file_handler: organe = json.load(file_handler)["organe"] type_raw = organe["codeType"] name = organe["libelle"] diff --git a/assemblee_nationale/scrap_memberships.py b/assemblee_nationale/scrap_memberships.py index 343e4fe..c8dc6cc 100644 --- a/assemblee_nationale/scrap_memberships.py +++ b/assemblee_nationale/scrap_memberships.py @@ -7,18 +7,25 @@ import csv from datetime import datetime import json import os +import sys from slugify import slugify +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +target_root = os.path.join(data_root, "assemblee_nationale") + # Extract representatives -print("Scraping memberships") -with open("../tmp/assemblee_nationale_memberships.csv", "w", encoding="utf-8", newline="") as csvfile: +data_source = os.path.join(data_root, "json/acteur") +data_target = os.path.join(target_root, "assemblee_nationale_memberships.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["representative_slug", "role_code", "entity_code", "start", "end"]) - for filename in os.listdir("../tmp/json/acteur"): + for filename in os.listdir(data_source): + print(".", end="") + sys.stdout.flush() # Loading informations - with open(os.path.join("../tmp/json/acteur", filename)) as file_handler: + with open(os.path.join(data_source, filename)) as file_handler: acteur = json.load(file_handler)["acteur"] identity = acteur["etatCivil"]["ident"] representative_slug = slugify(f"{identity['prenom']} {identity['nom']}") @@ -26,6 +33,8 @@ with open("../tmp/assemblee_nationale_memberships.csv", "w", encoding="utf-8", n if isinstance(mandats, dict): mandats = [mandats] for mandat in mandats: + print(".", end="") + sys.stdout.flush() role_code = mandat["infosQualite"].get("codeQualite", "") start = mandat.get("dateDebut", None) end = mandat.get("dateFin", None) diff --git a/assemblee_nationale/scrap_representatives.py b/assemblee_nationale/scrap_representatives.py index 06e401f..b55def7 100644 --- a/assemblee_nationale/scrap_representatives.py +++ b/assemblee_nationale/scrap_representatives.py @@ -8,16 +8,21 @@ import json import os import sys +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +target_root = os.path.join(data_root, "assemblee_nationale") + # Extract representatives -with open("../tmp/assemblee_nationale_representatives.csv", "w", encoding="utf-8", newline="") as csvfile: +data_source = os.path.join(data_root, "json/acteur") +data_target = os.path.join(target_root, "assemblee_nationale_representatives.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["code", "name", "picture", "nationality", "sex", "birth_date", "birth_place", "job"]) - for filename in os.listdir("../tmp/json/acteur"): + for filename in os.listdir(data_source): print(".", end="") sys.stdout.flush() # Loading informations - with open(os.path.join("../tmp/json/acteur", filename)) as file_handler: + with open(os.path.join(data_source, filename)) as file_handler: acteur = json.load(file_handler)["acteur"] uid = f"AN_{acteur['uid']['#text'][2:]}" # Identity diff --git a/assemblee_nationale/scrap_roles.py b/assemblee_nationale/scrap_roles.py index 782ccce..19f2198 100644 --- a/assemblee_nationale/scrap_roles.py +++ b/assemblee_nationale/scrap_roles.py @@ -8,18 +8,22 @@ import json import os import sys +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +target_root = os.path.join(data_root, "assemblee_nationale") + # Extract roles -print("Scraping roles") -with open("../tmp/assemblee_nationale_roles.csv", "w", encoding="utf-8", newline="") as csvfile: +data_source = os.path.join(data_root, "json/acteur") +data_target = os.path.join(target_root, "assemblee_nationale_roles.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["code", "name"]) roles = [] - for filename in os.listdir("../tmp/json/acteur"): + for filename in os.listdir(data_source): print(".", end="") sys.stdout.flush() # Loading informations - with open(os.path.join("../tmp/json/acteur", filename)) as file_handler: + with open(os.path.join(data_source, filename)) as file_handler: acteur = json.load(file_handler)["acteur"] mandats = acteur["mandats"]["mandat"] if isinstance(mandats, dict): diff --git a/assemblee_nationale/scrap_types.py b/assemblee_nationale/scrap_types.py index 5f98744..c8c8516 100644 --- a/assemblee_nationale/scrap_types.py +++ b/assemblee_nationale/scrap_types.py @@ -43,18 +43,22 @@ TYPES = { } +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +target_root = os.path.join(data_root, "assemblee_nationale") + # Extract types -print("Scraping types") -with open("../tmp/assemblee_nationale_types.csv", "w", encoding="utf-8", newline="") as csvfile: +data_source = os.path.join(data_root, "json/organe") +data_target = os.path.join(target_root, "assemblee_nationale_types.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["code", "name"]) types = {} - for filename in os.listdir("../tmp/json/organe"): + for filename in os.listdir(data_source): print(".", end="") sys.stdout.flush() # Loading informations - with open(os.path.join("../tmp/json/organe", filename)) as file_handler: + with open(os.path.join(data_source, filename)) as file_handler: organe = json.load(file_handler)["organe"] if organe["codeType"].upper() not in types: types[organe["codeType"].upper()] = TYPES.get(organe["codeType"].upper(), organe["codeType"].upper()) diff --git a/lqdn/README.md b/lqdn/README.md new file mode 100644 index 0000000..6a3d24e --- /dev/null +++ b/lqdn/README.md @@ -0,0 +1,18 @@ +# La Quadrature du Net + +This organisation has collected a lot of important data about french representatives for +years in their wiki (mediawiki engine). + +Those tools are used to scrap : + +* Representatives basic informations + * Name, Nationality, Sex, Birth date and place, job +* Stances by Representatives + * Trying to find a matter and a subject for each stance, with a date and a source url +* Votes by Representatives + * Trying to find a matter and a subject for each vote, with a date and a source url + No result is specified + + +Keep in mind that all those datas can be used to do a first push into politikorama and +that they should be subject to human proof-reading. diff --git a/lqdn/list_deputes.py b/lqdn/list_deputes.py index 8950a3a..ef4563e 100644 --- a/lqdn/list_deputes.py +++ b/lqdn/list_deputes.py @@ -1,13 +1,15 @@ # encoding: utf-8 +import os from string import ascii_uppercase -from time import sleep import sys +from time import sleep from bs4 import BeautifulSoup import requests url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom=" +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") deputes = [] for letter in ascii_uppercase: @@ -25,6 +27,7 @@ for letter in ascii_uppercase: sys.stdout.flush() print() -with open("../tmp/liste_deputes.txt", "w", encoding="utf-8") as file_handler: +data_folder = os.path.join(data_root, "liste_deputes.txt") +with open(data_folder, "w", encoding="utf-8") as file_handler: for depute in sorted(list(set(deputes))): file_handler.write(f"{depute}\n") diff --git a/lqdn/scrap_representatives.py b/lqdn/scrap_representatives.py index 6265c28..b442872 100644 --- a/lqdn/scrap_representatives.py +++ b/lqdn/scrap_representatives.py @@ -3,59 +3,69 @@ import csv from datetime import datetime import locale +import os import re from string import ascii_uppercase -from time import sleep import sys +from time import sleep from bs4 import BeautifulSoup import requests from slugify import slugify url = "https://wiki.laquadrature.net/index.php?title=Cat%C3%A9gorie:D%C3%A9put%C3%A9s&pagefrom=" +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") locale.setlocale(locale.LC_ALL, "FR") deputes = [] -with open("../tmp/liste_deputes.txt", encoding="utf-8") as file_handler: +data_source = os.path.join(data_root, "liste_deputes.txt") +with open(data_source, encoding="utf-8") as file_handler: deputes = file_handler.read().splitlines() # Extract representatives -with open("../tmp/lqdn_representatives.csv", "w", encoding="utf-8", newline="") as csvfile: +data_target = os.path.join(data_root, "lqdn_representatives.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["name", "picture", "nationality", "sex", "birth_date", "birth_place", "job"]) for depute in deputes: print(".", end="") sys.stdout.flush() - # Loading informations - content = requests.get(f"https://wiki.laquadrature.net/{depute}").text - soup = BeautifulSoup(content, features="lxml") - deputy = soup.find("span", attrs={"class": "mw-headline"}) - # Identity - fullname = deputy.text.split(":")[1].split(",")[0].strip() - if "Né le" in content: - sex = "M" - else: - sex = "F" - birth = soup.find(text=re.compile("Née? le")).parent.parent - birth_date = birth.contents[1].strip() - birth_date = datetime.strptime(birth_date, "%d %B %Y").strftime("%Y-%m-%d") - birth_city = birth.contents[3].strip("(").split()[0].strip() try: - job = soup.find(text=re.compile("Profession")).parent.parent - job_name = job.contents[1].split(":")[1].strip() - except: - job_name = "" - # Picture - picture = soup.find("img", attrs={"alt": fullname})["src"] - picture = f"https://wiki.laquadrature.net{picture}" - # CSV line - writer.writerow([ - fullname, - picture, - "FR", - sex, - birth_date, - birth_city, - job_name, - ]) + # Do not DDOS lqdn wiki ;o) + sleep(.2) + # Loading informations + content = requests.get(f"https://wiki.laquadrature.net/{depute}").text + soup = BeautifulSoup(content, features="lxml") + deputy = soup.find("span", attrs={"class": "mw-headline"}) + # Identity + fullname = deputy.text.split(":")[1].split(",")[0].strip() + if "Né le" in content: + sex = "M" + else: + sex = "F" + birth = soup.find(text=re.compile("Née? le")).parent.parent + birth_date = birth.contents[1].strip() + birth_date = datetime.strptime(birth_date, "%d %B %Y").strftime("%Y-%m-%d") + birth_city = birth.contents[3].strip("(").split()[0].strip() + try: + job = soup.find(text=re.compile("Profession")).parent.parent + job_name = job.contents[1].split(":")[1].strip() + except: + job_name = "" + # Picture + picture = soup.find("img", attrs={"alt": fullname})["src"] + picture = f"https://wiki.laquadrature.net{picture}" + # CSV line + writer.writerow([ + fullname, + picture, + "FR", + sex, + birth_date, + birth_city, + job_name, + ]) + except AttributeError: + print(f"\nError while scraping representative '{depute}'") + continue \ No newline at end of file diff --git a/lqdn/scrap_stances.py b/lqdn/scrap_stances.py index 2c71ffc..45f152a 100644 --- a/lqdn/scrap_stances.py +++ b/lqdn/scrap_stances.py @@ -2,61 +2,70 @@ from datetime import datetime from string import ascii_uppercase -from time import sleep +import os import sys +from time import sleep from bs4 import BeautifulSoup import csv import requests from slugify import slugify -with open("../tmp/liste_deputes.txt") as file_handler: +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") + +data_source = os.path.join(data_root, "liste_deputes.txt") +with open(data_source) as file_handler: deputes = file_handler.read().splitlines() -with open("../tmp/lqdn_stances.csv", "w", encoding="utf-8", newline="") as csvfile: +data_target = os.path.join(data_root, "lqdn_stances.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["name", "slug", "matter", "subject", "date", "extract", "source_url"]) for depute in deputes: - # Do not DDOS lqdn wiki ;o) - sleep(.2) - content = requests.get(f"https://wiki.laquadrature.net/{depute}").text - soup = BeautifulSoup(content, features="lxml") - deputy = soup.find("span", attrs={"class": "mw-headline"}) - if deputy is not None: - stance_author = deputy.text.split(",")[0].split(":")[1].strip() - else: - stance_author = depute - quotes = soup.find_all("h5") - for quote in quotes: - try: - stance_date = datetime.strptime(quote.text.split()[0], "%d/%m/%Y") - stance_subject = " ".join(quote.text.split()[1:]) - except: - stance_date = None - stance_subject = quote.text - # TODO: Set the matter accordingly to the subject - stance_matter = stance_subject.split(":")[0] - if quote.find("a"): - stance_link = quote.find("a").get("href") + print(".", end="") + sys.stdout.flush() + try: + # Do not DDOS lqdn wiki ;o) + sleep(.2) + content = requests.get(f"https://wiki.laquadrature.net/{depute}").text + soup = BeautifulSoup(content, features="lxml") + deputy = soup.find("span", attrs={"class": "mw-headline"}) + if deputy is not None: + stance_author = deputy.text.split(",")[0].split(":")[1].strip() else: - stance_link = None - # quote - quotes = [] - block = quote.find_next_sibling() - if block is not None: - while block is not None and block.name == "blockquote": - quotes.append(block.text) - block = block.find_next_sibling() - stance_quote = "\n".join(quotes) - writer.writerow([ - stance_author, - slugify(stance_author), - stance_matter, - stance_subject, - datetime.strftime(stance_date, "%Y-%m-%d") if stance_date is not None else None, - stance_quote, - stance_link, - ]) - print(".", end="") - sys.stdout.flush() + stance_author = depute + quotes = soup.find_all("h5") + for quote in quotes: + try: + stance_date = datetime.strptime(quote.text.split()[0], "%d/%m/%Y") + stance_subject = " ".join(quote.text.split()[1:]) + except: + stance_date = None + stance_subject = quote.text + # TODO: Set the matter accordingly to the subject + stance_matter = stance_subject.split(":")[0] + if quote.find("a"): + stance_link = quote.find("a").get("href") + else: + stance_link = None + # quote + quotes = [] + block = quote.find_next_sibling() + if block is not None: + while block is not None and block.name == "blockquote": + quotes.append(block.text) + block = block.find_next_sibling() + stance_quote = "\n".join(quotes) + writer.writerow([ + stance_author, + slugify(stance_author), + stance_matter, + stance_subject, + datetime.strftime(stance_date, "%Y-%m-%d") if stance_date is not None else None, + stance_quote, + stance_link, + ]) + except AttributeError: + print(f"\nError while scraping stances for representative '{depute}'") + continue \ No newline at end of file diff --git a/lqdn/scrap_votes.py b/lqdn/scrap_votes.py index 40e0ecd..ea6ba7b 100644 --- a/lqdn/scrap_votes.py +++ b/lqdn/scrap_votes.py @@ -2,45 +2,55 @@ from datetime import datetime from string import ascii_uppercase -from time import sleep +import os import sys +from time import sleep from bs4 import BeautifulSoup import csv import requests from slugify import slugify -with open("lqdn_representatives.txt") as file_handler: +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") + +data_source = os.path.join(data_root, "liste_deputes.txt") +with open(data_source) as file_handler: deputes = file_handler.read().splitlines() -with open("lqdn_votes.csv", "w", encoding="utf-8", newline="") as csvfile: +data_target = os.path.join(data_root, "lqdn_votes.csv") +with open(data_target, "w", encoding="utf-8", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) - writer.writerow(["name", "slug", "matter", "subject", "date", "extract", "source_url"]) + writer.writerow(["name", "slug", "matter", "date", "source_url"]) for depute in deputes: - # Do not DDOS lqdn wiki ;o) - sleep(.2) - content = requests.get(f"https://wiki.laquadrature.net/{depute}").text - soup = BeautifulSoup(content, features="lxml") - deputy = soup.find("span", attrs={"class": "mw-headline"}) - if deputy is not None: - stance_author = deputy.text.split(",")[0].split(":")[1].strip() - else: - stance_author = depute - print(stance_author) - votes = soup.find("span", attrs={"id": "Votes"}).parent.find_next_sibling("ul") - if votes is not None: - for vote in votes.find_all("li"): - pass##print(f" {vote}") - writer.writerow([ - stance_author, - slugify(stance_author), - stance_matter, - stance_subject, - stance_date, - stance_quote, - stance_link, - ]) - print(".", end="") - sys.stdout.flush() + print(".", end="") + sys.stdout.flush() + try: + # Do not DDOS lqdn wiki ;o) + sleep(.2) + content = requests.get(f"https://wiki.laquadrature.net/{depute}").text + soup = BeautifulSoup(content, features="lxml") + deputy = soup.find("span", attrs={"class": "mw-headline"}) + if deputy is not None: + vote_author = deputy.text.split(",")[0].split(":")[1].strip() + else: + vote_author = depute + votes = soup.find("span", attrs={"id": "Votes"}).parent.find_next_sibling("ul") + if votes is not None: + for vote in votes.find_all("li"): + print(".", end="") + sys.stdout.flush() + vote_date = datetime.strptime(vote.text.split()[0], "%d/%m/%Y") + vote_matter = vote.find("a").text + vote_url = vote.find("a").get("href") + writer.writerow([ + vote_author, + slugify(vote_author), + vote_matter, + datetime.strftime(vote_date, "%Y-%m-%d"), + vote_url, + ]) print() + except (AttributeError, ValueError): + print(f"\nError while scraping stances for representative '{depute}'") + continue \ No newline at end of file diff --git a/parltrack/README.md b/parltrack/README.md new file mode 100644 index 0000000..22246f2 --- /dev/null +++ b/parltrack/README.md @@ -0,0 +1,12 @@ +# Parltrack + +This organisation collect all data from European Parliament. + +Those tools are used to scrap : + +* Representatives basic informations + * EP code, Name, Nationality, Sex, Birth date and place, job + + +Keep in mind that all those datas can be used to do a first push into politikorama and +that they should be subject to human proof-reading. diff --git a/parltrack/parltrack.py b/parltrack/parltrack.py deleted file mode 100644 index bf9c01f..0000000 --- a/parltrack/parltrack.py +++ /dev/null @@ -1,286 +0,0 @@ -# encoding: utf-8 - -from datetime import datetime -from io import StringIO, BytesIO -import json -import os -import tempfile -import zipfile - -import click -import requests - -from app.model.address import AddressModel -from app.model.contact import ContactModel -from app.model.entity import EntityModel -from app.model.membership import MembershipModel -from app.model.representative import RepresentativeModel -from app.model.type import TypeModel -from command.json_reader import json_reader - -import json - - -def import_representatives(filepath): - click.echo("Importing representatives from parltrack") - click.echo(" Reading file") - with open(filepath) as f: - meps = json.load(f) - for representative in meps: - click.echo(".", nl=False) - - #for representative in json_reader(filepath): - # click.echo(".", nl=False) - - - - -def toto(): - # Delete only things related to "Assemblée Nationale" ! - MembershipModel.query.delete() #filter_by(source="Assemblée Nationale").delete() - RepresentativeModel.query.filter_by(source="Assemblée Nationale").delete() - AddressModel.query.filter_by(source="Assemblée Nationale").delete() - ContactModel.query.filter_by(source="Assemblée Nationale").delete() - EntityModel.query.filter_by(source="Assemblée Nationale").delete() - TypeModel.query.filter_by(source="Assemblée Nationale").delete() - - url = "https://data.assemblee-nationale.fr/static/openData/repository/15/amo/tous_acteurs_mandats_organes_xi_legislature/AMO30_tous_acteurs_tous_mandats_tous_organes_historique.json.zip" - if False: - datas = BytesIO() - result = requests.get(url, stream=True) - datas.write(result.content) - datas.seek(0) - with tempfile.TemporaryDirectory() as tmpdir: - with zipfile.ZipFile(datas, "r") as zip_ref: - zip_ref.extractall(tmpdir) - print(tmpdir) - for root, dirs, files in os.walk(tmpdir): - if root.endswith("acteur"): - for filename in files: - print(os.path.join(root, filename)) - for filename in files[:1]: - with open(os.path.join(root, filename)) as filehandler: - data = json.load(filehandler) - print(json.dumps(data, indent=2)) - # Testing - tmpdir = "C:\\Users\\tbouchet\\Downloads\\json" - click.echo(" ", nl=False) - for root, dirs, files in os.walk(tmpdir): - if root.endswith("organe"): - with click.progressbar(files, label="Entities") as progress_files: - entities = [] - for filename in progress_files: - #print(filename) - with open(os.path.join(root, filename)) as filehandler: - data = json.load(filehandler)["organe"] - - # Type - # file:///C:/Users/tbouchet/Downloads/html/Schemas_Entites/AMO/Schemas_Organes.html - type_types = { - "API": "Assemblée parlementaire internationale", - "ASSEMBLEE": "Assemblée nationale", - "ASSEXT": "Autres conseils", - "ASSTOM": "Assemblée territoriale d’Outre-Mer", - "CES": "Conseil économique, social et environnemental", - "CJR": "Cour de justice de la République", - "CMP": "Commissions mixtes paritaires", - "CNPE": "Commissions d’enquêtes", - "CNPS": "Commissions spéciales", - "COMMUNE": "Conseil Municipal", - "COMNL": "Autres commissions permanentes", - "COMPER": "Commissions permanentes législatives", - "COMSENAT": "Commissions du Sénat", - "COMSPSENAT": "Commissions spéciales du Sénat", - "CONFPT": "CONFPT", - "CONSTITU": "Conseil constitutionnel", - "DELEG": "Délégation parlementaire", - "DELEGBUREAU": "Délégation du Bureau (de l’AN)", - "DELEGSENAT": "Délégation du Sénat", - "DEPARTEMENT": "Conseil général ou départemental", - "EUROPE": "Mandat européen", - "GA": "Groupe d’amitié", - "GE": "Groupe d’études", - "GEVI": "Groupe d’études à vocation internationale", - "GOUVERNEMENT": "Gouvernement", - "GP": "Groupe politique", - "GROUPESENAT": "Groupe Sénat", - "HCJ": "Haute Cour de justice", - "INTCO": "Intercommunalité", - "MINISTERE": "Ministère", - "MISINFO": "Missions d’informations", - "MISINFOCOM": "Missions d’information communes", - "MISINFOPRE": "Missions d’information de la conférence des Présidents", - "OFFPAR": "Office parlementaire ou délégation mixte", - "ORGAINT": "Organisme international", - "ORGEXTPARL": "Organisme extra parlementaire", - "PARPOL": "Parti Politique", - "PRESREP": "Présidence de la République", - "REGION": "Conseil régional", - "SENAT": "Mandat de sénateur", - } - type = TypeModel.query.filter_by(name = type_types[data["codeType"]]).first() - if type is None: - type = TypeModel() - type.source = "Assemblée Nationale" - type.source_uid = data["codeType"] - type.name = type_types[data["codeType"]] - type.save() - - # Entity - entity = EntityModel( - source = "Assemblée Nationale", - source_uid = data["uid"], - type_id = type.id, - name = data["libelle"], - code = data["libelleAbrev"], - country_id = country.id, - ) - if data["organeParent"] is not None: - parent = EntityModel.query.filter_by(source_uid=data["organeParent"]).first() - if parent is not None: - entity.parent_id = parent.id - else: - print(data["uid"], data["organeParent"]) - entity.save() - - for root, dirs, files in os.walk(tmpdir): - if root.endswith("acteur"): - with click.progressbar(files, label="Representatives") as progress_files: - for filename in progress_files: - with open(os.path.join(root, filename)) as filehandler: - data = json.load(filehandler)["acteur"] - - # Representative - representative = RepresentativeModel() - representative.source = "Assemblée Nationale" - representative.source_uid = data["uid"]["#text"] - nom = data["etatCivil"]["ident"]["nom"] - prenom = data["etatCivil"]["ident"]["prenom"] - representative.name = f"{prenom} {nom}" - representative.nationality_id = country.id - representative.birth_date = datetime.strptime( - data["etatCivil"]["infoNaissance"]["dateNais"], "%Y-%m-%d" - ) - if isinstance(data["etatCivil"]["infoNaissance"]["villeNais"], str): - representative.birth_place = data["etatCivil"]["infoNaissance"][ - "villeNais" - ] - if isinstance(data["profession"]["libelleCourant"], str): - representative.profession = data["profession"]["libelleCourant"] - representative.save() - - # Address - if data["adresses"].get("adresse", "") != "": - address_types = { - "0": "Parliament address", - "1": "Address", - "2": "Constituency address", - } - - def manage_address(data_address): - if data_address["type"] in address_types: - address = AddressModel() - address.representative_id = representative.id - address.source = "Assemblée Nationale" - address.source_uid = data_address["uid"] - address.name = address_types[data_address["type"]] - address.country_id = country.id - address.number = data_address["numeroRue"] - address.street = data_address["nomRue"] - address.miscellaneous = data_address[ - "complementAdresse" - ] - address.city = data_address["ville"] - address.zipcode = data_address["codePostal"] - address.save() - - if isinstance(data["adresses"]["adresse"], list): - for data_address in data["adresses"]["adresse"]: - manage_address(data_address) - elif isinstance(data["adresses"]["adresse"], dict): - manage_address(data["adresses"]["adresse"]) - - # Contact - contact_types = { - "3": "Phone (Press contact)", - "11": "Phone", - "12": "Fax", - "15": "Email", - "22": "Website", - "23": "Senate URL", - "24": "Twitter", - "25": "Facebook", - } - - def manage_contact(data_contact): - if data_contact["type"] in contact_types: - contact = ContactModel() - contact.representative_id = representative.id - contact.source = "Assemblée Nationale" - contact.source_uid = data_contact["uid"] - if data_contact["adresseDeRattachement"] is not None: - address = AddressModel.query.filter_by( - source_uid=data_contact["adresseDeRattachement"] - ).first() - if address is not None: - contact.address_id = address.id - contact.name = contact_types[data_contact["type"]] - contact.value = data_contact["valElec"] - contact.save() - - if isinstance(data["adresses"]["adresse"], list): - for data_contact in data["adresses"]["adresse"]: - manage_contact(data_contact) - elif isinstance(data["adresses"]["adresse"], dict): - manage_contact(data["adresses"]["adresse"]) - - # Unknown addresses ? - if isinstance(data["adresses"]["adresse"], list): - for data_address in data["adresses"]["adresse"]: - if data_address["type"] not in dict( - address_types, **contact_types - ): - print( - f" => Unkown address type : {data_address['type']} in file {filename} : {data_address['typeLibelle']}" - ) - elif isinstance(data["adresses"]["adresse"], dict): - data_address = data["adresses"]["adresse"] - if data_address["type"] not in dict( - address_types, **contact_types - ): - print( - f" => Unkown address type : {data_address['type']} in file {filename} : {data_address['typeLibelle']}" - ) - - if data["mandats"].get("mandat", "") != "": - # Membership - membership_types = { - "Membre": "Member", - } - - def manage_membership(data_membership): - if data_membership["infosQualite"]["codeQualite"] in membership_types: - entity = EntityModel.query.filter_by(source_uid=data_membership["organes"]["organeRef"]).first() - if entity is None: - print("Organe inconnu", data_membership["organes"]["organeRef"]) - return - membership = MembershipModel() - membership.representative_id = representative.id - membership.role = membership_types[data_membership["infosQualite"]["codeQualite"]] - membership.country_id = country.id - if data_membership["dateDebut"] is not None: - membership.start = datetime.strptime( - data_membership["dateDebut"], "%Y-%m-%d" - ) - if data_membership["dateFin"] is not None: - membership.end = datetime.strptime( - data_membership["dateFin"], "%Y-%m-%d" - ) - membership.entity_id = entity.id - membership.save() - - if isinstance(data["mandats"]["mandat"], list): - for data_membership in data["mandats"]["mandat"]: - manage_membership(data_membership) - elif isinstance(data["mandats"]["mandat"], dict): - manage_membership(data["mandats"]["mandat"]) diff --git a/parltrack/scrap_representatives.py b/parltrack/scrap_representatives.py new file mode 100644 index 0000000..e8f16cc --- /dev/null +++ b/parltrack/scrap_representatives.py @@ -0,0 +1,31 @@ +# encoding: utf-8 + +import os +import shutil + +import lzip + +url = "https://parltrack.org/dumps/ep_meps.json.lz" +data_root = os.environ.get("POLITIKORAMA_DATA_ROOT", "../tmp") +data_source = os.path.join(data_root, "ep_meps.json.lz") +data_target = os.path.join(data_root, "json") + +# Cleaning old data +try: + os.remove(data_source) +except FileNotFoundError: + # No file to remove + pass +try: + shutil.rmtree(data_target) +except FileNotFoundError: + # No folder to remove + pass + +# Download and extract data +print("Downloading archive") +with open(data_source, "wb") as f: + for chunk in lzip.decompress_url_iter(url): + f.write(chunk) + +os.remove(data_source)