#!/usr/bin/python3.10 # -*- coding: utf-8 -*- """************************ AnkIdentification - Supplementary code 3-2 : Attributing its varnacular name(s) (VN) to each species. Aurélien VALENTIN - from 06/11/2022 to 29/12/2022 ************************""" # Importation and initialisation import xml.etree.ElementTree as ET import sys, json, lxml.etree, lxml.builder with open("_dict_species_raw.json", "r") as fpi: common_names = json.load(fpi) # Definition of the progressbar function, from https://gist.github.com/ChesterChowWOV/2b35c551b339adbf459363322aac5b4b def progressbar(it, prefix = "", size = 60, file = sys.stdout): count = len(list(myroot)) def show(j): x = int(size*j/count) file.write("{}[{}{}] {}/{} {}%\r".format(prefix, "█"*x, "."*(size-x), j, count, round(j / count * 100, 2))) file.flush() show(0) for i, item in enumerate(it): yield item show(i + 1) file.write("\n") file.flush() # Computing of the 7 fragments of the original files (because it is a very huge one) for file_number in range(1, 7): file_number = str(file_number) print("File number ", file_number) # Parsing of the xml file mytree = ET.parse(file_number + '.xml') globals()[f"common_names_{file_number}"] = {} myroot = mytree.getroot() E = lxml.builder.ElementMaker() ROOT = E.root DOC = E.doc PAGE = E.page TEXT = E.text TITLE = E.title the_doc = ROOT() for page in progressbar(myroot.iter("page"), "Searching plant species", 40): specie = page.find("title").text if specie in common_names.keys(): try: # Checking of French names VN = page.find("revision").find("text").text.split("{{VN")[1].split("}}")[0].replace("\n", " ") try: globals()[f"common_names_{file_number}"][specie] = {"fr": VN.split("fr=")[1].split("|")[0].strip("}}").strip(" ")} except: try: # Otherwise, cheking for an English one New_VN = VN.split("en=")[1].split("|")[0].strip("}}").strip(" ") if New_VN != "": globals()[f"common_names_{file_number}"][specie] = {"fr": "en " + New_VN} except: pass try: # Also storing of English vernacular names New_VN = VN.split("en=")[1].split("|")[0].strip("}}").strip(" ") if New_VN != "": globals()[f"common_names_{file_number}"][specie]["en"] = New_VN except: pass except: pass # Creation of a json file for each fragments print("Storing") with open("_dict_species_" + file_number + ".json", "w") as fpo : json.dump(globals()[f"common_names_{file_number}"], fpo) print(globals()[f"common_names_{file_number}"])