mapping-geojson-osm/etalab_data/gynadco/scrap.py

232 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import geopandas as gpd
from bs4 import BeautifulSoup
from shapely.geometry import Point
import re
import pandas as pd
def extraire_numero_telephone(line):
# Extraction des nombres dans la ligne
numbers = re.findall(r'\d+', line)
if numbers:
numbers = ''.join(numbers)
# print('numbers',numbers)
# Vérification si un numéro de téléphone est présent
if len(numbers) == 10:
# Reconstruction du numéro de téléphone
phone_number = ''.join(numbers)
return phone_number
else:
return None
def extraire_code_postal(line):
# Extraction du code postal dans la ligne
match = re.search(r'\b(\d{5})\b', line)
if match:
# Extraction des nombres correspondant au code postal
code_postal = match.group(1)
return code_postal
else:
return None
# trouver si la ligne est une adresse en cherchant deux numéros distincts et une virgule
def extraire_addr_line(line):
# Extraction des nombres dans la ligne
numbers = re.findall(r'\d+', line)
# Vérification si un numéro de téléphone est présent
if len(numbers) == 2 and ',' in line and len(numbers[1]) == 5:
return line
else:
return None
# Charger le fichier HTML
with open("list.html", "r") as file:
html = file.read()
# Analyser le code HTML avec BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Initialiser une liste pour stocker les informations des docteurs
doctors = []
# Trouver toutes les balises <article> sur la page
articles = soup.find_all("article")
# Parcourir chaque <article> pour extraire les informations des docteurs
for article in articles:
# Récupérer le nom du docteur à partir de la balise <h1>
# print(article.find("h1"))
name = article.find("h1").find('a').text.strip()
url = article.find("h1").find('a').get("href")
# Récupérer l'adresse du docteur à partir de la balise <em> dans la classe "entry-content"
address = ''
em = article.find("em")
if em:
address = em.text
# print(address)
# Vérifier si le contenu de l'article contient "Secteur 1"
if "Secteur 1" in article.text:
sector = "1"
elif "Secteur 2" in article.text:
sector = "2"
else:
sector = None
# Recherche d'un numéro de téléphone dans l'article
phone_number = None
gender = 'unknown'
trans_friendly = ''
handles_violence = ''
visio_meeting = 'no'
pseudo_science = ''
premenstrual_syndrome = ''
accessible_cabinet = ''
tatoo = ''
toxico = ''
sterilisation = ''
abortion = ''
endometriosis = ''
bigbody = ''
poil = ''
bi = ''
lesbian = ''
pma = ''
ist = ''
pregnancy = ''
ivg = ''
generaliste = ''
gyneco = ''
sage_femme = ''
auto_prelev = ''
mycoses = ''
dyspareunie = ''
spoken = '' # langues parlées
diu = '' # dispositif intra utérin
puma= '' # PUMA (ex-CMU)
ame= '' # AME (Aide médicale détat)
code_postal = ''
for line in article.stripped_strings:
found = extraire_code_postal(line)
if found:
code_postal = found
address = line
found = extraire_numero_telephone(line)
if found:
# print(found)
phone_number = found
if 'Rdv en ligne possible' in line:
visio_meeting = 'yes'
if 'femme soignante' in line:
gender = 'women'
if 'homme soignante' in line:
gender = 'women'
if 'Trans friendly' in line:
trans_friendly = 'yes'
if 'Tattoo' in line:
tatoo = 'yes'
if 'Sensibilité violence' in line:
handles_violence = 'yes'
if 'naturelles/alternatives' in line:
pseudo_science = 'yes'
if 'SPM' in line:
premenstrual_syndrome = 'yes'
if 'Poilfriendly' in line:
poil = 'yes'
if 'Bifriendly' in line:
bi = 'yes'
if 'Stérilisation' in line:
sterilisation = 'yes'
if 'DIU' in line:
diu = 'yes'
if 'Accompagnement grossesse' in line:
pregnancy = 'yes'
if 'suivi des IST' in line:
ist = 'yes'
if 'IVG' in line:
ivg = 'yes'
if 'Médecin généraliste' in line:
generaliste = 'yes'
if 'Sage-femme' in line:
sage_femme = 'yes'
if 'auto prélèvement' in line:
auto_prelev = 'yes'
if 'Conseils mycoses' in line:
mycoses = 'yes'
if 'Lesbiennes friendly' in line:
lesbian = 'yes'
if 'PMA' in line:
pma = 'yes'
if 'PUMA' in line:
puma = 'yes'
if 'AME' in line:
ame = 'yes'
if 'français, anglais' in line:
spoken = 'french;english'
if 'espagnol' in line:
spoken = ('french;english;spanish')
if 'Dyspareunie' in line:
dyspareunie = 'yes'
if 'Gynécologue' in line:
gyneco = 'yes'
# chercher une adresse si on en a pas
if not address:
found = extraire_addr_line(line)
if found:
# print(found)
address = found
# Ajouter les informations du docteur à la liste
doctor = {
"name": name,
"address": address,
"address:code_postal": code_postal,
"ref:FR:convention_secteur": sector,
"contact:phone": phone_number,
"contact:website": url,
"gender": gender,
"visioconference_meeting": visio_meeting,
"handles:gender:trans": trans_friendly,
"handles:violence": handles_violence,
"handles:premenstrual_syndrome": premenstrual_syndrome,
"handles:IST": ist,
"accessible_cabinet": accessible_cabinet,
"pseudo_science": pseudo_science,
"speaks": spoken,
"handles:tatoo": tatoo,
"handles:toxico": toxico,
"handles:sterilisation": sterilisation,
"handles:abortion": abortion,
"handles:endometriosis": endometriosis,
"handles:premenstrual_syndrome": premenstrual_syndrome,
"handles:hairy": poil,
"handles:bigbody": bigbody,
"handles:gender:bi": bi,
"handles:diu": diu,
"handles:mycoses": mycoses,
"handles:pregnancy": pregnancy,
"handles:abortion": ivg,
"handles:auto_prelevement": auto_prelev,
"handles:pma": pma,
"handles:dyspareunie": dyspareunie,
"handles:gender:lesbian": lesbian,
"healcare:generaliste": generaliste,
"healcare:sage_femme": sage_femme,
"healcare:gynecologist": gyneco,
}
doctors.append(doctor)
# Enregistrer les informations des docteurs au format JSON dans un fichier
with open("gynandco.json", "w", encoding="utf-8") as f:
json.dump(doctors, f, ensure_ascii=False, indent=2)
with open('gynandco.json', 'r') as f:
data = json.load(f)
df = pd.DataFrame(data)
df.to_csv('gynandco.csv', index=False)