2024-10-21 23:58:55 +02:00
|
|
|
|
import json
|
|
|
|
|
import geopandas as gpd
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from shapely.geometry import Point
|
|
|
|
|
import re
|
2024-10-22 00:26:28 +02:00
|
|
|
|
import pandas as pd
|
2024-10-21 23:58:55 +02:00
|
|
|
|
|
|
|
|
|
def extraire_numero_telephone(line):
|
|
|
|
|
# Extraction des nombres dans la ligne
|
|
|
|
|
numbers = re.findall(r'\d+', line)
|
|
|
|
|
|
|
|
|
|
if numbers:
|
|
|
|
|
numbers = ''.join(numbers)
|
2024-10-22 00:26:28 +02:00
|
|
|
|
# print('numbers',numbers)
|
2024-10-21 23:58:55 +02:00
|
|
|
|
# Vérification si un numéro de téléphone est présent
|
|
|
|
|
if len(numbers) == 10:
|
|
|
|
|
# Reconstruction du numéro de téléphone
|
|
|
|
|
phone_number = ''.join(numbers)
|
|
|
|
|
return phone_number
|
|
|
|
|
else:
|
|
|
|
|
return None
|
|
|
|
|
|
2024-10-22 00:19:28 +02:00
|
|
|
|
def extraire_code_postal(line):
|
|
|
|
|
# Extraction du code postal dans la ligne
|
|
|
|
|
match = re.search(r'\b(\d{5})\b', line)
|
|
|
|
|
if match:
|
|
|
|
|
# Extraction des nombres correspondant au code postal
|
|
|
|
|
code_postal = match.group(1)
|
|
|
|
|
return code_postal
|
|
|
|
|
else:
|
|
|
|
|
return None
|
2024-10-21 23:58:55 +02:00
|
|
|
|
# trouver si la ligne est une adresse en cherchant deux numéros distincts et une virgule
|
|
|
|
|
def extraire_addr_line(line):
|
|
|
|
|
# Extraction des nombres dans la ligne
|
|
|
|
|
numbers = re.findall(r'\d+', line)
|
|
|
|
|
|
|
|
|
|
# Vérification si un numéro de téléphone est présent
|
2024-10-22 00:26:28 +02:00
|
|
|
|
if len(numbers) == 2 and ',' in line and len(numbers[1]) == 5:
|
2024-10-21 23:58:55 +02:00
|
|
|
|
return line
|
|
|
|
|
else:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Charger le fichier HTML
|
|
|
|
|
with open("list.html", "r") as file:
|
|
|
|
|
html = file.read()
|
|
|
|
|
|
|
|
|
|
# Analyser le code HTML avec BeautifulSoup
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
|
|
|
|
# Initialiser une liste pour stocker les informations des docteurs
|
|
|
|
|
doctors = []
|
|
|
|
|
# Trouver toutes les balises <article> sur la page
|
|
|
|
|
articles = soup.find_all("article")
|
|
|
|
|
# Parcourir chaque <article> pour extraire les informations des docteurs
|
|
|
|
|
for article in articles:
|
|
|
|
|
# Récupérer le nom du docteur à partir de la balise <h1>
|
|
|
|
|
# print(article.find("h1"))
|
|
|
|
|
name = article.find("h1").find('a').text.strip()
|
|
|
|
|
url = article.find("h1").find('a').get("href")
|
|
|
|
|
|
|
|
|
|
# Récupérer l'adresse du docteur à partir de la balise <em> dans la classe "entry-content"
|
|
|
|
|
address = ''
|
|
|
|
|
|
|
|
|
|
em = article.find("em")
|
|
|
|
|
if em:
|
|
|
|
|
address = em.text
|
2024-10-22 00:26:28 +02:00
|
|
|
|
# print(address)
|
2024-10-21 23:58:55 +02:00
|
|
|
|
|
|
|
|
|
# Vérifier si le contenu de l'article contient "Secteur 1"
|
|
|
|
|
if "Secteur 1" in article.text:
|
|
|
|
|
sector = "1"
|
|
|
|
|
elif "Secteur 2" in article.text:
|
|
|
|
|
sector = "2"
|
|
|
|
|
else:
|
|
|
|
|
sector = None
|
|
|
|
|
|
|
|
|
|
# Recherche d'un numéro de téléphone dans l'article
|
|
|
|
|
phone_number = None
|
|
|
|
|
gender = 'unknown'
|
2024-10-22 00:19:28 +02:00
|
|
|
|
trans_friendly = ''
|
|
|
|
|
handles_violence = ''
|
2024-10-21 23:58:55 +02:00
|
|
|
|
visio_meeting = 'no'
|
2024-10-22 00:19:28 +02:00
|
|
|
|
pseudo_science = ''
|
|
|
|
|
premenstrual_syndrome = ''
|
|
|
|
|
accessible_cabinet = ''
|
|
|
|
|
tatoo = ''
|
|
|
|
|
toxico = ''
|
|
|
|
|
sterilisation = ''
|
|
|
|
|
abortion = ''
|
|
|
|
|
endometriosis = ''
|
|
|
|
|
bigbody = ''
|
|
|
|
|
poil = ''
|
|
|
|
|
bi = ''
|
|
|
|
|
lesbian = ''
|
|
|
|
|
pma = ''
|
|
|
|
|
ist = ''
|
|
|
|
|
pregnancy = ''
|
|
|
|
|
ivg = ''
|
|
|
|
|
generaliste = ''
|
|
|
|
|
gyneco = ''
|
|
|
|
|
sage_femme = ''
|
|
|
|
|
auto_prelev = ''
|
|
|
|
|
mycoses = ''
|
|
|
|
|
dyspareunie = ''
|
|
|
|
|
spoken = '' # langues parlées
|
|
|
|
|
diu = '' # dispositif intra utérin
|
|
|
|
|
puma= '' # PUMA (ex-CMU)
|
|
|
|
|
ame= '' # AME (Aide médicale d’état)
|
|
|
|
|
code_postal = ''
|
2024-10-21 23:58:55 +02:00
|
|
|
|
|
|
|
|
|
for line in article.stripped_strings:
|
2024-10-22 00:19:28 +02:00
|
|
|
|
found = extraire_code_postal(line)
|
|
|
|
|
if found:
|
|
|
|
|
code_postal = found
|
|
|
|
|
address = line
|
2024-10-21 23:58:55 +02:00
|
|
|
|
found = extraire_numero_telephone(line)
|
|
|
|
|
if found:
|
2024-10-22 00:26:28 +02:00
|
|
|
|
# print(found)
|
2024-10-21 23:58:55 +02:00
|
|
|
|
phone_number = found
|
|
|
|
|
if 'Rdv en ligne possible' in line:
|
|
|
|
|
visio_meeting = 'yes'
|
|
|
|
|
if 'femme soignante' in line:
|
|
|
|
|
gender = 'women'
|
|
|
|
|
if 'homme soignante' in line:
|
|
|
|
|
gender = 'women'
|
|
|
|
|
if 'Trans friendly' in line:
|
|
|
|
|
trans_friendly = 'yes'
|
|
|
|
|
if 'Tattoo' in line:
|
|
|
|
|
tatoo = 'yes'
|
|
|
|
|
if 'Sensibilité violence' in line:
|
|
|
|
|
handles_violence = 'yes'
|
|
|
|
|
if 'naturelles/alternatives' in line:
|
|
|
|
|
pseudo_science = 'yes'
|
|
|
|
|
if 'SPM' in line:
|
|
|
|
|
premenstrual_syndrome = 'yes'
|
|
|
|
|
if 'Poilfriendly' in line:
|
|
|
|
|
poil = 'yes'
|
|
|
|
|
if 'Bifriendly' in line:
|
|
|
|
|
bi = 'yes'
|
|
|
|
|
if 'Stérilisation' in line:
|
|
|
|
|
sterilisation = 'yes'
|
2024-10-22 00:19:28 +02:00
|
|
|
|
if 'DIU' in line:
|
|
|
|
|
diu = 'yes'
|
|
|
|
|
if 'Accompagnement grossesse' in line:
|
|
|
|
|
pregnancy = 'yes'
|
|
|
|
|
if 'suivi des IST' in line:
|
|
|
|
|
ist = 'yes'
|
|
|
|
|
if 'IVG' in line:
|
|
|
|
|
ivg = 'yes'
|
|
|
|
|
if 'Médecin généraliste' in line:
|
|
|
|
|
generaliste = 'yes'
|
|
|
|
|
if 'Sage-femme' in line:
|
|
|
|
|
sage_femme = 'yes'
|
|
|
|
|
if 'auto prélèvement' in line:
|
|
|
|
|
auto_prelev = 'yes'
|
|
|
|
|
if 'Conseils mycoses' in line:
|
|
|
|
|
mycoses = 'yes'
|
|
|
|
|
if 'Lesbiennes friendly' in line:
|
|
|
|
|
lesbian = 'yes'
|
|
|
|
|
if 'PMA' in line:
|
|
|
|
|
pma = 'yes'
|
|
|
|
|
if 'PUMA' in line:
|
|
|
|
|
puma = 'yes'
|
|
|
|
|
if 'AME' in line:
|
|
|
|
|
ame = 'yes'
|
|
|
|
|
if 'français, anglais' in line:
|
|
|
|
|
spoken = 'french;english'
|
|
|
|
|
if 'espagnol' in line:
|
2024-10-22 00:26:28 +02:00
|
|
|
|
spoken = ('french;english;spanish')
|
2024-10-22 00:19:28 +02:00
|
|
|
|
if 'Dyspareunie' in line:
|
|
|
|
|
dyspareunie = 'yes'
|
|
|
|
|
if 'Gynécologue' in line:
|
|
|
|
|
gyneco = 'yes'
|
2024-10-21 23:58:55 +02:00
|
|
|
|
# chercher une adresse si on en a pas
|
|
|
|
|
if not address:
|
|
|
|
|
found = extraire_addr_line(line)
|
|
|
|
|
if found:
|
2024-10-22 00:26:28 +02:00
|
|
|
|
# print(found)
|
2024-10-21 23:58:55 +02:00
|
|
|
|
address = found
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Ajouter les informations du docteur à la liste
|
|
|
|
|
doctor = {
|
|
|
|
|
"name": name,
|
|
|
|
|
"address": address,
|
2024-10-22 00:19:28 +02:00
|
|
|
|
"address:code_postal": code_postal,
|
2024-10-21 23:58:55 +02:00
|
|
|
|
"ref:FR:convention_secteur": sector,
|
|
|
|
|
"contact:phone": phone_number,
|
|
|
|
|
"contact:website": url,
|
|
|
|
|
"gender": gender,
|
|
|
|
|
"visioconference_meeting": visio_meeting,
|
|
|
|
|
"handles:gender:trans": trans_friendly,
|
|
|
|
|
"handles:violence": handles_violence,
|
|
|
|
|
"handles:premenstrual_syndrome": premenstrual_syndrome,
|
2024-10-22 00:19:28 +02:00
|
|
|
|
"handles:IST": ist,
|
2024-10-21 23:58:55 +02:00
|
|
|
|
"accessible_cabinet": accessible_cabinet,
|
|
|
|
|
"pseudo_science": pseudo_science,
|
2024-10-22 00:19:28 +02:00
|
|
|
|
"speaks": spoken,
|
2024-10-21 23:58:55 +02:00
|
|
|
|
"handles:tatoo": tatoo,
|
|
|
|
|
"handles:toxico": toxico,
|
|
|
|
|
"handles:sterilisation": sterilisation,
|
|
|
|
|
"handles:abortion": abortion,
|
|
|
|
|
"handles:endometriosis": endometriosis,
|
|
|
|
|
"handles:premenstrual_syndrome": premenstrual_syndrome,
|
|
|
|
|
"handles:hairy": poil,
|
|
|
|
|
"handles:bigbody": bigbody,
|
|
|
|
|
"handles:gender:bi": bi,
|
2024-10-22 00:19:28 +02:00
|
|
|
|
"handles:diu": diu,
|
|
|
|
|
"handles:mycoses": mycoses,
|
|
|
|
|
"handles:pregnancy": pregnancy,
|
|
|
|
|
"handles:abortion": ivg,
|
|
|
|
|
"handles:auto_prelevement": auto_prelev,
|
|
|
|
|
"handles:pma": pma,
|
|
|
|
|
"handles:dyspareunie": dyspareunie,
|
|
|
|
|
"handles:gender:lesbian": lesbian,
|
|
|
|
|
"healcare:generaliste": generaliste,
|
|
|
|
|
"healcare:sage_femme": sage_femme,
|
|
|
|
|
"healcare:gynecologist": gyneco,
|
|
|
|
|
|
2024-10-21 23:58:55 +02:00
|
|
|
|
}
|
|
|
|
|
doctors.append(doctor)
|
|
|
|
|
|
|
|
|
|
# Enregistrer les informations des docteurs au format JSON dans un fichier
|
|
|
|
|
with open("gynandco.json", "w", encoding="utf-8") as f:
|
|
|
|
|
json.dump(doctors, f, ensure_ascii=False, indent=2)
|
2024-10-22 00:26:28 +02:00
|
|
|
|
|
|
|
|
|
with open('gynandco.json', 'r') as f:
|
|
|
|
|
data = json.load(f)
|
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
|
df.to_csv('gynandco.csv', index=False)
|