47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score
|
|
from nltk.corpus import wordnet
|
|
import nltk
|
|
nltk.download('wordnet')
|
|
|
|
# Importer le fichier CSV dans un DataFrame
|
|
df = pd.read_csv('bank_statements.csv')
|
|
|
|
# Correction orthographique approximative
|
|
def correct_spelling(label):
|
|
suggestions = wordnet.synsets(label)
|
|
if suggestions:
|
|
return max(suggestions, key=lambda s: s.lemma_names('eng')).lemmas()[0].name()
|
|
else:
|
|
return label
|
|
|
|
# Appliquer la correction orthographique
|
|
df['Label'] = df['Label'].apply(correct_spelling)
|
|
|
|
# Vectoriser les labels de transaction
|
|
vectorizer = CountVectorizer()
|
|
labels = df['Label']
|
|
X = vectorizer.fit_transform(labels)
|
|
|
|
# Diviser les données en jeu d'entraînement et jeu de test
|
|
X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=42, test_size=0.2)
|
|
|
|
# Former le classificateur naïf Bayésien multinomial
|
|
clf = MultinomialNB()
|
|
clf.fit(X_train, y_train)
|
|
|
|
# Évaluer le classificateur
|
|
y_pred = clf.predict(X_test)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
print(f'Classification accuracy: {accuracy:.2f}')
|
|
|
|
# Catégoriser les dépenses
|
|
categories = set(df['Label'])
|
|
category_mapping = dict((cat, i) for i, cat in enumerate(categories))
|
|
df['Category'] = df['Label'].map(category_mapping)
|
|
|
|
df.head()
|