scripts/compta/sort_categories.py

47 lines
1.4 KiB
Python
Raw Normal View History

2024-03-17 10:18:22 +01:00
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
# Importer le fichier CSV dans un DataFrame
df = pd.read_csv('bank_statements.csv')
# Correction orthographique approximative
def correct_spelling(label):
suggestions = wordnet.synsets(label)
if suggestions:
return max(suggestions, key=lambda s: s.lemma_names('eng')).lemmas()[0].name()
else:
return label
# Appliquer la correction orthographique
df['Label'] = df['Label'].apply(correct_spelling)
# Vectoriser les labels de transaction
vectorizer = CountVectorizer()
labels = df['Label']
X = vectorizer.fit_transform(labels)
# Diviser les données en jeu d'entraînement et jeu de test
X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=42, test_size=0.2)
# Former le classificateur naïf Bayésien multinomial
clf = MultinomialNB()
clf.fit(X_train, y_train)
# Évaluer le classificateur
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification accuracy: {accuracy:.2f}')
# Catégoriser les dépenses
categories = set(df['Label'])
category_mapping = dict((cat, i) for i, cat in enumerate(categories))
df['Category'] = df['Label'].map(category_mapping)
df.head()