import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from nltk.corpus import wordnet import nltk nltk.download('wordnet') # Importer le fichier CSV dans un DataFrame df = pd.read_csv('bank_statements.csv') # Correction orthographique approximative def correct_spelling(label): suggestions = wordnet.synsets(label) if suggestions: return max(suggestions, key=lambda s: s.lemma_names('eng')).lemmas()[0].name() else: return label # Appliquer la correction orthographique df['Label'] = df['Label'].apply(correct_spelling) # Vectoriser les labels de transaction vectorizer = CountVectorizer() labels = df['Label'] X = vectorizer.fit_transform(labels) # Diviser les données en jeu d'entraînement et jeu de test X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=42, test_size=0.2) # Former le classificateur naïf Bayésien multinomial clf = MultinomialNB() clf.fit(X_train, y_train) # Évaluer le classificateur y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f'Classification accuracy: {accuracy:.2f}') # Catégoriser les dépenses categories = set(df['Label']) category_mapping = dict((cat, i) for i, cat in enumerate(categories)) df['Category'] = df['Label'].map(category_mapping) df.head()