mes-scripts-de-ml/01-fondamentaux/04-descente_gradient_mini-l...

import time
import numpy as np
import matplotlib.pyplot as plt

###############################################################################
# 04-descente_gradient_mini-lots.py
# @title: Fondamentaux - Apprentissage par descente de gradient par mini-lots
# @project: Mes scripts de ML
# @lang: fr
# @authors: Philippe Roy <philippe.roy@ac-grenoble.fr>
# @copyright: Copyright (C) 2023 Philippe Roy
# @license: GNU GPL
###############################################################################

###
# Commandes NumPy :
# - np.array : créer un tableau à partir d'une liste de listes
# - np.c_ : concatène les colonnes des tableaux
# - np.ones : créer un tableau de 1
# - np.linalg.inv : inversion de matrice
# - .T : transposé de matrice
# - .dot : produit de matrice
###

###############################################################################
# Initialisation
###############################################################################

# Init du temps
t_debut = time.time()

# Init des plots
fig = plt.figure(figsize=(15, 5))
fig.suptitle("Descente de gradient par mini-lots")
donnees_ax = fig.add_subplot(141) # Observations : x1 et cibles : y
model_ax = fig.add_subplot(142) # Modèle : theta0, theta1
couts_ax = fig.add_subplot(143) # Coûts : RMSE, MSE, ...
app_ax = fig.add_subplot(144) # Taux d'apprentissage : eta

i_list=[] # Itération
couts_2d=[]
couts_delta=[]
couts_mse=[] # MSE
couts_rmse=[] # RMSE
eta_list=[] # Taux d'apprentissage

###############################################################################
# Observations
###############################################################################

# Observations d'apprentisage
m = 1000 # Nombre d'observations
bg = 1 # Quantité du bruit gaussien
x1 = 2*np.random.rand(m, 1) # Liste des observations x1
y = 4 + 3*x1 + bg * np.random.rand(m, 1) # Liste des cibles y
X = np.c_[np.ones((m, 1)), x1] # Matrice des observations, avec x0=1
donnees_ax.plot(x1, y, 'b.', label="Observations")
exact_solution = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y) # Equation normale

# Nouvelles observations
x1_new=np.array([[0], [2]])
X_new = np.c_[np.ones((2, 1)), x1_new] # Matrice des observations, avec x0=1

###############################################################################
# Phase d'apprentissage et d'inférence
###############################################################################

# Phase d'apprentissage par descente de gradient stochastique
# - theta : vecteur paramètres du modèle
# - gradient : gradient du coût en fonction de theta
# - eta : taux d'appentissage ici dégressif par échéancier d'apprentissage (ech_app)

# n_epoq = 50  # Nombre d'époques
n_epoq = 20  # Nombre d'époques (hyperparamètre)
lot_taille = 20 # Taille d'un mini-lot (hyperparamètre)


# def mini_batch_gradient_descent():
#     n_iterations = 50
#     minibatch_size = 20
#     t0, t1 = 200, 1000
#     thetas = np.random.randn(2, 1)
#     thetas_path = [thetas]
#     t = 0
#     for epoch in range(n_iterations):
#         shuffled_indices = np.random.permutation(m)
#         X_b_shuffled = X_b[shuffled_indices]
#         y_shuffled = y[shuffled_indices]
#         for i in range(0, m, minibatch_size):
#             t += 1
#             xi = X_b_shuffled[i:i+minibatch_size]
#             yi = y_shuffled[i:i+minibatch_size]
#             gradients = 2*xi.T.dot(xi.dot(thetas) - yi)/minibatch_size
#             eta = learning_schedule(t, t0, t1)
#             thetas = thetas - eta*gradients
#             thetas_path.append(thetas)

# Rédéfinition du taux d'apprentissage à partir de l'échéancier d'apprentissage
# t0, t1 = 200, 1000
t0, t1 = 5, 50 # Facteurs de l'échéancier d'apprentissage (hyperparamètres)
def ech_app (t):
    return t0 / (t + t1)

# Calcul du coût (Mean Square Error (MSE), Root Mean Square Error (RMSE))
def mse(theta):
    return np.sum((np.dot(X, theta) - y)**2)/m
def rmse(theta):
    return np.sqrt(np.sum((np.dot(X, theta) - y)**2)/m)

# Initialisation aléatoire
theta= np.random.randn(2,1)
theta0=[theta[0]]
theta1=[theta[1]]

# Descente du gradient
for epoq in range (n_epoq):

    # Mélange des observations
    indices_melange = np.random.permutation(m)
    X_melange = X[indices_melange]
    y_melange = y[indices_melange]

    for i in range(0, m, lot_taille):
        i_list.append(epoq * (m/lot_taille)  + i/lot_taille)

        # Calcul du gradient du pas
        xi = X_melange[i:i+lot_taille]
        yi = y_melange[i:i+lot_taille]
        gradients = 2*xi.T.dot(xi.dot(theta) - yi)/lot_taille
        eta = ech_app (epoq * (m/lot_taille)  + i/lot_taille)
        eta_list.append(eta)
        theta = theta - eta*gradients
        theta0.append(theta[0])
        theta1.append(theta[1])

        # Calcul de l'erreur avec la norme du vecteur 2D (Objectif -> Theta) dans le plan (theta0, theta1)
        couts_2d.append(np.sqrt((theta[0]-exact_solution[0])**2+(theta[1]-exact_solution[1])**2))

        # Calcul du RMSE à la main :
        # FIXME : étrange, je n'ai pas le même résultat qu'avec  'couts_rmse.append(rmse(theta))'
        delta = 0
        for j in range (m):
            delta= delta +((theta[0] + theta[1]*x1[j])-(exact_solution[0] + exact_solution[1]*x1[j]))**2
        delta = np.sqrt(delta/m)
        couts_delta.append(delta)

        # Calcul du RMSE et du MSE par les matrices
        couts_mse.append(mse(theta))
        couts_rmse.append(rmse(theta))

        # Prédiction du pas
        # Phase d'inférence (dernier pas)
        y_predict=X_new.dot(theta)
        donnees_ax.plot(x1_new, y_predict, 'c-', linewidth=0.5)

###############################################################################
# Résultats
###############################################################################

# Plot des données
donnees_ax.set_title("Données")
donnees_ax.plot(x1_new, y_predict, 'r-', label="Prédictions")
donnees_ax.set_xlabel(r'$x_1$')
donnees_ax.set_ylabel(r'$y$', rotation=0)
donnees_ax.legend()

# Plot des paramètres du modèle
model_ax.set_title("Paramètres du modèle")
model_ax.plot(theta0, theta1, '.', ls=':', color='r', fillstyle='none', label="Chemin", markevery=10)
model_ax.plot(exact_solution[0], exact_solution[1], "o", color='k', fillstyle='full',  label="Equation normale")
model_ax.set_xlabel(r'$\theta_0$')
model_ax.set_ylabel(r'$\theta_1  $', rotation=0)
model_ax.legend()

# Plot du coût
couts_ax.set_title("Coûts")
couts_ax.plot(i_list, couts_2d, '.', ls=':', color='c', fillstyle='none', label="Coûts vecteur 2D", markevery=10)
couts_ax.plot(i_list, couts_delta, '.', ls=':', color='r', fillstyle='none', label="Coûts RMSE à la main", markevery=10)
couts_ax.plot(i_list, couts_mse, '.', ls=':', color='b', fillstyle='none', label="Coûts MSE", markevery=10)
couts_ax.plot(i_list, couts_rmse, '.', ls=':', color='g', fillstyle='none', label="Coûts RMSE", markevery=10)
couts_ax.set_xlabel(r'$i$')
couts_ax.set_ylabel("Coûts")
couts_ax.legend()

# Plot du taux d'apprentissage
app_ax.set_title("Taux d'apprentissage")
app_ax.plot(i_list, eta_list, '.', ls=':', color='b', fillstyle='none', label="Taux d'apprentissage", markevery=10)
app_ax.set_xlabel(r'$i$')
app_ax.set_ylabel(r'$\eta$', rotation=0)
# app_ax.legend()

plt.show()

# Performances
print ("Theta th : theta0 : "+str(4)+"     ; theta1 : "+str(3))
print ("Theta    : theta0 : "+str(round(float(theta[0]),3))+" ; theta1 : "+str(round(float(theta[1]),3)))
print ("Erreurs  : theta0 : "+str(round(float(theta[0]-4),3))+" ; theta1 : "+str(round(float(theta[1]-3),3)))
print ("Temps    : "+str(time.time()-t_debut))