feat: Add a pgf-plot scikit-learn KMeans figure

2023-09-27 18:56:25 +02:00 · 2023-09-27 18:02:07 +02:00 · 2023-09-27 18:02:07 +02:00 · 7f89b6c842
commit 7f89b6c842
parent 6f90bdfa0d
20 changed files with 508 additions and 32 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+main.pdf filter=lfs diff=lfs merge=lfs -text
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,3 +0,0 @@
-{
-    "ansible.python.interpreterPath": "/bin/python"
-}
--- a/6
+++ b/6
@ -0,0 +1,6 @@
+options=-shell-escape -file-line-error
+
+all: main.pdf
+
+%.pdf: %.tex
+	lualatex $(options) $<
--- a/content/chapters/1.tex
+++ b/content/chapters/1.tex
@ -0,0 +1,168 @@
+\chapter{Unsupervised Learning}
+
+\begin{definition}[Precision Medicine]
+  Design of treatment for a given patient, based on genomic data.
+\end{definition}
+
+\begin{definition}[Hierarchical clustering]
+\end{definition}
+
+Gene expression time series: look for genes with similar expression footprint.
+
+\paragraph{Representation of data}
+
+\begin{itemize}
+  \item Tables;
+  \item Trees / Graphs;
+  \item Time series...
+\end{itemize}
+
+\begin{figure}
+  \includestandalone{figures/plots/genes_expression_timeseries}
+  \caption{Example of gene expression time series}
+\end{figure}
+
+\section{Distances and Similarities}
+
+\begin{property}[Distance]
+  \begin{description}
+    \item[non-negativity] $d(i, j) \geq 0$
+    \item[isolation] $d(i, i) = 0$
+    \item[symmetry] $d(i, j) = d(j, i)$
+    \item[triangular inequality] $d(i, j) \leq d(i, h) + d(h, j)$   
+  \end{description}
+\end{property}
+
+\begin{definition}[Dissimilarity]
+  Distance without triangular inequality.
+\end{definition}
+
+\begin{definition}[Similarity]
+  Function $s$ from $X \times X$ to $\RR_+$ such that:
+  \begin{enumerate}
+    \item $s$ is symmetric: $(x, y) \in X \times X; s(x, y) = s(y, x)$
+    \item $(x, y) \in X \times X; s(x, x) = s(y, y) > s(x, y)$.
+  \end{enumerate}
+\end{definition}
+
+\begin{exercise}
+
+  Let $d(x, y)$ be the distance, $d(x, y) \in [0, +\infty[$.
+
+  What should be the similarity measure $S(x, y) = f(d(x, y))$ that satisfies the following property:
+  \[
+    (x, y) \in X \times X \: | \: S(x, y) > S(x, y)
+  \]
+  having $S(x, y) \leq M$, $S(x, y) \in ]0, M]$.
+\end{exercise}
+$d(x, y) \geq 0 \: \forall (x, y)$
+\begin{equation}
+  S(x, y) = \frac{M}{d(x, y) + 1}
+  \label{eq:similarity-first}
+\end{equation}
+In \cref{eq:similarity-first}, $S(x, y)$ ranges from 0 to M.
+\begin{eqnarray}
+  \lim_{n \to \infty} \frac{M}{n + 1} = 0 && \lim_{n \to 0}  \frac{M}{n + 1} = M
+\end{eqnarray}
+
+
+\section{Data Representation}
+
+\paragraph{Data matrix}
+
+
+\paragraph{Distance matrix}
+
+\[
+  \begin{bmatrix}
+    0 \\
+    d(2, 1) & 0 \\
+    d(3, 1) & d(3, 2) & 0 \\
+    \vdots & \vdots & \ddots \\
+    d(n, 1) & d(n,2) & \dots & \dots & 0
+  \end{bmatrix}
+\]
+
+
+\begin{table}
+  \centering
+  \begin{tabular}{c|cc}
+    &$s_{1}$ & $s_{2}$ \\
+    \hline
+    $p_{1}$ & 0 & 1 \\
+    $p_{2}$ & 1 & 0 \\
+    $p_{3}$ & 3 & 2 \\
+    \end{tabular}
+  \caption{Example data matrix: 2 symptoms for 3 patients.}
+\end{table}
+
+
+
+\begin{definition}[Minkowski distance]
+  \[
+    L_p (x, y) = \left(\abs{x_1 - y_1}^p + \abs{x_2 - y_2}^p + \ldots + \abs{x_d - y_d}^p\right)^{\sfrac{1}{p}} = \left(\sum_{i=1}^d \left(x_i - y_i\right)^p\right)^{\sfrac{1}{p}}
+  \]
+  where $p$ is a positive integer.
+\end{definition}
+
+\begin{definition}[Manhattan distance]
+  \[
+  L_1(x, y) = \sum_{i=1}^d \abs{x_i - y_i}
+  \]
+\end{definition}
+
+\begin{definition}[Euclidian distance]
+  Let $A$ and $B$ be two points, with $(x_{A}, y_{A})$ and $(x_{B}, y_{B})$ their respective coordinates,
+\end{definition}
+
+If $p=2$, $L_2$ is the Euclidian distance:
+\begin{definition}[Euclidian distance]
+  \[
+    d(x, y) = \sqrt{\abs{x_1 - y_1}^2 + \abs{x_2 - y_2} + \ldots + \abs{x_d - y_d}^2}
+  \]
+\end{definition}
+
+We can add weights
+
+\subsection{K-means}
+
+The cost function is minimized:
+\[
+  Cost(C) \sum_{i=1}^{k}...
+\]
+
+\begin{algorithm}[H]
+  Choose the number of clusters $k$.
+
+  Choose randomly $k$ means.
+
+  For each point, compute the distance between the point and each means.
+  We allocate the point to the cluster represented by the clostest center.
+
+  We set each means to the center of the cluster, and reiterate.
+  \caption{$K$-means algorithm}
+\end{algorithm}
+
+
+\begin{exercise}
+  We have six genes:
+  \begin{table}[H]
+    \centering
+    \begin{tabular}{ccccccc}
+      \toprule
+      & $g_{1}$ & $g_{2}$ & $g_{3}$ & $g_{4}$ & $g_{5}$  & $g_{6}$ \\
+      \midrule
+      $\times 10^{-2}$ & 10 & 12 & 9 & 15 & 17 & 18 \\
+      \bottomrule
+    \end{tabular}
+    \caption{Sample values for six gene expressions.}
+  \end{table}
+
+  With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm.
+\end{exercise}
+
+\begin{figure}
+  \centering
+  \includegraphics[scale=1]{figures/plots/kmeans.pdf}
+  \caption{$k$-means states at each of the 3 steps}
+\end{figure}
--- a/content/chapters/include.tex
+++ b/content/chapters/include.tex
@ -11,9 +11,7 @@
 		}
 }

-
-\includechapters{part1}{2}
-
+\includechapters{}{2}

 % \includechapters{part2}{2}

--- a/content/chapters/part1/1.tex
+++ b/content/chapters/part1/1.tex
--- a/content/genes_expression_timeseries.tex
+++ b/content/genes_expression_timeseries.tex
@ -0,0 +1,19 @@
+\documentclass[tikz,a4paper]{standalone}
+
+\usepackage{tikz}
+
+\begin{document}
+
+\usetikzlibrary{datavisualization}
+
+\begin{tikzpicture}
+  \datavisualization[visualize as smooth line]
+  data {
+    x, y
+    2, 1,
+    3, 2,
+    4, 1.5
+  };
+\end{tikzpicture}
+
+\end{document}
--- a/figures/euclidian_distance.tex
+++ b/figures/euclidian_distance.tex
@ -0,0 +1,19 @@
+
+
+
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usepackage{tkz-euclide}
+
+\begin{document}
+
+\begin{tikzpicture}[scale=1]
+  \tkzInit[xmax=5,ymax=5]
+  \tkzDrawX[>=latex]
+  \tkzDraw[>=latex]
+  \tkzDefPoints()
+
+\end{tikzpicture}
+
+\end{document}
--- a/figures/plots/.gitattributes
+++ b/figures/plots/.gitattributes
@ -0,0 +1,2 @@
+genes_expression_timeseries.pdf filter=lfs diff=lfs merge=lfs -text
+kmeans.pdf filter=lfs diff=lfs merge=lfs -text
--- a/figures/plots/genes_expression_timeseries.pdf
+++ b/figures/plots/genes_expression_timeseries.pdf
--- a/figures/plots/genes_expression_timeseries.tex
+++ b/figures/plots/genes_expression_timeseries.tex
@ -0,0 +1,45 @@
+\documentclass[tikz]{standalone}
+\usepackage{tikz}
+
+\begin{document}
+
+\usetikzlibrary{datavisualization}
+\begin{tikzpicture}
+
+  \datavisualization data group {genes} = {
+    data[set=gene1] {
+      x, y
+      0, 1,
+      1, 2,
+      2, 1.5
+    }
+    data[set=gene2] {
+      x, y
+      0, 1.5,
+      1, 2.25,
+      2, 1.75
+    }
+    data[set=gene3] {
+      x, y
+      0, 0.25,
+      1, 0.26,
+      2, 0.7
+    }
+    data[set=gene4] {
+      x, y
+      0, 0.5,
+      1, 0.25,
+      2, 1
+    }
+  };
+  \datavisualization [
+  school book axes, all axes={unit length=7.5mm},
+  visualize as smooth line/.list={gene1, gene2, gene3, gene4},
+  style sheet=strong colors,
+  x axis={label=$t$},
+  y axis={label={expression}}]
+data group {genes};
+
+\end{tikzpicture}
+
+\end{document}
--- a/figures/plots/kmeans.pdf
+++ b/figures/plots/kmeans.pdf
--- a/figures/plots/kmeans.tex
+++ b/figures/plots/kmeans.tex
@ -0,0 +1,54 @@
+\documentclass[margin=0.5cm]{standalone}
+\usepackage{tikz}
+\usepackage{pyluatex}
+\usepackage{pgf}
+
+\begin{document}
+\begin{python}
+# %%
+import io
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+
+# %%
+
+
+data = """g1,10
+g2,12
+g3,9
+g4,15
+g5,17
+g6,18"""
+
+points =[int(row.split(",")[1]) for row in data.split("\n")]
+X = np.array([[point] for point in points])
+initial_means = [[10], [9]]
+points
+
+# %%
+kmeans_values = []
+for i in range(1,4): 
+    kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1)
+    kmeans.fit(X)
+    kmeans_values.append(kmeans.cluster_centers_)
+
+
+# %%
+fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True)
+for i, centroids in enumerate(kmeans_values):
+    ax = axs[i]
+    ax.scatter(centroids, [i]*len(centroids), marker='x')
+    ax.scatter(points, [i]*len(points), s=2, color="black")
+    ax.axis('off')
+
+
+with io.StringIO() as file:
+    fig.savefig(file, format="pgf", bbox_inches="tight", pad_inches=0.1)
+    print(file.getvalue())
+\end{python}
+\begin{tikzpicture}
+    
+\end{tikzpicture}
+\end{document}
--- a/content/chapters/part1/0.tex
+++ b/content/chapters/part1/0.tex
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
@ -1,46 +1,69 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% Course of None
-% 
-% Author: Samuel ORTION <samuel@ortion.fr> 
-% Version: 0.0.1
+% Course on "Data-mining and Machine Learning" - GENIOMHE - M1-S1
+%
+% Author: Samuel Ortion <samuel@ortion.fr>
+% Version: 0.1.0
 % Date: 2023
 % Licence: CC-By-SA 4.0+ International 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-\documentclass[
-	a4paper,
-	10pt,
-	fleqn,
-	oneside
-]{talpa}
+\documentclass[twoside=false,fontsize=10pt,fleqn]{scrbook}
+\usepackage{mus}
+\usepackage{standalone}
+\titlehead{GENIOMHE}
+\title{Data-mining and\newline{}Machine Learning}
+\subtitle{}
+\author{Samuel Ortion}
+\date{Fall 2023}
+\teacher{Farida Zerhaoui}
+\cursus{GENIOMHE}
+\university{Université d'Évry val d'Essonne -- Université Paris-Saclay}
+\semester{M1 - S1}

-\input{colors.tex}
-\input{meta.tex}
-\input{definitions.tex}
+\input{definitions}
+\input{preamble}

 \hypersetup{
-	pdftitle={
-		Course - None
-	},
-	pdfauthor={
-		Samuel Ortion
-	},
+	pdftitle={Course - Data-mining and Machine Learning},
+	pdfauthor={Samuel Ortion},
 	pdfsubject={},
-	pdfkeywords={},
+	pdfkeywords={GENIOMHE, Master, bioinformatics, machine learning, statistics, data},
 	pdfcreator={LaTeX}
 }

-% \addbibressource{bibliography.bib}
+\usepackage{ccicons}
+\usepackage[
+    type={CC},
+    modifier={by-sa},
+    version={4.0},
+]{doclicense}

+\addbibresource{references.bib}
 \makeindex
+
 \begin{document}

-\tableofcontents
+\setkomafont{fullpagetitle}{\fontsize{1.5cm}{3em}\fontseries{b}\selectfont}
+\maketitlefullpage

-% \input{content/introduction.tex}
+{
+	\hypersetup{
+		linkcolor=black
+	}
+	\tableofcontents
+}

-\input{content/chapters/include.tex}
+\doclicenseThis%

-% \input{content/conclusion.tex}
+% \input{content/introduction}

-\end{document}
+\input{content/chapters/include}
+
+% \input{content/conclusion}
+
+\nocite{*}
+
+\printbibliography%
+% \printglossary%
+
+\end{document}
--- a/notebooks/kmeans1d.ipynb
+++ b/notebooks/kmeans1d.ipynb
@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import io\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.cluster import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[10, 12, 9, 15, 17, 18]"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "data = \"\"\"g1,10\n",
+    "g2,12\n",
+    "g3,9\n",
+    "g4,15\n",
+    "g5,17\n",
+    "g6,18\"\"\"\n",
+    "\n",
+    "points =[int(row.split(\",\")[1]) for row in data.split(\"\\n\")]\n",
+    "X = np.array([[point] for point in points])\n",
+    "initial_means = [[10], [9]]\n",
+    "points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kmeans_values = []\n",
+    "for i in range(1,4): \n",
+    "    kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1)\n",
+    "    kmeans.fit(X)\n",
+    "    kmeans_values.append(kmeans.cluster_centers_)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgMAAAGFCAYAAABg2vAPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAMpklEQVR4nO3dMU9Va77H8f+54cgRjBhsRpPDJDsZKS0GM+z2vgUL7Ow8mel4JSbTGC2spZj7Fm41kRAtKGUSEr0JTCMBIm6QnXALMxxRi3N0b9Zi/T6fbkPifh6f9Tz5Zu219YeTk5OTAgBi/VfTAwAAmiUGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgXCMxsH94XNt7g6/+bntvUPuHx+c8IqDtnBswPuceA/uHx3X/6VotPV6trd2zG3trd1BLj1fr/tM1Gxs45dyA8Tr3GDg4Gtbbdx/qzc77uvfk1429tTuoe09W683O+3r77kMdHA3Pe2hASzk3YLzOPQZuzFyuZw8Wa252qt7svK+lR/+s//nftVp69M96s/O+5man6tmDxboxc/m8h/ZdhsNhbWxs1HDoMGoD69Etn58b956s1svXO6chcF7nhuuqXbqyHm2YRyPPDNy89nFj/zwzWWt//1vd/e+/1Nrf/1Y/z0zWsweLdfPaxQuBfr9f8/Pz1e/3L/yFedFZj276z7nxnyC4++j5mRAY97nhumqXrqxHW+bR2LcJbl67XMuLV+vDv/9VVVUf/v2vWl68euFCoKpqc3OzXrx4UVVVL168qM3NzYZHlM16dNfNa5fr4dLtMz97uHT7XM4N11W7dGU92jKPxmJga3dQD1f369If/lRVVZf+8Kd6uLr/xcNBF0Gv16uFhYWqqrpz5071er2GR5TNenTX1u6gllfWz/xseWX9XM4N11W7dGU92jKPH05OTk7O+00/fejn55nJWl68Wg9X9+v/9o7O7ZbfqA2Hw9rc3Kxer1cTExNNDyee9eieT8+Nudmperh0u5ZX1s/9owLXVXt0ZT3aMI9zj4HtvY9fA/p8A3++0Vd+uXgPEQLj4dyA8Tr3jwmmJyfq+pVLX5T8pw8HXb9yqaYnL27lAaPl3IDxauRjgv3D4zo4Gn614Lf3BjU9OVFXf/rxvIcFtJhzA8ankRgAANrDf1QEAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhOtsDOwfHtf23uCrv9veG9T+4fE5jwjg65xXNK2TMbB/eFz3n67V0uPV2to9u8G2dge19Hi17j9ds8GAxjmvaINOxsDB0bDevvtQb3be170nv26wrd1B3XuyWm923tfbdx/q4GjY8EiBdM4r2qDxGBgOh7WxsVHD4egu9Bszl+vZg8Wam5063WAvX++cbqy52al69mCxbsxcHtl7jmMefDvrwTh05bzqiq7s8zbMo9EYGA6H1e/3a35+vvr9/kj/Im5eO7vB7j56fmZj3bw22hAY1zz4/awH49CV86orurLP2zKPH05OTk4aeeeq2tjYqPn5+dPXr169qlu3bo30PV6+3qm7j56fvv7HX/v15z/OjvQ9zmMe/HbWg3HoynnVFV3Z522ZR6N3Bnq9Xi0sLFRV1Z07d6rX6430z9/aHdTyyvqZny2vrH/xkM73Gvc8+H2sB+PQlfOqK7qyz9syj0bvDFR9vEWyublZvV6vJiYmRvbnfvrwzdzsVD1cul3LK+tj/ahgHPPg21gPxqEr51VXdGWft2EejcfAOGzvffw6zucb6fMNt/KLh3KAZjmvaIPGv00wDtOTE3X9yqUvivrTh3SuX7lU05MXtySBbnBe0QadvDNQ9fEf8jg4Gn61pLf3BjU9OVFXf/qxgZEBnOW8ommdjQEA4Lfp5McEAMBvJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGKgQfuHx7W9N/jq77b3BrV/eHzOIwIYL+deO4mBhuwfHtf9p2u19Hi1tnbPboyt3UEtPV6t+0/XbAygM5x77SUGGnJwNKy37z7Um533de/Jrxtja3dQ956s1pud9/X23Yc6OBo2PFKA0XDutVfjMTAcDmtjY6OGw4u9+L93HjdmLtezB4s1Nzt1ujFevt453RBzs1P17MFi3Zi5POaRd1NXrivaxXX1fUZ97nVlPdowj0ZjYDgcVr/fr/n5+er3+xd2Qb91Hjevnd0Ydx89P7Mhbl4TAt+iK9cV7eK6Go1RnXtdWY+2zOOHk5OTk0beuao2NjZqfn7+9PWrV6/q1q1bTQ3nm33vPF6+3qm7j56fvv7HX/v15z/OjnSMSbpyXdEurqvR+t5zryvr0ZZ5NHpnoNfr1cLCQlVV3blzp3q9XpPD+WbfM4+t3UEtr6yf+dnyyvoXD9fw23XluqJdXFejM4pzryvr0ZZ5NHpnoOrjLZLNzc3q9Xo1MTHR5FC+y7fM49OHZuZmp+rh0u1aXln3UcEIdOW6ol1cV99vlOdeV9ajDfNoPAZSbe99/BrN5xvg842y8ouHCIFucO61V+PfJkg1PTlR169c+qKEP3245vqVSzU9eXFrF+BTzr32cmegQfuHx3VwNPxqAW/vDWp6cqKu/vRjAyMDGA/nXjuJAQAI52MCAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAI9/8lWKkDA/pOhQAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 640x480 with 3 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True)\n",
+    "for i, centroids in enumerate(kmeans_values):\n",
+    "    ax = axs[i]\n",
+    "    ax.scatter(centroids, [i]*len(centroids), marker='x')\n",
+    "    ax.scatter(points, [i]*len(points), s=2, color=\"black\")\n",
+    "    ax.axis('off')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "geniomhe-ml",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preamble.tex
+++ b/preamble.tex
@ -0,0 +1,2 @@
+\usepackage{mus-learn}
+\usepackage{xfrac}
--- a/references.bib
+++ b/references.bib
@ -0,0 +1,25 @@
+
+@book{geron_hands-machine_2019,
+  edition   = {2},
+  title     = {Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}},
+  abstract  = {Through a recent series of breakthroughs, deep learning has boosted the entire field of machine learning. Now, even programmers who know close to nothing about this technology can use simple, … - Selection from Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}, 2nd Edition [Book]},
+  publisher = {O'{REILLY}},
+  author    = {Géron, Aurélien},
+  date      = {2019},
+  langid    = {english},
+  note      = {{ISBN}: 9781098125974}
+}
+
+@collection{witten_data_2011,
+  location  = {Boston},
+  edition   = {4},
+  title     = {Data Mining - Practical Machine Learning Tools an Techniques},
+  isbn      = {978-0-12-374856-0},
+  series    = {The Morgan Kaufmann Series in Data Management Systems},
+  publisher = {Morgan Kaufmann},
+  editor    = {Witten, Ian H. and Frank, Eibe and Hall, Mark A.},
+  urldate   = {2023-06-16},
+  date      = {2011-01-01},
+  langid    = {english},
+  doi       = {10.1016/B978-0-12-374856-0.00018-3}
+}
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+scikit-learn
+numpy
				`@ -0,0 +1 @@`
				`main.pdf filter=lfs diff=lfs merge=lfs -text`