diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..defcde4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +main.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a84950a --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +options=-shell-escape -file-line-error + +all: main.pdf + +%.pdf: %.tex + lualatex $(options) $< diff --git a/content/chapters/1.tex b/content/chapters/1.tex new file mode 100644 index 0000000..cf88ac7 --- /dev/null +++ b/content/chapters/1.tex @@ -0,0 +1,168 @@ +\chapter{Unsupervised Learning} + +\begin{definition}[Precision Medicine] + Design of treatment for a given patient, based on genomic data. +\end{definition} + +\begin{definition}[Hierarchical clustering] +\end{definition} + +Gene expression time series: look for genes with similar expression footprint. + +\paragraph{Representation of data} + +\begin{itemize} + \item Tables; + \item Trees / Graphs; + \item Time series... +\end{itemize} + +\begin{figure} + \includestandalone{figures/plots/genes_expression_timeseries} + \caption{Example of gene expression time series} +\end{figure} + +\section{Distances and Similarities} + +\begin{property}[Distance] + \begin{description} + \item[non-negativity] $d(i, j) \geq 0$ + \item[isolation] $d(i, i) = 0$ + \item[symmetry] $d(i, j) = d(j, i)$ + \item[triangular inequality] $d(i, j) \leq d(i, h) + d(h, j)$ + \end{description} +\end{property} + +\begin{definition}[Dissimilarity] + Distance without triangular inequality. +\end{definition} + +\begin{definition}[Similarity] + Function $s$ from $X \times X$ to $\RR_+$ such that: + \begin{enumerate} + \item $s$ is symmetric: $(x, y) \in X \times X; s(x, y) = s(y, x)$ + \item $(x, y) \in X \times X; s(x, x) = s(y, y) > s(x, y)$. + \end{enumerate} +\end{definition} + +\begin{exercise} + + Let $d(x, y)$ be the distance, $d(x, y) \in [0, +\infty[$. + + What should be the similarity measure $S(x, y) = f(d(x, y))$ that satisfies the following property: + \[ + (x, y) \in X \times X \: | \: S(x, y) > S(x, y) + \] + having $S(x, y) \leq M$, $S(x, y) \in ]0, M]$. +\end{exercise} +$d(x, y) \geq 0 \: \forall (x, y)$ +\begin{equation} + S(x, y) = \frac{M}{d(x, y) + 1} + \label{eq:similarity-first} +\end{equation} +In \cref{eq:similarity-first}, $S(x, y)$ ranges from 0 to M. +\begin{eqnarray} + \lim_{n \to \infty} \frac{M}{n + 1} = 0 && \lim_{n \to 0} \frac{M}{n + 1} = M +\end{eqnarray} + + +\section{Data Representation} + +\paragraph{Data matrix} + + +\paragraph{Distance matrix} + +\[ + \begin{bmatrix} + 0 \\ + d(2, 1) & 0 \\ + d(3, 1) & d(3, 2) & 0 \\ + \vdots & \vdots & \ddots \\ + d(n, 1) & d(n,2) & \dots & \dots & 0 + \end{bmatrix} +\] + + +\begin{table} + \centering + \begin{tabular}{c|cc} + &$s_{1}$ & $s_{2}$ \\ + \hline + $p_{1}$ & 0 & 1 \\ + $p_{2}$ & 1 & 0 \\ + $p_{3}$ & 3 & 2 \\ + \end{tabular} + \caption{Example data matrix: 2 symptoms for 3 patients.} +\end{table} + + + +\begin{definition}[Minkowski distance] + \[ + L_p (x, y) = \left(\abs{x_1 - y_1}^p + \abs{x_2 - y_2}^p + \ldots + \abs{x_d - y_d}^p\right)^{\sfrac{1}{p}} = \left(\sum_{i=1}^d \left(x_i - y_i\right)^p\right)^{\sfrac{1}{p}} + \] + where $p$ is a positive integer. +\end{definition} + +\begin{definition}[Manhattan distance] + \[ + L_1(x, y) = \sum_{i=1}^d \abs{x_i - y_i} + \] +\end{definition} + +\begin{definition}[Euclidian distance] + Let $A$ and $B$ be two points, with $(x_{A}, y_{A})$ and $(x_{B}, y_{B})$ their respective coordinates, +\end{definition} + +If $p=2$, $L_2$ is the Euclidian distance: +\begin{definition}[Euclidian distance] + \[ + d(x, y) = \sqrt{\abs{x_1 - y_1}^2 + \abs{x_2 - y_2} + \ldots + \abs{x_d - y_d}^2} + \] +\end{definition} + +We can add weights + +\subsection{K-means} + +The cost function is minimized: +\[ + Cost(C) \sum_{i=1}^{k}... +\] + +\begin{algorithm}[H] + Choose the number of clusters $k$. + + Choose randomly $k$ means. + + For each point, compute the distance between the point and each means. + We allocate the point to the cluster represented by the clostest center. + + We set each means to the center of the cluster, and reiterate. + \caption{$K$-means algorithm} +\end{algorithm} + + +\begin{exercise} + We have six genes: + \begin{table}[H] + \centering + \begin{tabular}{ccccccc} + \toprule + & $g_{1}$ & $g_{2}$ & $g_{3}$ & $g_{4}$ & $g_{5}$ & $g_{6}$ \\ + \midrule + $\times 10^{-2}$ & 10 & 12 & 9 & 15 & 17 & 18 \\ + \bottomrule + \end{tabular} + \caption{Sample values for six gene expressions.} + \end{table} + + With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm. +\end{exercise} + +\begin{figure} + \centering + \includegraphics[scale=1]{figures/plots/kmeans.pdf} + \caption{$k$-means states at each of the 3 steps} +\end{figure} \ No newline at end of file diff --git a/content/chapters/include.tex b/content/chapters/include.tex index dd55602..bc8b4bb 100755 --- a/content/chapters/include.tex +++ b/content/chapters/include.tex @@ -11,9 +11,7 @@ } } - -\includechapters{part1}{2} - +\includechapters{}{2} % \includechapters{part2}{2} diff --git a/content/chapters/part1/1.tex b/content/chapters/part1/1.tex deleted file mode 100644 index e69de29..0000000 diff --git a/content/genes_expression_timeseries.tex b/content/genes_expression_timeseries.tex new file mode 100644 index 0000000..a3ba271 --- /dev/null +++ b/content/genes_expression_timeseries.tex @@ -0,0 +1,19 @@ +\documentclass[tikz,a4paper]{standalone} + +\usepackage{tikz} + +\begin{document} + +\usetikzlibrary{datavisualization} + +\begin{tikzpicture} + \datavisualization[visualize as smooth line] + data { + x, y + 2, 1, + 3, 2, + 4, 1.5 + }; +\end{tikzpicture} + +\end{document} diff --git a/figures/euclidian_distance.tex b/figures/euclidian_distance.tex new file mode 100644 index 0000000..2d36285 --- /dev/null +++ b/figures/euclidian_distance.tex @@ -0,0 +1,19 @@ + + + +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usepackage{tkz-euclide} + +\begin{document} + +\begin{tikzpicture}[scale=1] + \tkzInit[xmax=5,ymax=5] + \tkzDrawX[>=latex] + \tkzDraw[>=latex] + \tkzDefPoints() + +\end{tikzpicture} + +\end{document} diff --git a/figures/plots/.gitattributes b/figures/plots/.gitattributes new file mode 100644 index 0000000..2d5cb21 --- /dev/null +++ b/figures/plots/.gitattributes @@ -0,0 +1,2 @@ +genes_expression_timeseries.pdf filter=lfs diff=lfs merge=lfs -text +kmeans.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/figures/plots/genes_expression_timeseries.pdf b/figures/plots/genes_expression_timeseries.pdf new file mode 100644 index 0000000..142c73c --- /dev/null +++ b/figures/plots/genes_expression_timeseries.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59f5adc3524efcc205865151a59c13980b04d5756e83b062e6c7e20bc644018 +size 1128 diff --git a/figures/plots/genes_expression_timeseries.tex b/figures/plots/genes_expression_timeseries.tex new file mode 100644 index 0000000..f04e3db --- /dev/null +++ b/figures/plots/genes_expression_timeseries.tex @@ -0,0 +1,45 @@ +\documentclass[tikz]{standalone} +\usepackage{tikz} + +\begin{document} + +\usetikzlibrary{datavisualization} +\begin{tikzpicture} + + \datavisualization data group {genes} = { + data[set=gene1] { + x, y + 0, 1, + 1, 2, + 2, 1.5 + } + data[set=gene2] { + x, y + 0, 1.5, + 1, 2.25, + 2, 1.75 + } + data[set=gene3] { + x, y + 0, 0.25, + 1, 0.26, + 2, 0.7 + } + data[set=gene4] { + x, y + 0, 0.5, + 1, 0.25, + 2, 1 + } + }; + \datavisualization [ + school book axes, all axes={unit length=7.5mm}, + visualize as smooth line/.list={gene1, gene2, gene3, gene4}, + style sheet=strong colors, + x axis={label=$t$}, + y axis={label={expression}}] +data group {genes}; + +\end{tikzpicture} + +\end{document} diff --git a/figures/plots/kmeans.pdf b/figures/plots/kmeans.pdf new file mode 100644 index 0000000..56a73fa --- /dev/null +++ b/figures/plots/kmeans.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd507e1c6741f48c32c531b1159fe1d777453b2e3638331427ba045e399f426d +size 1562 diff --git a/figures/plots/kmeans.tex b/figures/plots/kmeans.tex new file mode 100644 index 0000000..09e2f36 --- /dev/null +++ b/figures/plots/kmeans.tex @@ -0,0 +1,54 @@ +\documentclass[margin=0.5cm]{standalone} +\usepackage{tikz} +\usepackage{pyluatex} +\usepackage{pgf} + +\begin{document} +\begin{python} +# %% +import io + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +# %% + + +data = """g1,10 +g2,12 +g3,9 +g4,15 +g5,17 +g6,18""" + +points =[int(row.split(",")[1]) for row in data.split("\n")] +X = np.array([[point] for point in points]) +initial_means = [[10], [9]] +points + +# %% +kmeans_values = [] +for i in range(1,4): + kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1) + kmeans.fit(X) + kmeans_values.append(kmeans.cluster_centers_) + + +# %% +fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True) +for i, centroids in enumerate(kmeans_values): + ax = axs[i] + ax.scatter(centroids, [i]*len(centroids), marker='x') + ax.scatter(points, [i]*len(points), s=2, color="black") + ax.axis('off') + + +with io.StringIO() as file: + fig.savefig(file, format="pgf", bbox_inches="tight", pad_inches=0.1) + print(file.getvalue()) +\end{python} +\begin{tikzpicture} + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/content/chapters/part1/0.tex b/glossary.tex similarity index 100% rename from content/chapters/part1/0.tex rename to glossary.tex diff --git a/main.pdf b/main.pdf index edded29..d3d41e2 100644 Binary files a/main.pdf and b/main.pdf differ diff --git a/main.tex b/main.tex index 736a1b2..cf15132 100644 --- a/main.tex +++ b/main.tex @@ -1,46 +1,69 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Course of None -% -% Author: Samuel ORTION -% Version: 0.0.1 +% Course on "Data-mining and Machine Learning" - GENIOMHE - M1-S1 +% +% Author: Samuel Ortion +% Version: 0.1.0 % Date: 2023 % Licence: CC-By-SA 4.0+ International %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\documentclass[ - a4paper, - 10pt, - fleqn, - oneside -]{talpa} +\documentclass[twoside=false,fontsize=10pt,fleqn]{scrbook} +\usepackage{mus} +\usepackage{standalone} +\titlehead{GENIOMHE} +\title{Data-mining and\newline{}Machine Learning} +\subtitle{} +\author{Samuel Ortion} +\date{Fall 2023} +\teacher{Farida Zerhaoui} +\cursus{GENIOMHE} +\university{Université d'Évry val d'Essonne -- Université Paris-Saclay} +\semester{M1 - S1} -\input{colors.tex} -\input{meta.tex} -\input{definitions.tex} +\input{definitions} +\input{preamble} \hypersetup{ - pdftitle={ - Course - None - }, - pdfauthor={ - Samuel Ortion - }, + pdftitle={Course - Data-mining and Machine Learning}, + pdfauthor={Samuel Ortion}, pdfsubject={}, - pdfkeywords={}, + pdfkeywords={GENIOMHE, Master, bioinformatics, machine learning, statistics, data}, pdfcreator={LaTeX} } -% \addbibressource{bibliography.bib} +\usepackage{ccicons} +\usepackage[ + type={CC}, + modifier={by-sa}, + version={4.0}, +]{doclicense} +\addbibresource{references.bib} \makeindex + \begin{document} -\tableofcontents +\setkomafont{fullpagetitle}{\fontsize{1.5cm}{3em}\fontseries{b}\selectfont} +\maketitlefullpage -% \input{content/introduction.tex} +{ + \hypersetup{ + linkcolor=black + } + \tableofcontents +} -\input{content/chapters/include.tex} +\doclicenseThis% -% \input{content/conclusion.tex} +% \input{content/introduction} -\end{document} \ No newline at end of file +\input{content/chapters/include} + +% \input{content/conclusion} + +\nocite{*} + +\printbibliography% +% \printglossary% + +\end{document} diff --git a/notebooks/kmeans1d.ipynb b/notebooks/kmeans1d.ipynb new file mode 100644 index 0000000..874fb3c --- /dev/null +++ b/notebooks/kmeans1d.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[10, 12, 9, 15, 17, 18]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "\n", + "data = \"\"\"g1,10\n", + "g2,12\n", + "g3,9\n", + "g4,15\n", + "g5,17\n", + "g6,18\"\"\"\n", + "\n", + "points =[int(row.split(\",\")[1]) for row in data.split(\"\\n\")]\n", + "X = np.array([[point] for point in points])\n", + "initial_means = [[10], [9]]\n", + "points" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans_values = []\n", + "for i in range(1,4): \n", + " kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1)\n", + " kmeans.fit(X)\n", + " kmeans_values.append(kmeans.cluster_centers_)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgMAAAGFCAYAAABg2vAPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAMpklEQVR4nO3dMU9Va77H8f+54cgRjBhsRpPDJDsZKS0GM+z2vgUL7Ow8mel4JSbTGC2spZj7Fm41kRAtKGUSEr0JTCMBIm6QnXALMxxRi3N0b9Zi/T6fbkPifh6f9Tz5Zu219YeTk5OTAgBi/VfTAwAAmiUGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgXCMxsH94XNt7g6/+bntvUPuHx+c8IqDtnBswPuceA/uHx3X/6VotPV6trd2zG3trd1BLj1fr/tM1Gxs45dyA8Tr3GDg4Gtbbdx/qzc77uvfk1429tTuoe09W683O+3r77kMdHA3Pe2hASzk3YLzOPQZuzFyuZw8Wa252qt7svK+lR/+s//nftVp69M96s/O+5man6tmDxboxc/m8h/ZdhsNhbWxs1HDoMGoD69Etn58b956s1svXO6chcF7nhuuqXbqyHm2YRyPPDNy89nFj/zwzWWt//1vd/e+/1Nrf/1Y/z0zWsweLdfPaxQuBfr9f8/Pz1e/3L/yFedFZj276z7nxnyC4++j5mRAY97nhumqXrqxHW+bR2LcJbl67XMuLV+vDv/9VVVUf/v2vWl68euFCoKpqc3OzXrx4UVVVL168qM3NzYZHlM16dNfNa5fr4dLtMz97uHT7XM4N11W7dGU92jKPxmJga3dQD1f369If/lRVVZf+8Kd6uLr/xcNBF0Gv16uFhYWqqrpz5071er2GR5TNenTX1u6gllfWz/xseWX9XM4N11W7dGU92jKPH05OTk7O+00/fejn55nJWl68Wg9X9+v/9o7O7ZbfqA2Hw9rc3Kxer1cTExNNDyee9eieT8+Nudmperh0u5ZX1s/9owLXVXt0ZT3aMI9zj4HtvY9fA/p8A3++0Vd+uXgPEQLj4dyA8Tr3jwmmJyfq+pVLX5T8pw8HXb9yqaYnL27lAaPl3IDxauRjgv3D4zo4Gn614Lf3BjU9OVFXf/rxvIcFtJhzA8ankRgAANrDf1QEAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhOtsDOwfHtf23uCrv9veG9T+4fE5jwjg65xXNK2TMbB/eFz3n67V0uPV2to9u8G2dge19Hi17j9ds8GAxjmvaINOxsDB0bDevvtQb3be170nv26wrd1B3XuyWm923tfbdx/q4GjY8EiBdM4r2qDxGBgOh7WxsVHD4egu9Bszl+vZg8Wam5063WAvX++cbqy52al69mCxbsxcHtl7jmMefDvrwTh05bzqiq7s8zbMo9EYGA6H1e/3a35+vvr9/kj/Im5eO7vB7j56fmZj3bw22hAY1zz4/awH49CV86orurLP2zKPH05OTk4aeeeq2tjYqPn5+dPXr169qlu3bo30PV6+3qm7j56fvv7HX/v15z/OjvQ9zmMe/HbWg3HoynnVFV3Z522ZR6N3Bnq9Xi0sLFRV1Z07d6rX6430z9/aHdTyyvqZny2vrH/xkM73Gvc8+H2sB+PQlfOqK7qyz9syj0bvDFR9vEWyublZvV6vJiYmRvbnfvrwzdzsVD1cul3LK+tj/ahgHPPg21gPxqEr51VXdGWft2EejcfAOGzvffw6zucb6fMNt/KLh3KAZjmvaIPGv00wDtOTE3X9yqUvivrTh3SuX7lU05MXtySBbnBe0QadvDNQ9fEf8jg4Gn61pLf3BjU9OVFXf/qxgZEBnOW8ommdjQEA4Lfp5McEAMBvJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGKgQfuHx7W9N/jq77b3BrV/eHzOIwIYL+deO4mBhuwfHtf9p2u19Hi1tnbPboyt3UEtPV6t+0/XbAygM5x77SUGGnJwNKy37z7Um533de/Jrxtja3dQ956s1pud9/X23Yc6OBo2PFKA0XDutVfjMTAcDmtjY6OGw4u9+L93HjdmLtezB4s1Nzt1ujFevt453RBzs1P17MFi3Zi5POaRd1NXrivaxXX1fUZ97nVlPdowj0ZjYDgcVr/fr/n5+er3+xd2Qb91Hjevnd0Ydx89P7Mhbl4TAt+iK9cV7eK6Go1RnXtdWY+2zOOHk5OTk0beuao2NjZqfn7+9PWrV6/q1q1bTQ3nm33vPF6+3qm7j56fvv7HX/v15z/OjnSMSbpyXdEurqvR+t5zryvr0ZZ5NHpnoNfr1cLCQlVV3blzp3q9XpPD+WbfM4+t3UEtr6yf+dnyyvoXD9fw23XluqJdXFejM4pzryvr0ZZ5NHpnoOrjLZLNzc3q9Xo1MTHR5FC+y7fM49OHZuZmp+rh0u1aXln3UcEIdOW6ol1cV99vlOdeV9ajDfNoPAZSbe99/BrN5xvg842y8ouHCIFucO61V+PfJkg1PTlR169c+qKEP3245vqVSzU9eXFrF+BTzr32cmegQfuHx3VwNPxqAW/vDWp6cqKu/vRjAyMDGA/nXjuJAQAI52MCAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAI9/8lWKkDA/pOhQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True)\n", + "for i, centroids in enumerate(kmeans_values):\n", + " ax = axs[i]\n", + " ax.scatter(centroids, [i]*len(centroids), marker='x')\n", + " ax.scatter(points, [i]*len(points), s=2, color=\"black\")\n", + " ax.axis('off')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "geniomhe-ml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/preamble.tex b/preamble.tex index e69de29..d3a4468 100644 --- a/preamble.tex +++ b/preamble.tex @@ -0,0 +1,2 @@ +\usepackage{mus-learn} +\usepackage{xfrac} \ No newline at end of file diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..0f46426 --- /dev/null +++ b/references.bib @@ -0,0 +1,25 @@ + +@book{geron_hands-machine_2019, + edition = {2}, + title = {Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}}, + abstract = {Through a recent series of breakthroughs, deep learning has boosted the entire field of machine learning. Now, even programmers who know close to nothing about this technology can use simple, … - Selection from Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}, 2nd Edition [Book]}, + publisher = {O'{REILLY}}, + author = {Géron, Aurélien}, + date = {2019}, + langid = {english}, + note = {{ISBN}: 9781098125974} +} + +@collection{witten_data_2011, + location = {Boston}, + edition = {4}, + title = {Data Mining - Practical Machine Learning Tools an Techniques}, + isbn = {978-0-12-374856-0}, + series = {The Morgan Kaufmann Series in Data Management Systems}, + publisher = {Morgan Kaufmann}, + editor = {Witten, Ian H. and Frank, Eibe and Hall, Mark A.}, + urldate = {2023-06-16}, + date = {2011-01-01}, + langid = {english}, + doi = {10.1016/B978-0-12-374856-0.00018-3} +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9bad371 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +scikit-learn +numpy \ No newline at end of file