From 7f89b6c8429769317c06b0b1fbeead615bb98a74 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Wed, 27 Sep 2023 18:02:07 +0200 Subject: [PATCH] feat: Add a pgf-plot scikit-learn KMeans figure --- .gitattributes | 1 + .vscode/settings.json | 3 - Makefile | 6 + content/chapters/1.tex | 168 ++++++++++++++++++ content/chapters/include.tex | 4 +- content/chapters/part1/1.tex | 0 content/genes_expression_timeseries.tex | 19 ++ figures/euclidian_distance.tex | 19 ++ figures/plots/.gitattributes | 2 + figures/plots/genes_expression_timeseries.pdf | 3 + figures/plots/genes_expression_timeseries.tex | 45 +++++ figures/plots/kmeans.pdf | 3 + figures/plots/kmeans.tex | 54 ++++++ content/chapters/part1/0.tex => glossary.tex | 0 main.pdf | Bin 7603 -> 131 bytes main.tex | 75 +++++--- notebooks/kmeans1d.ipynb | 109 ++++++++++++ preamble.tex | 2 + references.bib | 25 +++ requirements.txt | 2 + 20 files changed, 508 insertions(+), 32 deletions(-) create mode 100644 .gitattributes delete mode 100644 .vscode/settings.json create mode 100644 Makefile create mode 100644 content/chapters/1.tex delete mode 100644 content/chapters/part1/1.tex create mode 100644 content/genes_expression_timeseries.tex create mode 100644 figures/euclidian_distance.tex create mode 100644 figures/plots/.gitattributes create mode 100644 figures/plots/genes_expression_timeseries.pdf create mode 100644 figures/plots/genes_expression_timeseries.tex create mode 100644 figures/plots/kmeans.pdf create mode 100644 figures/plots/kmeans.tex rename content/chapters/part1/0.tex => glossary.tex (100%) create mode 100644 notebooks/kmeans1d.ipynb create mode 100644 references.bib create mode 100644 requirements.txt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..defcde4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +main.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9d14cfb..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "ansible.python.interpreterPath": "/bin/python" -} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a84950a --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +options=-shell-escape -file-line-error + +all: main.pdf + +%.pdf: %.tex + lualatex $(options) $< diff --git a/content/chapters/1.tex b/content/chapters/1.tex new file mode 100644 index 0000000..cf88ac7 --- /dev/null +++ b/content/chapters/1.tex @@ -0,0 +1,168 @@ +\chapter{Unsupervised Learning} + +\begin{definition}[Precision Medicine] + Design of treatment for a given patient, based on genomic data. +\end{definition} + +\begin{definition}[Hierarchical clustering] +\end{definition} + +Gene expression time series: look for genes with similar expression footprint. + +\paragraph{Representation of data} + +\begin{itemize} + \item Tables; + \item Trees / Graphs; + \item Time series... +\end{itemize} + +\begin{figure} + \includestandalone{figures/plots/genes_expression_timeseries} + \caption{Example of gene expression time series} +\end{figure} + +\section{Distances and Similarities} + +\begin{property}[Distance] + \begin{description} + \item[non-negativity] $d(i, j) \geq 0$ + \item[isolation] $d(i, i) = 0$ + \item[symmetry] $d(i, j) = d(j, i)$ + \item[triangular inequality] $d(i, j) \leq d(i, h) + d(h, j)$ + \end{description} +\end{property} + +\begin{definition}[Dissimilarity] + Distance without triangular inequality. +\end{definition} + +\begin{definition}[Similarity] + Function $s$ from $X \times X$ to $\RR_+$ such that: + \begin{enumerate} + \item $s$ is symmetric: $(x, y) \in X \times X; s(x, y) = s(y, x)$ + \item $(x, y) \in X \times X; s(x, x) = s(y, y) > s(x, y)$. + \end{enumerate} +\end{definition} + +\begin{exercise} + + Let $d(x, y)$ be the distance, $d(x, y) \in [0, +\infty[$. + + What should be the similarity measure $S(x, y) = f(d(x, y))$ that satisfies the following property: + \[ + (x, y) \in X \times X \: | \: S(x, y) > S(x, y) + \] + having $S(x, y) \leq M$, $S(x, y) \in ]0, M]$. +\end{exercise} +$d(x, y) \geq 0 \: \forall (x, y)$ +\begin{equation} + S(x, y) = \frac{M}{d(x, y) + 1} + \label{eq:similarity-first} +\end{equation} +In \cref{eq:similarity-first}, $S(x, y)$ ranges from 0 to M. +\begin{eqnarray} + \lim_{n \to \infty} \frac{M}{n + 1} = 0 && \lim_{n \to 0} \frac{M}{n + 1} = M +\end{eqnarray} + + +\section{Data Representation} + +\paragraph{Data matrix} + + +\paragraph{Distance matrix} + +\[ + \begin{bmatrix} + 0 \\ + d(2, 1) & 0 \\ + d(3, 1) & d(3, 2) & 0 \\ + \vdots & \vdots & \ddots \\ + d(n, 1) & d(n,2) & \dots & \dots & 0 + \end{bmatrix} +\] + + +\begin{table} + \centering + \begin{tabular}{c|cc} + &$s_{1}$ & $s_{2}$ \\ + \hline + $p_{1}$ & 0 & 1 \\ + $p_{2}$ & 1 & 0 \\ + $p_{3}$ & 3 & 2 \\ + \end{tabular} + \caption{Example data matrix: 2 symptoms for 3 patients.} +\end{table} + + + +\begin{definition}[Minkowski distance] + \[ + L_p (x, y) = \left(\abs{x_1 - y_1}^p + \abs{x_2 - y_2}^p + \ldots + \abs{x_d - y_d}^p\right)^{\sfrac{1}{p}} = \left(\sum_{i=1}^d \left(x_i - y_i\right)^p\right)^{\sfrac{1}{p}} + \] + where $p$ is a positive integer. +\end{definition} + +\begin{definition}[Manhattan distance] + \[ + L_1(x, y) = \sum_{i=1}^d \abs{x_i - y_i} + \] +\end{definition} + +\begin{definition}[Euclidian distance] + Let $A$ and $B$ be two points, with $(x_{A}, y_{A})$ and $(x_{B}, y_{B})$ their respective coordinates, +\end{definition} + +If $p=2$, $L_2$ is the Euclidian distance: +\begin{definition}[Euclidian distance] + \[ + d(x, y) = \sqrt{\abs{x_1 - y_1}^2 + \abs{x_2 - y_2} + \ldots + \abs{x_d - y_d}^2} + \] +\end{definition} + +We can add weights + +\subsection{K-means} + +The cost function is minimized: +\[ + Cost(C) \sum_{i=1}^{k}... +\] + +\begin{algorithm}[H] + Choose the number of clusters $k$. + + Choose randomly $k$ means. + + For each point, compute the distance between the point and each means. + We allocate the point to the cluster represented by the clostest center. + + We set each means to the center of the cluster, and reiterate. + \caption{$K$-means algorithm} +\end{algorithm} + + +\begin{exercise} + We have six genes: + \begin{table}[H] + \centering + \begin{tabular}{ccccccc} + \toprule + & $g_{1}$ & $g_{2}$ & $g_{3}$ & $g_{4}$ & $g_{5}$ & $g_{6}$ \\ + \midrule + $\times 10^{-2}$ & 10 & 12 & 9 & 15 & 17 & 18 \\ + \bottomrule + \end{tabular} + \caption{Sample values for six gene expressions.} + \end{table} + + With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm. +\end{exercise} + +\begin{figure} + \centering + \includegraphics[scale=1]{figures/plots/kmeans.pdf} + \caption{$k$-means states at each of the 3 steps} +\end{figure} \ No newline at end of file diff --git a/content/chapters/include.tex b/content/chapters/include.tex index dd55602..bc8b4bb 100755 --- a/content/chapters/include.tex +++ b/content/chapters/include.tex @@ -11,9 +11,7 @@ } } - -\includechapters{part1}{2} - +\includechapters{}{2} % \includechapters{part2}{2} diff --git a/content/chapters/part1/1.tex b/content/chapters/part1/1.tex deleted file mode 100644 index e69de29..0000000 diff --git a/content/genes_expression_timeseries.tex b/content/genes_expression_timeseries.tex new file mode 100644 index 0000000..a3ba271 --- /dev/null +++ b/content/genes_expression_timeseries.tex @@ -0,0 +1,19 @@ +\documentclass[tikz,a4paper]{standalone} + +\usepackage{tikz} + +\begin{document} + +\usetikzlibrary{datavisualization} + +\begin{tikzpicture} + \datavisualization[visualize as smooth line] + data { + x, y + 2, 1, + 3, 2, + 4, 1.5 + }; +\end{tikzpicture} + +\end{document} diff --git a/figures/euclidian_distance.tex b/figures/euclidian_distance.tex new file mode 100644 index 0000000..2d36285 --- /dev/null +++ b/figures/euclidian_distance.tex @@ -0,0 +1,19 @@ + + + +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usepackage{tkz-euclide} + +\begin{document} + +\begin{tikzpicture}[scale=1] + \tkzInit[xmax=5,ymax=5] + \tkzDrawX[>=latex] + \tkzDraw[>=latex] + \tkzDefPoints() + +\end{tikzpicture} + +\end{document} diff --git a/figures/plots/.gitattributes b/figures/plots/.gitattributes new file mode 100644 index 0000000..2d5cb21 --- /dev/null +++ b/figures/plots/.gitattributes @@ -0,0 +1,2 @@ +genes_expression_timeseries.pdf filter=lfs diff=lfs merge=lfs -text +kmeans.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/figures/plots/genes_expression_timeseries.pdf b/figures/plots/genes_expression_timeseries.pdf new file mode 100644 index 0000000..142c73c --- /dev/null +++ b/figures/plots/genes_expression_timeseries.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59f5adc3524efcc205865151a59c13980b04d5756e83b062e6c7e20bc644018 +size 1128 diff --git a/figures/plots/genes_expression_timeseries.tex b/figures/plots/genes_expression_timeseries.tex new file mode 100644 index 0000000..f04e3db --- /dev/null +++ b/figures/plots/genes_expression_timeseries.tex @@ -0,0 +1,45 @@ +\documentclass[tikz]{standalone} +\usepackage{tikz} + +\begin{document} + +\usetikzlibrary{datavisualization} +\begin{tikzpicture} + + \datavisualization data group {genes} = { + data[set=gene1] { + x, y + 0, 1, + 1, 2, + 2, 1.5 + } + data[set=gene2] { + x, y + 0, 1.5, + 1, 2.25, + 2, 1.75 + } + data[set=gene3] { + x, y + 0, 0.25, + 1, 0.26, + 2, 0.7 + } + data[set=gene4] { + x, y + 0, 0.5, + 1, 0.25, + 2, 1 + } + }; + \datavisualization [ + school book axes, all axes={unit length=7.5mm}, + visualize as smooth line/.list={gene1, gene2, gene3, gene4}, + style sheet=strong colors, + x axis={label=$t$}, + y axis={label={expression}}] +data group {genes}; + +\end{tikzpicture} + +\end{document} diff --git a/figures/plots/kmeans.pdf b/figures/plots/kmeans.pdf new file mode 100644 index 0000000..56a73fa --- /dev/null +++ b/figures/plots/kmeans.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd507e1c6741f48c32c531b1159fe1d777453b2e3638331427ba045e399f426d +size 1562 diff --git a/figures/plots/kmeans.tex b/figures/plots/kmeans.tex new file mode 100644 index 0000000..09e2f36 --- /dev/null +++ b/figures/plots/kmeans.tex @@ -0,0 +1,54 @@ +\documentclass[margin=0.5cm]{standalone} +\usepackage{tikz} +\usepackage{pyluatex} +\usepackage{pgf} + +\begin{document} +\begin{python} +# %% +import io + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +# %% + + +data = """g1,10 +g2,12 +g3,9 +g4,15 +g5,17 +g6,18""" + +points =[int(row.split(",")[1]) for row in data.split("\n")] +X = np.array([[point] for point in points]) +initial_means = [[10], [9]] +points + +# %% +kmeans_values = [] +for i in range(1,4): + kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1) + kmeans.fit(X) + kmeans_values.append(kmeans.cluster_centers_) + + +# %% +fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True) +for i, centroids in enumerate(kmeans_values): + ax = axs[i] + ax.scatter(centroids, [i]*len(centroids), marker='x') + ax.scatter(points, [i]*len(points), s=2, color="black") + ax.axis('off') + + +with io.StringIO() as file: + fig.savefig(file, format="pgf", bbox_inches="tight", pad_inches=0.1) + print(file.getvalue()) +\end{python} +\begin{tikzpicture} + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/content/chapters/part1/0.tex b/glossary.tex similarity index 100% rename from content/chapters/part1/0.tex rename to glossary.tex diff --git a/main.pdf b/main.pdf index edded294b4d7dd3df961f4358cdef81793fb083b..d3d41e2c14831f8ad61effce1b80883f5c11cbec 100644 GIT binary patch literal 131 zcmWN_K@Ni;5CFhCuiyg}T$V-r4XjHcX^Sc9p|7u}N#>I8X#Jz>aqMF?=Gl(NtBmQk zo_XT_He(ky*Q|Qm^r#6ut-<0#P;#Pz?9ryhK@i78Fo!7FL-JO#+x&gbCgc(waEOs| O03_AEez@muWe0XP-&>S0|W}-RD#-D!mI#XAOV1=DBzz3!v*FHHMPU=mu*>?jm zW!!Uuh(9Q(8z&)?p#FTxJfdeX+K}b-$XgF>l1RZsMDW`=i7>mRF1`xPnJB{gsPm0Z z@B>}I+!Y`jh;A(%z|?7`S~=mj>jIfd6GX48^qYz0dX@E>ro2Qqw%&e`J*#c*s~A2} z=Bt1ZCTFfnrIzRiR3gTXH>N2~iS-(W4RLZGm|M=il&!0Z*K7r3{;D3s)nyno*I9Z+ zFs*e4UR%c%MV_0i5oNj~6VRx~OBfcIN#%Fjwh;2%I&HpAn7RX3eVJ=bq!L>l3nM>v z)p>&fwTJxkCE=~#9s=?Gx31#(!&892TxD)zvSMPQW}-Ud7O*Ogo{01W=NFRT-zeh$ zLy?B78O+lWJ`Ymz(y|WrFim&|DD{8L6%U`_UncxzK&7^|CV>u7Rq-x|Yz<@HPvp-^ zTJiYYXox`(2EuCM^b#GoAkYVsXM+ryQVszMka+t~RD&!mXjoX1#8DM}I67$5YLd}T z-Bu8;2y;i-L?s69kNxH;HiPm*X=(TAo+U%&FOL@QCf(b;mqk=UT0WW4#9J+WO6|W; z%WK1pAKk;GCnwn;u2h&RVdWhfyl5qUWOFRJz9D&xep_Pbq5p%sifO{=zVY+^4}}R{ zM=%}&KHr(7F4|=O_rl42bna_?sads>i0&)4U3#!9s#H5?8-t|irUKcH+ z-f+IYm;CUOTPHC+Gv7Rji1T%Azu-p`$@?@%XqUZfqnkTR0R%tI(Vt-~CqmuwS$S!6 zQ##bGR6S`|KKf-ugqj`nTt8WMfbPx2tA7YNEPcbVru8g@iTUHuDonJI-Nc8<^o;vG7V3MT&3#m4BpV(X}ZDQz5$pn*!SFE zu!J17-+f63WeiI=}nWuMzX%bGfjlApPha_>7nxQxiPsQal#}L>s?(fj`T}+?~S+`x%WYj^7p6~ zNRj<|-0Xt`u4$fhB6HlxQ!^0-<7a2wIBXv1XK34vSX#_pR30>@5bfcf*GmUn2IPbC ziBmAD$uEN?yvVvE2Hv`L`5F2GQnIaYNiQ+280Uu(>w;4{%U=DWk!w26sCxgJSVcvo z7rLHD&-;bvsKa}}rb<1A0wZpZJtdQdtcIJ4I;EYihO+tzWd!1k?CxZ35S>>s!7%bX zvEZsfct9`=6TyXba5bua6xTlqbuE{(+$^&#o-28>|^eB0n ze5?(8%S2t9_DC*u9jqjd)luZ$1f+b=@iT`FNkM|XhHsCtu&kO!fQUJW&y5%-hwVQq z(Z6TqXE-c!*s`m66(?G@IV)4YDa3qoVQx)j9iZ=fg0O}`}O;(PqK&ca=Pv6yCaUx@pz=`DC6uvo}U6m=@I2j9vH11T+q@*g6}Uoh@O4Ya40MD*eF^Bkg zR6f>5g_T@yFrS?7AF~Q7iAR~>U`XtbN}eOC=X>l#A(}E_1}{0HFG$=>e;dW+5$mu- zIK9UgMjMRFZhhpXGnSTB&^Fngwu}l1o)e|sS~pFn_)hE>z7Tgu!?_W-<+uK<$Z;w- zd?Z-kS2Y`vrmj(AenQAu6QaH_%FUJ~Xyy6HE<@Ek0+P#uwffd){Pbt#uZcOFL7+(> z;L>C)JQx!X=C<=Ee5}|mtb&|( zkvGC5)a|+CKYr#Xpsqisp8^nvZ%@%WS&C`;aB*ES8`{58bk|nDtL)H}dVxrY#S`64 zE-oVxf!^vFpb(W{xNy0*>VX%bgUUXmO|_gM?#}2&{QBDghZq0H#+iepD)Rx0-t_4y zj@T$+7+1<1Yza0wwrNqy@kBfxJGzA@vfCxFrnq*(t2uJ-Ry<*ZJLT3J?~~uyMhaMiip}F%f0mW zvjR>BE)&!;Yx|$_YO11inDbMEW)%F3dlOI58b-!w)s zzNjo|6X0_`(CDosED&u+e}|EW=BzCFO3g?^(a5``QQO!SH3e^Pz3nbH=qK576R-rr z{W)>ytqS|M`exks8C(i<-fRqtdHK1BJgP@>Vn+tsQQ6NGBNiL;Z)j5(NGitrdC!^N z=J^bbe3-J>EDbsg69>rvF*6@Fzc^TvHY55sW7B7TO}pb~?0g83UwcA@p!4lwEI~vD zgyOx3fh`@3i6^5lpU&wsGq)yt|Abh=V&i@q<1J?!GrwLhD(;1G~9;K;_mZeqydjJ0R+OzKW zeU(OaT^=@XgWB($ED@8CNBf_~B7Zxu*3=$|c-Zd01$R)Ev}_glisQZ=2&t>>9O3VD zNH-D0p2u=T(q04zd(b|%W_AzSzL-(2s>e|MvK9QvP$qBIiY1pQ7X$r-0yLyV1hwT+2Dt=Cv_e`&@7OF)lv?@f|{81`~h{*~R!O~!i z7NJ&zNavkYhkx!^AT|e2*;%WBF!lWKDG+L zdN5)b1G~KL<(^!kCXS+PBlgWzGLPG{fiT{g>yx{oaM{zH)peXUG~X_uuUYk1S@$sW zREL3=Br)3Q?u2D-;Y_NSHfNYrvS&z*z@aFuJjk8ZVCdR3cxC0bX_t^w{Xy4|l7#9+ zrCHRT|Ff{q{GqE>nE?8g$8J>-dfGFg@0T_Qk4{0!tJ#>MqYgYV6MLjWZZK}9^`Y*0isD;a=xqJg)}0?_ zzB}!RoNM687lz8+k;w}}BdLxcHl#qyjQJsJ9c{Gu3~}lTmihTbwR>br2VTiF!Qzw_ zqL~T$R~lu{P!)wxQ#zf^KeCC_{~9Be$r=io0=D)=Nv!1*IYq23FqqjAS-wk1E}^?h zVklG=QTEcud7z=$-sl~N$eb(at0cKJGU$6NFb^cDErc;KFpfsodrC^r@zCqX@logR zS$|Z!sE;k!bJ6eik(Hj@PNIjB4)zcC#EfUf938A@f!vPn5l-*c-DFwrMa>(kjFQ?P zRw;fFzBq>Pn_xZe{StSJ2GMB#o$h z=8(&KR^-|^x&Eyy;GQ@&lRO`_xaM1HWH2%9TgQ4rTf&0IxpAlZf~Do1#f!PuLKil! zTYF9CE{7Z^@SgkhkZ;}D!O-28JCWhL)*~U#$hW;{k@?u~b&@I#jMI$N^*i}?*`rQf zSW;2n>p7@UR^h!Me6hY!7#D~Zn~Rvnph^=+)NS9g(`x7=h7Y!o!)QxQ77hYfo##!6 zo=L?q__N)FE+$zPV}9L;rLE;bmU$s7x1q6d#1u9pllKOh4zn><_&751=G-;Lwyt1QdY>KDD<+I%as+DYGG>#X17U};n@?oyTQ_D;fy4@o zp7$N*lK-q2*z-`3+K*;|9%|2q2RfUT6#$jZN(U(I1tUZ%rZo*%%{QCe*kX|N~b$~_3Cet7b zCi12PD_X;WONi$YcVjw?Yig;GpOe43K`B14MnD~=e&6;kdIS_-Hv2~&u>PP zw}&*T=Z>OnaZx|vTv99-7h$yaZjbRB?FFy5QcrEh?qPj*VVYV?@UoO)x3`l=`E z=vNvzqzd>oi=A9D-E1AFjaaD`rw%R-4O^V;oGhcgLl|Yl(AV^CR{b1))_5l`=Utm3 z9^yf|odRsnEC1XK0%6pfg8WS^OO}`CB(%2`{TlLo?)4Bw zpEdx=rzDX^x)!&o0y+u+9=V(S)lkgD z18(&0f|k-;^-3V~A)r&msSPQ~tTAUj6y~R+>B)$IS!(Hr>SyNmRcAUt`zPd%%JgyVMAsVqQ6coBS}4zk9fwL zBSDa#&L%*`A~uA|Brev5k_2(mg&rpx@F!%nY_6PT`<_=enjlA7=V&ba>S)d+PMOlS7c6o(o0uL;w@ z`z8MI`Ew(uWb@nW{VC*=*{mxGRIQ&n64o^qdcmv7{OSEE)K!dSsN~gX`~rd!Yc7(D z=HV)M(?i>m*0rsNPQn3d9P}6cPc?oyJVIo5jr41fMVF!rKoIF1)?vhbB!mgf5#7_R04|q*;4q`r@OOj@Sfy=>UOTd)bh&>8`XVE ztvTVdqN_!^Y@!oq1nj^l5kG%!6mF1v$MQHOTF7Vl1#TZDGfDv)A;kr7<}ah#KXW^v}ud60Y+r%PxJ&gUr zAa?3;gvffL=)-2i9#`))y6E_NE_x;Dz)!oJIAjAdeTj5Oa=y#dp31#Q;$sz4ZZ;2y zUiC-XPV8#ktm@I$e-MCNZ7_H<1xf2OJ$ zE;2lM%r-mbKjM~rJsw{kY|4@w&!S%Eoh{y8Ru;?;E3(L{S+j!*lEHv!Jc!c!q6NE{F|W3^GDG9PqCSY z>)&vy2`ax4Tkp{=L0!1Gmm|B-V*t&li-|34}cf@dr|E4+`ncp|z%1 ze7|l6t+KOuZ!fiFRmLcQ=?gA1rfZ9~n-}JTrd1hO95h%kFGT`Vrc@_|Xn&Fd67FkL7t-Kph)W#H*&p z0c(v(v>KJ%IIz~@rUgNc@||LE94ffK7Kk@rjAzB_E7=GG9b=}vaff0y+~>ROy`(@l z>O4+YLY2QStuSW#wkVl7ocKfImcZ&II*+`321>dgk)ACG8uD@mB1U`tD0|+1a z$0MvA?4{vsJ%CC2B^Qv38_3Hg2!a=05G#<25eQ_2%PBiR{x1$U{ZiAd8+zK{;Hq_Y#t{m{c=D;Q3uc2h^1_i*) z;9_QmgTCL+`AwYj{^60|^fiD}#mq(nW(VK|!-aoS+0HI70Ek=QKeO5b0-(Q;W*bo{{ybzIPY`2g?Af{Xofw@%I zWDKnbO+rvWJBH!f2hnrHLsL5WF4}~CoGyG>JcAlWx1~0EOYl%c3(tm<6@o6uV1L?H zO4aU4E^l+|M;<|{M9k$H=1LAbDK$%h@ff}7Qu`G6Y9_JRNK5|85sQ|KU z%m}QznOJAPhfB)d>pL|Z4V8Es=S}=4y(Vw0g;^%2!4^#6w7ZRl6T5UI)P@ldwq~YCs{_UWY^;?tISE!KOkk zubnR&vbO?lxH8;YX79~@Q+w0C0%DWv<+x9dEy8TkXdBQU{>p`h$}ZZiZc5rqEkk1h z3fW8N0^UL#4`*!Xj*SlIEgA3wx86na@TFuGF%3xWH$UXL>ub zFGaJC(PWNOBe5H($8P%?X;^V@nw18*IJpZAX?*REULExdDv&MCiW$2<^+(b@H*kP3ABo69rf77|9vFv4pmu*-9dT4`+6&;^2qwi z(c1p}g&^^T`n2KAVezR5s+Lyh>l1*XC_!xh&#y`rTP*K3#VzNHUnt#J4~> zpOdGuhM~oi)l&xgGFVbH3^Z?6?76@UM8XYX-#cBHx&7Lm_pvg(=B)a}QTLpf*?2IB z^N_vj)_>jR7*l8VCux;}LsYqX8<9@V$2(@7qQ|6i^#^{P$Fb7`kbKQ*8O%7Rd_Gox zyp8ZpHjUuX8I-$~tGl~9#Ps5@r;t{dv|({wC&SK-HDrd^ncB6809^1BepTJU0S16aLLfl> z_bnC<0IuJA=e&5Fu7k`4Zaw+i9WvL5WgQSCuD2}94jsX~&xASm4R?HkQ+@UW|N9fp= zTiDZs^C@1Ag}E?Z$=MmnGx!i)jIqe$mbgOOQMmiHB5T%p=5xo{6QsENr+|NTqYKQ` W8Rh}s|1tQvc;WwO>19-8G5!ayT)+MR diff --git a/main.tex b/main.tex index 736a1b2..cf15132 100644 --- a/main.tex +++ b/main.tex @@ -1,46 +1,69 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Course of None -% -% Author: Samuel ORTION -% Version: 0.0.1 +% Course on "Data-mining and Machine Learning" - GENIOMHE - M1-S1 +% +% Author: Samuel Ortion +% Version: 0.1.0 % Date: 2023 % Licence: CC-By-SA 4.0+ International %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\documentclass[ - a4paper, - 10pt, - fleqn, - oneside -]{talpa} +\documentclass[twoside=false,fontsize=10pt,fleqn]{scrbook} +\usepackage{mus} +\usepackage{standalone} +\titlehead{GENIOMHE} +\title{Data-mining and\newline{}Machine Learning} +\subtitle{} +\author{Samuel Ortion} +\date{Fall 2023} +\teacher{Farida Zerhaoui} +\cursus{GENIOMHE} +\university{Université d'Évry val d'Essonne -- Université Paris-Saclay} +\semester{M1 - S1} -\input{colors.tex} -\input{meta.tex} -\input{definitions.tex} +\input{definitions} +\input{preamble} \hypersetup{ - pdftitle={ - Course - None - }, - pdfauthor={ - Samuel Ortion - }, + pdftitle={Course - Data-mining and Machine Learning}, + pdfauthor={Samuel Ortion}, pdfsubject={}, - pdfkeywords={}, + pdfkeywords={GENIOMHE, Master, bioinformatics, machine learning, statistics, data}, pdfcreator={LaTeX} } -% \addbibressource{bibliography.bib} +\usepackage{ccicons} +\usepackage[ + type={CC}, + modifier={by-sa}, + version={4.0}, +]{doclicense} +\addbibresource{references.bib} \makeindex + \begin{document} -\tableofcontents +\setkomafont{fullpagetitle}{\fontsize{1.5cm}{3em}\fontseries{b}\selectfont} +\maketitlefullpage -% \input{content/introduction.tex} +{ + \hypersetup{ + linkcolor=black + } + \tableofcontents +} -\input{content/chapters/include.tex} +\doclicenseThis% -% \input{content/conclusion.tex} +% \input{content/introduction} -\end{document} \ No newline at end of file +\input{content/chapters/include} + +% \input{content/conclusion} + +\nocite{*} + +\printbibliography% +% \printglossary% + +\end{document} diff --git a/notebooks/kmeans1d.ipynb b/notebooks/kmeans1d.ipynb new file mode 100644 index 0000000..874fb3c --- /dev/null +++ b/notebooks/kmeans1d.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[10, 12, 9, 15, 17, 18]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "\n", + "data = \"\"\"g1,10\n", + "g2,12\n", + "g3,9\n", + "g4,15\n", + "g5,17\n", + "g6,18\"\"\"\n", + "\n", + "points =[int(row.split(\",\")[1]) for row in data.split(\"\\n\")]\n", + "X = np.array([[point] for point in points])\n", + "initial_means = [[10], [9]]\n", + "points" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans_values = []\n", + "for i in range(1,4): \n", + " kmeans = KMeans(n_clusters=2, random_state=42, max_iter=i, init=initial_means, n_init=1)\n", + " kmeans.fit(X)\n", + " kmeans_values.append(kmeans.cluster_centers_)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgMAAAGFCAYAAABg2vAPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAMpklEQVR4nO3dMU9Va77H8f+54cgRjBhsRpPDJDsZKS0GM+z2vgUL7Ow8mel4JSbTGC2spZj7Fm41kRAtKGUSEr0JTCMBIm6QnXALMxxRi3N0b9Zi/T6fbkPifh6f9Tz5Zu219YeTk5OTAgBi/VfTAwAAmiUGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgXCMxsH94XNt7g6/+bntvUPuHx+c8IqDtnBswPuceA/uHx3X/6VotPV6trd2zG3trd1BLj1fr/tM1Gxs45dyA8Tr3GDg4Gtbbdx/qzc77uvfk1429tTuoe09W683O+3r77kMdHA3Pe2hASzk3YLzOPQZuzFyuZw8Wa252qt7svK+lR/+s//nftVp69M96s/O+5man6tmDxboxc/m8h/ZdhsNhbWxs1HDoMGoD69Etn58b956s1svXO6chcF7nhuuqXbqyHm2YRyPPDNy89nFj/zwzWWt//1vd/e+/1Nrf/1Y/z0zWsweLdfPaxQuBfr9f8/Pz1e/3L/yFedFZj276z7nxnyC4++j5mRAY97nhumqXrqxHW+bR2LcJbl67XMuLV+vDv/9VVVUf/v2vWl68euFCoKpqc3OzXrx4UVVVL168qM3NzYZHlM16dNfNa5fr4dLtMz97uHT7XM4N11W7dGU92jKPxmJga3dQD1f369If/lRVVZf+8Kd6uLr/xcNBF0Gv16uFhYWqqrpz5071er2GR5TNenTX1u6gllfWz/xseWX9XM4N11W7dGU92jKPH05OTk7O+00/fejn55nJWl68Wg9X9+v/9o7O7ZbfqA2Hw9rc3Kxer1cTExNNDyee9eieT8+Nudmperh0u5ZX1s/9owLXVXt0ZT3aMI9zj4HtvY9fA/p8A3++0Vd+uXgPEQLj4dyA8Tr3jwmmJyfq+pVLX5T8pw8HXb9yqaYnL27lAaPl3IDxauRjgv3D4zo4Gn614Lf3BjU9OVFXf/rxvIcFtJhzA8ankRgAANrDf1QEAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhBMDABBODABAODEAAOHEAACEEwMAEE4MAEA4MQAA4cQAAIQTAwAQTgwAQDgxAADhxAAAhOtsDOwfHtf23uCrv9veG9T+4fE5jwjg65xXNK2TMbB/eFz3n67V0uPV2to9u8G2dge19Hi17j9ds8GAxjmvaINOxsDB0bDevvtQb3be170nv26wrd1B3XuyWm923tfbdx/q4GjY8EiBdM4r2qDxGBgOh7WxsVHD4egu9Bszl+vZg8Wam5063WAvX++cbqy52al69mCxbsxcHtl7jmMefDvrwTh05bzqiq7s8zbMo9EYGA6H1e/3a35+vvr9/kj/Im5eO7vB7j56fmZj3bw22hAY1zz4/awH49CV86orurLP2zKPH05OTk4aeeeq2tjYqPn5+dPXr169qlu3bo30PV6+3qm7j56fvv7HX/v15z/OjvQ9zmMe/HbWg3HoynnVFV3Z522ZR6N3Bnq9Xi0sLFRV1Z07d6rX6430z9/aHdTyyvqZny2vrH/xkM73Gvc8+H2sB+PQlfOqK7qyz9syj0bvDFR9vEWyublZvV6vJiYmRvbnfvrwzdzsVD1cul3LK+tj/ahgHPPg21gPxqEr51VXdGWft2EejcfAOGzvffw6zucb6fMNt/KLh3KAZjmvaIPGv00wDtOTE3X9yqUvivrTh3SuX7lU05MXtySBbnBe0QadvDNQ9fEf8jg4Gn61pLf3BjU9OVFXf/qxgZEBnOW8ommdjQEA4Lfp5McEAMBvJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGKgQfuHx7W9N/jq77b3BrV/eHzOIwIYL+deO4mBhuwfHtf9p2u19Hi1tnbPboyt3UEtPV6t+0/XbAygM5x77SUGGnJwNKy37z7Um533de/Jrxtja3dQ956s1pud9/X23Yc6OBo2PFKA0XDutVfjMTAcDmtjY6OGw4u9+L93HjdmLtezB4s1Nzt1ujFevt453RBzs1P17MFi3Zi5POaRd1NXrivaxXX1fUZ97nVlPdowj0ZjYDgcVr/fr/n5+er3+xd2Qb91Hjevnd0Ydx89P7Mhbl4TAt+iK9cV7eK6Go1RnXtdWY+2zOOHk5OTk0beuao2NjZqfn7+9PWrV6/q1q1bTQ3nm33vPF6+3qm7j56fvv7HX/v15z/OjnSMSbpyXdEurqvR+t5zryvr0ZZ5NHpnoNfr1cLCQlVV3blzp3q9XpPD+WbfM4+t3UEtr6yf+dnyyvoXD9fw23XluqJdXFejM4pzryvr0ZZ5NHpnoOrjLZLNzc3q9Xo1MTHR5FC+y7fM49OHZuZmp+rh0u1aXln3UcEIdOW6ol1cV99vlOdeV9ajDfNoPAZSbe99/BrN5xvg842y8ouHCIFucO61V+PfJkg1PTlR169c+qKEP3245vqVSzU9eXFrF+BTzr32cmegQfuHx3VwNPxqAW/vDWp6cqKu/vRjAyMDGA/nXjuJAQAI52MCAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAIJwYAIJwYAIBwYgAAwokBAAgnBgAgnBgAgHBiAADCiQEACCcGACCcGACAcGIAAMKJAQAI9/8lWKkDA/pOhQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axs = plt.subplots(len(kmeans_values), 1, sharex=True)\n", + "for i, centroids in enumerate(kmeans_values):\n", + " ax = axs[i]\n", + " ax.scatter(centroids, [i]*len(centroids), marker='x')\n", + " ax.scatter(points, [i]*len(points), s=2, color=\"black\")\n", + " ax.axis('off')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "geniomhe-ml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/preamble.tex b/preamble.tex index e69de29..d3a4468 100644 --- a/preamble.tex +++ b/preamble.tex @@ -0,0 +1,2 @@ +\usepackage{mus-learn} +\usepackage{xfrac} \ No newline at end of file diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..0f46426 --- /dev/null +++ b/references.bib @@ -0,0 +1,25 @@ + +@book{geron_hands-machine_2019, + edition = {2}, + title = {Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}}, + abstract = {Through a recent series of breakthroughs, deep learning has boosted the entire field of machine learning. Now, even programmers who know close to nothing about this technology can use simple, … - Selection from Hands-On Machine Learning with Scikit-Learn, Keras, and {TensorFlow}, 2nd Edition [Book]}, + publisher = {O'{REILLY}}, + author = {Géron, Aurélien}, + date = {2019}, + langid = {english}, + note = {{ISBN}: 9781098125974} +} + +@collection{witten_data_2011, + location = {Boston}, + edition = {4}, + title = {Data Mining - Practical Machine Learning Tools an Techniques}, + isbn = {978-0-12-374856-0}, + series = {The Morgan Kaufmann Series in Data Management Systems}, + publisher = {Morgan Kaufmann}, + editor = {Witten, Ian H. and Frank, Eibe and Hall, Mark A.}, + urldate = {2023-06-16}, + date = {2011-01-01}, + langid = {english}, + doi = {10.1016/B978-0-12-374856-0.00018-3} +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9bad371 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +scikit-learn +numpy \ No newline at end of file