machine-learning/content/chapters/1.tex

168 lines
4.2 KiB
TeX

\chapter{Unsupervised Learning}
\begin{definition}[Precision Medicine]
Design of treatment for a given patient, based on genomic data.
\end{definition}
\begin{definition}[Hierarchical clustering]
\end{definition}
Gene expression time series: look for genes with similar expression footprint.
\paragraph{Representation of data}
\begin{itemize}
\item Tables;
\item Trees / Graphs;
\item Time series...
\end{itemize}
\begin{figure}
\includestandalone{figures/plots/genes_expression_timeseries}
\caption{Example of gene expression time series}
\end{figure}
\section{Distances and Similarities}
\begin{property}[Distance]
\begin{description}
\item[non-negativity] $d(i, j) \geq 0$
\item[isolation] $d(i, i) = 0$
\item[symmetry] $d(i, j) = d(j, i)$
\item[triangular inequality] $d(i, j) \leq d(i, h) + d(h, j)$
\end{description}
\end{property}
\begin{definition}[Dissimilarity]
Distance without triangular inequality.
\end{definition}
\begin{definition}[Similarity]
Function $s$ from $X \times X$ to $\RR_+$ such that:
\begin{enumerate}
\item $s$ is symmetric: $(x, y) \in X \times X; s(x, y) = s(y, x)$
\item $(x, y) \in X \times X; s(x, x) = s(y, y) > s(x, y)$.
\end{enumerate}
\end{definition}
\begin{exercise}
Let $d(x, y)$ be the distance, $d(x, y) \in [0, +\infty[$.
What should be the similarity measure $S(x, y) = f(d(x, y))$ that satisfies the following property:
\[
(x, y) \in X \times X \: | \: S(x, y) > S(x, y)
\]
having $S(x, y) \leq M$, $S(x, y) \in ]0, M]$.
\end{exercise}
$d(x, y) \geq 0 \: \forall (x, y)$
\begin{equation}
S(x, y) = \frac{M}{d(x, y) + 1}
\label{eq:similarity-first}
\end{equation}
In \cref{eq:similarity-first}, $S(x, y)$ ranges from 0 to M.
\begin{eqnarray}
\lim_{n \to \infty} \frac{M}{n + 1} = 0 && \lim_{n \to 0} \frac{M}{n + 1} = M
\end{eqnarray}
\section{Data Representation}
\paragraph{Data matrix}
\paragraph{Distance matrix}
\[
\begin{bmatrix}
0 \\
d(2, 1) & 0 \\
d(3, 1) & d(3, 2) & 0 \\
\vdots & \vdots & \ddots \\
d(n, 1) & d(n,2) & \dots & \dots & 0
\end{bmatrix}
\]
\begin{table}
\centering
\begin{tabular}{c|cc}
&$s_{1}$ & $s_{2}$ \\
\hline
$p_{1}$ & 0 & 1 \\
$p_{2}$ & 1 & 0 \\
$p_{3}$ & 3 & 2 \\
\end{tabular}
\caption{Example data matrix: 2 symptoms for 3 patients.}
\end{table}
\begin{definition}[Minkowski distance]
\[
L_p (x, y) = \left(\abs{x_1 - y_1}^p + \abs{x_2 - y_2}^p + \ldots + \abs{x_d - y_d}^p\right)^{\sfrac{1}{p}} = \left(\sum_{i=1}^d \left(x_i - y_i\right)^p\right)^{\sfrac{1}{p}}
\]
where $p$ is a positive integer.
\end{definition}
\begin{definition}[Manhattan distance]
\[
L_1(x, y) = \sum_{i=1}^d \abs{x_i - y_i}
\]
\end{definition}
\begin{definition}[Euclidian distance]
Let $A$ and $B$ be two points, with $(x_{A}, y_{A})$ and $(x_{B}, y_{B})$ their respective coordinates,
\end{definition}
If $p=2$, $L_2$ is the Euclidian distance:
\begin{definition}[Euclidian distance]
\[
d(x, y) = \sqrt{\abs{x_1 - y_1}^2 + \abs{x_2 - y_2} + \ldots + \abs{x_d - y_d}^2}
\]
\end{definition}
We can add weights
\subsection{K-means}
The cost function is minimized:
\[
Cost(C) \sum_{i=1}^{k}...
\]
\begin{algorithm}[H]
Choose the number of clusters $k$.
Choose randomly $k$ means.
For each point, compute the distance between the point and each means.
We allocate the point to the cluster represented by the clostest center.
We set each means to the center of the cluster, and reiterate.
\caption{$K$-means algorithm}
\end{algorithm}
\begin{exercise}
We have six genes:
\begin{table}[H]
\centering
\begin{tabular}{ccccccc}
\toprule
& $g_{1}$ & $g_{2}$ & $g_{3}$ & $g_{4}$ & $g_{5}$ & $g_{6}$ \\
\midrule
$\times 10^{-2}$ & 10 & 12 & 9 & 15 & 17 & 18 \\
\bottomrule
\end{tabular}
\caption{Sample values for six gene expressions.}
\end{table}
With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm.
\end{exercise}
\begin{figure}
\centering
\includegraphics[scale=1]{figures/plots/kmeans.pdf}
\caption{$k$-means states at each of the 3 steps}
\end{figure}