\chapter{Linear Model} \section{Simple Linear Regression} \[ Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i \] \[ \Y = \X \beta + \varepsilon. \] \[ \begin{pmatrix} Y_1 \\ Y_2 \\ \vdots \\ Y_n \end{pmatrix} = \begin{pmatrix} 1 & X_1 \\ 1 & X_2 \\ \vdots & \vdots \\ 1 & X_n \end{pmatrix} \begin{pmatrix} \beta_0 \\ \beta_1 \end{pmatrix} + \begin{pmatrix} \varepsilon_1 \\ \varepsilon_2 \\ \vdots \varepsilon_n \end{pmatrix} \] \paragraph*{Assumptions} \begin{enumerate}[label={\color{primary}{($A_\arabic*$)}}] \item $\varepsilon_i$ are independent; \item $\varepsilon_i$ are identically distributed; \item $\varepsilon_i$ are i.i.d $\sim \Norm(0, \sigma^2)$ (homoscedasticity). \end{enumerate} \section{Generalized Linear Model} \[ g(\EE(Y)) = X \beta \] with $g$ being \begin{itemize} \item Logistic regression: $g(v) = \log \left(\frac{v}{1-v}\right)$, for instance for boolean values, \item Poisson regression: $g(v) = \log(v)$, for instance for discrete variables. \end{itemize} \subsection{Penalized Regression} When the number of variables is large, e.g, when the number of explanatory variable is above the number of observations, if $p >> n$ ($p$: the number of explanatory variable, $n$ is the number of observations), we cannot estimate the parameters. In order to estimate the parameters, we can use penalties (additional terms). Lasso regression, Elastic Net, etc. \subsection{Statistical Analysis Workflow} \begin{enumerate}[label={\bfseries\color{primary}Step \arabic*.}] \item Graphical representation; \item ... \end{enumerate} \[ Y = X \beta + \varepsilon, \] is noted equivalently as \[ \begin{pmatrix} y_1 \\ y_2 \\ y_3 \\ y_4 \end{pmatrix} = \begin{pmatrix} 1 & x_{11} & x_{12} \\ 1 & x_{21} & x_{22} \\ 1 & x_{31} & x_{32} \\ 1 & x_{41} & x_{42} \end{pmatrix} \begin{pmatrix} \beta_0 \\ \beta_1 \\ \beta_2 \end{pmatrix} + \begin{pmatrix} \varepsilon_1 \\ \varepsilon_2 \\ \varepsilon_3 \\ \varepsilon_4 \end{pmatrix}. \] \section{Parameter Estimation} \subsection{Simple Linear Regression} \subsection{General Case} If $\X^T\X$ is invertible, the OLS estimator is: \begin{equation} \hat{\beta} = (\X^T\X)^{-1} \X^T \Y \end{equation} \subsection{Ordinary Least Square Algorithm} We want to minimize the distance between $\X\beta$ and $\Y$: \[ \min \norm{\Y - \X\beta}^2 \] (See \autoref{ch:elements-of-linear-algebra}). \begin{align*} \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\ \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\ \Rightarrow& \forall i: \\ & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\ \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\ \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\ \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y \end{align*} This formula comes from the orthogonal projection of $\Y$ on the vector subspace defined by the explanatory variables $\X$ $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$. If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$. \section{Coefficient of Determination: \texorpdfstring{$R^2$}{R\textsuperscript{2}}} \begin{definition}[$R^2$] \[ 0 \leq R^2 = \frac{\norm{\X\hat{\beta} - \bar{\Y}\One}^2}{\norm{\Y - \bar{\Y}\One}^2} = 1 - \frac{\norm{\Y - \X\hat{\beta}}^2}{\norm{\Y - \bar{\Y}\One}^2} \leq 1 \] proportion of variation of $\Y$ explained by the model. \end{definition} \begin{figure} \centering \includestandalone{figures/schemes/orthogonal_projection} \caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\norm{\Y - \hat{\beta}\X}^2$} \label{fig:scheme-orthogonal-projection} \end{figure} \begin{figure} \centering \includestandalone{figures/schemes/ordinary_least_squares} \caption{Ordinary least squares and regression line with simulated data.} \label{fig:ordinary-least-squares} \end{figure}