diff --git a/.gitignore b/.gitignore
index d72457013cc68dd9db4f173766019fd6597ee846..94233198db01cb6a1bdbb1a80b5991bd9998d8f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 *.bbl
 *.blg
 *.out
+*.toc
diff --git a/recommender.tex b/recommender.tex
index 8266b371206b73766850f0df9edfd9c1ebbe484b..5e9fc81f4289570b53d63ae0290cc3e73e94ac19 100644
--- a/recommender.tex
+++ b/recommender.tex
@@ -74,4 +74,10 @@ The best known and most common method when it comes to \textit{machine learning}
 \end{algorithm}
 
 At the beginning, the matrices $\mathcal{P}, \mathcal{Q}$ are filled with \textit{random numbers}. According to \citet{Funk06} this can be done by a \textit{gaussian-distribution}. Then, for each element in the \textit{training set}, the entries of the corresponding vectors $p_u \in \mathcal{P}, q_i \in \mathcal{Q}$ are recalculated on the basis of the \textit{error} that occurred in an \textit{epoch}. The parameters $\mu, \gamma$ are introduced to avoid \textit{over}- and \textit{underfitting}. These can be determined using \textit{grid-search} and \textit{k-fold cross-validation}. For the \textit{optimization} of the parameters $\mu$ and $\gamma$ the so-called \textit{grid-search} procedure is used. A \textit{grid} of possible parameters is defined before the analysis. This \textit{grid} consists of the sets $\Lambda$ and $\Gamma$. The \textit{grid-search} method then trains the algorithm to be considered with each possible pair of $(\lambda \in \Lambda, \gamma \in \Gamma)$. The models trained in this way are then tested using a \textit{k-fold cross-validation}. The data set is divided into $k$-equally large fragments. Each of the $k$ parts is used once as a test set while the remaining ($k-1)$ parts are used as training data. The average error is then determined via the $k$-\textit{folds} and entered into the \textit{grid}. Thus the pair $(\lambda \in \Lambda, \gamma \in \Gamma)$ can be determined for which the \textit{error} is lowest.
-This approach is also called \textit{Funk-SVD} or \textit{SVD} in combination with section \ref{subsec:rmf} and \ref{subsec:bmf} \citep{Rendle19}.  The algorithm shown above can also be extended. Thus procedures like in section \ref{subsec:amf} can be solved. The second method from section \ref{subsec:amf} is then also called \textit{SVD++}. A coherent \textit{SGD} approach was given by \citet{Kor11}.
\ No newline at end of file
+This approach is also called \textit{Funk-SVD} or \textit{SVD} in combination with section \ref{subsec:rmf} and \ref{subsec:bmf} \citep{Rendle19}.  The algorithm shown above can also be extended. Thus procedures like in section \ref{subsec:amf} can be solved. The second method from section \ref{subsec:amf} is then also called \textit{SVD++}. A coherent \textit{SGD} approach was given by \citet{Kor11}.
+
+\subsubsection{Alternating Least Square}
+The second method often used is \textit{alternating least square (ALS)}. In contrast to \textit{SGD}, the vectors $q_i, p_u$ are adjusted in \textit{two steps}. Since \textit{SGD} $q_i$ and $p_u$ are both unknown, this is a \textit{non-convex problem}. The idea of \textit{ALS} is to capture one of the two vectors and work with one unknown variable each. Thus the problem becomes \textit{quadratic} and can be solved optimally. For this purpose the matrix $\mathcal{P}$ is filled with \textit{random numbers} at the beginning. These should be as small as possible and can be generated by a \textit{gaussian-distribution}. Then $\mathcal{P}$ is recorded and all $q_i \in \mathcal{Q}$ are recalculated according to the \textit{least-square problem}. This step is then repeated in reverse order. \textit{ALS} terminated if a \textit{termination condition} such as the \textit{convergence} of the error is satisfied for both steps \citep{Zh08}.
+
+\subsubsection{Bayesian Learning}
+The third approach is known as \textit{bayesian learning}. With this approach the so-called \textit{gibbs-sampler} is often used. The aim is to determine the \textit{common distribution} of the vectors in $\mathcal{P}, \mathcal{Q}$. For this purpose the \textit{gibbs-sampler} is given an initialization of \textit{hyperparameters} to generate the \textit{initial distribution}. The \textit{common distribution} of the vectors $q_i \in \mathcal{Q}, p_u \in \mathcal{P}$ is approximated by the \textit{conditional probabilities}. The basic principle is to select a variable in a \textit{reciprocal way} and to generate a value dependent on the values of the other variable according to its \textit{conditional distribution}, with the other values remaining unchanged in each \textit{epoch}. A detailed representation of the \textit{gibbs-sampler} was written by \citet{Rus08}.
\ No newline at end of file
diff --git a/references.bib b/references.bib
index 88581aa95b68e0ed1a3a227e8ec49850d1c915fe..3536c270f651436d8c45bd6fb2db413cbc79beb2 100644
--- a/references.bib
+++ b/references.bib
@@ -98,4 +98,22 @@ doi = {10.1007/978-0-387-85820-3_4}
   note = {Accessed: 2019-12-12},
   year = {2006},
   month = {12}
+}
+@inproceedings{Zh08,
+author = {Yunhong Zhou and Dennis Wilkinson and Robert Schreiber and Rong Pan},
+year = {2008},
+month = {06},
+pages = {337-348},
+title = {Large-Scale Parallel Collaborative Filtering for the Netflix Prize},
+doi = {10.1007/978-3-540-68880-8_32}
+}
+@inproceedings{Rus08,
+author = {Ruslan Salakhutdinov and Andriy Mnih},
+year = {2008},
+month = {01},
+pages = {880-887},
+title = {Bayesian probabilistic matrix factorization using Markov chain Monte Carlo},
+volume = {25},
+journal = {Proceedings of the 25th International Conference on Machine Learning},
+doi = {10.1145/1390156.1390267}
 }
\ No newline at end of file
diff --git a/submission.pdf b/submission.pdf
index 63d1731ea1395a28fd0ca046a3b0fb6aca4ffee8..b2003f7f3eca57a7d0edb3dd90eeb2ed5fe9c69e 100644
Binary files a/submission.pdf and b/submission.pdf differ