diff --git a/.gitignore b/.gitignore index d72457013cc68dd9db4f173766019fd6597ee846..94233198db01cb6a1bdbb1a80b5991bd9998d8f5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ *.bbl *.blg *.out +*.toc diff --git a/recommender.tex b/recommender.tex index 8266b371206b73766850f0df9edfd9c1ebbe484b..5e9fc81f4289570b53d63ae0290cc3e73e94ac19 100644 --- a/recommender.tex +++ b/recommender.tex @@ -74,4 +74,10 @@ The best known and most common method when it comes to \textit{machine learning} \end{algorithm} At the beginning, the matrices $\mathcal{P}, \mathcal{Q}$ are filled with \textit{random numbers}. According to \citet{Funk06} this can be done by a \textit{gaussian-distribution}. Then, for each element in the \textit{training set}, the entries of the corresponding vectors $p_u \in \mathcal{P}, q_i \in \mathcal{Q}$ are recalculated on the basis of the \textit{error} that occurred in an \textit{epoch}. The parameters $\mu, \gamma$ are introduced to avoid \textit{over}- and \textit{underfitting}. These can be determined using \textit{grid-search} and \textit{k-fold cross-validation}. For the \textit{optimization} of the parameters $\mu$ and $\gamma$ the so-called \textit{grid-search} procedure is used. A \textit{grid} of possible parameters is defined before the analysis. This \textit{grid} consists of the sets $\Lambda$ and $\Gamma$. The \textit{grid-search} method then trains the algorithm to be considered with each possible pair of $(\lambda \in \Lambda, \gamma \in \Gamma)$. The models trained in this way are then tested using a \textit{k-fold cross-validation}. The data set is divided into $k$-equally large fragments. Each of the $k$ parts is used once as a test set while the remaining ($k-1)$ parts are used as training data. The average error is then determined via the $k$-\textit{folds} and entered into the \textit{grid}. Thus the pair $(\lambda \in \Lambda, \gamma \in \Gamma)$ can be determined for which the \textit{error} is lowest. -This approach is also called \textit{Funk-SVD} or \textit{SVD} in combination with section \ref{subsec:rmf} and \ref{subsec:bmf} \citep{Rendle19}. The algorithm shown above can also be extended. Thus procedures like in section \ref{subsec:amf} can be solved. The second method from section \ref{subsec:amf} is then also called \textit{SVD++}. A coherent \textit{SGD} approach was given by \citet{Kor11}. \ No newline at end of file +This approach is also called \textit{Funk-SVD} or \textit{SVD} in combination with section \ref{subsec:rmf} and \ref{subsec:bmf} \citep{Rendle19}. The algorithm shown above can also be extended. Thus procedures like in section \ref{subsec:amf} can be solved. The second method from section \ref{subsec:amf} is then also called \textit{SVD++}. A coherent \textit{SGD} approach was given by \citet{Kor11}. + +\subsubsection{Alternating Least Square} +The second method often used is \textit{alternating least square (ALS)}. In contrast to \textit{SGD}, the vectors $q_i, p_u$ are adjusted in \textit{two steps}. Since \textit{SGD} $q_i$ and $p_u$ are both unknown, this is a \textit{non-convex problem}. The idea of \textit{ALS} is to capture one of the two vectors and work with one unknown variable each. Thus the problem becomes \textit{quadratic} and can be solved optimally. For this purpose the matrix $\mathcal{P}$ is filled with \textit{random numbers} at the beginning. These should be as small as possible and can be generated by a \textit{gaussian-distribution}. Then $\mathcal{P}$ is recorded and all $q_i \in \mathcal{Q}$ are recalculated according to the \textit{least-square problem}. This step is then repeated in reverse order. \textit{ALS} terminated if a \textit{termination condition} such as the \textit{convergence} of the error is satisfied for both steps \citep{Zh08}. + +\subsubsection{Bayesian Learning} +The third approach is known as \textit{bayesian learning}. With this approach the so-called \textit{gibbs-sampler} is often used. The aim is to determine the \textit{common distribution} of the vectors in $\mathcal{P}, \mathcal{Q}$. For this purpose the \textit{gibbs-sampler} is given an initialization of \textit{hyperparameters} to generate the \textit{initial distribution}. The \textit{common distribution} of the vectors $q_i \in \mathcal{Q}, p_u \in \mathcal{P}$ is approximated by the \textit{conditional probabilities}. The basic principle is to select a variable in a \textit{reciprocal way} and to generate a value dependent on the values of the other variable according to its \textit{conditional distribution}, with the other values remaining unchanged in each \textit{epoch}. A detailed representation of the \textit{gibbs-sampler} was written by \citet{Rus08}. \ No newline at end of file diff --git a/references.bib b/references.bib index 88581aa95b68e0ed1a3a227e8ec49850d1c915fe..3536c270f651436d8c45bd6fb2db413cbc79beb2 100644 --- a/references.bib +++ b/references.bib @@ -98,4 +98,22 @@ doi = {10.1007/978-0-387-85820-3_4} note = {Accessed: 2019-12-12}, year = {2006}, month = {12} +} +@inproceedings{Zh08, +author = {Yunhong Zhou and Dennis Wilkinson and Robert Schreiber and Rong Pan}, +year = {2008}, +month = {06}, +pages = {337-348}, +title = {Large-Scale Parallel Collaborative Filtering for the Netflix Prize}, +doi = {10.1007/978-3-540-68880-8_32} +} +@inproceedings{Rus08, +author = {Ruslan Salakhutdinov and Andriy Mnih}, +year = {2008}, +month = {01}, +pages = {880-887}, +title = {Bayesian probabilistic matrix factorization using Markov chain Monte Carlo}, +volume = {25}, +journal = {Proceedings of the 25th International Conference on Machine Learning}, +doi = {10.1145/1390156.1390267} } \ No newline at end of file diff --git a/submission.pdf b/submission.pdf index 63d1731ea1395a28fd0ca046a3b0fb6aca4ffee8..b2003f7f3eca57a7d0edb3dd90eeb2ed5fe9c69e 100644 Binary files a/submission.pdf and b/submission.pdf differ