diff --git a/content-based-collaborative-filtering-comparison.tex b/content-based-collaborative-filtering-comparison.tex new file mode 100644 index 0000000000000000000000000000000000000000..b2d0157ffefc9bf2e7f31d0d1fa56c65643b7a61 --- /dev/null +++ b/content-based-collaborative-filtering-comparison.tex @@ -0,0 +1,15 @@ +\begin{figure}[!ht] + \centering + \begin{subfigure}[b]{0.25\linewidth} + \includegraphics[width=\linewidth]{Bilder/ContendBasedFlow.jpg} + \caption{\textit{Content-Based}.} + \label{fig:cb} + \end{subfigure} + \begin{subfigure}[b]{0.25\linewidth} + \includegraphics[width=\linewidth]{Bilder/CollaborativeFlow.jpg} + \caption{\textit{Collaborative-Filtering}.} + \label{fig:cf} + \end{subfigure} + \caption{Overview of \textit{content-based} (left) and \textit{collaborative-filtering} (right) \textit{recommender systems}. \textit{Content-based recommender systems} work via \textit{feature vectors}. In contrast, \textit{collaborative filtering recommender systems} work over \textit{neighborhoods}.} + \label{fig:cbcf} +\end{figure} diff --git a/content-based.tex b/content-based.tex deleted file mode 100644 index db246f6e7e0dad8657c2bfcfb175ddb88e95325b..0000000000000000000000000000000000000000 --- a/content-based.tex +++ /dev/null @@ -1,6 +0,0 @@ -\begin{figure}[htbp!] - \centering - \includegraphics[scale=0.5]{Bilder/ContendBasedFlow.jpg} - \caption{\textit{Content-Based recommender systems} work via \textit{feature vectors}. These \textit{vectors} are learned or created using a variety of methods to model the \textit{user's preferences}. A suggestion is determined by the similarity between the \textit{feature vector} of the \textit{user} and the \textit{items}.} - \label{img:content-based} -\end{figure} \ No newline at end of file diff --git a/recommender.tex b/recommender.tex index 11efb3f477e05a67ebe862720b67ef3bd84273c7..d07e556a8f717da7cb9ca8c9422e51f7e08ae93b 100644 --- a/recommender.tex +++ b/recommender.tex @@ -8,10 +8,23 @@ Each of the \textit{users} in $\mathcal{U}$ gives \textit{ratings} from a set $\ In the following, the two main approaches of \textit{collaborative-filtering} and \textit{content-based} \textit{recommender systems} will be discussed. In addition, it is explained how \textit{matrix factorization} can be integrated into the two ways of thinking. \subsection{Content-Based} -\textit{Content-based} \textit{recommender systems} work directly with \textit{feature vectors}. Such a \textit{feature vector} can, for example, represent a \textit{user profile}. In this case, this \textit{profile} contains information about the \textit{user's preferences}, such as \textit{genres}, \textit{authors}, \textit{etc}. This is done by trying to create a \textit{model} of the \textit{user}, which best represents his preferences. The different \textit{learning algorithms} from the field of \textit{machine learning} are used to learn or create the \textit{models}. The most prominent \textit{algorithms} are: \textit{tf-idf}, \textit{bayesian learning}, \textit{Rocchio's algorithm} and \textit{neural networks} \citep{Lops11, Ferrari19}. Altogether the built and learned \textit{feature vectors} are compared with each other. Based on their closeness, similar \textit{features} can be used to generate \textit{missing ratings}. - -\input{content-based} +\textit{Content-based} \textit{recommender systems} work directly with \textit{feature vectors}. Such a \textit{feature vector} can, for example, represent a \textit{user profile}. In this case, this \textit{profile} contains information about the \textit{user's preferences}, such as \textit{genres}, \textit{authors}, \textit{etc}. This is done by trying to create a \textit{model} of the \textit{user}, which best represents his preferences. The different \textit{learning algorithms} from the field of \textit{machine learning} are used to learn or create the \textit{models}. The most prominent \textit{algorithms} are: \textit{tf-idf}, \textit{bayesian learning}, \textit{Rocchio's algorithm} and \textit{neural networks} \citep{Lops11, Ferrari19, DeKa11}. Altogether the built and learned \textit{feature vectors} are compared with each other. Based on their closeness, similar \textit{features} can be used to generate \textit{missing ratings}. Figure \ref{fig:cb} shows a sketch of the general operation of \textit{content-based recommenders}. \subsection{Collaborative-Filtering} +Unlike the \textit{content-based recommender}, the \textit{collaborative-filtering recommender} not only considers individual \textit{users} and \textit{feature vectors}, but rather a \textit{like-minded neighborhood} of each \textit{user}. +Missing \textit{user ratings} can be extracted by this \textit{neighbourhood} and \textit{networked} to form a whole. It is assumed that a \textit{missing rating} of the considered \textit{user} for an unknown \textit{item} $i$ will be similar to the \textit{rating} of a \textit{user} $v$ as soon as $u$ and $v$ have rated some \textit{items} similarly. The similarity of the \textit{users} is determined by the \textit{community ratings}. This type of \textit{recommender system} is also known by the term \textit{neighborhood-based recommender} \citep{DeKa11}. The main focus of \textit{neighbourhood-based methods} is on the application of iterative methods such as \textit{k-nearest-neighbours} or \textit{k-means}. +A \textit{neighborhood-based recommender} can be viewed from two angles: The first and best known problem is the so-called \textit{user-based prediction}. Here, the \textit{missing ratings} of a considered \textit{user} $u$ are to be determined from his \textit{neighborhood} $\mathcal{N}_i(u)$. +$\mathcal{N}_i(u)$ denotes the subset of the \textit{neighborhood} of all \textit{users} who have a similar manner of evaluation to $u$ via the \textit{item} $i$. The second problem is that of \textit{item-based prediction}. Analogously, the similarity of the items is determined by their received ratings. +This kind of problem consideres the \textit{neighborhood} $\mathcal{N}_u(i)$ of all \textit{items} $i$ which were similar rated via the \textit{user} $u$. The similarity between the objects of a \textit{neighborhood} is determined by \textit{distance functions} such as \textit{mean-squared-difference}, \textit{pearson-correlation} or \textit{cosine-similarity}. +Figure \ref{fig:cf} shows a sketch of the general operation of the \textit{collaborative-filtering recommender}. + +\input{content-based-collaborative-filtering-comparison} + +\subsection{Matrix-Factorization} +The core idea of \textit{matrix factorization} is to supplement the not completely filled out \textit{rating-matrix} $\mathcal{R}$. For this purpose the \textit{users} and \textit{items} are to be mapped to a joined \textit{latent feature space} with \textit{dimensionality} $f$. The \textit{user} is represented by the vector $p_u \in \mathbb{R}^{f}$ and the item by the vector $q_i \in \mathbb{R}^{f}$. As a result, the \textit{missing ratings} and thus the \textit{user-item interaction} are to be determined via the \textit{inner product} $\hat{r}_{ui}=q_i^Tp_u$ of the corresponding vectors \citep{Kor09}. In the following, the four most classical matrix factorization approaches are described in detail. Afterwards, the concrete learning methods with which the vectors are learned are presented. In addition, the \textit{training data} for which a \textit{concrete rating} is available should be referred to as $\mathcal{B} = \lbrace(u,i) | r_{ui} \in \mathcal{R}\rbrace$. + +\subsubsection{Basic Matrix-Factorization} +The first and easiest way to solve \textit{matrix-factorization} is to connect the \textit{feature vectors} of the \textit{users} and the \textit{items} using the \textit{inner product}. The result is the \textit{user-item interaction}. In addition, the \textit{error} should be as small as possible. Therefore, $\min_{p_u, q_i}{\sum_{(u,i) \in \mathcal{B}} (r_{ui} - \hat{r}_{ui})^{2}}$ is defined as an associated \textit{minimization problem} for the \textit{RMSE}. -\subsection{Matrix-Factorization} \ No newline at end of file +\subsubsection{Regulated Matrix-Factorization} +This problem extends the \textit{basic matrix factorization} by a \textit{regulation factor} $\lambda$ in the corresponding \textit{minimization problem}. Since $\mathcal{R}$ is thinly occupied, the effect of \textit{overfitting} may occur due to learning from the few known values. The problem with \textit{overfitting} is that the generated \textit{ratings} are too tight. To counteract this, the magnitudes of the previous vectors is taken into account. High magnitudes are punished by a factor $\lambda(\lVert q_i \rVert^2 + \lVert p_u \lVert^2)$ in the \textit{minimization problem}. Overall, the \textit{minimization problem} $\min_{p_u, q_i}{\sum_{(u,i) \in \mathcal{B}} (r_{ui} - \hat{r}_{ui})^{2}} + \lambda(\lVert q_i \rVert^2 + \lVert p_u \lVert^2)$ is to be solved. diff --git a/references.bib b/references.bib index bd65f8b76cdce2862f559abaaefa4fececd655f3..df05aebe6ccf0e4450d68c164cd9d47b28fb820e 100644 --- a/references.bib +++ b/references.bib @@ -10,7 +10,6 @@ abstract = {A measure for proximity between documents is defined, based on data from readers. This proximity measure can be further investigated as a tool document retrieval, and as to provide data for concept formation experiments. }, year = {1990} } - @article{Rendle19, author = {Steffen Rendle and Li Zhang and @@ -52,7 +51,6 @@ editor = {P.B. Kantor and F. Ricci and L. Rokach and B. Shapira}, publisher={Springer}, doi = {10.1007/978-0-387-85820-3_4} } - @inproceedings{Ferrari19, author = {Maurizio Ferrari Dacrema and Paolo Cremonesi and Dietmar Jannach}, year = {2019}, @@ -61,4 +59,15 @@ pages = {}, title = {Are We Really Making Much Progress? A Worrying Analysis of Recent Neural Recommendation Approaches}, isbn = {978-1-4503-6243-6}, doi = {10.1145/3298689.3347058} +} +@article{Kor09, +author = {Yehuda Koren and + Robert Bell and + Chris Volinsky}, +year = {2009}, +month = {08}, +pages = {30-37}, +title = {Matrix factorization techniques for recommender systems}, +volume = {42}, +journal = {Computer} } \ No newline at end of file diff --git a/submission.pdf b/submission.pdf index e4502e520dec64e33fe32ba969074f0ecb08835c..ac8586d4e91fc3f5ba09587a889413239cff6601 100644 Binary files a/submission.pdf and b/submission.pdf differ diff --git a/submission.tex b/submission.tex index b0c8b343ac024d67a46e1065424c6b1b258669d9..6ec8f08af01ae4bee1d7b50762c8e6f9c4c44fb3 100644 --- a/submission.tex +++ b/submission.tex @@ -32,9 +32,10 @@ \usepackage[]{titlesec} \titlespacing*{\section} {0pt}{6pt}{6pt} -\usepackage[]{titlesec} \titlespacing*{\subsection} {0pt}{6pt}{6pt} +\titlespacing*{\subsubsection} +{0pt}{6pt}{6pt} \usepackage{footmisc} \setlength{\abovedisplayskip}{0pt} \renewcommand{\footrulewidth}{0.5pt} @@ -47,7 +48,7 @@ \hypersetup{ colorlinks, citecolor=hhuUniBlau, - linkcolor=black, + linkcolor=hhuUniBlau, urlcolor=hhuUniBlau} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%