diff --git a/baselines.tex b/baselines.tex index 20a22216d8486a54a36b9e1358359895bcbba815..4bf1526a643f238ac76990f9638abedf891fda2a 100644 --- a/baselines.tex +++ b/baselines.tex @@ -1,13 +1,13 @@ \section{On the Diffculty of Evaluating Baselines} -This section reviews the \textit{main part} of the work represented by \citet{Rendle19}. In addition to a \textit{detailed description} and \textit{explanation} of the \textit{experiments} carried out and the \textit{observations} gained from them, a short introduction is given regarding the driving \textit{motivation} +This section reviews the main part of the work represented by \citet{Rendle19}. In addition to a detailed description and explanation of the experiments carried out and the observations gained from them, a short introduction is given regarding the driving motivation. \subsection{Motivation and Background} -As in many other fields of \textit{data-science}, a valid \textit{benchmark-dataset} is required for a proper execution of experiments. In the field of \textit{recommender systems}, the best known \textit{datasets} are the \textit{Netflix-} and \textit{MovieLens-dataset}. This section introduces both \textit{datasets} and shows the relationship of \citet{Koren}, one of the authors of this paper, to the \textit{Netflix-Prize}, in addition to the existing \textit{baselines}. +As in many other fields of \textit{data-science}, a valid \textit{benchmark-dataset} is required for a proper execution of experiments. In the field of \textit{recommender systems}, the best known datasets are the \textit{Netflix-} and \textit{MovieLens-dataset}. This section introduces both datasets and shows the relationship of \citet{Koren}, one of the authors of this paper, to the \textit{Netflix-Prize}, in addition to the existing \textit{baselines}. \subsubsection{Netflix-Prize} \label{sec:netflix} The topic of \textit{recommender systems} was first properly promoted and made known by the \textit{Netflix-Prize}. On \textit{October 2nd 2006}, the competition announced by \textit{Netflix} began with the \textit{goal} of beating the self-developed \textit{recommender system Cinematch} with an \textit{RMSE} of \textit{0.9514} by at least \textit{10\%}. -In total, the \textit{Netflix-dataset} was divided into three parts that can be grouped into two categories: \textit{training} and \textit{qualification}. In addition to a \textit{probe-dataset} for \textit{training} the algorithms, two further datasets were retained to qualify the winners. The \textit{quiz-dataset} was then used to calculate the \textit{score} of the \textit{submitted solutions} on the \textit{public leaderboard}. In contrast, the \textit{test-dataset} was used to determine the \textit{actual winners}. Each of the pieces had around \textit{1.408.000 data} and \textit{similar statistical values}. By splitting the data in this way, it was possible to ensure that an improvement could not be achieved by \textit{simple hill-climbing-algorithms}. -It took a total of \textit{three years} and \textit{several hundred models} until the team \textit{"BellKor`s Pragmatic Chaos"} was chosen as the \textit{winner} on \textit{21st September 2009}. They had managed to achieve an \textit{RMSE} of \textit{0.8554} and thus an \textit{improvement} of \textit{0.096}. Such a result is extraordinary excellent, because it took \textit{one year} of work and intensive research to reduce the \textit{RMSE} from \textit{0.8712 (progress award 2007)} to \textit{0.8616 (progress award 2008)}. -The \textit{co-author} of the present paper, \citet{Koren}, was significantly involved in the work of this team. Since the beginning of the event, \textit{matrix-factorization methods} have been regarded as promising approaches. Even with the simplest \textit{SVD} methods, \textit{RMSE values} of \textit{0.94} could be achieved by \citet{Kurucz07}. +In total, the \textit{Netflix-dataset} was divided into three parts that can be grouped into two categories: \textit{training} and \textit{qualification}. In addition to a \textit{probe-dataset} for \textit{training} the algorithms, two further datasets were retained to qualify the winners. The \textit{quiz-dataset} was then used to calculate the \textit{score} of the \textit{submitted solutions} on the \textit{public leaderboard}. In contrast, the \textit{test-dataset} was used to determine the actual winners. Each of the pieces had around \textit{1.408.000 elements} and similar statistical values. By splitting the data in this way, it was possible to ensure that an improvement could not be achieved by \textit{simple hill-climbing-algorithms}. +It took a total of \textit{three years} and \textit{several hundred models} until the team \textit{BellKor`s Pragmatic Chaos} was chosen as the \textit{winner} on \textit{21st September 2009}. They had managed to achieve an \textit{RMSE} of \textit{0.8554} and thus an improvement of \textit{0.096}. Such a result is extraordinary excellent, because it took \textit{one year} of work and intensive research to reduce the \textit{RMSE} from \textit{0.8712 (progress award 2007)} to \textit{0.8616 (progress award 2008)}. +The \textit{co-author} of the present paper, \citet{Koren}, was significantly involved in the work of this team. Since the beginning of the event, \textit{matrix-factorization methods} have been regarded as promising approaches. Even with the simplest \textit{SVD} methods, \textit{RMSE} values of \textit{0.94} could be achieved by \citet{Kurucz07}. The \textit{breakthrough} came through \citet{Funk06} who achieved an \textit{RMSE} of \textit{0.93} with his \textit{FunkSVD}. Based on this, more and more work has been invested in the research of simple \textit{matrix-factorization methods}. Thus, \citet{Zh08} presented an \textit{ALS variant} with an \textit{RMSE} of \textit{0.8985} and \citet{Koren09} presented an \textit{SGD variant} with \textit{RMSE 0.8995}. @@ -15,30 +15,30 @@ Thus, \citet{Zh08} presented an \textit{ALS variant} with an \textit{RMSE} of \t The \textit{Netflix-Prize} made it clear that even the \textit{simplest methods} are \textit{not trivial} and that a \textit{reasonable investigation} and \textit{evaluation requires} an \textit{immense effort} from within the \textit{community}. \subsubsection{MovieLens} -In the \textit{non-commercial sector} of \textit{recommender systems} the \textit{MovieLens10M-dataset} is mostly used. It consists of \textit{10.000.054 data} and was published by the research group \textit{GroupLens} in \textit{2009} \citep{Harper15}. In most cases a \textit{global} and \textit{random} \textit{90:10 split} of the data is used to evaluate the \textit{RMSE}. This means that through a \textit{random selection 90\%} of the data is used for \textit{training} and \textit{10\%} of the remaining data is used for \textit{testing}. Over the last \textit{5 years} a large number of \textit{algorithms} on this data set have been evaluated and the results have been published on \textit{well-known convergences} such as \textit{ICML}, \textit{NeurIPS}, \textit{WWW}, \textit{SIGIR} and \textit{AAAI}. \textit{Figure} \ref{fig:reported_results} shows the \textit{results obtained} over the last \textit{5 years} on the \textit{MovieLens10M-dataset}. -It can be clearly stated that the \textit{texisting baselines} have been \textit{beaten} and \textit{newer methods} have made \textit{steady progress}. +In the \textit{non-commercial sector} of \textit{recommender systems} the \textit{MovieLens10M-dataset} is mostly used. It consists of \textit{10.000.054 elements} and was published by the research group \textit{GroupLens} in \textit{2009} \citep{Harper15}. In most cases a \textit{global} and \textit{random} \textit{90:10 split} of the data is used to evaluate the \textit{RMSE}. This means that through a \textit{random selection 90\%} of the data is used for \textit{training} and \textit{10\%} of the remaining data is used for \textit{testing}. Over the last \textit{five years} a large number of algorithms on this dataset have been evaluated and the results have been published on \textit{well-known convergences} such as \textit{ICML}, \textit{NeurIPS}, \textit{WWW}, \textit{SIGIR} and \textit{AAAI}. \textit{Figure} \ref{fig:reported_results} shows the \textit{results obtained} over the last \textit{five years} on the \textit{MovieLens10M-dataset}. +It can be clearly stated that the \textit{existing baselines} have been \textit{beaten} and \textit{newer methods} have made \textit{steady progress}. \input{reported_results} \subsection{Experiment Realization} -As the \textit{Netflix-Prize} has shown, \textit{research} and \textit{validation} is \textit{complex} even for very \textit{simple methods}. Not only during the \textit{Netflix-Prize} was intensive work done on researching \textit{existing} and \textit{new reliable methods}. The \textit{MovieLens10M-dataset} was used just as often. With their \textit{experiment} the authors \textit{doubt} that the \textit{baselines} of \textit{MovieLens10M} are \textit{inadequate} for the evaluation of new methods. To test their hypothesis, the authors transferred all the findings from the \textit{Netflix-Prize} to the existing baselines of \textit{MovieLens10M}. +As the \textit{Netflix-Prize} has shown, \textit{research} and \textit{validation} is \textit{complex} even for very \textit{simple methods}. Not only during the \textit{Netflix-Prize} was intensive work done on researching \textit{existing} and \textit{new reliable methods}. The \textit{MovieLens10M-dataset} was used just as often. With their experiment the authors \textit{doubt} that the \textit{baselines} of \textit{MovieLens10M} are \textit{inadequate} for the evaluation of new methods. To test their hypothesis, the authors transferred all the findings from the \textit{Netflix-Prize} to the existing baselines of \textit{MovieLens10M}. \subsubsection{Experiment Preparation}\label{sec:experiment_preparation} -Before actually conducting the experiment, the authors took a closer look at the given baselines. In the process, they noticed some \textit{systematic overlaps}. These can be taken from \textit{table} below. +Before actually conducting the experiment, the authors took a closer look at the given \textit{baselines}. In the process, they noticed some \textit{systematic overlaps}. These can be taken from the \textit{table} below. \input{overlaps} From the three aspects it can be seen that the models are fundamentally similar and that the main differences arise from different setups and learning procedures. Thus, the authors examined the two learning methods \textit{stochastic gradient descent} and \textit{bayesian learning} in combination with \textit{biased matrix-factorization} before conducting the actual experiment. For $b_u = b_i = 0$ this is equivalent to \textit{regulated matrix-factorization (RSVD)}. In addition, for $\alpha = \beta = 1$ the \textit{weighted regulated matrix-factorization (WR)} is equivalent to \textit{RSVD}. Thus, the only differences are explained by the different adjustments of the methods. -To prepare the two learning procedures they were initialized with a \textit{gaussian normal distribution} $\mathcal{N}(\mu, 0.1^2)$. The value for the \textit{standard deviation} of 0.1 is the value suggested by the \textit{factorization machine libFM} as the default. In addition, \citet{Rendle13} achieved good results on the \textit{Netflix-Prize-dataset} with this value. Nothing is said about the parameter $\mu$. However, it can be assumed that this parameter is around the \textit{global average} of the \textit{ratings}. This can be assumed because \textit{ratings} are to be \textit{generated} with the \textit{initialization}. +To prepare the two learning procedures they were initialized with a \textit{gaussian normal distribution} $\mathcal{N}(\mu, 0.1^2)$. The value for the \textit{standard deviation} of \textit{0.1} is the value suggested by the \textit{factorization machine libFM} as the default. In addition, \citet{Rendle13} achieved good results on the \textit{Netflix-Prize-dataset} with this value. Nothing is said about the parameter $\mu$. However, it can be assumed that this parameter is around the \textit{global average} of the \textit{ratings}. This can be assumed because \textit{ratings} are to be \textit{generated} with the \textit{initialization}. For both approaches the number of \textit{sampling steps} was then set to \textit{128}. Since \textit{SGD} has two additional \textit{hyperparameters} $\lambda, \gamma$ these were also determined. Overall, the \textit{MovieLens10M-dataset} was evaluated by a \textit{10-fold cross-validation} over a \textit{random global} and \textit{non-overlapping 90:10 split}. In each split, \textit{90\%} of the data was used for \textit{training} and \textit{10\%} of the data was used for \textit{evaluation} without overlapping. In each split, \textit{95\%} of the \textit{training data} was used for \textit{training} and the remaining \textit{5\%} for \textit{evaluation} to determine the \textit{hyperparameters}. The \textit{hyperparameter search} was performed as mentioned in \textit{section} \ref{sec:sgd} using the \textit{grid} $(\lambda \in \{0.02, 0.03, 0.04, 0.05\}, \gamma \in \{0.001, 0.003\})$. This grid was inspired by findings during the \textit{Netflix-Prize} \citep{Kor08, Paterek07}. In total the parameters $\lambda=0.04$ and $\gamma=0.003$ could be determined. Afterwards both \textit{learning methods} and their settings were compared. The \textit{RMSE} was plotted against the used \textit{dimension} $f$ of $p_u, q_i \in \mathbb{R}^f$. \textit{Figure} \ref{fig:battle} shows the corresponding results. \input{battle} \newpage -As a \textit{first intermediate result} of the preparation it can be stated that both \textit{SGD} and \textit{gibbs-samper} achieve better \textit{RMSE values} for increasing \textit{dimensional embedding}. +As a \textit{first intermediate result} of the preparation it can be stated that both \textit{SGD} and \textit{gibbs-samper} achieve better \textit{RMSE} values for increasing \textit{dimensional embedding}. -In addition, it can be stated that learning using the \textit{bayesian approach} is better than learning using \textit{SGD}. Even if the results could be different due to more efficient setups, it is still surprising that \textit{SGD} is worse than the \textit{bayesian approach}, although the \textit{exact opposite} was reported for \textit{MovieLens10M}. For example, \textit{figure} \ref{fig:reported_results} shows that the \textit{bayesian approach BPMF} achieved an \textit{RMSE} of \textit{0.8187} while the \textit{SGD approach Biased MF} performed better with \textit{0.803}. The fact that the \textit{bayesian approach} outperforms \textit{SGD} has already been reported and validated by \citet{Rendle13}, \citet{Rus08} for the \textit{Netflix-Prize-dataset}. Looking more closely at \textit{figures} \ref{fig:reported_results} and \ref{fig:battle}, the \textit{bayesian approach} scores better than the reported \textit{BPMF} and \textit{Biased MF} for each \textit{dimensional embedding}. Moreover, it even beats all reported baselines and new methods. Building on this, the authors have gone into the detailed examination of the methods and baselines. +In addition, it can be stated that learning using the \textit{bayesian approach} is better than learning using \textit{SGD}. Even if the results could be different due to more efficient setups, it is still surprising that \textit{SGD} is worse than the \textit{bayesian approach}, although the \textit{exact opposite} was reported for \textit{MovieLens10M}. For example, \textit{figure} \ref{fig:reported_results} shows that the \textit{bayesian approach BPMF} achieved an \textit{RMSE} of \textit{0.8187} while the \textit{SGD approach Biased MF} performed better with \textit{0.803}. The fact that the \textit{bayesian approach} outperforms \textit{SGD} has already been reported and validated by \citet{Rendle13}, \citet{Rus08} for the \textit{Netflix-Prize-dataset}. Looking more closely at \textit{figures} \ref{fig:reported_results} and \ref{fig:battle}, the \textit{bayesian approach} scores better than the reported \textit{BPMF} and \textit{Biased MF} for each \textit{dimensional embedding}. Moreover, it even beats all reported \textit{baselines} and new methods. Building on this, the authors have gone into the detailed examination of the methods and \textit{baselines}. \subsubsection{Experiment Implementation} -For the actual execution of the experiment, the \textit{authors} used the knowledge they had gained from the \textit{preparations}. They noticed already for the two \textit{simple matrix-factorization models SGD-MF} and \textit{Bayesian MF}, which were trained with an \textit{embedding} of \textit{512 dimensions} and over \textit{128 epochs}, that they performed extremely well. Thus \textit{SGD-MF} achieved an \textit{RMSE} of \textit{0.7720}. This result alone was better than: \textit{RSVD (0.8256)}, \textit{Biased MF (0.803)}, \textit{LLORMA (0.7815)}, \textit{Autorec (0.782)}, \textit{WEMAREC (0.7769)} and \textit{I-CFN++ (0.7754)}. In addition, \textit{Bayesian MF} with an \textit{RMSE} of \textit{0.7653} not only beat the \textit{reported baseline BPMF (0.8197)}. It also beat the \textit{best algorithm MRMA (0.7634)}. -As the \textit{Netflix-Prize} showed, the use of \textit{implicit data} such as \textit{time} or \textit{dependencies} between \textit{users} or \textit{items} could \textit{immensely improve existing models}. In addition to the two \textit{simple matrix factorizations}, \textit{table} \ref{table:models} shows the \textit{extensions} of the \textit{authors} regarding the \textit{bayesian approach}. +For the actual execution of the experiment, the authors used the knowledge they had gained from the preparations. They noticed already for the two \textit{simple matrix-factorization models SGD-MF} and \textit{Bayesian MF}, which were trained with an \textit{embedding} of \textit{512 dimensions} and over \textit{128 epochs}, that they performed extremely well. Thus \textit{SGD-MF} achieved an \textit{RMSE} of \textit{0.7720}. This result alone was better than: \textit{RSVD (0.8256)}, \textit{Biased MF (0.803)}, \textit{LLORMA (0.7815)}, \textit{Autorec (0.782)}, \textit{WEMAREC (0.7769)} and \textit{I-CFN++ (0.7754)}. In addition, \textit{Bayesian MF} with an \textit{RMSE} of \textit{0.7653} not only beat the \textit{reported baseline BPMF (0.8197)}. It also beat the \textit{best algorithm MRMA (0.7634)}. +As the \textit{Netflix-Prize} showed, the use of \textit{implicit data} such as \textit{time} or \textit{dependencies} between \textit{users} or \textit{items} could immensely improve existing models. In addition to the two \textit{simple matrix factorizations}, \textit{table} \ref{table:models} shows the extensions of the authors regarding the \textit{bayesian approach}. \input{model_table} As it turned out that the \textit{bayesian approach} gave more promising results, the given models were trained with it. For this purpose, the \textit{dimensional embedding} as well as the \textit{number of sampling steps} for the models were examined again. Again the \textit{gaussian normal distribution} was used for \textit{initialization} as indicated in \textit{section} \ref{sec:experiment_preparation}. \textit{Figure} \ref{fig:bayes_evaluation} shows the corresponding results. @@ -48,8 +48,8 @@ As it turned out that the \textit{bayesian approach} gave more promising results The first observation that emerges from \textit{figure} \ref{fig:bayes_sampling_steps} is that the \textit{increase} in \textit{sampling steps} with a \textit{fixed dimensional embedding} also results in an \textit{improvement} in \textit{RMSE} for all models. Based on this, \textit{figure} \ref{fig:bayes_dimensional_embeddings} also shows that an \textit{increase} in the \textit{dimensional embedding} for \textit{512 sampling steps} also leads to an \textit{improvement} in the \textit{RMSE} for all models. Thus, both the \textit{number of sampling steps} and the size of the \textit{dimensional embedding} are involved in the \textit{RMSE} of \textit{matrix-factorization models} when they are trained using the \textit{bayesian approach}. \subsubsection{Stronger Baselines} -As a second finding, the \textit{RMSE values} of the created models can be taken from \textit{figure} \ref{fig:bayes_dimensional_embeddings}. Several points can be addressed. Firstly, it can be seen that the \textit{individual inclusion} of \textit{implicit knowledge} such as \textit{time} or \textit{user behaviour} leads to a significant \textit{improvement} in the \textit{RMSE}. For example, models like \textit{bayesian timeSVD (0.7587)} and \textit{bayesian SVD++ (0.7563)}, which already use single implicit knowledge, beat the \textit{simple bayesian MF} with an \textit{RMSE} of \textit{0.7633}. In addition, it also shows that the \textit{combination} of \textit{implicit data} further improves the \textit{RMSE}. \textit{Bayesian timeSVD++} achieves an \textit{RMSE} of \textit{0.7523}. Finally, \textit{bayesian timeSVD++ flipped} can achieve an \textit{RMSE} of \textit{0.7485} by adding \textit{more implicit data}. -This results in the third and most significant observation of the experiment. Firstly, the \textit{simple bayesian MF} with an \textit{RMSE} of \textit{0.7633} already beat the best method \textit{MRMA} with an \textit{RMSE} of \textit{0.7634}. Furthermore, the best method \textit{MRMA} could be surpassed with \textit{bayesian timeSVD++} by 0.0149 with respect to the \textit{RMSE}. Such a result is astonishing, as it took \textit{one year} during the \textit{Netflix-Prize} to reduce the leading \textit{RMSE} from \textit{0.8712 (progress award 2007)} to \textit{0.8616 (progress award 2008)}. Additionally, this result is remarkable as it \textit{challenges} the \textit{last 5 years} of research on the \textit{MovieLens10M-dataset}. Based on the results obtained, the \textit{authors} see the first problem with the \textit{results} achieved on the \textit{MovieLens10M-dataset} as being that they were \textit{compared against} too \textit{weak baselines}. +As a second finding, the \textit{RMSE} values of the created models can be taken from \textit{figure} \ref{fig:bayes_dimensional_embeddings}. Several points can be addressed. Firstly, it can be seen that the \textit{individual inclusion} of \textit{implicit knowledge} such as \textit{time} or \textit{user behaviour} leads to a significant \textit{improvement} in the \textit{RMSE}. For example, models like \textit{bayesian timeSVD (0.7587)} and \textit{bayesian SVD++ (0.7563)}, which already use \textit{single implicit knowledge}, beat the \textit{simple bayesian MF} with an \textit{RMSE} of \textit{0.7633}. In addition, it also shows that the \textit{combination} of \textit{implicit data} further improves the \textit{RMSE}. \textit{Bayesian timeSVD++} achieves an \textit{RMSE} of \textit{0.7523}. Finally, \textit{bayesian timeSVD++ flipped} can achieve an \textit{RMSE} of \textit{0.7485} by adding \textit{more implicit data}. +This results in the third and most significant observation of the experiment. Firstly, the \textit{simple bayesian MF} with an \textit{RMSE} of \textit{0.7633} already beat the best method \textit{MRMA} with an \textit{RMSE} of \textit{0.7634}. Furthermore, the best method \textit{MRMA} could be surpassed with \textit{bayesian timeSVD++} by 0.0149 with respect to the \textit{RMSE}. Such a result is astonishing, as it took \textit{one year} during the \textit{Netflix-Prize} to reduce the leading \textit{RMSE} from \textit{0.8712 (progress award 2007)} to \textit{0.8616 (progress award 2008)}. Additionally, this result is remarkable as it \textit{challenges} the \textit{last five years} of research on the \textit{MovieLens10M-dataset}. Based on the results obtained, the \textit{authors} see the first problem with the \textit{results} achieved on the \textit{MovieLens10M-dataset} as being that they were \textit{compared against} too \textit{weak baselines}. From \textit{figure} \ref{fig:corrected_results} the \textit{improved baselines} and the \textit{results} of the \textit{new methods} can be examined. \input{corrected_results} \subsubsection{Reproducability} @@ -58,4 +58,4 @@ In response, the authors see two main points. The first is \textit{reproducibili \subsubsection{Inadequate validations} The authors do not doubt the relevance of such methods. They even consider them \textit{necessary} but \textit{not meaningful enough} for the \textit{general goodness} of an \textit{experiment}. Thus, their preparation, which takes up the above mentioned methods, shows that they can achieve meaningful results. -Therefore the authors see the second point of criticism of the results obtained on the \textit{MovieLens10M-dataset} as the \textit{wrong understanding} of \textit{reliable experiments}. The \textit{main reason} given is the \textit{difference} between \textit{scientific} and \textit{industrial work}. For example, during the\textit{ Netflix-Prize}, which represents \textit{industrial work}, \textit{audible sums} were \textit{awarded} for the best results. This had several consequences. Firstly, a \textit{larger community} was addressed to work on the solution of the \textit{recommender problem}. On the other hand, the high number of \textit{competitors} and the \textit{simplicity} in the formulation of the \textit{task} encouraged each participant to investigate the \textit{simplest methods} in \textit{small steps}. The \textit{small-step approach} was also driven by the \textit{standardized guidelines} for the \textit{evaluation} of the methods given in \textit{section} \ref{sec:netflix} and by the \textit{public competition}. Thus, a better understanding of the \textit{basic relationships} could be achieved through the \textit{miniscule evaluation} of hundreds of models. All in all, these insights led to \textit{well-understood} and \textit{sharp baselines} within a \textit{community} that \textit{continuously} worked towards a \textit{common goal} over a total of three years. Such a \textit{motivation} and such a \textit{target-oriented competitive idea} is mostly not available in the \textit{scientific field}. Thus, publications that achieve \textit{better results} with \textit{old methods} are considered \textit{unpublishable}. Instead, experiments are \textit{not questioned} and their \textit{results} are \textit{simply transferred}. In some cases experiments are \textit{repeated exactly as specified} in the specifications. Achieving the \textit{same result} is considered a \textit{valid baseline}. According to the authors, such an approach is \textit{not meaningful} and, by not questioning the \textit{one-off evaluations}, leads to \textit{one-hit-wonders} that \textit{distort} the \textit{sharpness} of the \textit{baselines}. As a result, the \textit{MovieLens10M-dataset} shows that the main results of the last \textit{five years} were \textit{measured} against too \textit{weak baselines}. +Therefore the authors see the second point of criticism of the results obtained on the \textit{MovieLens10M-dataset} as the \textit{wrong understanding} of \textit{reliable experiments}. The \textit{main reason} given is the \textit{difference} between \textit{scientific} and \textit{industrial work}. For example, during the\textit{ Netflix-Prize}, which represents \textit{industrial work}, \textit{audible sums} were \textit{awarded} for the best results. This had several consequences. Firstly, a \textit{larger community} was addressed to work on the solution of the \textit{recommender problem}. On the other hand, the high number of \textit{competitors} and the \textit{simplicity} in the formulation of the task encouraged each participant to investigate the \textit{simplest methods} in \textit{small steps}. The \textit{small-step approach} was also driven by the \textit{standardized guidelines} for the \textit{evaluation} of the methods given in \textit{section} \ref{sec:netflix} and by the \textit{public competition}. Thus, a better understanding of the \textit{basic relationships} could be achieved through the \textit{miniscule evaluation} of hundreds of models. All in all, these insights led to \textit{well-understood} and \textit{sharp baselines} within a \textit{community} that \textit{continuously} worked towards a \textit{common goal} over a total of \textit{three years}. Such a \textit{motivation} and such a \textit{target-oriented competitive idea} is mostly not available in the \textit{scientific field}. Thus, publications that achieve \textit{better results} with \textit{old methods} are considered \textit{unpublishable}. Instead, experiments are \textit{not questioned} and their \textit{results} are \textit{simply transferred}. In some cases experiments are \textit{repeated exactly as specified} in the instructions. Achieving the \textit{same result} is considered a \textit{valid baseline}. According to the authors, such an approach is \textit{not meaningful} and, by not questioning the \textit{one-off evaluations}, leads to \textit{one-hit-wonders} that \textit{distort} the \textit{sharpness} of the \textit{baselines}. Therefore, the \textit{MovieLens10M-dataset} shows that the main results of the last \textit{five years} were \textit{measured} against too \textit{weak baselines}. diff --git a/battle.tex b/battle.tex index fabf4e946d4fcc59be1c9c3f68741f92015fc190..e2ee3d8498e07b74787dbea7e916ac8218784a9b 100644 --- a/battle.tex +++ b/battle.tex @@ -1,7 +1,7 @@ \begin{figure}[!ht] \centering \includegraphics[scale=0.60]{Bilder/battle.png} - \caption{Comparison of \textit{matrix-factorization} learned by \textit{gibbs-sampling (bayesian learning)} and \textit{stochastic gradient descent (SGD)} for an \textit{embedding dimension} from \textit{16} to \textit{512}. + \caption{Comparison of \textit{matrix-factorization} learned by \textit{gibbs-sampling (bayesian learning)} and \textit{stochastic gradient descent (SGD)} for an \textit{embedding dimension} from \textit{16} to \textit{512} with \textit{128} \textit{sampling-steps}. } \label{fig:battle} \end{figure} diff --git a/conclusion.tex b/conclusion.tex index eb7f631ee93653c3ce623294570727d7c3c781bf..fc9da33b4cda80de8f9d1f61ad320843ef59a80e 100644 --- a/conclusion.tex +++ b/conclusion.tex @@ -1,8 +1,9 @@ \newpage \section{Conclusion} -Overall, Rendle et. al. 2019 concludes that the last five years of research for the MovieLens10M dataset have not really produced any new findings. Although in the presented experiment the best practice of the community was applied, the simplest matrix factorization methods could clearly beat the reported results. Thus, the authors support the thesis that finding and evaluating valid and sharp baselines is not trivial. Empirical data are collected, since there is no formal evidence in the field of recommender systems to make the methods comparable. From the numerical evaluation the authors identify the rating of a work in a scientific context as a major problem. Here, a publication is classified as not worth publishing if it achieves better results with old methods. Rather, most papers aim to distinguish themselves from the others by using new methods that beat the old ones. In this way, baselines are not questioned and the community is steered in the wrong direction, as their work competes against insufficient baselines. This problem was not only solved during the Netflix award by the horrendous prize money. However, it turns out that the insights gained there were more profound and can be transferred to the MovieLens10M dataset. Thus new techniques but no new elementary knowledge could be achieved on the MovieLens10M data set. -With this paper Rendle et. al. addresses the highly experienced reader. The simple structure of the paper convinces by the clear and direct way in which the problem is identified. Additionally, the paper can be seen as an addendum to the Netflix price. As the authors Rendle and Koren were significantly involved in this competition, the points mentioned above are convincing by the experience they have gained. With their results they support the very simple but not trivial statement that finding good baselines requires an immense effort and this has to be promoted much more in a scientific context. This implies a change in the long-established thinking about the evaluation of scientific work. At this point it is questionable whether it is possible to change existing thinking. This should be considered especially because the scientific sector, unlike the industrial sector, cannot provide financial motivation due to limited resources. On the other hand, it must be considered that the individual focus of a work must also be taken into account. Thus, it is questionable whether the scientific sector is able to create such a large unit with regard to a common goal as Netflix did during the competition. -It should be clearly emphasized that it is immensely important to use sharp baselines as guidelines. However, in a scientific context the goal is not as precisely defined as it was in the Netflix Prize. Rather, a large part of the work is aimed at investigating whether new methods such as neural networks etc. are applicable to the recommender problem. -Regarding the results, however, it has to be said that they clearly support a rethinking even if this should only concern a small part of the work. On the website "Papers with Code" the public Leaderboard regarding the results obtained on the MovieLens10M dataset can be viewed. The source analysis of "Papers with Code" also identifies the results given by Rendle as leading. -Due to the recent publication in spring 2019, this paper has not yet been cited frequently. So time will tell what impact it will have on the community. Nevertheless, XY has already observed similar problems for Top-N-Recommenders based on this paper. According to this, Rendle seems to have recognized an elementary and unseen problem and made it public. Overall the paper has the potential to counteract the general hype whose only purpose is to develop the best and only true model and thus prevent a winter for recommender systems. +Overall, \citet{Rendle19} concludes that the last \textit{five years} of \textit{research} for the \textit{MovieLens10M-dataset} have not really produced any new findings. Although in the presented experiment the \textit{best practice} of the \textit{community} was applied, the \textit{simplest matrix-factorization} methods could clearly beat the reported results. Thus, the authors support the thesis that \textit{finding} and \textit{evaluating valid} and \textit{sharp baselines} is \textit{not trivial}. \textit{Empirical data} are collected, since there is \textit{no formal evidence} in the field of \textit{recommender systems} to make the methods comparable. From the \textit{numerical evaluation} the authors identify the \textit{rating of a work} in a \textit{scientific context} as a \textit{major problem}. Here, a \textit{publication} is classified as \textit{not worth publishing} if it achieves \textit{better results with old methods}. Rather, most papers aim to \textit{distinguish themselves} from the others by using new methods that beat the old ones. In this way, \textit{baselines} are \textit{not questioned} and the \textit{community} is steered in the wrong direction, as their work competes against \textit{insufficient} \textit{baselines}. This problem was not only solved during the \textit{Netflix-Prize} by the \textit{horrendous prize money}. However, it turns out that the \textit{insights} gained there were more \textit{profound} and can be transferred to the \textit{MovieLens10M-dataset}. Thus \textit{new techniques} but \textit{no new elementary knowledge} could be achieved on the \textit{MovieLens10M-dataset}. +With this paper \citet{Rendle19} addresses the highly experienced reader. The simple structure of the paper convinces by the clear and direct way in which the problem is identified. Additionally, the paper can be seen as an \textit{addendum} to the \textit{Netflix-Prize}. As the authors \citet{Rendle} and \citet{Koren} were significantly \textit{involved} in this competition, the points mentioned above are convincing by the experience they have gained. With their results they support the very simple but not trivial statement that finding good \textit{baselines} requires an \textit{immense effort} and this has to be \textit{promoted} much more in a \textit{scientific context}. This implies a change in the \textit{long-established thinking} about the evaluation of scientific work. At this point it is questionable whether it is possible to change existing thinking. This should be considered especially because the scientific sector, unlike the industrial sector, cannot provide financial motivation due to limited resources. On the other hand, it must be considered that the individual focus of a work must also be taken into account. Thus, it is \textit{questionable} whether the \textit{scientific sector} is able to create such a large unit with regard to a \textit{common goal} as \textit{Netflix} did during the competition. +It should be clearly emphasized that it is immensely important to use sharp \textit{baselines} as guidelines. However, in a \textit{scientific context} the \textit{goal} is not as \textit{precisely defined} as it was in the \textit{Netflix-Prize}. Rather, a large part of the work is aimed at investigating whether new methods such as \textit{neural networks} etc. are applicable to the \textit{recommender problem}. +Regarding the results, however, it has to be said that they clearly support a \textit{rethinking} even if this should only concern a \textit{small part} of the work. On the website \textit{Papers with Code}\footnote{\url{https://paperswithcode.com/sota/collaborative-filtering-on-movielens-10m}} the \textit{public leaderboard} regarding the results obtained on the \textit{MovieLens10M-dataset} can be viewed. The source analysis of \textit{Papers with Code} also identifies the results given by \citet{Rendle19} as leading. +In addition, \textit{future work} should focus on a more \textit{in-depth source analysis} which, besides the importance of the \textit{MovieLens10M-dataset} for the \textit{scientific community}, also examines whether and to what extent \textit{other datasets} are affected by this phenomenon. +Due to the recent publication in spring \textit{2019}, this paper has not yet been cited frequently. So time will tell what impact it will have on the \textit{community}. Nevertheless, \citet{Dacrema2019} has already observed similar problems for \textit{top-n-recommender} based on this paper. According to this, \citet{Rendle} seems to have recognized an elementary and unseen problem and made it public. This is strongly reminiscent of the so-called \textit{Artificial-Intelligence-Winter (AI-Winter)} in which \textit{stagnation} in the \textit{development} of \textit{artificial intelligence} occurred due to too high expectations and other favourable factors. Overall the paper has the potential to \textit{counteract} the \textit{general hype} whose only purpose is to develop the best and only true model and thus \textit{prevent} a \textit{winter for recommender systems}. diff --git a/introduction.tex b/introduction.tex index 890e6eccdca280bafb0726c798e8f1f20750e398..8a317effd7aa05aae913cc873679f062273d2df5 100644 --- a/introduction.tex +++ b/introduction.tex @@ -8,15 +8,15 @@ Since \citet{JuKa90} first presented \textit{recommender systems} as a kind of i The most diverse subject areas were not only illuminated by the industry. A whole new branch of research also opened up for science. - In their work ``\textit{On the Diffculty of Evaluating Baselines A Study on Recommender Systems}`` \citet{Rendle19} show that current research on the \textit{MovieLens10M} dataset leads in a wrong direction. + In their work ``\textit{On the Diffculty of Evaluating Baselines A Study on Recommender Systems}`` \citet{Rendle19} show that current research on the \textit{MovieLens10M-dataset} leads in a wrong direction. In addition to general problems, they particulary list wrong working methods and missunderstood \textit{baselines} by breaking them by a number of simple methods such as \textit{matrix-factorization}. - They were able to beat the existing baselines by not taking them for granted. - On the contrary, they questioned them and transferred well evaluated and understood properties of the baselines from the \textit{Netflix-Challenge} to them. + They were able to beat the existing \textit{baselines} by not taking them for granted. + On the contrary, they questioned them and transferred well evaluated and understood properties of the \textit{baselines} from the \textit{Netflix-Prize} to them. -As a result, they were not only able to beat the \textit{baselines} reported for the \textit{MovieLens10M}, but also the newer methods from the last 5 years of research. Therefore, it can be assumed that the current and former results obtained on the \textit{MovieLens10M} dataset were not sufficient to be considered as a true baseline. Thus they show the community a critical error on which can be found not only in the evaluation of \textit{recommender systems} but also in other scientific areas. +As a result, they were not only able to beat the \textit{baselines} reported for the \textit{MovieLens10M-dataset}, but also the newer methods from the last five years of research. Therefore, it can be assumed that the current and former results obtained on the \textit{MovieLens10M-dataset} were not sufficient to be considered as a true \textit{baseline}. Thus they show the \textit{community} a critical error on which can be found not only in the evaluation of \textit{recommender systems} but also in other scientific areas. -As a first problem, the authors point out that scientific papers whose focus is on better understanding and improving existing \textit{baselines} do not receive recognition because they do not seem innovative enough. In contrast to industry, which tenders horrendous prizes for researching and improving such \textit{baselines}, there is a lack of such motivation in the scientific field. From the authors point of view, the scientific work on the \textit{MovieLens10M} dataset is misdirected, because one-off evaluations leading to one-hit-wonders, which are then used as a starting point for further work. Thus \citet{Rendle19} points out as a second point of criticism that the need for further basic research for the \textit{MovieLens10M} dataset is not yet exhausted. +The first problem, the authors point out that scientific papers whose focus is on better understanding and improving existing \textit{baselines} do not receive recognition because they do not seem innovative enough. In contrast to industry, which tenders horrendous prizes for researching and improving such \textit{baselines}, there is a lack of such motivation in the scientific field. From the authors point of view, the scientific work on the \textit{MovieLens10M-dataset} is misdirected, because \textit{one-off evaluations} leading to \textit{one-hit-wonders}, which are then used as a starting point for further work. Thus \citet{Rendle19} points out as a second point of criticism that the need for further basic research for the \textit{MovieLens10M-dataset} is not yet exhausted. This submission takes a critical look at the topic presented by \citet{Rendle19}. In addition, basic terms and the results obtained are presented in a way that is comprehensible to the non-experienced reader. -For this purpose, the submission is divided into three subject areas. First of all, the non-experienced reader is introduced to the topic of recommender systems in the section ``\textit{A Study on Recommender Systems}``. Subsequently, building on the first section, the work in the section ``\textit{On the Diffculty of Evaluating Baselines}`` is presented in detail. The results are then evaluated in a critical discourse. \ No newline at end of file +For this purpose, the submission is divided into three subject areas. First of all, the non-experienced reader is introduced to the topic of \textit{recommender systems} in the section ``\textit{A Study on Recommender Systems}``. Subsequently, building on the first section, the work in the section ``\textit{On the Diffculty of Evaluating Baselines}`` is presented in detail. The results are then evaluated in a critical discourse. \ No newline at end of file diff --git a/recommender.tex b/recommender.tex index cbce45888ae5d57c8885e99c37aa02b64eba3131..d5e57100ce481ca6e4caed91f9bb19a856a5ce4e 100644 --- a/recommender.tex +++ b/recommender.tex @@ -1,27 +1,27 @@ \section{A Study on Recommender Systems} -This section explains the basics of \textit{recommender systems} necessary for the essential understanding of the paper presented. Besides the general definition of the \textit{recommender problem}, the corresponding solution approaches are presented. Furthermore, the main focus will be on the solution approach of \textit{matrix factorization}. +This section explains the basics of \textit{recommender systems} necessary for the essential understanding of the paper presented. Besides the general definition of the \textit{recommender problem}, the corresponding solution approaches are presented. Furthermore, the main focus will be on the solution approach of \textit{matrix-factorization}. \subsection{Recommender Problem} The \textit{recommender problem} consists of the entries of the sets $\mathcal{U}$ and $\mathcal{I}$, where $\mathcal{U}$ represents the set of all \textit{users} and $\mathcal{I}$ the set of all \textit{items}. Each of the \textit{users} in $\mathcal{U}$ gives \textit{ratings} from a set $\mathcal{S}$ of possible scores for the available \textit{items} in $\mathcal{I}$. The resulting \textit{rating-matrix} $\mathcal{R}$ is composed of $\mathcal{R} = \mathcal{U} \times \mathcal{I}$. The entries in $\mathcal{R}$ indicate the \textit{rating} from \textit{user} $u \in \mathcal{U}$ to \textit{item} $i \in \mathcal{I}$. This entry is then referred to as $r_{ui}$. Due to incomplete \textit{item-ratings}, $\mathcal{R}$ may also be incomplete. In the following, the subset of all \textit{users} who have rated a particular \textit{item} $i$ is referred to as $\mathcal{U}_i$. Similarly, $\mathcal{I}_u$ refers to the subset of \textit{items} that were rated by \textit{user} $u$. Since $\mathcal{R}$ is not completely filled, there are missing values for some \textit{user-item relations}. The aim of the \textit{recommender system} is to estimate the missing \textit{ratings} $\hat{r}_{ui}$ using a \textit{prediction-function} $p(u,i)$. The \textit{prediction-function} consists of $p: \mathcal{U} \times \mathcal{I} \rightarrow \mathcal{S}$ \citep{DeKa11}. In the further course of the work different methods are presented to determine $p(u,i)$. -In the following, the two main approaches of \textit{collaborative-filtering} and \textit{content-based} \textit{recommender systems} will be discussed. In addition, it is explained how \textit{matrix factorization} can be integrated into the two ways of thinking. +In the following, the two main approaches of \textit{collaborative-filtering} and \textit{content-based} \textit{recommender systems} will be discussed. In addition, it is explained how \textit{matrix-factorization} can be integrated into the two ways of thinking. \subsection{Content-Based} -\textit{Content-based} \textit{recommender systems (CB)} work directly with \textit{feature vectors}. Such a \textit{feature vector} can, for example, represent a \textit{user profile}. In this case, this \textit{profile} contains information about the \textit{user's preferences}, such as \textit{genres}, \textit{authors}, \textit{etc}. This is done by trying to create a \textit{model} of the \textit{user}, which best represents his preferences. The different \textit{learning algorithms} from the field of \textit{machine learning} are used to learn or create the \textit{models}. The most prominent \textit{algorithms} are: \textit{tf-idf}, \textit{bayesian learning}, \textit{Rocchio's algorithm} and \textit{neural networks} \citep{Lops11, Ferrari19, DeKa11}. Altogether the built and learned \textit{feature vectors} are compared with each other. Based on their closeness, similar \textit{features} can be used to generate \textit{missing ratings}. Figure \ref{fig:cb} shows a sketch of the general operation of \textit{content-based recommenders}. +\textit{Content-based} \textit{recommender systems (CB)} work directly with \textit{feature vectors}. Such a \textit{feature vector} can, for example, represent a \textit{user profile}. In this case, this \textit{profile} contains information about the \textit{user's preferences}, such as \textit{genres}, \textit{authors}, \textit{etc}. This is done by trying to create a \textit{model} of the \textit{user}, which best represents his preferences. The different \textit{learning algorithms} from the field of \textit{machine learning} are used to learn or create the \textit{models}. The most prominent \textit{algorithms} are: \textit{tf-idf}, \textit{bayesian learning}, \textit{Rocchio's algorithm} and \textit{neuronal networks} \citep{Lops11, Ferrari19, DeKa11}. Altogether the built and learned \textit{feature vectors} are compared with each other. Based on their closeness, similar \textit{features} can be used to generate \textit{missing ratings}. Figure \ref{fig:cb} shows a sketch of the general operation of \textit{content-based recommenders}. \subsection{Collaborative-Filtering} Unlike the \textit{content-based recommender (CF)}, the \textit{collaborative-filtering recommender} not only considers individual \textit{users} and \textit{feature vectors}, but rather a \textit{like-minded neighborhood} of each \textit{user}. Missing \textit{user ratings} can be extracted by this \textit{neighbourhood} and \textit{networked} to form a whole. It is assumed that a \textit{missing rating} of the considered \textit{user} for an unknown \textit{item} $i$ will be similar to the \textit{rating} of a \textit{user} $v$ as soon as $u$ and $v$ have rated some \textit{items} similarly. The similarity of the \textit{users} is determined by the \textit{community ratings}. This type of \textit{recommender system} is also known by the term \textit{neighborhood-based recommender} \citep{DeKa11}. The main focus of \textit{neighbourhood-based methods} is on the application of iterative methods such as \textit{k-nearest-neighbours} or \textit{k-means}. A \textit{neighborhood-based recommender} can be viewed from two angles: The first and best known problem is the so-called \textit{user-based prediction}. Here, the \textit{missing ratings} of a considered \textit{user} $u$ are to be determined from his \textit{neighborhood} $\mathcal{N}_i(u)$. -$\mathcal{N}_i(u)$ denotes the subset of the \textit{neighborhood} of all \textit{users} who have a similar manner of evaluation to $u$ via the \textit{item} $i$. The second problem is that of \textit{item-based prediction}. Analogously, the similarity of the items is determined by their received ratings. +$\mathcal{N}_i(u)$ denotes the subset of the \textit{neighborhood} of all \textit{users} who have a similar manner of evaluation to $u$ via the \textit{item} $i$. The second problem is that of \textit{item-based prediction}. Analogously, the similarity of the \textit{items} are determined by their received \textit{ratings}. This kind of problem consideres the \textit{neighborhood} $\mathcal{N}_u(i)$ of all \textit{items} $i$ which were similar rated via the \textit{user} $u$. The similarity between the objects of a \textit{neighborhood} is determined by \textit{distance functions} such as \textit{mean-squared-difference}, \textit{pearson-correlation} or \textit{cosine-similarity}. -Figure \ref{fig:cf} shows a sketch of the general operation of the \textit{collaborative-filtering recommender}. +Figure \ref{fig:cf} shows a sketch of the general operation of \textit{content-based} and \textit{collaborative-filtering} \textit{recommender}. \input{content-based-collaborative-filtering-comparison} \subsection{Matrix-Factorization}\label{sec:mf} -The core idea of \textit{matrix factorization} is to supplement the not completely filled out \textit{rating-matrix} $\mathcal{R}$. For this purpose the \textit{users} and \textit{items} are to be mapped to a joined \textit{latent feature space} with \textit{dimensionality} $f$. The \textit{user} is represented by the vector $p_u \in \mathbb{R}^{f}$ and the item by the vector $q_i \in \mathbb{R}^{f}$. As a result, the \textit{missing ratings} and thus the \textit{user-item interaction} are to be determined via the \textit{inner product} $\hat{r}_{ui}=q_i^Tp_u$ of the corresponding vectors \citep{Kor09}. In the following, the four most classical matrix factorization approaches are described in detail. Afterwards, the concrete learning methods with which the vectors are learned are presented. In addition, the \textit{training data} for which a \textit{concrete rating} is available should be referred to as $\mathcal{B} = \lbrace(u,i) | r_{ui} \in \mathcal{R}\rbrace$. +The core idea of \textit{matrix-factorization} is to supplement the not completely filled out \textit{rating-matrix} $\mathcal{R}$. For this purpose the \textit{users} and \textit{items} are to be mapped to a joined \textit{latent feature space} with \textit{dimensionality} $f$. The \textit{user} is represented by the vector $p_u \in \mathbb{R}^{f}$ and the \textit{item} by the vector $q_i \in \mathbb{R}^{f}$. As a result, the \textit{missing ratings} and thus the \textit{user-item interaction} are to be determined via the \textit{inner product} $\hat{r}_{ui}=q_i^Tp_u$ of the corresponding vectors \citep{Kor09}. In the following, the four most classical \textit{matrix-factorization} approaches are described in detail. Afterwards, the concrete learning methods with which the vectors are learned are presented. In addition, the \textit{training data} for which a \textit{concrete rating} is available should be referred to as $\mathcal{B} = \lbrace(u,i) | r_{ui} \in \mathcal{R}\rbrace$. \subsubsection{Basic Matrix-Factorization} The first and easiest way to solve \textit{matrix-factorization} is to connect the \textit{feature vectors} of the \textit{users} and the \textit{items} using the \textit{inner product}. The result is the \textit{user-item interaction}. In addition, the \textit{error} should be as small as possible. Therefore, $\min_{p_u, q_i}{\sum_{(u,i) \in \mathcal{B}} (r_{ui} - \hat{r}_{ui})^{2}}$ is defined as an associated \textit{minimization problem}. @@ -39,14 +39,14 @@ The most popular model that takes \textit{bias} into account is called \textit{b In addition, the \textit{missing rating} is no longer determined only by the \textit{inner product} of the two vectors $q_i$ and $p_u$. Rather, the \textit{bias} is also considered. Accordingly, a \textit{missing rating} is calculated by $\hat{r}_{ui} = b_{ui} + q_i^Tp_u$, where $b_{ui}$ is the \textit{bias} of a \textit{user} $u$ and an \textit{item} $i$. The \textit{bias} is determined by $b_{ui}=\mu + b_u + b_i$. The parameter $\mu$ is the \textit{global average} of all \textit{ratings} $r_{ui} \in \mathcal{R}$. Furthermore, $b_u = \mu_u - \mu$ and $b_i = \mu_i - \mu$. Here $\mu_u$ denotes the \textit{average} of all \textit{assigned ratings} of the \textit{user} $u$. Similarly, $\mu_i$ denotes the \textit{average} of all \textit{received ratings} of an \textit{item} $i$. -Thus $b_u$ indicates the \textit{deviation} of the \textit{average assigned rating} of a \textit{user} from the \textit{global average}. Similarly, $b_i$ indicates the \textit{deviation} of the \textit{average rating} of an item from the \textit{global average}. +Thus $b_u$ indicates the \textit{deviation} of the \textit{average assigned rating} of a \textit{user} from the \textit{global average}. Similarly, $b_i$ indicates the \textit{deviation} of the \textit{average rating} of an \textit{item} from the \textit{global average}. In addition, the \textit{minimization problem} can be extended by the \textit{bias}. Accordingly, the \textit{minimization problem} is then $\min_{p_u, q_i}{\sum_{(u,i) \in \mathcal{B}} (r_{ui} - \hat{r}_{ui})^{2}} + \lambda(\lVert q_i \rVert^2 + \lVert p_u \lVert^2 + b_u^2 + b_i^2)$. Analogous to the \textit{regulated matrix-factorization}, the values $b_u$ and $b_i$ are penalized in addition to $\lVert q_i \rVert, \lVert p_u \rVert$. In this case $b_u, b_i$ are penalized more if they assume a large value and thus deviate strongly from the \textit{global average}. \subsubsection{Advanced Matrix-Factorization}\label{subsec:amf} This section is intended to show that there are \textit{other approaches} to \textit{matrix-factorization}. Thus, \textit{implicit data} can also be included. First of all, it should be mentioned that \textit{temporary dynamics} can also be included. -On the one hand, it is not realistic that a \textit{user} cannot change his taste. On the other hand, the properties of an \textit{item} remain constant. Therefore, \textit{missing ratings} can also be determined \textit{time-based}. A \textit{missing rating} is then determined by $\hat{r}_{ui}=\mu + b_i(t) + b_u(t) + q_i^{T}p_u(t)$ \citep{Kor09}. +On the one hand, it is not realistic that a \textit{user} cannot change his taste. On the other hand, the properties of an \textit{item} may also not remain constant. Therefore, \textit{missing ratings} can also be determined \textit{time-based}. A \textit{missing rating} is then determined by $\hat{r}_{ui}=\mu + b_i(t) + b_u(t) + q_i^{T}p_u(t)$ \citep{Kor09}. As a second possibility, \textit{implicit influence} can be included. This can involve the \textit{properties} of the \textit{items} a \textit{user} is dealing with. A \textit{missing rating} can be determined by $\hat{r}_{ui}=\mu + b_i + b_u + q_i^{T}(p_u + |\mathcal{I}_u|^{-\frac{1}{2}}\sum_{i \in \mathcal{I}_u}{y_i})$. $y_i \in \mathbb{R}^{f}$ describes the \textit{feature vectors} of the \textit{items} $i \in \mathcal{I}_u$ which have been evaluated by \textit{user} $u$. The corresponding \textit{minimization problems} can be adjusted as mentioned in the sections above \citep{Kor08}. \subsection{Optimization and Learning} @@ -54,7 +54,7 @@ An important point that does not emerge from the above points is the question of \subsubsection{Stochastic Gradient Descent} \label{sec:sgd} -The best known and most common method when it comes to \textit{machine learning} is \textit{stochastic gradient descent (SGD)}. The goal of \textit{SGD} is to \textit{minimize} the \textit{error} of a given \textit{objective function}. Thus the estimators mentioned in section \ref{sec:mf} can be used as \textit{objective functions}. In the field of \textit{recommender systems}, \citet{Funk06} presented a \textit{modified} variant of \textit{SGD} in the context of the \textit{Netflix Challenge}. \textit{SGD} can be applied to \textit{regulated matrix-factorization} with \textit{bias} as well as without \textit{bias}. This method can be described by the following pseudo code: +The best known and most common method when it comes to \textit{machine learning} is \textit{stochastic gradient descent (SGD)}. The goal of \textit{SGD} is to \textit{minimize} the \textit{error} of a given \textit{objective function}. Thus the estimators mentioned in section \ref{sec:mf} can be used as \textit{objective functions}. In the field of \textit{recommender systems}, \citet{Funk06} presented a \textit{modified} variant of \textit{SGD} in the context of the \textit{Netflix-Prize}. \textit{SGD} can be applied to \textit{regulated matrix-factorization} with \textit{bias} as well as without \textit{bias}. This method can be described by the following pseudo code: \begin{algorithm}\label{alg:sgd} \caption{SGD of Funk} \begin{algorithmic}[1] diff --git a/references.bib b/references.bib index c66f3e4bc777019c86313766a815656940b5372c..fd227a7f3791cc73ec87a11433133de9a561543b 100644 --- a/references.bib +++ b/references.bib @@ -123,6 +123,12 @@ doi = {10.1145/1390156.1390267} howpublished = {\url{https://ieeexplore.ieee.org/author/37414256700}}, note = {Accessed: 2019-12-21}, } +@misc{Rendle, + author = {Steffen Rendle}, + title = {Papers of Steffen Rendle}, + howpublished = {\url{https://dblp.org/pers/hd/r/Rendle:Steffen}}, + note = {Accessed: 2020-01-20}, +} @article{Kurucz07, author = {Miklós Kurucz and András Benczúr and Károly Csalogány}, year = {2007}, @@ -175,4 +181,11 @@ month = {01}, pages = {}, title = {Improving regularized singular value decomposition for collaborative filtering}, journal = {Proceedings of KDD Cup and Workshop} +} +@article{Dacrema2019, + title={A Troubling Analysis of Reproducibility and Progress in Recommender Systems Research}, + author={Maurizio Ferrari Dacrema and Simone Boglio and Paolo Cremonesi and Dietmar Jannach}, + journal={ArXiv}, + year={2019}, + volume={abs/1911.07698} } \ No newline at end of file diff --git a/reported_results.tex b/reported_results.tex index aed2f9c0c39928ee521bd3ea186a170fafd004b7..544ace19258812d28acfc4c235299386e314ba50 100644 --- a/reported_results.tex +++ b/reported_results.tex @@ -1,6 +1,6 @@ \begin{figure}[!ht] \centering \includegraphics[scale=0.60]{Bilder/reported_results.png} - \caption{\textit{Results obtained} on the \textit{MovieLens10M-dataset} over the last \textit{5 years}. The \textit{y-axis} shows the corresponding \textit{RMSE} values and the \textit{x-axis} shows the \textit{year} in which the corresponding method was developed. \textit{Blue} marked points show \textit{newer methods} that have \textit{competed} against the points shown in \textit{black}. \citep{Rendle19}} + \caption{\textit{Results obtained} on the \textit{MovieLens10M-dataset} over the last \textit{five years}. The \textit{y-axis} shows the corresponding \textit{RMSE} values and the \textit{x-axis} shows the \textit{year} in which the corresponding method was developed. \textit{Blue} marked points show \textit{newer methods} that have \textit{competed} against the points shown in \textit{black}. \citep{Rendle19}} \label{fig:reported_results} \end{figure} \ No newline at end of file diff --git a/submission.pdf b/submission.pdf index ac4351a8af21bcc219a1ad4dacdbcc0de0772dc0..8c4d9126edde0be0b2ace2f2a631aa7b768dc130 100644 Binary files a/submission.pdf and b/submission.pdf differ