diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index e9adf621a..dff164ab4 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -541,6 +541,63 @@ def recall_at_k( return (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users +def r_precision_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, + **_, +): + """R-precision at K. + + R-precision can be defined as the precision@R for each user, where R is the + numer of relevant items for the query. Its also equivalent to the recall at + the R-th position. + + Note: + As R can be high, in this case, the k indicates the maximum possible R. + If every user has more than k true items, then r-precision@k is equal to + precision@k. You might need to raise the k value to get meaningful results. + + Args: + rating_true (pandas.DataFrame): True DataFrame + rating_pred (pandas.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the + top k items are directly provided, so there is no need to compute the relevancy operation. + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than + k items exist for a user in rating_true. + """ + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + df_merged = df_hit.merge(df_hit_count[[col_user, 'actual']]) + df_merged = df_merged[df_merged['rank'] <= df_merged['actual']] + + return (df_merged.groupby(col_user).size() / df_hit_count.set_index(col_user)['actual']).mean() + + def ndcg_at_k( rating_true, rating_pred, @@ -824,6 +881,7 @@ def get_top_k_items( exp_var.__name__: exp_var, precision_at_k.__name__: precision_at_k, recall_at_k.__name__: recall_at_k, + r_precision_at_k.__name__: r_precision_at_k, ndcg_at_k.__name__: ndcg_at_k, map_at_k.__name__: map_at_k, map.__name__: map, diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index 4f0d4730b..e2f6dc149 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -25,6 +25,7 @@ exp_var, get_top_k_items, precision_at_k, + r_precision_at_k, recall_at_k, ndcg_at_k, map_at_k, @@ -366,6 +367,20 @@ def test_python_recall_at_k(rating_true, rating_pred, rating_nohit): assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) +def test_python_r_precision(rating_true, rating_pred, rating_nohit): + assert r_precision_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1, TOL) + assert r_precision_at_k(rating_true, rating_nohit, k=5) == 0.0 + assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx(0.21111, TOL) + assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx(0.24444, TOL) + # Equivalent to precision + assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) + + def test_python_auc(rating_true_binary, rating_pred_binary): assert auc( rating_true=rating_true_binary,