Fix eval metrics (#5)

* update ndcg to use log base 2 and relevance scores > 1 * replace binary search with brute force in np/cp kernels * add ndcg test on nfcorpus
rapidsai · Nov 4, 2023 · df5e74b · df5e74b
1 parent d6563d2
commit df5e74b
Show file tree

Hide file tree

Showing 9 changed files with 120 additions and 100 deletions.
diff --git a/crossfit/backend/cupy/kernels.py b/crossfit/backend/cupy/kernels.py
@@ -1,47 +1,25 @@
 from numba import cuda
 
 
-@cuda.jit(device=True)
-def cuda_searchsorted(arr, val, side):
-    """
-    Binary search on a sorted array.
-
-    ======  ============================
-    `side`  returned index `i` satisfies
-    ======  ============================
-    0       ``arr[i-1] < val <= arr[i]``
-    1       ``arr[i-1] <= val < arr[i]``
-    ======  ============================
-    """
-    left = 0
-    right = len(arr)
-    while left < right:
-        mid = (left + right) // 2
-        if arr[mid] < val or (side == 1 and arr[mid] <= val):
-            left = mid + 1
-        else:
-            right = mid
-    return left
-
-
 @cuda.jit
 def _numba_lookup(A_indptr, A_cols, A_data, B, vals):
     i = cuda.grid(1)
 
-    n_rows_a = len(A_indptr) - 1
-    if n_rows_a == len(B):
-        ind_start, ind_end = A_indptr[i], A_indptr[i + 1]
-        for j in range(B.shape[1]):
-            left_idx = cuda_searchsorted(A_cols[ind_start:ind_end], B[i][j], 0)
-            right_idx = cuda_searchsorted(A_cols[ind_start:ind_end], B[i][j], 1)
-            if left_idx != right_idx:
-                vals[i][j] = A_data[ind_start:ind_end][left_idx]
-    else:
-        for j in range(B.shape[1]):
-            left_idx = cuda_searchsorted(A_cols, B[i][j], 0)
-            right_idx = cuda_searchsorted(A_cols, B[i][j], 1)
-            if left_idx != right_idx:
-                vals[i][j] = A_data[left_idx]
+    if i < B.shape[0]:
+        n_rows_a = len(A_indptr) - 1
+        if n_rows_a == len(B):
+            ind_start, ind_end = A_indptr[i], A_indptr[i + 1]
+            for j in range(B.shape[1]):
+                for k in range(ind_start, ind_end):
+                    if A_cols[k] == B[i][j]:
+                        vals[i][j] = A_data[k]
+                        break
+        else:
+            for j in range(B.shape[1]):
+                for k in range(len(A_cols)):
+                    if A_cols[k] == B[i][j]:
+                        vals[i][j] = A_data[k]
+                        break
 
 
 @cuda.jit
@@ -74,15 +52,12 @@ def _numba_setop(self_idx_ptr, self_col_idx, self_data, other_idx_ptr, other_col
         os, oe = other_idx_ptr[i], other_idx_ptr[i + 1]
 
         for j in range(ss, se):
-            left_idx = cuda_searchsorted(other_col_idx[os:oe], self_col_idx[j], 0)
-            right_idx = cuda_searchsorted(other_col_idx[os:oe], self_col_idx[j], 1)
-
-            if intersect:
-                found = left_idx == right_idx
-            else:
-                found = left_idx != right_idx
-
-            if found:
+            found = False
+            for k in range(os, oe):
+                if self_col_idx[j] == other_col_idx[k]:
+                    found = True
+                    break
+            if (intersect and not found) or (not intersect and found):
                 self_data[j] = 0
 
 

diff --git a/crossfit/backend/cupy/sparse.py b/crossfit/backend/cupy/sparse.py
@@ -122,7 +122,7 @@ def _setop(self, other, mode):
     def sort(self):
         from crossfit.backend.cupy.kernels import _numba_sort, determine_blocks_threads
 
-        blocks, threads = determine_blocks_threads(len(self.idx_ptr))
+        blocks, threads = determine_blocks_threads(len(self.idx_ptr) - 1)
         _numba_sort[blocks, threads](self.idx_ptr, self.col_idx, self.data)
 
     def intersection(self, other):

diff --git a/crossfit/backend/numpy/sparse.py b/crossfit/backend/numpy/sparse.py
@@ -168,16 +168,18 @@ def _numba_lookup(A_indptr, A_cols, A_data, B):
     if n_rows_a == len(B):
         for i in numba.prange(B.shape[0]):
             ind_start, ind_end = A_indptr[i], A_indptr[i + 1]
-            left_idx = np.searchsorted(A_cols[ind_start:ind_end], B[i])
-            right_idx = np.searchsorted(A_cols[ind_start:ind_end], B[i], side="right")
-            found = left_idx != right_idx
-            vals[i][found] = A_data[ind_start:ind_end][left_idx[found]]
+            for j in range(len(B[i])):
+                for k in range(ind_start, ind_end):
+                    if A_cols[k] == B[i][j]:
+                        vals[i][j] = A_data[k]
+                        break
     else:
         for i in numba.prange(B.shape[0]):
-            left_idx = np.searchsorted(A_cols, B[i])
-            right_idx = np.searchsorted(A_cols, B[i], side="right")
-            found = left_idx != right_idx
-            vals[i][found] = A_data[left_idx[found]]
+            for j in range(len(B[i])):
+                for k in range(len(A_cols)):
+                    if A_cols[k] == B[i][j]:
+                        vals[i][j] = A_data[k]
+                        break
 
     return vals
 
@@ -197,13 +199,14 @@ def _numba_setop(self_idx_ptr, self_col_idx, self_data, other_idx_ptr, other_col
         ss, se = self_idx_ptr[i], self_idx_ptr[i + 1]
         os, oe = other_idx_ptr[i], other_idx_ptr[i + 1]
 
-        left_idx = np.searchsorted(other_col_idx[os:oe], self_col_idx[ss:se])
-        right_idx = np.searchsorted(other_col_idx[os:oe], self_col_idx[ss:se], side="right")
-        if intersect:
-            found = left_idx == right_idx
-        else:
-            found = left_idx != right_idx
-        self_data[ss:se][found] = 0
+        for j in range(ss, se):
+            found = False
+            for k in range(os, oe):
+                if self_col_idx[j] == other_col_idx[k]:
+                    found = True
+                    break
+            if (intersect and not found) or (not intersect and found):
+                self_data[j] = 0
 
 
 @numba.njit

diff --git a/crossfit/metric/ranking/ndcg.py b/crossfit/metric/ranking/ndcg.py
@@ -9,7 +9,7 @@ class DCG(RankingMetric):
     SCALERS = {"identity": lambda x: x, "power": lambda x: np.power(x, 2) - 1}
     LOGS = {"2": lambda x: np.log2(x), "e": lambda x: np.log(x)}
 
-    def __init__(self, k=None, relevance_scaling="identity", log_base="e"):
+    def __init__(self, k=None, relevance_scaling="identity", log_base="2"):
         self._k = k
         if relevance_scaling not in self.SCALERS:
             raise ValueError("Relevance scaling must be 'identity' or 'power'.")
@@ -38,4 +38,11 @@ def _score(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         ideal_labels = y_true.get_labels_for(y_true.as_rankings(), self._k)
         idcg = self._dcg(y_true, ideal_labels)
 
+        ndcg = dcg / idcg
+
+        if idcg.shape[0] == 1 and ndcg.shape[0] > 1:
+            idcg = np.ones_like(ndcg) * idcg
+
+        ndcg[idcg == 0] = np.NaN
+
         return dcg / idcg
diff --git a/crossfit/metric/ranking/precision.py b/crossfit/metric/ranking/precision.py
@@ -14,7 +14,7 @@ def __init__(self, k, truncated=False):
         super().__init__(k)
         self._truncated = truncated
 
-    def _precision(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
+    def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
         n_relevant = np.sum(
             (y_pred_labels.data[:, : self._k] == 1)

diff --git a/crossfit/metric/ranking/recall.py b/crossfit/metric/ranking/recall.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from crossfit.metric.ranking.base import BinaryRankingMetric, SparseBinaryLabels
+from crossfit.metric.ranking.base import BinaryRankingMetric, SparseLabels
 from crossfit.data.array.masked import MaskedArray
 
 
@@ -9,10 +9,10 @@ def __init__(self, k, truncated=False):
         super().__init__(k)
         self._truncated = truncated
 
-    def _recall(self, y_true: SparseBinaryLabels, y_pred_labels: MaskedArray):
+    def _recall(self, y_true: SparseLabels, y_pred_labels: MaskedArray):
         n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
         n_relevant = np.sum(
-            (y_pred_labels.data[:, : self._k] == 1) & (~y_pred_labels.mask[:, : self._k]), axis=-1
+            (y_pred_labels.data[:, : self._k] >= 1) & (~y_pred_labels.mask[:, : self._k]), axis=-1
         )
 
         scores = np.NaN * np.zeros_like(n_relevant, dtype=float)

diff --git a/crossfit/report/beir/report.py b/crossfit/report/beir/report.py
@@ -13,7 +13,7 @@
 from crossfit.report.beir.embed import embed
 from crossfit.calculate.aggregate import Aggregator
 from crossfit.metric.continuous.mean import Mean
-from crossfit.metric.ranking import AP, NDCG, Precision, Recall, SparseBinaryLabels, SparseRankings
+from crossfit.metric.ranking import AP, NDCG, Precision, Recall, SparseBinaryLabels, SparseNumericLabels, SparseRankings
 from crossfit.report.base import Report
 from crossfit.op.vector_search import VectorSearchOp
 from crossfit.backend.torch.model import Model
@@ -34,13 +34,13 @@ def __init__(
         self.metrics = metrics
 
     def prepare(self, df):
-        encoder = self.create_label_encoder(df, ["corpus-index-pred", "corpus-index-obs"])
-        obs_csr = self.create_csr_matrix(df["corpus-index-obs"], df["score-obs"], encoder)
-        pred_csr = self.create_csr_matrix(df["corpus-index-pred"], df["score-pred"], encoder)
+        encoder = create_label_encoder(df, ["corpus-index-pred", "corpus-index-obs"])
+        obs_csr = create_csr_matrix(df["corpus-index-obs"], df["score-obs"], encoder)
+        pred_csr = create_csr_matrix(df["corpus-index-pred"], df["score-pred"], encoder)
 
         # TODO: Fix dispatch
-        labels = SparseBinaryLabels(CrossSparse.from_matrix(obs_csr))
-        rankings = SparseRankings(CrossSparse.from_matrix(pred_csr))
+        labels = SparseNumericLabels.from_matrix(obs_csr)
+        rankings = SparseRankings.from_scores(pred_csr)
 
         outputs = {}
         with crossarray:
@@ -49,42 +49,40 @@ def prepare(self, df):
                     metric_at_k = metric(k=k)
                     result = metric_at_k.score(labels, rankings)
 
-                    # TODO: Does this make sense?
-                    result = np.nan_to_num(result)
-                    result = np.where(result > 1, 1, result)
-
                     outputs[metric_at_k.name()] = Mean.from_array(result, axis=0)
 
         return outputs
 
-    def create_label_encoder(self, df, cols) -> LabelEncoder:
-        # Extract leaves (flattened arrays)
-        _leaves = []
 
-        for col in cols:
-            _leaves.append(df[col].list.leaves)
+def create_label_encoder(df, cols) -> LabelEncoder:
+    # Extract leaves (flattened arrays)
+    _leaves = []
 
-        # Concatenate and get unique values for fit_transform
-        all_ids = cudf.concat(_leaves).unique()
+    for col in cols:
+        _leaves.append(df[col].list.leaves)
 
-        # Label Encoding
-        le = LabelEncoder()
-        le.fit(all_ids)
+    # Concatenate and get unique values for fit_transform
+    all_ids = cudf.concat(_leaves).unique()
 
-        return le
+    # Label Encoding
+    le = LabelEncoder()
+    le.fit(all_ids)
 
-    def create_csr_matrix(self, ids, scores, label_encoder: LabelEncoder):
-        num_rows = scores.size
-        num_columns = label_encoder.classes_.shape[0]
+    return le
 
-        values = scores.list.leaves.values.astype(cp.float32)
-        indices = label_encoder.transform(ids.list.leaves).values
-        indptr = scores.list._column.offsets.values
-        sparse_matrix = cp.sparse.csr_matrix(
-            (values, indices, indptr), shape=(num_rows, num_columns)
-        )
 
-        return sparse_matrix
+def create_csr_matrix(ids, scores, label_encoder: LabelEncoder):
+    num_rows = scores.size
+    num_columns = label_encoder.classes_.shape[0]
+
+    values = scores.list.leaves.values.astype(cp.float32)
+    indices = label_encoder.transform(ids.list.leaves).values
+    indptr = scores.list._column.offsets.values
+    sparse_matrix = cp.sparse.csr_matrix(
+        (values, indices, indptr), shape=(num_rows, num_columns)
+    )
+
+    return sparse_matrix
 
 
 def join_predictions(data, predictions):

diff --git a/tests/metrics/ranking/test_ndcg.py b/tests/metrics/ranking/test_ndcg.py
@@ -105,7 +105,6 @@ def test_numeric_score(self, y_gold, y_pred, expect, params):
         ],
     )
     def test_binary_score(self, y_gold, y_pred, expect, params):
-        print(y_gold, y_pred)
         y_gold = SparseBinaryLabels.from_positive_indices(y_gold)
         if len(y_pred) == 0 or [] in y_pred:
             with pytest.warns(UserWarning):

diff --git a/tests/report/beir/test_report.py b/tests/report/beir/test_report.py
@@ -1,12 +1,20 @@
 import pytest
 
+pytest.importorskip("cupy")
 beir = pytest.importorskip("beir")
 
+import numpy as np
+
 import crossfit as cf
+from crossfit.data.sparse.ranking import SparseNumericLabels, SparseRankings
+from crossfit.metric.ranking import NDCG
+from crossfit.report.beir.report import (create_csr_matrix,
+                                         create_label_encoder,
+                                         join_predictions)
 
 
 @pytest.mark.singlegpu
-@pytest.mark.parametrize("dataset", ["hotpotqa", "nq"])
+@pytest.mark.parametrize("dataset", ["nq"])
 def test_beir_report(
     dataset, model_name="sentence-transformers/all-MiniLM-L6-v2", k=10
 ):
@@ -34,3 +42,33 @@ def test_beir_report(
     assert ("split", "test") in report.result_df.index.values.tolist()
     for col in expected_columns:
         assert report.result_df.loc[("split", "test"), col].item() > 0.0
+
+
+@pytest.mark.singlegpu
+@pytest.mark.parametrize("dataset", ["hotpotqa"])
+def test_no_invalid_scores(dataset, model_name="sentence-transformers/all-MiniLM-L6-v2", k=10):
+    model = cf.SentenceTransformerModel(model_name)
+    vector_search = cf.TorchExactSearch(k=k)
+    embeds = cf.embed(
+        dataset,
+        model,
+        vector_search=vector_search,
+        overwrite=True,
+        tiny_sample=True,
+    )
+    test = embeds.data.test.ddf()
+    test["split"] = "test"
+    df = join_predictions(test, embeds.predictions).compute()
+
+    encoder = create_label_encoder(df, ["corpus-index-pred", "corpus-index-obs"])
+    obs_csr = create_csr_matrix(df["corpus-index-obs"], df["score-obs"], encoder)
+    pred_csr = create_csr_matrix(df["corpus-index-pred"], df["score-pred"], encoder)
+
+    labels = SparseNumericLabels.from_matrix(obs_csr)
+    rankings = SparseRankings.from_scores(pred_csr)
+
+    ndcg = NDCG(5).score(labels, rankings)
+
+    assert ndcg.min() >= 0
+    assert ndcg.max() <= 1
+    assert not np.isinf(ndcg).any()