diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 1e9a9542..ea6a8823 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -49,6 +49,7 @@ DISTANCES ~dot_product.DotProduct ~euclidean.Euclidean ~manhattan.Manhattan + ~jaccard.Jaccard EVAL ---- diff --git a/quaterion/distances/__init__.py b/quaterion/distances/__init__.py index 69b5fe39..8799060b 100644 --- a/quaterion/distances/__init__.py +++ b/quaterion/distances/__init__.py @@ -4,6 +4,7 @@ from quaterion.distances.cosine import Cosine from quaterion.distances.dot_product import DotProduct from quaterion.distances.euclidean import Euclidean +from quaterion.distances.jaccard import Jaccard from quaterion.distances.manhattan import Manhattan @@ -14,6 +15,7 @@ class Distance(str, Enum): COSINE = "cosine" DOT_PRODUCT = "dot_product" MANHATTAN = "manhattan" + JACCARD = "jaccard" @staticmethod def get_by_name(name: str) -> BaseDistance: @@ -26,6 +28,7 @@ def get_by_name(name: str) -> BaseDistance: "euclidean": Euclidean, "manhattan": Manhattan, "dot_product": DotProduct, + "jaccard": Jaccard, } try: diff --git a/quaterion/distances/jaccard.py b/quaterion/distances/jaccard.py new file mode 100644 index 00000000..22807934 --- /dev/null +++ b/quaterion/distances/jaccard.py @@ -0,0 +1,37 @@ +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor + +from quaterion.distances.base_distance import BaseDistance + + +class Jaccard(BaseDistance): + """Compute Weighted Jaccard distances (and its interpretation as similarities). + + Note: + The implementation of Weighted Jaccard + (https://en.wikipedia.org/wiki/Jaccard_index#Weighted_Jaccard_similarity_and_distance) + supports Tensors with positive float values. + """ + + @staticmethod + def distance(x: Tensor, y: Tensor) -> Tensor: + return 1 - Jaccard.similarity(x, y) + + @staticmethod + def similarity(x: Tensor, y: Tensor) -> Tensor: + min_sum = torch.minimum(x, y).sum(dim=-1) + max_sum = torch.maximum(x, y).sum(dim=-1) + return min_sum / max_sum + + @staticmethod + def distance_matrix(x: Tensor, y: Optional[Tensor] = None) -> Tensor: + return 1 - Jaccard.similarity_matrix(x.unsqueeze(1), y.unsqueeze(0)) + + @staticmethod + def similarity_matrix(x: Tensor, y: Optional[Tensor] = None) -> Tensor: + if y is None: + y = x + return Jaccard.similarity(x.unsqueeze(1), y.unsqueeze(0)) diff --git a/tests/test_distances.py b/tests/test_distances.py index c48524be..f4d2c294 100644 --- a/tests/test_distances.py +++ b/tests/test_distances.py @@ -12,6 +12,13 @@ class TestDistances: ] ) + x_2 = torch.tensor( + [ + [1.0, 1.5, 2.0, 3.0], + [0.5, 2.5, 2.5, 1.0], + ] + ) + x_dim = x.size()[0] expected = { "cosine": { @@ -30,6 +37,10 @@ class TestDistances: "similarity_matrix": torch.tensor([[16.25, -16.25], [-16.25, 16.25]]), "distance_matrix": torch.tensor([[-16.25, 16.25], [16.25, -16.25]]), }, + "jaccard": { + "similarity_matrix": torch.tensor([[1.0000, 0.5556], [0.5556, 1.0000]]), + "distance_matrix": torch.tensor([[0.0000, 0.4444], [0.4444, 0.0000]]), + }, } @pytest.mark.parametrize(