PatentsView · sarvo-madhavan · Jun 11, 2022 · Jun 11, 2022 · Jun 11, 2022 · Jun 11, 2022
diff --git a/.gitignore b/.gitignore
@@ -151,3 +151,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 .Rproj.user
+
+# Pycharm IDE folder
+.idea
diff --git a/environment.yml b/environment.yml
@@ -37,3 +37,5 @@ dependencies:
     - sphinx-design
     - parse
     - git+https://github.com/OlivierBinette/StringCompare.git@release
+    - editdistance
+    - dill
diff --git a/examples/summary/assignee.ipynb b/examples/summary/assignee.ipynb
diff --git a/pv_evaluation/benchmark/report.py b/pv_evaluation/benchmark/report.py
@@ -33,8 +33,8 @@
     "pairwise precision": pairwise_precision,
     "pairwise recall": pairwise_recall,
     # "pairwise f1": pairwise_fscore,
-    "cluster precision": cluster_precision,
-    "cluster recall": cluster_recall,
+    #"cluster precision": cluster_precision,
+    #"cluster recall": cluster_recall,
     # "cluster f1": cluster_fscore
     # "rand index": rand_score,
 }

diff --git a/pv_evaluation/data/assignee/rawassignee_baseline.csv b/pv_evaluation/data/assignee/rawassignee_baseline.csv
diff --git a/pv_evaluation/summary/assignee_disambiguation_summary.py b/pv_evaluation/summary/assignee_disambiguation_summary.py
@@ -0,0 +1,127 @@
+from itertools import combinations
+import pandas as pd
+
+from pv_evaluation.summary.utils import EuclideanDistance, DistanceMetric, calculate_silhouette_a_i, \
+    calculate_silhouette_b_i
+from pv_evaluation.summary.disambiguation_summary import DisambiguationSummary
+
+
+class AssigneeDisambiguationSummary(DisambiguationSummary):
+    def __init__(self, data, name):
+        super().__init__(data, name, id_field="assignee_id")
+
+        # placeholder for storing various distances
+        self.intra_cluster_distances = {}
+        self.inter_cluster_distances = {}
+        self.intra_point_distances = {}
+        self.dataset_inter_cluster_distance = None
+        self.cluster_diameters = {}
+
+    def _validate_data(self):
+        for col in ["patent_id", "assignee_id", "data_label", "cluster_label"]:
+            assert (col in self._data.columns) or (col in [self._data.index.name]), f"{col} is not in the data columns."
+
+    def get_intra_cluster_distance(self, *args, **kwargs):
+        cluster_name = None
+        if len(args) > 1:
+            cluster_name = args[0]
+        cluster_name = kwargs.get("cluster_name", cluster_name)
+        if cluster_name is not None:
+            return self.intra_cluster_distances.get(cluster_name, None)
+        else:
+            return self.intra_cluster_distances
+
+    def get_inter_cluster_distance(self, cluster_name1, cluster_name2):
+        distance = self.inter_cluster_distances.get((cluster_name1, cluster_name2), None)
+        if distance is None:
+            distance = self.inter_cluster_distances.get((cluster_name1, cluster_name2), None)
+        return distance
+
+    def get_dataset_inter_cluster_distance(self):
+        return self.dataset_inter_cluster_distance
+
+    def collect_intra_cluster_distance(self, distance_metric=EuclideanDistance()):
+        """
+        Collects the within-cluster distance for all clusters in the dataset
+        Args:
+            distance_metric (DistanceMetric): An object representing the type of distance measure to use to calculate distance. (Euclidean or Correlation). Default: Euclidean
+        """
+        for cluster_name in self._data.cluster_label.unique():
+            mention_names = self._data[self._data.cluster_label == cluster_name].data_label
+            points_generator = combinations(mention_names, r=2)
+            try:
+                mean_distance, pointwise_distances = distance_metric.multi_point_distance(points_generator)
+                self.intra_cluster_distances[cluster_name] = (mean_distance, pointwise_distances)
+                self.cluster_diameters[cluster_name] = max(pointwise_distances.values())
+
+            except ZeroDivisionError:
+                self.intra_cluster_distances[cluster_name] = None
+
+    def collect_inter_cluster_distance(
+        self, distance_metric: DistanceMetric = EuclideanDistance(), measure_using: str = "centroid"
+    ):
+        """
+        Collects all the between-cluster distance between all cluster in datasets.
+        Args:
+            distance_metric: An object representing the type of distance measure to use to calculate distance. (Euclidean or Correlation)
+            measure_using: Cluster comparison mechanism. 'centroid' or 'members'.
+
+        Returns:
+            float: Inter-cluster distance
+        """
+        cluster_names = self._data.cluster_label.unique()
+        if measure_using != "centroid":
+            raise NotImplementedError
+        points_generator = combinations(cluster_names, 2)
+        try:
+            self.dataset_inter_cluster_distance, self.inter_cluster_distances = distance_metric.multi_point_distance(
+                points_generator
+            )
+        except ZeroDivisionError:
+            raise Exception("There is only one cluster in the dataset")
+
+    def calculate_silhouette_score(self):
+        """
+        Calculate silhouette score for all records in the dataset. https://en.wikipedia.org/wiki/Silhouette_(clustering)
+        Calculate the silhouette score formula: s(i) = b(i) - a(i) / max {a(i), b(i)} where i = current record
+
+        """
+        silhouette_scores = {}
+        for record_idx in self._data.index:
+            record_cluster_label = self._data.loc[record_idx, "cluster_label"]
+            record_label = self._data.loc[record_idx, "data_label"]
+            current_cluster_distances = self.intra_cluster_distances[record_cluster_label]
+            # s(i) = 0 if |C] =1
+            if current_cluster_distances is None:
+                silhouette_scores[record_idx] = 0
+            else:
+                a = calculate_silhouette_a_i(record_label, current_cluster_distances[1])
+                b = calculate_silhouette_b_i(record_cluster_label, self._data.cluster_label.tolist())
+                silhouette_scores[record_idx] = (b - a) / max(a, b)
+
+        # Update the _data variable with calculated scores
+        silhouette_series = pd.DataFrame(silhouette_scores.values(), index=silhouette_scores.keys())
+        silhouette_series.rename({0: "sihouette_score"}, axis=1, inplace=True)
+        self._data = self._data.join(silhouette_series)
+
+    @classmethod
+    def get_example_summary(cls):
+        import os
+        try:
+            import importlib.resources as pkg_resources
+        except ImportError:
+            # Try backported to PY<37 `importlib_resources`.
+            import importlib_resources as pkg_resources
+        from pv_evaluation.data import assignee
+        with pkg_resources.path(package=assignee, resource='rawassignee_baseline.csv') as p:
+            baseline_assignee_file = p
+        return cls(datapath=str(baseline_assignee_file),
+                   name='test')
+
+
+if __name__ == '__main__':
+    assignee_summary = AssigneeDisambiguationSummary.get_example_summary()
+    assignee_summary.collect_inter_cluster_distance()
+    assignee_summary.collect_intra_cluster_distance()
+    assignee_summary.calculate_silhouette_score()
+    print(assignee_summary.get_dataset_inter_cluster_distance())
diff --git a/pv_evaluation/summary/disambiguation_summary.py b/pv_evaluation/summary/disambiguation_summary.py
@@ -0,0 +1,210 @@
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from .utils import read_auto
+
+
+class DisambiguationSummary:
+    def __init__(self, data, name=None, id_field="inventor_id"):
+        """Report disambiguation summaries.
+
+        Args:
+            datapath (str): Path to the disambiguation data (csv, tsv or parquet format).
+                The data should have the column specified in the id_field parameter as well
+                as three columns :patent_id", "name_first", and "name_last".
+            name (str): Name of the disambiguation algorithm to show in plots.n
+            id_field (str): Column name of unique entity identifiers, such as "inventor_id" or "assignee_id". Defaults to "inventor_id".
+        """
+        self.id_field = id_field
+
+        self.name = name
+        if isinstance(data, str):
+            self._data = read_auto(data)
+            if self.name is None:
+                self.name = data
+        else:
+            self._data = data
+
+        self._validate_data()
+
+        # Lazy initialization
+        self._cluster_size_distribution = None
+        self._cluster_unique_name_distribution = None
+        self._homonymy_rate_distribution = None
+
+
+        self._validate_data()
+
+        # Lazy initialization
+        self._cluster_size_distribution = None
+        self._cluster_unique_name_distribution = None
+        self._homonymy_rate_distribution = None
+
+    def _validate_data(self):
+        for col in ["patent_id", self.id_field, "name_first", "name_last"]:
+            assert (col in self._data.columns) or (col in [self._data.index.name]), f"{col} is not in the data columns."
+
+    def get_cluster_size_distribution(self):
+        """Get the cluster size distribution summary.
+
+        Returns:
+            Series: Series of cluster sizes value counts
+        """
+        if self._cluster_size_distribution is None:
+            self._cluster_size_distribution = (
+                self.get_cluster_sizes_dd()["Number of patents"]
+                .value_counts()
+                .reset_index()
+                .rename(columns={"Number of patents": "Number of inventors", "index": "Number of patents"})
+            )
+
+        return self._cluster_size_distribution.copy()
+
+    def get_cluster_sizes_dd(self):
+        """Return the number of patents per disambiguated inventor as a Dask DataFrame.
+
+        The mode of the inventor's first name and last name are kept in the resulting dataframe.
+        """
+        return (
+            self._data.groupby(self.id_field)
+            .agg({"patent_id": "count", "name_first": "first", "name_last": "first"})
+            .rename(columns={"patent_id": "Number of patents"})
+        )
+
+    def plot_cluster_size_distribution(self, range=(0, 20)):
+        """Plot the distribution of the number of patents per inventor
+
+        Args:
+            range (tuple, optional): x-axis limits (inclusive range for the number of patents by inventor). Defaults to (1, 10).
+
+        Returns:
+            Plotly graph object.
+        """
+        data = self.get_cluster_size_distribution().reset_index()
+        fig = px.bar(
+            data,
+            x="Number of patents",
+            y="Number of inventors",
+            title="Distribution of the number of patents per inventor",
+        )
+        fig.update_xaxes(range=range)
+
+        ylim = max(data["Number of inventors"][data["Number of patents"].between(range[0], range[1])])
+        fig.update_yaxes(range=(0, ylim), autorange=False)
+
+        fig.update_traces(name=self.name)
+        return fig
+
+    def get_top_inventors(self, n=10):
+        """Get DataFrame of n most prolific inventors
+
+        Args:
+            n (int, optional): Number of inventors to return. Defaults to 10.
+
+        Returns:
+            DataFrame containing the sorted top n most prolific inventors.
+        """
+        return self.get_cluster_sizes_dd().sort_values(by="Number of patents", ascending=False).head(n)
+
+    def entropy_curve(self, q_range=np.linspace(0, 2)):
+        data = self.get_cluster_size_distribution()
+        data["Number of inventors"] = data["Number of inventors"] / sum(data["Number of inventors"])
+
+        def hill_number(arr, q):
+            if q == 1:
+                I = arr > 0
+                return np.exp(-np.sum(arr[I] * np.log(arr[I])))
+            elif q == 0:
+                return np.sum(arr > 0)
+            else:
+                return np.sum(arr ** (q)) ** (1 / (1 - q))
+
+        return [hill_number(data["Number of inventors"], q) for q in q_range], q_range
+
+    def plot_entropy_curve(self, q_range=np.linspace(0, 2)):
+        ent, q = self.entropy_curve(q_range)
+        fig = px.line(x=q, y=ent, title="Hill Numbers entropy curve", labels={"x": "q", "y": "Entropy"})
+        fig.update_traces(name=self.name)
+        return fig
+
+    def get_cluster_unique_name_distribution(self):
+        """Get the proportion of homogeneous clusters (no name variation) by cluster size."""
+        if self._cluster_unique_name_distribution is None:
+            self._cluster_unique_name_distribution = (
+                self.get_cluster_sizes_dd()
+                .join(
+                    self._data.groupby(self.id_field)["name_first"]
+                    .apply(lambda x: len(set(x)) == 1)
+                    .rename("Proportion of unique name")
+                )
+                .groupby("Number of patents")
+                .agg({"Proportion of unique name": "mean"})
+                .reset_index()
+            )
+
+        return self._cluster_unique_name_distribution.copy()
+
+    def plot_cluster_unique_name_distribution(self, range=(0, 100)):
+        """Plot the proportion of homogeneous clusters (no name variation) by cluster size."""
+        data = self.get_cluster_unique_name_distribution()
+        fig = px.bar(
+            data,
+            x="Number of patents",
+            y="Proportion of unique name",
+            title="Proportion of homogeneous clusters (no name variation) by cluster size",
+        )
+        fig.update_xaxes(range=range)
+
+        ylim = max(data["Proportion of unique name"][data["Number of patents"].between(range[0], range[1])])
+        fig.update_yaxes(range=(0, ylim), autorange=False)
+
+        fig.update_traces(name=self.name)
+        return fig
+
+    def get_homonymy_rate_distribution(self):
+        """Get homonymy rates by cluster size.
+
+        The homonymy rate is the proportion of clusters which share at least one name mention with another cluster.
+        """
+        if self._homonymy_rate_distribution is None:
+            data = self._data.assign(
+                inventor_id2=self._data[self.id_field], homophones=self._data.name_first + ":" + self._data.name_last
+            )
+
+            dat = (
+                data.join(
+                    data.groupby("homophones")["inventor_id2"].apply(lambda x: len(set(x)) > 1).rename("Shared name"),
+                    on="homophones",
+                )[["Shared name", "inventor_id2", "patent_id"]]
+                .groupby("inventor_id2")
+                .agg({"patent_id": "count", "Shared name": "sum"})
+            )
+
+            dat = dat.assign(shared_name_prop=np.where(dat["Shared name"].values > 1, 1, 0))
+            result = dat.groupby("patent_id")["shared_name_prop"].agg(np.mean)
+
+            self._homonymy_rate_distribution = result.reset_index().rename(
+                columns={"shared_name_prop": "Homonymy rate", "patent_id": "Number of patents"}
+            )
+
+        return self._homonymy_rate_distribution.copy()
+
+    def plot_homonymy_rate_distribution(self, range=(0, 100)):
+        """Plot homonymy rate by cluster size.
+
+        The homonymy rate is the proportion of clusters which share at least one name mention with another cluster.
+        """
+        data = self.get_homonymy_rate_distribution().reset_index()
+        fig = px.bar(
+            data,
+            x="Number of patents",
+            y="Homonymy rate",
+            title="Homonymy rate by cluster size",
+        )
+        fig.update_xaxes(range=range)
+
+        ylim = max(data["Homonymy rate"][data["Number of patents"].between(range[0], range[1])])
+        fig.update_yaxes(range=(0, ylim), autorange=False)
+
+        fig.update_traces(name=self.name)
+        return fig