Skip to content
This repository was archived by the owner on Dec 1, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,6 @@ dmypy.json
# Pyre type checker
.pyre/
.Rproj.user

# Pycharm IDE folder
.idea
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ dependencies:
- sphinx-design
- parse
- git+https://github.com/OlivierBinette/StringCompare.git@release
- editdistance
- dill
838,026 changes: 838,026 additions & 0 deletions examples/summary/assignee.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pv_evaluation/benchmark/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
"pairwise precision": pairwise_precision,
"pairwise recall": pairwise_recall,
# "pairwise f1": pairwise_fscore,
"cluster precision": cluster_precision,
"cluster recall": cluster_recall,
#"cluster precision": cluster_precision,
#"cluster recall": cluster_recall,
# "cluster f1": cluster_fscore
# "rand index": rand_score,
}
Expand Down
2,414 changes: 2,414 additions & 0 deletions pv_evaluation/data/assignee/rawassignee_baseline.csv

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions pv_evaluation/summary/assignee_disambiguation_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from itertools import combinations
import pandas as pd

from pv_evaluation.summary.utils import EuclideanDistance, DistanceMetric, calculate_silhouette_a_i, \
calculate_silhouette_b_i
from pv_evaluation.summary.disambiguation_summary import DisambiguationSummary


class AssigneeDisambiguationSummary(DisambiguationSummary):
def __init__(self, data, name):
super().__init__(data, name, id_field="assignee_id")

# placeholder for storing various distances
self.intra_cluster_distances = {}
self.inter_cluster_distances = {}
self.intra_point_distances = {}
self.dataset_inter_cluster_distance = None
self.cluster_diameters = {}

def _validate_data(self):
for col in ["patent_id", "assignee_id", "data_label", "cluster_label"]:
assert (col in self._data.columns) or (col in [self._data.index.name]), f"{col} is not in the data columns."

def get_intra_cluster_distance(self, *args, **kwargs):
cluster_name = None
if len(args) > 1:
cluster_name = args[0]
cluster_name = kwargs.get("cluster_name", cluster_name)
if cluster_name is not None:
return self.intra_cluster_distances.get(cluster_name, None)
else:
return self.intra_cluster_distances

def get_inter_cluster_distance(self, cluster_name1, cluster_name2):
distance = self.inter_cluster_distances.get((cluster_name1, cluster_name2), None)
if distance is None:
distance = self.inter_cluster_distances.get((cluster_name1, cluster_name2), None)
return distance

def get_dataset_inter_cluster_distance(self):
return self.dataset_inter_cluster_distance

def collect_intra_cluster_distance(self, distance_metric=EuclideanDistance()):
"""
Collects the within-cluster distance for all clusters in the dataset
Args:
distance_metric (DistanceMetric): An object representing the type of distance measure to use to calculate distance. (Euclidean or Correlation). Default: Euclidean
"""
for cluster_name in self._data.cluster_label.unique():
mention_names = self._data[self._data.cluster_label == cluster_name].data_label
points_generator = combinations(mention_names, r=2)
try:
mean_distance, pointwise_distances = distance_metric.multi_point_distance(points_generator)
self.intra_cluster_distances[cluster_name] = (mean_distance, pointwise_distances)
self.cluster_diameters[cluster_name] = max(pointwise_distances.values())

except ZeroDivisionError:
self.intra_cluster_distances[cluster_name] = None

def collect_inter_cluster_distance(
self, distance_metric: DistanceMetric = EuclideanDistance(), measure_using: str = "centroid"
):
"""
Collects all the between-cluster distance between all cluster in datasets.
Args:
distance_metric: An object representing the type of distance measure to use to calculate distance. (Euclidean or Correlation)
measure_using: Cluster comparison mechanism. 'centroid' or 'members'.

Returns:
float: Inter-cluster distance
"""
cluster_names = self._data.cluster_label.unique()
if measure_using != "centroid":
raise NotImplementedError
points_generator = combinations(cluster_names, 2)
try:
self.dataset_inter_cluster_distance, self.inter_cluster_distances = distance_metric.multi_point_distance(
points_generator
)
except ZeroDivisionError:
raise Exception("There is only one cluster in the dataset")

def calculate_silhouette_score(self):
"""
Calculate silhouette score for all records in the dataset. https://en.wikipedia.org/wiki/Silhouette_(clustering)
Calculate the silhouette score formula: s(i) = b(i) - a(i) / max {a(i), b(i)} where i = current record

"""
silhouette_scores = {}
for record_idx in self._data.index:
record_cluster_label = self._data.loc[record_idx, "cluster_label"]
record_label = self._data.loc[record_idx, "data_label"]
current_cluster_distances = self.intra_cluster_distances[record_cluster_label]
# s(i) = 0 if |C] =1
if current_cluster_distances is None:
silhouette_scores[record_idx] = 0
else:
a = calculate_silhouette_a_i(record_label, current_cluster_distances[1])
b = calculate_silhouette_b_i(record_cluster_label, self._data.cluster_label.tolist())
silhouette_scores[record_idx] = (b - a) / max(a, b)

# Update the _data variable with calculated scores
silhouette_series = pd.DataFrame(silhouette_scores.values(), index=silhouette_scores.keys())
silhouette_series.rename({0: "sihouette_score"}, axis=1, inplace=True)
self._data = self._data.join(silhouette_series)

@classmethod
def get_example_summary(cls):
import os
try:
import importlib.resources as pkg_resources
except ImportError:
# Try backported to PY<37 `importlib_resources`.
import importlib_resources as pkg_resources
from pv_evaluation.data import assignee
with pkg_resources.path(package=assignee, resource='rawassignee_baseline.csv') as p:
baseline_assignee_file = p
return cls(datapath=str(baseline_assignee_file),
name='test')


if __name__ == '__main__':
assignee_summary = AssigneeDisambiguationSummary.get_example_summary()
assignee_summary.collect_inter_cluster_distance()
assignee_summary.collect_intra_cluster_distance()
assignee_summary.calculate_silhouette_score()
print(assignee_summary.get_dataset_inter_cluster_distance())
210 changes: 210 additions & 0 deletions pv_evaluation/summary/disambiguation_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import numpy as np
import pandas as pd
import plotly.express as px
from .utils import read_auto


class DisambiguationSummary:
def __init__(self, data, name=None, id_field="inventor_id"):
"""Report disambiguation summaries.

Args:
datapath (str): Path to the disambiguation data (csv, tsv or parquet format).
The data should have the column specified in the id_field parameter as well
as three columns :patent_id", "name_first", and "name_last".
name (str): Name of the disambiguation algorithm to show in plots.n
id_field (str): Column name of unique entity identifiers, such as "inventor_id" or "assignee_id". Defaults to "inventor_id".
"""
self.id_field = id_field

self.name = name
if isinstance(data, str):
self._data = read_auto(data)
if self.name is None:
self.name = data
else:
self._data = data

self._validate_data()

# Lazy initialization
self._cluster_size_distribution = None
self._cluster_unique_name_distribution = None
self._homonymy_rate_distribution = None


self._validate_data()

# Lazy initialization
self._cluster_size_distribution = None
self._cluster_unique_name_distribution = None
self._homonymy_rate_distribution = None

def _validate_data(self):
for col in ["patent_id", self.id_field, "name_first", "name_last"]:
assert (col in self._data.columns) or (col in [self._data.index.name]), f"{col} is not in the data columns."

def get_cluster_size_distribution(self):
"""Get the cluster size distribution summary.

Returns:
Series: Series of cluster sizes value counts
"""
if self._cluster_size_distribution is None:
self._cluster_size_distribution = (
self.get_cluster_sizes_dd()["Number of patents"]
.value_counts()
.reset_index()
.rename(columns={"Number of patents": "Number of inventors", "index": "Number of patents"})
)

return self._cluster_size_distribution.copy()

def get_cluster_sizes_dd(self):
"""Return the number of patents per disambiguated inventor as a Dask DataFrame.

The mode of the inventor's first name and last name are kept in the resulting dataframe.
"""
return (
self._data.groupby(self.id_field)
.agg({"patent_id": "count", "name_first": "first", "name_last": "first"})
.rename(columns={"patent_id": "Number of patents"})
)

def plot_cluster_size_distribution(self, range=(0, 20)):
"""Plot the distribution of the number of patents per inventor

Args:
range (tuple, optional): x-axis limits (inclusive range for the number of patents by inventor). Defaults to (1, 10).

Returns:
Plotly graph object.
"""
data = self.get_cluster_size_distribution().reset_index()
fig = px.bar(
data,
x="Number of patents",
y="Number of inventors",
title="Distribution of the number of patents per inventor",
)
fig.update_xaxes(range=range)

ylim = max(data["Number of inventors"][data["Number of patents"].between(range[0], range[1])])
fig.update_yaxes(range=(0, ylim), autorange=False)

fig.update_traces(name=self.name)
return fig

def get_top_inventors(self, n=10):
"""Get DataFrame of n most prolific inventors

Args:
n (int, optional): Number of inventors to return. Defaults to 10.

Returns:
DataFrame containing the sorted top n most prolific inventors.
"""
return self.get_cluster_sizes_dd().sort_values(by="Number of patents", ascending=False).head(n)

def entropy_curve(self, q_range=np.linspace(0, 2)):
data = self.get_cluster_size_distribution()
data["Number of inventors"] = data["Number of inventors"] / sum(data["Number of inventors"])

def hill_number(arr, q):
if q == 1:
I = arr > 0
return np.exp(-np.sum(arr[I] * np.log(arr[I])))
elif q == 0:
return np.sum(arr > 0)
else:
return np.sum(arr ** (q)) ** (1 / (1 - q))

return [hill_number(data["Number of inventors"], q) for q in q_range], q_range

def plot_entropy_curve(self, q_range=np.linspace(0, 2)):
ent, q = self.entropy_curve(q_range)
fig = px.line(x=q, y=ent, title="Hill Numbers entropy curve", labels={"x": "q", "y": "Entropy"})
fig.update_traces(name=self.name)
return fig

def get_cluster_unique_name_distribution(self):
"""Get the proportion of homogeneous clusters (no name variation) by cluster size."""
if self._cluster_unique_name_distribution is None:
self._cluster_unique_name_distribution = (
self.get_cluster_sizes_dd()
.join(
self._data.groupby(self.id_field)["name_first"]
.apply(lambda x: len(set(x)) == 1)
.rename("Proportion of unique name")
)
.groupby("Number of patents")
.agg({"Proportion of unique name": "mean"})
.reset_index()
)

return self._cluster_unique_name_distribution.copy()

def plot_cluster_unique_name_distribution(self, range=(0, 100)):
"""Plot the proportion of homogeneous clusters (no name variation) by cluster size."""
data = self.get_cluster_unique_name_distribution()
fig = px.bar(
data,
x="Number of patents",
y="Proportion of unique name",
title="Proportion of homogeneous clusters (no name variation) by cluster size",
)
fig.update_xaxes(range=range)

ylim = max(data["Proportion of unique name"][data["Number of patents"].between(range[0], range[1])])
fig.update_yaxes(range=(0, ylim), autorange=False)

fig.update_traces(name=self.name)
return fig

def get_homonymy_rate_distribution(self):
"""Get homonymy rates by cluster size.

The homonymy rate is the proportion of clusters which share at least one name mention with another cluster.
"""
if self._homonymy_rate_distribution is None:
data = self._data.assign(
inventor_id2=self._data[self.id_field], homophones=self._data.name_first + ":" + self._data.name_last
)

dat = (
data.join(
data.groupby("homophones")["inventor_id2"].apply(lambda x: len(set(x)) > 1).rename("Shared name"),
on="homophones",
)[["Shared name", "inventor_id2", "patent_id"]]
.groupby("inventor_id2")
.agg({"patent_id": "count", "Shared name": "sum"})
)

dat = dat.assign(shared_name_prop=np.where(dat["Shared name"].values > 1, 1, 0))
result = dat.groupby("patent_id")["shared_name_prop"].agg(np.mean)

self._homonymy_rate_distribution = result.reset_index().rename(
columns={"shared_name_prop": "Homonymy rate", "patent_id": "Number of patents"}
)

return self._homonymy_rate_distribution.copy()

def plot_homonymy_rate_distribution(self, range=(0, 100)):
"""Plot homonymy rate by cluster size.

The homonymy rate is the proportion of clusters which share at least one name mention with another cluster.
"""
data = self.get_homonymy_rate_distribution().reset_index()
fig = px.bar(
data,
x="Number of patents",
y="Homonymy rate",
title="Homonymy rate by cluster size",
)
fig.update_xaxes(range=range)

ylim = max(data["Homonymy rate"][data["Number of patents"].between(range[0], range[1])])
fig.update_yaxes(range=(0, ylim), autorange=False)

fig.update_traces(name=self.name)
return fig
Loading