diff --git a/site/images/mercury_graph.png b/site/images/mercury_graph.png new file mode 100644 index 0000000..a3dc2d9 Binary files /dev/null and b/site/images/mercury_graph.png differ diff --git a/site/reference/embeddings/index.html b/site/reference/embeddings/index.html index 2bfc30c..3a9a088 100644 --- a/site/reference/embeddings/index.html +++ b/site/reference/embeddings/index.html @@ -1451,6 +1451,7 @@

Create an embedding mapping the nodes of a graph.

+

Includes contributions by David Muelas Recuenco.

Parameters:

@@ -1576,9 +1577,7 @@

Source code in mercury/graph/embeddings/graphembeddings.py -
 88
- 89
- 90
+                    
 90
  91
  92
  93
@@ -1600,7 +1599,9 @@ 

109 110 111 -112

def __init__(
+112
+113
+114
def __init__(
     self,
     dimension=None,
     n_jumps=None,
@@ -1709,9 +1710,7 @@ 

Source code in mercury/graph/embeddings/graphembeddings.py -
114
-115
-116
+              
116
 117
 118
 119
@@ -1720,7 +1719,9 @@ 

122 123 124 -125

def __getitem__(self, arg):
+125
+126
+127
def __getitem__(self, arg):
     """
     Method to access rows in the embedding by ID.
 
@@ -1776,9 +1777,7 @@ 

Source code in mercury/graph/embeddings/graphembeddings.py -
203
-204
-205
+              
205
 206
 207
 208
@@ -1786,7 +1785,9 @@ 

210 211 212 -213

def embedding(self):
+213
+214
+215
def embedding(self):
     """
     Return the internal Embeddings object.
 
@@ -1879,9 +1880,7 @@ 

Source code in mercury/graph/embeddings/graphembeddings.py -
127
-128
-129
+              
129
 130
 131
 132
@@ -1953,7 +1952,9 @@ 

198 199 200 -201

def fit(self, g: Graph):
+201
+202
+203
def fit(self, g: Graph):
     """
     Train the embedding by doing random walks.
 
@@ -2161,9 +2162,7 @@ 

Source code in mercury/graph/embeddings/graphembeddings.py -
215
-216
-217
+              
217
 218
 219
 220
@@ -2188,7 +2187,9 @@ 

239 240 241 -242

def get_most_similar_nodes(
+242
+243
+244
def get_most_similar_nodes(
     self, node_id, k=5, metric="cosine", return_as_indices=False
 ):
     """
@@ -2287,9 +2288,7 @@ 

Source code in mercury/graph/embeddings/graphembeddings.py -
244
-245
-246
+              
246
 247
 248
 249
@@ -2316,7 +2315,9 @@ 

270 271 272 -273

def save(self, file_name, save_embedding=False):
+273
+274
+275
def save(self, file_name, save_embedding=False):
     """
     Saves a GraphEmbedding to a compressed binary file with or without the embedding itself. It saves the graph's node names
     and the adjacency matrix as a sparse matrix.
diff --git a/site/reference/ml/index.html b/site/reference/ml/index.html
index c1e9932..f6ba05b 100644
--- a/site/reference/ml/index.html
+++ b/site/reference/ml/index.html
@@ -774,6 +774,8 @@ 

all nodes are reassigned at the same time and conflicts (i.e., 1 -> C2 and 2 -> C1) are resolved with a simple tie-breaking rule. This version also introduces the resolution parameter gamma, as in 2.

+

Contributed by Arturo Soberon Cedillo, Jose Antonio Guzman Vazquez and +Isaac Dodanim Hernandez Garcia.


    @@ -916,10 +918,7 @@

    Source code in mercury/graph/ml/louvain.py -
    67
    -68
    -69
    -70
    +                    
    70
     71
     72
     73
    @@ -935,7 +934,10 @@ 

    83 84 85 -86

    def __init__(
    +86
    +87
    +88
    +89
    def __init__(
         self,
         min_modularity_gain=1e-03,
         max_pass=2,
    @@ -1038,10 +1040,7 @@ 

    Source code in mercury/graph/ml/louvain.py -
    102
    -103
    -104
    -105
    +              
    105
     106
     107
     108
    @@ -1157,7 +1156,10 @@ 

    218 219 220 -221

    def fit(self, g: Graph):
    +221
    +222
    +223
    +224
    def fit(self, g: Graph):
         """
         Args:
             g (Graph): A mercury graph structure.
    @@ -1940,6 +1942,7 @@ 

    Implementation of the spectral clustering algorithm which detect communities inside a graph.

    +

    Contributed by Gibran Gabriel Otazo Sanchez.

    Parameters:

    @@ -2028,16 +2031,16 @@

    Source code in mercury/graph/ml/spectral.py -
    24
    -25
    -26
    +                    
    26
     27
     28
     29
     30
     31
     32
    -33
    def __init__(
    +33
    +34
    +35
    def __init__(
         self, n_clusters=2, mode="networkx", max_iterations=10, random_state=0
     ):
         self.n_clusters = n_clusters
    @@ -2132,9 +2135,7 @@ 

    Source code in mercury/graph/ml/spectral.py -
    49
    -50
    -51
    +              
    51
     52
     53
     54
    @@ -2149,7 +2150,9 @@ 

    63 64 65 -66

    def fit(self, graph: Graph):
    +66
    +67
    +68
    def fit(self, graph: Graph):
         """
         Find the optimal clusters of a given graph. The function returns nothing, but saves the clusters and
         the modularity in the object self.
    diff --git a/site/reference/viz/index.html b/site/reference/viz/index.html
    index 8fbf8fa..58b271d 100644
    --- a/site/reference/viz/index.html
    +++ b/site/reference/viz/index.html
    @@ -681,6 +681,11 @@ 

    Moebius class for visualizing graphs using JavaScript and HTML.

    +
    + Note +

    Moebius is currently only compatible with Google Colab and Jupyter Notebooks Classic (prior to v7).

    +
    +
    Usage
    from mercury.graph.viz import Moebius
    @@ -766,15 +771,27 @@ 

    Source code in mercury/graph/viz/moebius.py -
    34
    -35
    -36
    -37
    +                    
    37
     38
     39
     40
     41
    -42
    def __init__(self, G):
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    def __init__(self, G):
     
         if HTML is None:
             raise ImportError('IPython is not installed')
    @@ -783,6 +800,18 @@ 

    self.use_spark = self.G._as_networkx is None self.front_pat = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/frontend' self._int_id_map = {node['id'] : i for i, node in enumerate(self.G.nodes)} + + # Define callback for JS interactions within Google Colab + if importlib.util.find_spec('google') is not None and importlib.util.find_spec('google.colab') is not None: + from google.colab import output + from IPython import get_ipython + + def colab_execute_python(code): + # Use get_ipython() to access the Moebius object defined by the user in a Colab cell + get_ipython().run_cell(f"_temp_colab_execute_python_result = {code}") + return get_ipython().user_ns["_temp_colab_execute_python_result"] + + output.register_callback("notebook.colab_execute_python", colab_execute_python)

    @@ -834,12 +863,12 @@

    Source code in mercury/graph/viz/moebius.py -
    86
    -87
    -88
    -89
    -90
    -91
    def FHT(self, fn):
    +              
    100
    +101
    +102
    +103
    +104
    +105
    def FHT(self, fn):
         """
         Syntactic sugar for display(HTML(filename = fn))
         """
    @@ -866,12 +895,12 @@ 

    Source code in mercury/graph/viz/moebius.py -
    78
    -79
    -80
    -81
    -82
    -83
    def FJS(self, fn):
    +              
    92
    +93
    +94
    +95
    +96
    +97
    def FJS(self, fn):
         """
         Syntactic sugar for display(Javascript(filename = fn))
         """
    @@ -898,12 +927,12 @@ 

    Source code in mercury/graph/viz/moebius.py -
    70
    -71
    -72
    -73
    -74
    -75
    def JS(self, s):
    +              
    84
    +85
    +86
    +87
    +88
    +89
    def JS(self, s):
         """
         Syntactic sugar for display(Javascript())
         """
    @@ -930,12 +959,12 @@ 

    Source code in mercury/graph/viz/moebius.py -
    53
    -54
    -55
    -56
    -57
    -58
    def __getitem__(self, item):
    +              
    67
    +68
    +69
    +70
    +71
    +72
    def __getitem__(self, item):
         """
         Add support for the [] operator.
         """
    @@ -962,12 +991,12 @@ 

    Source code in mercury/graph/viz/moebius.py -
    45
    -46
    -47
    -48
    -49
    -50
    def __str__(self):
    +              
    59
    +60
    +61
    +62
    +63
    +64
    def __str__(self):
         """
         Convert the object via str()
         """
    @@ -1138,21 +1167,7 @@ 

    Source code in mercury/graph/viz/moebius.py -
     94
    - 95
    - 96
    - 97
    - 98
    - 99
    -100
    -101
    -102
    -103
    -104
    -105
    -106
    -107
    -108
    +              
    108
     109
     110
     111
    @@ -1193,7 +1208,21 @@ 

    146 147 148 -149

    def node_or_edge_config(self, text_is = None, color_is = None, colors = None, size_is = None, size_range = None, size_scale = 'linear'):
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    +157
    +158
    +159
    +160
    +161
    +162
    +163
    def node_or_edge_config(self, text_is = None, color_is = None, colors = None, size_is = None, size_range = None, size_scale = 'linear'):
         """
         Create a `node_config` or `edge_config` configuration dictionary for `show()` in an understandable way.
     
    @@ -1349,21 +1378,7 @@ 

    Source code in mercury/graph/viz/moebius.py -
    152
    -153
    -154
    -155
    -156
    -157
    -158
    -159
    -160
    -161
    -162
    -163
    -164
    -165
    -166
    +              
    166
     167
     168
     169
    @@ -1371,7 +1386,21 @@ 

    171 172 173 -174

    def show(self, initial_id = None, initial_depth = 1, node_config = None, edge_config = None):
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    def show(self, initial_id = None, initial_depth = 1, node_config = None, edge_config = None):
         """
         Start the interactive graph visualization in a Jupyter notebook.
     
    diff --git a/site/search/search_index.json b/site/search/search_index.json
    index 4959885..e5619b2 100644
    --- a/site/search/search_index.json
    +++ b/site/search/search_index.json
    @@ -1 +1 @@
    -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"mercury-graph","text":"

    mercury-graph is a Python library that offers graph analytics capabilities with a technology-agnostic API, enabling users to apply a curated range of performant and scalable algorithms and utilities regardless of the underlying data framework. The consistent, scikit-like interface abstracts away the complexities of internal transformations, allowing users to effortlessly switch between different graph representations to leverage optimized algorithms implemented using pure Python, numba, networkx and PySpark GraphFrames.

    Currently implemented submodules in mercury.graph include:

    • mercury.graph.core, with the main classes of the library that create and store the graphs' data and properties.

    • mercury.graph.ml, with graph theory and machine learning algorithms such as Louvain community detection, spectral clustering, Markov chains, spreading activation-based diffusion models and graph random walkers.

    • mercury.graph.embeddings, with classes that calculate graph embeddings in different ways, such as following the Node2Vec algorithm.

    • mercury.graph.viz, with capabilities for graph visualization.

    "},{"location":"#repository","title":"Repository","text":"

    The website for the GitHub repository can be found here.

    "},{"location":"reference/core/","title":"mercury.graph.core","text":""},{"location":"reference/core/#mercury.graph.core.Graph","title":"mercury.graph.core.Graph(data=None, keys=None, nodes=None)","text":"

    This is the main class in mercury.graph.

    This class seamlessly abstracts the underlying technology used to represent the graph. You can create a graph passing the following objects to the constructor:

    • A pandas DataFrame containing edges (with a keys dictionary to specify the columns and possibly a nodes DataFrame)
    • A pyspark DataFrame containing edges (with a keys dictionary to specify the columns and possibly a nodes DataFrame)
    • A networkx graph
    • A graphframes graph

    Bear in mind that the graph object is immutable. This means that you can't modify the graph object once it has been created. If you want to modify it, you have to create a new graph object.

    The graph object provides:

    • Properties to access the graph in different formats (networkx, graphframes, dgl)
    • Properties with metrics and summary information that are calculated on demand and technology independent.
    • It is inherited by other graph classes in mercury-graph providing ML algorithms such as graph embedding, visualization, etc.

    Using this class from the other classes in mercury-graph:

    The other classes in mercury-graph define models or functionalities that are based on graphs. They use a Scikit-learn-like API to interact with the graph object. This means that the graph object is passed to the class constructor and the class follow the Scikit-learn conventions. It is recommended to follow the same conventions when creating your own classes to work with mercury-graph.

    The conventions can be found here:

    • Scikit API
    • On scikit conventions

    Parameters:

    Name Type Description Default data (DataFrame, Graph or DataFrame)

    The data to create the graph from. It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. In case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored.

    None keys dict

    A dictionary with keys to specify the columns in the data DataFrame. The keys are:

    • 'src': The name of the column with the source node.
    • 'dst': The name of the column with the destination node.
    • 'id': The name of the column with the node id.
    • 'weight': The name of the column with the edge weight.
    • 'directed': A boolean to specify if the graph is directed. (Only for pyspark DataFrames)

    When the keys argument is not provided or the key is missing, the default values are:

    • 'src': 'src'
    • 'dst': 'dst'
    • 'id': 'id'
    • 'weight': 'weight'
    • 'directed': True
    None nodes DataFrame

    A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when data is pandas or pyspark DataFrame and with the same type as data) If not given, the nodes are inferred from the edges DataFrame.

    None Source code in mercury/graph/core/graph.py
    def __init__(self, data = None, keys = None, nodes = None):\n    self._as_networkx = None\n    self._as_graphframe = None\n    self._as_dgl = None\n    self._degree = None\n    self._in_degree = None\n    self._out_degree = None\n    self._closeness_centrality = None\n    self._betweenness_centrality = None\n    self._pagerank = None\n    self._connected_components = None\n    self._nodes_colnames = None\n    self._edges_colnames = None\n\n    self._number_of_nodes = 0\n    self._number_of_edges = 0\n    self._node_ix = 0\n    self._is_directed = False\n    self._is_weighted = False\n\n    self._init_values = {k: v for k, v in locals().items() if k in inspect.signature(self.__init__).parameters}\n\n    if type(data) == pd.core.frame.DataFrame:\n        self._from_pandas(data, nodes, keys)\n        return\n\n    if isinstance(data, nx.Graph):      # This is the most general case, including: ...Graph, ...DiGraph and ...MultiGraph\n        self._from_networkx(data)\n        return\n\n    spark_int = SparkInterface()\n\n    if pyspark_installed and graphframes_installed:\n        if type(data) == spark_int.type_spark_dataframe:\n            self._from_dataframe(data, nodes, keys)\n            return\n\n        if type(data) == spark_int.type_graphframe:\n            self._from_graphframes(data)\n            return\n\n    raise ValueError('Invalid input data. (Expected: pandas DataFrame, a networkx Graph, a pyspark DataFrame, a graphframes Graph.)')\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.betweenness_centrality","title":"betweenness_centrality property","text":"

    Returns the betweenness centrality of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.closeness_centrality","title":"closeness_centrality property","text":"

    Returns the closeness centrality of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.connected_components","title":"connected_components property","text":"

    Returns the connected components of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.degree","title":"degree property","text":"

    Returns the degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.dgl","title":"dgl property","text":"

    Returns the graph as a DGL graph.

    If the graph has not been converted to a DGL graph yet, it will be converted and cached for future use.

    Returns:

    Type Description DGLGraph

    The graph represented as a DGL graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges","title":"edges property","text":"

    Returns an iterator over the edges in the graph.

    Returns:

    Type Description EdgeIterator

    An iterator object that allows iterating over the edges in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_colnames","title":"edges_colnames property","text":"

    Returns the column names of the edges DataFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.graphframe","title":"graphframe property","text":"

    Returns the graph as a GraphFrame.

    If the graph has not been converted to a GraphFrame yet, it will be converted and cached for future use.

    Returns:

    Type Description GraphFrame

    The graph represented as a GraphFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.in_degree","title":"in_degree property","text":"

    Returns the in-degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.is_directed","title":"is_directed property","text":"

    Returns True if the graph is directed, False otherwise.

    Note

    Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph. This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the dict argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means this class cannot be aware of.

    "},{"location":"reference/core/#mercury.graph.core.Graph.is_weighted","title":"is_weighted property","text":"

    Returns True if the graph is weighted, False otherwise.

    A graph is considered weight if it has a column named 'weight' in the edges DataFrame or the column has a different name and that name is passed in the dict argument as the 'weight' key.

    "},{"location":"reference/core/#mercury.graph.core.Graph.networkx","title":"networkx property","text":"

    Returns the graph representation as a NetworkX graph.

    If the graph has not been converted to NetworkX format yet, it will be converted and cached for future use.

    Returns:

    Type Description Graph

    The graph representation as a NetworkX graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes","title":"nodes property","text":"

    Returns an iterator over all the nodes in the graph.

    Returns:

    Type Description NodeIterator

    An iterator that yields each node in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_colnames","title":"nodes_colnames property","text":"

    Returns the column names of the nodes DataFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.number_of_edges","title":"number_of_edges property","text":"

    Returns the number of edges in the graph.

    Returns:

    Type Description int

    The number of edges in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.number_of_nodes","title":"number_of_nodes property","text":"

    Returns the number of nodes in the graph.

    Returns:

    Type Description int

    The number of nodes in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.out_degree","title":"out_degree property","text":"

    Returns the out-degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.pagerank","title":"pagerank property","text":"

    Returns the PageRank of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_as_dataframe","title":"edges_as_dataframe()","text":"

    Returns the edges as a pyspark DataFrame.

    If the graph is represented as a graphframes graph, the edges are extracted from it. Otherwise, the edges are converted from the pandas DataFrame representation. The columns used as the source and destination nodes are always named 'src' and 'dst', respectively, regardless of the original column names passed to the constructor.

    Source code in mercury/graph/core/graph.py
    def edges_as_dataframe(self):\n    \"\"\"\n    Returns the edges as a pyspark DataFrame.\n\n    If the graph is represented as a graphframes graph, the edges are extracted from it. Otherwise, the edges are converted from the\n    pandas DataFrame representation. The columns used as the source and destination nodes are always named 'src' and 'dst',\n    respectively, regardless of the original column names passed to the constructor.\n    \"\"\"\n    if self._as_graphframe is not None:\n        return self._as_graphframe.edges\n\n    return SparkInterface().spark.createDataFrame(self.edges_as_pandas())\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_as_pandas","title":"edges_as_pandas()","text":"

    Returns the edges as a pandas DataFrame.

    If the graph is represented as a networkx graph, the edges are extracted from it. Otherwise, the graphframes graph will be used. This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The columns used as the source and destination nodes are always named 'src' and 'dst', respectively.

    Source code in mercury/graph/core/graph.py
    def edges_as_pandas(self):\n    \"\"\"\n    Returns the edges as a pandas DataFrame.\n\n    If the graph is represented as a networkx graph, the edges are extracted from it. Otherwise, the graphframes graph will be used.\n    This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The columns used\n    as the source and destination nodes are always named 'src' and 'dst', respectively.\n    \"\"\"\n    if self._as_networkx is not None:\n        edges_data = self._as_networkx.edges(data = True)\n        edges_df   = pd.DataFrame([(src, dst, attr) for src, dst, attr in edges_data], columns = ['src', 'dst', 'attributes'])\n\n        attrs_df   = pd.json_normalize(edges_df['attributes'])\n\n        return pd.concat([edges_df.drop('attributes', axis = 1), attrs_df], axis = 1)\n\n    return self.graphframe.edges.toPandas()\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_as_dataframe","title":"nodes_as_dataframe()","text":"

    Returns the nodes as a pyspark DataFrame.

    If the graph is represented as a graphframes graph, the nodes are extracted from it. Otherwise, the nodes are converted from the pandas DataFrame representation. The column used as the node id is always named 'id', regardless of the original column name passed to the constructor.

    Source code in mercury/graph/core/graph.py
    def nodes_as_dataframe(self):\n    \"\"\"\n    Returns the nodes as a pyspark DataFrame.\n\n    If the graph is represented as a graphframes graph, the nodes are extracted from it. Otherwise, the nodes are converted from the\n    pandas DataFrame representation. The column used as the node id is always named 'id', regardless of the original column name passed\n    to the constructor.\n    \"\"\"\n    if self._as_graphframe is not None:\n        return self._as_graphframe.vertices\n\n    return SparkInterface().spark.createDataFrame(self.nodes_as_pandas())\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_as_pandas","title":"nodes_as_pandas()","text":"

    Returns the nodes as a pandas DataFrame.

    If the graph is represented as a networkx graph, the nodes are extracted from it. Otherwise, the graphframes graph will be used. This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The column used as the node id is always named 'id'.

    Source code in mercury/graph/core/graph.py
    def nodes_as_pandas(self):\n    \"\"\"\n    Returns the nodes as a pandas DataFrame.\n\n    If the graph is represented as a networkx graph, the nodes are extracted from it. Otherwise, the graphframes graph will be used.\n    This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The column used\n    as the node id is always named 'id'.\n    \"\"\"\n    if self._as_networkx is not None:\n        nodes_data = self._as_networkx.nodes(data = True)\n        nodes_df   = pd.DataFrame([(node, attr) for node, attr in nodes_data], columns = ['id', 'attributes'])\n\n        attrs_df = pd.json_normalize(nodes_df['attributes'])\n\n        return pd.concat([nodes_df.drop('attributes', axis = 1), attrs_df], axis = 1)\n\n    return self.graphframe.vertices.toPandas()\n
    "},{"location":"reference/core/#mercury.graph.core.SparkInterface","title":"mercury.graph.core.SparkInterface(config=None, session=None)","text":"

    A class that provides an interface for interacting with Apache Spark, graphframes and dgl.

    Attributes:

    Name Type Description _spark_session SparkSession

    The shared Spark session.

    _graphframes module

    The shared graphframes namespace.

    Methods:

    Name Description _create_spark_session

    Creates a Spark session.

    spark

    Property that returns the shared Spark session.

    pyspark

    Property that returns the pyspark namespace.

    graphframes

    Property that returns the shared graphframes namespace.

    dgl

    Property that returns the shared dgl namespace.

    read_csv

    Reads a CSV file into a DataFrame.

    read_parquet

    Reads a Parquet file into a DataFrame.

    read_json

    Reads a JSON file into a DataFrame.

    read_text

    Reads a text file into a DataFrame.

    read

    Reads a file into a DataFrame.

    sql

    Executes a SQL query.

    udf

    Registers a user-defined function (UDF).

    stop

    Stops the Spark session.

    Parameters:

    Name Type Description Default config dict

    A dictionary of Spark configuration options. If not provided, the configuration in the global variable default_spark_config will be used.

    None Source code in mercury/graph/core/spark_interface.py
    def __init__(self, config=None, session=None):\n    if SparkInterface._spark_session is None:\n        if session is not None:\n            SparkInterface._spark_session = session\n        else:\n            SparkInterface._spark_session = self._create_spark_session(config)\n            # Set checkpoint directory\n            SparkInterface._spark_session.sparkContext.setCheckpointDir(\".checkpoint\")\n\n    if SparkInterface._graphframes is None and graphframes_installed:\n        SparkInterface._graphframes = gf\n\n    if SparkInterface._dgl is None and dgl_installed:\n        SparkInterface._dgl = dgl\n
    "},{"location":"reference/embeddings/","title":"mercury.graph.embeddings","text":""},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings","title":"mercury.graph.embeddings.Embeddings(dimension, num_elements=0, mean=0, sd=1, learn_step=3, bidirectional=False)","text":"

    Bases: BaseClass

    This class holds a matrix object that is interpreted as the embeddings for any list of objects, not only the nodes of a graph. You can see this class as the internal object holding the embedding for other classes such as class GraphEmbedding.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note below.

    required num_elements int

    The number of rows in the embedding. You can leave this empty on creation and then use initialize_as() to automatically match the nodes in a graph.

    0 mean float

    The (expected) mean of the initial values.

    0 sd float

    The (expected) standard deviation of the initial values.

    1 learn_step float

    The size of the learning step elements get approached or moved away. Units are hexadecimal degrees in along an ellipse.

    3 bidirectional bool

    Should the changes apply only to the elements of first column (False) or to both.

    False Note

    On dimension: Embeddings cannot be zero (that is against the whole concept). Smaller dimension embeddings can only hold few elements without introducing spurious correlations by some form of 'birthday attack' phenomenon as elements increase. Later it is very hard to get rid of that spurious 'knowledge'.

    Solution: With may elements, you have to go to high enough dimension even if the structure is simple. Pretending to fit many embeddings in low dimension without them being correlated is like pretending to plot a trillion random points in a square centimeter while keeping them 1 mm apart from each other: It's simply impossible!

    Source code in mercury/graph/embeddings/embeddings.py
    def __init__(\n    self, dimension, num_elements=0, mean=0, sd=1, learn_step=3, bidirectional=False\n):\n    self.dimension = dimension\n    self.num_elements = num_elements\n    self.mean = mean\n    self.sd = sd\n    self.learn_step = learn_step\n    self.bidirectional = bidirectional\n\n    if self.num_elements > 0:\n        self.embeddings_matrix_ = np.random.normal(\n            self.mean, self.sd, (self.num_elements, self.dimension)\n        )\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.as_numpy","title":"as_numpy()","text":"

    Return the embedding as a numpy matrix where each row is an embedding.

    Source code in mercury/graph/embeddings/embeddings.py
    def as_numpy(self):\n    \"\"\"\n    Return the embedding as a numpy matrix where each row is an embedding.\n    \"\"\"\n    if not hasattr(self, \"embeddings_matrix_\"):\n        return\n\n    return self.embeddings_matrix_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.fit","title":"fit(converge=None, diverge=None)","text":"

    Apply a learning step to the embedding.

    Parameters:

    Name Type Description Default converge numpy matrix of two columns

    A matrix of indices to elements meaning (first column) should be approached to (second column).

    None diverge numpy matrix of two columns

    A matrix of indices to elements meaning (first column) should be moved away from (second column).

    None

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Note

    Embeddings start being randomly distributed and hold no structure other than spurious correlations. Each time you apply a learning step by calling this method, you are tweaking the embedding to approach some rows and/or move others away. You can use both converge and diverge or just one of them and call this as many times you want with varying learning step. A proxy of how much an embedding can learn can be estimated by measuring how row correlations are converging towards some asymptotic values.

    Source code in mercury/graph/embeddings/embeddings.py
    def fit(self, converge=None, diverge=None):\n    \"\"\"\n    Apply a learning step to the embedding.\n\n    Args:\n        converge (numpy matrix of two columns): A matrix of indices to elements meaning (first column) should be approached to\n            (second column).\n        diverge (numpy matrix of two columns): A matrix of indices to elements meaning (first column) should be moved away from\n            (second column).\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    Note:\n        Embeddings start being randomly distributed and hold no structure other than spurious correlations. Each time you apply a\n        learning step by calling this method, you are tweaking the embedding to approach some rows and/or move others away. You can use\n        both converge and diverge or just one of them and call this as many times you want with varying learning step. A proxy of how\n        much an embedding can learn can be estimated by measuring how row correlations are converging towards some asymptotic values.\n    \"\"\"\n\n    w = self.learn_step * np.pi / 180\n\n    cos_w = np.cos(w)\n    sin_w = np.sin(w)\n\n    if converge is not None:\n        self.embeddings_matrix_ = _elliptic_rotate(\n            self.embeddings_matrix_, converge[:, 0], converge[:, 1], cos_w, sin_w\n        )\n\n        if self.bidirectional:\n            self.embeddings_matrix_ = _elliptic_rotate(\n                self.embeddings_matrix_,\n                converge[:, 1],\n                converge[:, 0],\n                cos_w,\n                sin_w,\n            )\n\n    if diverge is not None:\n        self.embeddings_matrix_ = _elliptic_rotate(\n            self.embeddings_matrix_, diverge[:, 0], diverge[:, 1], cos_w, -sin_w\n        )\n\n        if self.bidirectional:\n            self.embeddings_matrix_ = _elliptic_rotate(\n                self.embeddings_matrix_, diverge[:, 1], diverge[:, 0], cos_w, -sin_w\n            )\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.get_most_similar_embeddings","title":"get_most_similar_embeddings(index, k=5, metric='cosine')","text":"

    Given an index of a vector in the embedding matrix, returns the k most similar embeddings in the matrix

    Parameters:

    Name Type Description Default index int

    index of the vector in the matrix that we want to compute the similar embeddings

    required k int

    Number of most similar embeddings to return

    5 metric str

    metric to use as a similarity.

    'cosine'

    Returns:

    Type Description list

    list of k most similar nodes as indices and list of similarities of the most similar nodes

    Source code in mercury/graph/embeddings/embeddings.py
    def get_most_similar_embeddings(self, index, k=5, metric=\"cosine\"):\n    \"\"\"\n    Given an index of a vector in the embedding matrix, returns the k most similar embeddings in the matrix\n\n    Args:\n        index (int): index of the vector in the matrix that we want to compute the similar embeddings\n        k (int): Number of most similar embeddings to return\n        metric (str): metric to use as a similarity.\n\n    Returns:\n        (list): list of k most similar nodes as indices and list of similarities of the most similar nodes\n    \"\"\"\n    if metric == \"cosine\":\n        similarities = (\n            1\n            - cdist(\n                np.expand_dims(self.as_numpy()[index], axis=0),\n                self.as_numpy(),\n                \"cosine\",\n            )[0]\n        )\n\n    elif metric == \"euclidean\":\n        similarities = 1 / (\n            1\n            + cdist(\n                np.expand_dims(self.as_numpy()[index], axis=0),\n                self.as_numpy(),\n                \"euclidean\",\n            )[0]\n        )\n\n    else:\n        raise ValueError(\"Unknown Distance Metric: %s\" % metric)\n\n    ordered_indices = np.argsort(similarities)[::-1][1 : (k + 1)]\n    ordered_similarities = similarities[ordered_indices]\n\n    return ordered_indices, ordered_similarities\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding","title":"mercury.graph.embeddings.GraphEmbedding(dimension=None, n_jumps=None, max_per_epoch=None, learn_step=3, bidirectional=False, load_file=None)","text":"

    Bases: BaseClass

    Create an embedding mapping the nodes of a graph.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note the notes in Embeddings for details. (This parameter will be ignored when load_file is used.)

    None n_jumps int

    Number of random jumps from node to node.

    None max_per_epoch int

    Maximum number Number of consecutive random jumps without randomly jumping outside the edges. Note that normal random jumps are not going to explore outside a connected component.

    None learn_step float

    The size of the learning step elements get approached or moved away. Units are hexadecimal degrees in along an ellipse.

    3 bidirectional bool

    Should the changes apply only to the elements of first column (False) or to both.

    False load_file str

    (optional) The full path to a binary file containing a serialized GraphEmbedding object. This file must be created using GraphEmbedding.save().

    None

    GraphEmbedding class constructor

    Source code in mercury/graph/embeddings/graphembeddings.py
    def __init__(\n    self,\n    dimension=None,\n    n_jumps=None,\n    max_per_epoch=None,\n    learn_step=3,\n    bidirectional=False,\n    load_file=None,\n):\n    \"\"\"GraphEmbedding class constructor\"\"\"\n    if load_file is None and (dimension is None or n_jumps is None):\n        raise ValueError(\n            \"Parameters dimension and n_jumps are required when load_file is None\"\n        )\n\n    self.dimension = dimension\n    self.n_jumps = n_jumps\n    self.max_per_epoch = max_per_epoch\n    self.learn_step = learn_step\n    self.bidirectional = bidirectional\n    self.load_file = load_file\n\n    if self.load_file is not None:\n        self._load(self.load_file)\n        return\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.__getitem__","title":"__getitem__(arg)","text":"

    Method to access rows in the embedding by ID.

    Parameters:

    Name Type Description Default arg same as node ids in the graph

    A node ID in the graph

    required

    Returns:

    Type Description matrix

    A numpy matrix of one row

    Source code in mercury/graph/embeddings/graphembeddings.py
    def __getitem__(self, arg):\n    \"\"\"\n    Method to access rows in the embedding by ID.\n\n    Args:\n        arg (same as node ids in the graph): A node ID in the graph\n\n    Returns:\n        (numpy.matrix): A numpy matrix of one row\n\n    \"\"\"\n    return self.embeddings_.embeddings_matrix_[self.node_ids.index(arg)]\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.embedding","title":"embedding()","text":"

    Return the internal Embeddings object.

    Returns:

    Type Description Embeddings

    The embedding which is a dense matrix of float that can be used with numpy functions.

    Source code in mercury/graph/embeddings/graphembeddings.py
    def embedding(self):\n    \"\"\"\n    Return the internal Embeddings object.\n\n    Returns:\n        (mercury.graph.embeddings.Embeddings): The embedding which is a dense matrix of `float` that can be used with `numpy` functions.\n    \"\"\"\n    if not hasattr(self, \"embeddings_\"):\n        return\n\n    return self.embeddings_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.fit","title":"fit(g)","text":"

    Train the embedding by doing random walks.

    Parameters:

    Name Type Description Default g mercury.graph Graph asset

    A mercury.graph Graph object. The embedding will be created so that each row in the embedding maps a node ID in g.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    This does a number of random walks starting from a random node and selecting the edges with a probability that is proportional to the weight of the edge. If the destination node also has outgoing edges, the next step will start from it, otherwise, a new random node will be selected. The edges visited (concordant pairs) will get some reinforcement in the embedding while a randomly selected non-existent edges will get divergence instead (discordant pairs).

    Internally, this stores the node IDS of the node visited and calls Embeddings.fit() to transfer the structure to the embedding. Of course, it can be called many times on the same GraphEmbedding.

    Source code in mercury/graph/embeddings/graphembeddings.py
    def fit(self, g: Graph):\n    \"\"\"\n    Train the embedding by doing random walks.\n\n    Args:\n        g (mercury.graph Graph asset): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps\n            a node ID in g.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    This does a number of random walks starting from a random node and selecting the edges with a probability that is proportional to\n    the weight of the edge. If the destination node also has outgoing edges, the next step will start from it, otherwise, a new random\n    node will be selected. The edges visited (concordant pairs) will get some reinforcement in the embedding while a randomly selected\n    non-existent edges will get divergence instead (discordant pairs).\n\n    Internally, this stores the node IDS of the node visited and calls Embeddings.fit() to transfer the structure to the embedding.\n    Of course, it can be called many times on the same GraphEmbedding.\n\n    \"\"\"\n\n    self.node_ids = list(g.networkx.nodes)\n\n    j_matrix = nx.adjacency_matrix(g.networkx)\n\n    N = j_matrix.shape[1]\n    M = j_matrix.nnz\n\n    self.r_ini = np.zeros(N, dtype=int)\n    self.r_len = np.zeros(N, dtype=int)\n    self.r_sum = np.zeros(N, dtype=float)\n    self.r_col = np.zeros(M, dtype=int)\n    self.r_wgt = np.zeros(M, dtype=float)\n\n    i = 0\n    for r in range(N):\n        self.r_ini[r] = i\n\n        i_col = j_matrix[[r], :].nonzero()[1]\n        L = len(i_col)\n\n        self.r_len[r] = L\n\n        for k in range(L):\n            c = i_col[k]\n            w = j_matrix[r, c]\n\n            self.r_sum[r] += w\n            self.r_col[i] = c\n            self.r_wgt[i] = w\n\n            i += 1\n\n    self.TotW = sum(self.r_sum)\n\n    converge, diverge = _random_walks(\n        self.r_ini,\n        self.r_len,\n        self.r_sum,\n        self.r_col,\n        self.r_wgt,\n        self.TotW,\n        self.n_jumps,\n        self.max_per_epoch if self.max_per_epoch is not None else self.n_jumps,\n    )\n\n    self.embeddings_ = Embeddings(\n        dimension=self.dimension,\n        num_elements=len(self.node_ids),\n        learn_step=self.learn_step,\n        bidirectional=self.bidirectional,\n    )\n    self.embeddings_.fit(converge, diverge)\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.get_most_similar_nodes","title":"get_most_similar_nodes(node_id, k=5, metric='cosine', return_as_indices=False)","text":"

    Returns the k most similar nodes and the similarities

    Parameters:

    Name Type Description Default node_id object

    Id of the node that we want to search the similar nodes.

    required k int

    Number of most similar nodes to return

    5 metric str

    metric to use as a similarity.

    'cosine' return_as_indices bool

    if return the nodes as indices (False), or as node ids (True)

    False

    Returns:

    Type Description list

    list of k most similar nodes and list of similarities of the most similar nodes

    DataFrame

    A list of k most similar nodes as a pd.DataFrame[word: string, similarity: double]

    Source code in mercury/graph/embeddings/graphembeddings.py
    def get_most_similar_nodes(\n    self, node_id, k=5, metric=\"cosine\", return_as_indices=False\n):\n    \"\"\"\n    Returns the k most similar nodes and the similarities\n\n    Args:\n        node_id (object): Id of the node that we want to search the similar nodes.\n        k (int): Number of most similar nodes to return\n        metric (str): metric to use as a similarity.\n        return_as_indices (bool): if return the nodes as indices (False), or as node ids (True)\n\n    Returns:\n        (list): list of k most similar nodes and list of similarities of the most similar nodes\n        (DataFrame): A list of k most similar nodes as a `pd.DataFrame[word: string, similarity: double]`\n    \"\"\"\n    node_index = self.node_ids.index(node_id)\n\n    ordered_indices, ordered_similarities = (\n        self.embeddings_.get_most_similar_embeddings(node_index, k, metric)\n    )\n\n    if not return_as_indices:\n        nodes = list(np.array(self.node_ids)[ordered_indices])\n    else:\n        nodes = list(ordered_indices)\n\n    return pd.DataFrame({\"word\": nodes, \"similarity\": ordered_similarities})\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.save","title":"save(file_name, save_embedding=False)","text":"

    Saves a GraphEmbedding to a compressed binary file with or without the embedding itself. It saves the graph's node names and the adjacency matrix as a sparse matrix.

    Parameters:

    Name Type Description Default file_name str

    The name of the file to which the GraphEmbedding will be saved.

    required save_embedding bool

    Since the embedding can be big and, if not trained, it is just a matrix of uniform random numbers it is possible avoiding saving it. In case it is not saved, loading the file will create a new random embedding. This parameter controls if the embedding is saved or not (the default value).

    False Source code in mercury/graph/embeddings/graphembeddings.py
    def save(self, file_name, save_embedding=False):\n    \"\"\"\n    Saves a GraphEmbedding to a compressed binary file with or without the embedding itself. It saves the graph's node names\n    and the adjacency matrix as a sparse matrix.\n\n    Args:\n        file_name (str): The name of the file to which the GraphEmbedding will be saved.\n        save_embedding (bool): Since the embedding can be big and, if not trained, it is just a matrix of uniform random numbers it is\n            possible avoiding saving it. In case it is not saved, loading the file will create a new random embedding. This parameter\n            controls if the embedding is saved or not (the default value).\n    \"\"\"\n    with bz2.BZ2File(file_name, \"w\") as f:\n        pickle.dump(GraphEmbedding.FILE_HEAD, f)\n        pickle.dump(save_embedding, f)\n        pickle.dump(self.embeddings_.dimension, f)\n\n        pickle.dump(self.node_ids, f)\n\n        np.save(f, self.r_ini)\n        np.save(f, self.r_len)\n        np.save(f, self.r_sum)\n        np.save(f, self.r_col)\n        np.save(f, self.r_wgt)\n\n        pickle.dump(self.TotW, f)\n\n        if save_embedding:\n            np.save(f, self.embeddings_.embeddings_matrix_)\n\n        pickle.dump(GraphEmbedding.FILE_END, f)\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec","title":"mercury.graph.embeddings.SparkNode2Vec(dimension=None, sampling_ratio=1.0, num_epochs=10, num_paths_per_node=1, batch_size=1000000, w2v_max_iter=1, w2v_num_partitions=1, w2v_step_size=0.025, w2v_min_count=5, path_cache=None, use_cached_rw=False, n_partitions_cache=10, load_file=None)","text":"

    Bases: BaseClass

    Create or reload a SparkNode2Vec embedding mapping the nodes of a graph.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note the notes in Embeddings for details. (This parameter will be ignored when load_file is used.)

    None sampling_ratio float

    The proportion from the total number of nodes to be used in parallel at each step (whenever possible).

    1.0 num_epochs int

    Number of epochs. This is the total number of steps the iteration goes through. At each step, sampling_ratio times the total number of nodes paths will be computed in parallel.

    10 num_paths_per_node int

    The amount of random walks to source from each node.

    1 batch_size int

    This forces caching the random walks computed so far and breaks planning each time this number of epochs is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to \"cache a continue\" so to say.

    1000000 w2v_max_iter int

    This is the Spark Word2Vec parameter maxIter, the default value is the original default value.

    1 w2v_num_partitions int

    This is the Spark Word2Vec parameter numPartitions, the default value is the original default value.

    1 w2v_step_size float

    This is the Spark Word2Vec parameter stepSize, the default value is the original default value.

    0.025 w2v_min_count int

    This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the minimum number of times that a node has to appear to generate an embedding.

    5 path_cache str

    Folder where random walks will be stored, the default value is None which entails that random walks will not be stored.

    None use_cached_rw bool

    Flag that indicates if random walks should be read from disk (hence, they will not be computed again). Setting this parameter to True requires a valid path_cache.

    False n_partitions_cache int

    Number of partitions that will be used when storing the random walks, to optimize read access. The default value is 10.

    10 load_file str

    (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created using SparkNode2Vec.save().

    None Source code in mercury/graph/embeddings/spark_node2vec.py
    def __init__(\n    self,\n    dimension=None,\n    sampling_ratio=1.0,\n    num_epochs=10,\n    num_paths_per_node=1,\n    batch_size=1000000,\n    w2v_max_iter=1,\n    w2v_num_partitions=1,\n    w2v_step_size=0.025,\n    w2v_min_count=5,\n    path_cache=None,\n    use_cached_rw=False,\n    n_partitions_cache=10,\n    load_file=None,\n):\n    \"\"\"\n    Create or reload a SparkNode2Vec embedding mapping the nodes of a graph.\n\n    Args:\n        dimension (int): The number of columns in the embedding. See note the notes in `Embeddings` for details. (This parameter will be\n            ignored when `load_file` is used.)\n        sampling_ratio (float): The proportion from the total number of nodes to be used in parallel at each step (whenever possible).\n        num_epochs (int): Number of epochs. This is the total number of steps the iteration goes through. At each step, sampling_ratio\n            times the total number of nodes paths will be computed in parallel.\n        num_paths_per_node (int): The amount of random walks to source from each node.\n        batch_size (int): This forces caching the random walks computed so far and breaks planning each time this number of epochs\n            is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to\n            set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high\n            number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to\n            \"cache a continue\" so to say.\n        w2v_max_iter (int): This is the Spark Word2Vec parameter maxIter, the default value is the original default value.\n        w2v_num_partitions (int): This is the Spark Word2Vec parameter numPartitions, the default value is the original default value.\n        w2v_step_size (float): This is the Spark Word2Vec parameter stepSize, the default value is the original default value.\n        w2v_min_count (int): This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the\n            minimum number of times that a node has to appear to generate an embedding.\n        path_cache (str): Folder where random walks will be stored, the default value is None which entails that random walks will not\n            be stored.\n        use_cached_rw (bool): Flag that indicates if random walks should be read from disk (hence, they will not be computed again).\n            Setting this parameter to True requires a valid path_cache.\n        n_partitions_cache (int): Number of partitions that will be used when storing the random walks, to optimize read access.\n            The default value is 10.\n        load_file (str): (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created\n            using SparkNode2Vec.save().\n    \"\"\"\n    self.dimension = dimension\n    self.sampling_ratio = sampling_ratio\n    self.num_epochs = num_epochs\n    self.num_paths_per_node = num_paths_per_node\n    self.batch_size = batch_size\n    self.w2v_max_iter = w2v_max_iter\n    self.w2v_num_partitions = w2v_num_partitions\n    self.w2v_step_size = w2v_step_size\n    self.w2v_min_count = w2v_min_count\n    self.path_cache = path_cache\n    self.use_cached_rw = use_cached_rw\n    self.n_partitions_cache = n_partitions_cache\n    self.load_file = load_file\n\n    if self.load_file is not None:\n        self._load(self.load_file)\n        return\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.embedding","title":"embedding()","text":"

    Return all embeddings.

    Returns:

    Type Description DataFrame

    All embeddings as a DataFrame[word: string, vector: vector].

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def embedding(self):\n    \"\"\"\n    Return all embeddings.\n\n    Returns:\n        (DataFrame): All embeddings as a `DataFrame[word: string, vector: vector]`.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.getVectors()\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.fit","title":"fit(G)","text":"

    Train the embedding by doing random walks.

    Random walk paths are available in attribute paths_.

    Parameters:

    Name Type Description Default G Graph

    A mercury.graph Graph object. The embedding will be created so that each row in the embedding maps a node ID in G. (This parameter will be ignored when load_file is used.)

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def fit(self, G: Graph):\n    \"\"\"\n    Train the embedding by doing random walks.\n\n    Random walk paths are available in attribute `paths_`.\n\n    Args:\n        G (mercury.graph.core.Graph): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps\n            a node ID in G. (This parameter will be ignored when `load_file` is used.)\n\n    Returns:\n        (self): Fitted self (or raises an error)\n    \"\"\"\n\n    if self.path_cache is None:\n        if self.use_cached_rw:\n            logging.warning(\n                \"Wrong options (use_cached_rw and no path_cache). \"\n                \"Paths will be recomputed.\"\n            )\n        self.use_cached_rw = False\n\n    if not self.use_cached_rw:\n        paths = (\n            self._run_rw(G)\n            .withColumn(\"size\", f.size(\"random_walks\"))\n            .where(f.col(\"size\") > 1)\n            .drop(\"size\")\n        )\n\n        if self.path_cache is not None:\n            (\n                paths.repartition(self.n_partitions_cache)\n                .write.mode(\"overwrite\")\n                .parquet(\"%s/block=0\" % self.path_cache)\n            )\n\n        if self.num_paths_per_node > 1:\n            for block_id in range(1, self.num_paths_per_node):\n                new_paths = (\n                    self._run_rw(G)\n                    .withColumn(\"size\", f.size(\"random_walks\"))\n                    .where(f.col(\"size\") > 1)\n                    .drop(\"size\")\n                )\n                if self.path_cache is None:\n                    paths = paths.unionByName(new_paths)\n                else:\n                    (\n                        new_paths.repartition(self.n_partitions_cache)\n                        .write.mode(\"overwrite\")\n                        .parquet(\"%s/block=%d\" % (self.path_cache, block_id))\n                    )\n                    # With this, we clear the persisted dataframe\n                    new_paths.unpersist()\n\n    if self.path_cache is None:\n        self.paths_ = paths.persist()\n    else:\n        self.paths_ = (\n            SparkInterface()\n            .read_parquet(self.path_cache)\n            .drop(\"block\")\n            .repartition(self.n_partitions_cache)\n            .persist()\n        )\n\n    w2v = Word2Vec(\n        vectorSize=self.dimension,\n        maxIter=self.w2v_max_iter,\n        numPartitions=self.w2v_num_partitions,\n        stepSize=self.w2v_step_size,\n        inputCol=\"random_walks\",\n        outputCol=\"model\",\n        minCount=self.w2v_min_count,\n    )\n\n    self.node2vec_ = w2v.fit(self.paths_)\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.get_most_similar_nodes","title":"get_most_similar_nodes(node_id, k=5)","text":"

    Returns the k most similar nodes and a similarity measure.

    Parameters:

    Name Type Description Default node_id str

    Id of the node we want to search.

    required k int

    Number of most similar nodes to return

    5

    Returns:

    Type Description DataFrame

    A list of k most similar nodes (using cosine similarity) as a DataFrame[word: string, similarity: double]

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def get_most_similar_nodes(self, node_id, k=5):\n    \"\"\"\n    Returns the k most similar nodes and a similarity measure.\n\n    Args:\n        node_id (str): Id of the node we want to search.\n        k (int): Number of most similar nodes to return\n\n    Returns:\n        (DataFrame): A list of k most similar nodes (using cosine similarity) as a `DataFrame[word: string, similarity: double]`\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.findSynonyms(node_id, k)\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.model","title":"model()","text":"

    Returns the Spark Word2VecModel object.

    Returns:

    Type Description Word2VecModel

    The Spark Word2VecModel of the embedding to use its API directly.

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def model(self):\n    \"\"\"\n    Returns the Spark Word2VecModel object.\n\n    Returns:\n        (pyspark.ml.feature.Word2VecModel): The Spark Word2VecModel of the embedding to use its API directly.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.save","title":"save(file_name)","text":"

    Saves the internal Word2VecModel to a human-readable (JSON) model metadata as a Parquet formatted data file.

    The model may be loaded using SparkNode2Vec(load_file='path/file')

    Parameters:

    Name Type Description Default file_name str

    The name of the file to which the Word2VecModel will be saved.

    required Source code in mercury/graph/embeddings/spark_node2vec.py
    def save(self, file_name):\n    \"\"\"\n    Saves the internal Word2VecModel to a human-readable (JSON) model metadata as a Parquet formatted data file.\n\n    The model may be loaded using SparkNode2Vec(load_file='path/file')\n\n    Args:\n        file_name (str): The name of the file to which the Word2VecModel will be saved.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.save(file_name)\n
    "},{"location":"reference/ml/","title":"mercury.graph.ml","text":""},{"location":"reference/ml/#mercury.graph.ml.LouvainCommunities","title":"mercury.graph.ml.LouvainCommunities(min_modularity_gain=0.001, max_pass=2, max_iter=10, resolution=1, all_partitions=True, verbose=True)","text":"

    Bases: BaseClass

    Class that defines the functions that run a PySpark implementation of the Louvain algorithm to find the partition that maximizes the modularity of an undirected graph (as in 1).

    This version of the algorithm differs from 1 in that the reassignment of nodes to new communities is calculated in parallel, not sequentially. That is, all nodes are reassigned at the same time and conflicts (i.e., 1 -> C2 and 2 -> C1) are resolved with a simple tie-breaking rule. This version also introduces the resolution parameter gamma, as in 2.

    1. Blondel V D, Guillaume J-L, Lambiotte R and Lefebvre E (2008). Fast unfolding of communities in large networks. Journal of Statistical Mechanics: Theory and Experiment, 2008. https://doi.org/10.1088/1742-5468/2008/10/p10008 \u21a9\u21a9

    2. Aynaud T, Blondel V D, Guillaume J-L and Lambiotte R (2013). Multilevel local optimization of modularity. Graph Partitioning (315--345), 2013.\u00a0\u21a9

    Parameters:

    Name Type Description Default min_modularity_gain float

    Modularity gain threshold between each pass. The algorithm stops if the gain in modularity between the current pass and the previous one is less than the given threshold.

    0.001 max_pass int

    Maximum number of passes.

    2 max_iter int

    Maximum number of iterations within each pass.

    10 resolution float

    The resolution parameter gamma. Its value must be greater or equal to zero. If resolution is less than 1, modularity favors larger communities, while values greater than 1 favor smaller communities.

    1 all_partitions bool

    If True, the function will return all the partitions found at each step of the algorithm (i.e., pass0, pass1, pass2, ..., pass20). If False, only the last (and best) partition will be returned.

    True verbose bool

    If True, print progress information during the Louvain algorithm execution. Defaults to True.

    True Source code in mercury/graph/ml/louvain.py
    def __init__(\n    self,\n    min_modularity_gain=1e-03,\n    max_pass=2,\n    max_iter=10,\n    resolution: Union[float, int] = 1,\n    all_partitions=True,\n    verbose=True,\n):\n    self.min_modularity_gain = min_modularity_gain\n    self.max_pass = max_pass\n    self.max_iter = max_iter\n    self.resolution = resolution\n    self.all_partitions = all_partitions\n    self.verbose = verbose\n\n    # Check resolution\n    if resolution < 0:\n        exceptionMsg = f\"Resolution value is {resolution} and cannot be < 0.\"\n        raise ValueError(exceptionMsg)\n
    "},{"location":"reference/ml/#mercury.graph.ml.LouvainCommunities.fit","title":"fit(g)","text":"

    Parameters:

    Name Type Description Default g Graph

    A mercury graph structure.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error).

    Source code in mercury/graph/ml/louvain.py
    def fit(self, g: Graph):\n    \"\"\"\n    Args:\n        g (Graph): A mercury graph structure.\n\n    Returns:\n        (self): Fitted self (or raises an error).\n    \"\"\"\n    edges = g.graphframe.edges\n\n    # Verify edges input\n    self._verify_data(\n        df=edges,\n        expected_cols_grouping=[\"src\", \"dst\"],\n        expected_cols_others=[\"weight\"],\n    )\n\n    # Init dataframe to be returned\n    ret = (\n        edges.selectExpr(\"src as id\")\n        .unionByName(edges.selectExpr(\"dst as id\"))\n        .distinct()\n        .withColumn(\"pass0\", F.row_number().over(Window.orderBy(\"id\")))\n    ).checkpoint()\n\n    # Convert edges to anonymized src's and dst's\n    edges = (\n        edges.selectExpr(\"src as src0\", \"dst as dst0\", \"weight\")\n        .join(other=ret.selectExpr(\"id as src0\", \"pass0 as src\"), on=\"src0\")\n        .join(other=ret.selectExpr(\"id as dst0\", \"pass0 as dst\"), on=\"dst0\")\n        .select(\"src\", \"dst\", \"weight\")\n    ).checkpoint()\n\n    # Calculate m and initialize modularity\n    m = self._calculate_m(edges)\n    modularity0 = -1.0\n\n    # Begin pass\n    canPass, _pass = True, 0\n    while canPass:\n\n        # Declare naive partition\n        p1 = (\n            edges.selectExpr(\"src as id\")\n            .unionByName(edges.selectExpr(\"dst as id\"))\n            .distinct()\n            .withColumn(\"c\", F.col(\"id\"))\n        )\n\n        # Begin iterations within pass\n        canIter, _iter = True, 0\n        # Carry reference to previously cached p2 to call unpersist()\n        prev_p2 = None\n        while canIter:\n\n            if _iter >= self.max_iter:\n                break\n\n            # Print progress\n            if self.verbose:\n                print(f\"Starting Pass {_pass} Iteration {_iter}.\")\n\n            # Create new partition and check if movements were made\n            p2 = self._reassign_all(edges, p1)\n            # Break complex lineage caused by loops first\n            p2 = p2.checkpoint()\n            p2.cache()\n\n            canIter = len(p2.where(\"cx != cj\").take(1)) > 0\n            if canIter:\n                p1 = p2.selectExpr(\"id\", \"cj as c\")\n            if prev_p2 is not None:\n                prev_p2.unpersist()\n            prev_p2 = p2\n            _iter += 1\n\n        # Calculate new modularity and update pass counter\n        modularity1 = self._calculate_modularity(edges=edges, partition=p1, m=m)\n\n        # Declare stopping criterion and update old modularity\n        canPass = (modularity1 - modularity0 > self.min_modularity_gain) and (\n            _pass < self.max_pass\n        )\n        modularity0 = modularity1\n\n        self.modularity_ = modularity0\n\n        # Update ret and compress graph\n        if canPass:\n            ret = ret.join(\n                other=p1.selectExpr(f\"id as pass{_pass}\", f\"c as pass{_pass + 1}\"),\n                on=f\"pass{_pass}\",\n            ).checkpoint()\n\n            edges = (\n                self._label_edges(edges, p1)\n                .select(\"cSrc\", \"cDst\", \"weight\")\n                .groupBy(\"cSrc\", \"cDst\")\n                .agg(F.sum(\"weight\").alias(\"weight\"))\n                .selectExpr(\"cSrc as src\", \"cDst as dst\", \"weight\")\n            ).checkpoint()\n\n        prev_p2.unpersist()\n        _pass += 1\n\n    # Return final dataframe with sorted columns\n    if self.all_partitions:\n\n        # Return sorted columns\n        cols = self._sort_passes(ret)\n        ret = ret.select(cols)\n\n    # Return final dataframe with id & community\n    else:\n        _last = self._last_pass(ret)\n        ret = ret.selectExpr(\"id as node_id\", f\"{_last} as cluster\")\n\n    self.labels_ = ret\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkRandomWalker","title":"mercury.graph.ml.SparkRandomWalker(num_epochs=10, batch_size=1, n_sampling_edges=None)","text":"

    Bases: BaseClass

    Class to perform random walks from a specific source_id node within a given Graph

    Parameters:

    Name Type Description Default num_epochs int

    Number of epochs. This is the total number of steps the iteration goes through.

    10 batch_size int

    This forces caching the random walks computed so far and breaks planning each time this number of epochs is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to \"cache a continue\" so to say.

    1 n_sampling_edges int

    by setting this parameter you can limit at each timestep the number of new paths opened from each node. This is useful when the graph contains nodes with very high out-degree, where running the algorithm several epochs is not feasible. When using this parameter, the graph will consider only at most edge_sampling outgoing edges at each epoch for each path. If the last node of the path contains more than edge_sampling the selected edges are sampled using its weight.

    None Source code in mercury/graph/ml/spark_randomwalker.py
    def __init__(self, num_epochs=10, batch_size=1, n_sampling_edges=None):\n    \"\"\"\n    Class to perform random walks from a specific source_id node within a given Graph\n\n    Args:\n        num_epochs (int): Number of epochs. This is the total number of steps the iteration goes through.\n        batch_size (int): This forces caching the random walks computed so far and breaks planning each time this number of epochs\n            is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to\n            set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high\n            number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to\n            \"cache a continue\" so to say.\n        n_sampling_edges (int): by setting this parameter you can limit at each timestep the number of new paths opened from each node.\n            This is useful when the graph contains nodes with very high out-degree, where running the algorithm several epochs is\n            not feasible. When using this parameter, the graph will consider only at most `edge_sampling` outgoing edges at each\n            epoch for each path. If the last node of the path contains more than `edge_sampling` the selected edges are sampled\n            using its weight.\n    \"\"\"\n    self.num_epochs = num_epochs\n    self.batch_size = batch_size\n    self.n_sampling_edges = n_sampling_edges\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkRandomWalker.fit","title":"fit(G, source_id)","text":"

    Perform random walks from a specific source_id node within a given Graph

    Parameters:

    Name Type Description Default G mercury.graph Graph asset

    A mercury.graph Graph

    required source_id int / str / list

    the source vertex or list for vertices to start the random walks.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Attribute paths_ contains a Spark Dataframe with a columns random_walks containing an array of the elements of the path walked and another column with the corresponding weights. The weights represent the probability of following that specific path starting from source_id.

    Source code in mercury/graph/ml/spark_randomwalker.py
    def fit(self, G: Graph, source_id):\n    \"\"\"\n    Perform random walks from a specific source_id node within a given Graph\n\n    Args:\n        G (mercury.graph Graph asset): A `mercury.graph` Graph\n        source_id (int/str/list): the source vertex or list for vertices to start the random walks.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    Attribute `paths_` contains a Spark Dataframe with a columns `random_walks` containing an array of the elements\n    of the path walked and another column with the corresponding weights. The weights represent the probability of\n    following that specific path starting from source_id.\n    \"\"\"\n    self.paths_ = self._run_rw(G, source_id)\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkSpreadingActivation","title":"mercury.graph.ml.SparkSpreadingActivation(attribute='influence', spreading_factor=0.2, transfer_function='weighted', steps=1, influenced_by=False)","text":"

    Bases: BaseClass

    This class is a model that represents a \u201cword-of-mouth\u201d scenario where a node influences his neighbors, from where the influence spreads to other neighbors, and so on.

    At the end of the diffusion process, we inspect the amount of influence received by each node. Using a threshold-based technique, a node that is currently not influenced can be declared to be a potential future one, based on the influence that has been accumulated.

    The diffusion model is based on Spreading Activation (SPA) techniques proposed in cognitive psychology and later used for trust metric computations. For more details, please see paper entitled \"Social Ties and their Relevance to Churn in Mobile Telecom Networks\"

    Parameters:

    Name Type Description Default attribute str

    Column name which will store the amount of influence spread

    'influence' spreading_factor float

    Percentage of influence to distribute. Low values favor influence proximity to the source of injection, while high values allow the influence to also reach nodes which are further away. It must be a value in the range (0,1). Default value is 0.2

    0.2 transfer_function str

    Allowed values: \"weighted\" or \"unweighted\". Once a node decides what fraction of energy to distribute, the next step is to decide what fraction of the energy is transferred to each neighbor. This is controlled by the Transfer Function. If \"weighted\" then the energy distributed along the directed edge depends on its relatively weight compared to the sum of weights of all outgoing edges of X. If \"unweighted\", then the energy distributed along the edge is independent of its relatively weight. 'weighted' steps int

    Number of steps to perform

    1 influenced_by bool

    if True, and extra column \"influenced_by\" is calculated which contains the seed nodes that have spread some influence to a given node. When True, the ids of the nodes cannot contain commas \",\". Note that seed_nodes will have at least their own (remaining) influence

    False Source code in mercury/graph/ml/spark_spreadactivation.py
    def __init__(\n    self,\n    attribute: str = \"influence\",\n    spreading_factor: float = 0.2,\n    transfer_function: str = \"weighted\",\n    steps: int = 1,\n    influenced_by: bool = False,\n):\n    self.attribute = attribute\n    self.spreading_factor = spreading_factor\n    self.transfer_function = transfer_function\n    self.steps = steps\n    self.influenced_by = influenced_by\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkSpreadingActivation.fit","title":"fit(g, seed_nodes)","text":"

    Perform all iterations of spread_activation

    Parameters:

    Name Type Description Default g Graph

    A mercury.graph Graph object.

    required seed_nodes Union[List, DataFrame]

    Collection of nodes that are the \"seed\" or are the source to spread the influence. It must be pyspark dataframe with column 'id' or python list

    required

    Returns:

    Type Description self

    Fitted self

    Source code in mercury/graph/ml/spark_spreadactivation.py
    def fit(\n    self,\n    g: Graph,\n    seed_nodes: Union[List, \"pyspark.sql.DataFrame\"],\n):\n    \"\"\"\n    Perform all iterations of spread_activation\n\n    Args:\n        g (mercury.graph.core.Graph): A `mercury.graph` Graph object.\n        seed_nodes (Union[List, pyspark.sql.DataFrame]): Collection of nodes that are the \"seed\" or are the source to spread\n            the influence. It must be pyspark dataframe with column 'id' or python list\n\n    Returns:\n        (self): Fitted self\n    \"\"\"\n\n    # Set seed nodes which are the source of influence\n    g = self._set_seed_nodes(g, seed_nodes)\n\n    # Compute degrees\n    g = self._compute_degrees(g)\n\n    # Number of iterations specified for spread activation\n    for _ in range(0, self.steps, 1):\n        g = self._spread_activation_step(\n            g,\n        )\n\n    # Graph with updated attributes\n    self.fitted_graph_ = g\n    # Influences as DataFrame\n    self.influences_ = self.fitted_graph_.nodes_as_dataframe().select(\n        \"id\", \"influence\"\n    )\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SpectralClustering","title":"mercury.graph.ml.SpectralClustering(n_clusters=2, mode='networkx', max_iterations=10, random_state=0)","text":"

    Bases: BaseClass

    Implementation of the spectral clustering algorithm which detect communities inside a graph.

    Parameters:

    Name Type Description Default n_clusters int

    The number of clusters that you want to detect.

    2 random_state int

    Seed for reproducibility

    0 mode str

    Calculation mode. Pass 'networkx' for using pandas + networkx or 'spark' for spark + graphframes

    'networkx' max_iterations int

    Max iterations parameter (only used if mode==spark)

    10 Source code in mercury/graph/ml/spectral.py
    def __init__(\n    self, n_clusters=2, mode=\"networkx\", max_iterations=10, random_state=0\n):\n    self.n_clusters = n_clusters\n    self.mode = mode\n    self.max_iterations = max_iterations\n    self.random_state = random_state\n\n    if self.mode not in (\"networkx\", \"spark\"):\n        raise ValueError(\"Error: Mode must be either 'networkx' or 'spark'\")\n
    "},{"location":"reference/ml/#mercury.graph.ml.SpectralClustering.fit","title":"fit(graph)","text":"

    Find the optimal clusters of a given graph. The function returns nothing, but saves the clusters and the modularity in the object self.

    Parameters:

    Name Type Description Default graph Graph

    A mercury graph structure.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Source code in mercury/graph/ml/spectral.py
    def fit(self, graph: Graph):\n    \"\"\"\n    Find the optimal clusters of a given graph. The function returns nothing, but saves the clusters and\n    the modularity in the object self.\n\n    Args:\n        graph (Graph): A mercury graph structure.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    \"\"\"\n    if self.mode == \"networkx\":\n        self._fit_networkx(graph)\n    else:\n        self._fit_spark(graph)\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition","title":"mercury.graph.ml.Transition()","text":"

    Bases: BaseClass

    Create an interface class to manage the adjacency matrix of a directed graph as a transition matrix. This enables computing distributions of probabilities over the nodes after a given number of iterations.

    Source code in mercury/graph/ml/transition.py
    def __init__(self):\n    self.fitted_graph_ = None\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition.fit","title":"fit(G)","text":"

    Converts the adjacency matrix into a transition matrix. Transition matrices are used to compute the distribution of probability of being in each of the nodes (or states) of a directed graph (or Markov process). The distribution for state s is:

    • \\(s_t = T*s_{t-1}\\)

    Where:

    T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .to_pandas() to see it. \\(s_{t-1}\\) is the previous state.

    What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows recurrent states. A recurrent state is a final state, a state whose next state is itself.

    Parameters:

    Name Type Description Default G Graph

    A mercury.graph Graph.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error).

    Note

    If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value.

    Source code in mercury/graph/ml/transition.py
    def fit(self, G: Graph):\n    \"\"\"\n    Converts the adjacency matrix into a transition matrix. Transition matrices are used to compute the distribution of probability\n    of being in each of the nodes (or states) of a directed graph (or Markov process). The distribution for state s is:\n\n    * $s_t = T*s_{t-1}$\n\n    Where:\n\n    T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .to_pandas() to see it.\n    $s_{t-1}$ is the previous state.\n\n    What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows\n    recurrent states. A recurrent state is a final state, a state whose next state is itself.\n\n    Args:\n        G (Graph): A `mercury.graph` Graph.\n\n    Returns:\n        (self): Fitted self (or raises an error).\n\n    Note:\n        If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way\n        to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value.\n\n    \"\"\"\n    names = list(G.networkx.nodes)\n    adj_m = nx.adjacency_matrix(G.networkx, weight=\"weight\", dtype=float)\n\n    with warnings.catch_warnings():\n        warnings.simplefilter(\"ignore\")\n\n        for i in range(adj_m.shape[0]):\n            row = adj_m[[i], :]\n            tot = row.sum()\n\n            if tot == 0:\n                row[0, i] = 1\n            else:\n                row = row / tot\n\n            adj_m[[i], :] = row\n\n    df = pd.DataFrame(adj_m.todense(), index=names, columns=names)\n    self.fitted_graph_ = Graph(nx.from_pandas_adjacency(df, create_using=nx.DiGraph))\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition.to_pandas","title":"to_pandas(num_iterations=1)","text":"

    Returns the adjacency (which is the transition matrix after fit() was called) for a given number of iterations as a pandas dataframe with labeled rows and columns.

    Parameters:

    Name Type Description Default num_iterations int

    If you want to compute the matrix for a different number of iterations, k, you can use this argument to raise the matrix to any non negative integer, since \\(s_{t+k} = T^k*s_t\\)

    1

    Returns:

    Type Description DataFrame

    The transition matrix for num_iterations.

    Note

    This method does not automatically call fit(). This allows inspecting the adjacency matrix as a pandas dataframe. The result of computing num_iterations will not make sense if fit() has not been called before to_pandas().

    Source code in mercury/graph/ml/transition.py
    def to_pandas(self, num_iterations=1):\n    \"\"\"\n    Returns the adjacency (which is the transition matrix after `fit()` was called) for a given number of iterations as a pandas\n    dataframe with labeled rows and columns.\n\n    Args:\n        num_iterations (int): If you want to compute the matrix for a different number of iterations, k, you can use this argument to\n            raise the matrix to any non negative integer, since $s_{t+k} = T^k*s_t$\n\n    Returns:\n        (pd.DataFrame): The transition matrix for num_iterations.\n\n    Note:\n        This method does not automatically call `fit()`. This allows inspecting the adjacency matrix as a pandas dataframe.\n        The result of computing num_iterations will not make sense if `fit()` has not been called before `to_pandas()`.\n\n    \"\"\"\n    if self.fitted_graph_ is None:\n        raise ValueError(\"Error: fit() must be called first.\")\n\n    names = list(self.fitted_graph_.networkx.nodes)\n    adj_m = nx.adjacency_matrix(self.fitted_graph_.networkx, weight=\"weight\").todense()\n\n    if num_iterations != 1:\n        adj_m = matrix_power(adj_m, num_iterations)\n\n    return pd.DataFrame(adj_m, index=names, columns=names)\n
    "},{"location":"reference/viz/","title":"mercury.graph.viz","text":""},{"location":"reference/viz/#mercury.graph.viz.Moebius","title":"mercury.graph.viz.Moebius(G)","text":"

    Moebius class for visualizing graphs using JavaScript and HTML.

    Usage
    from mercury.graph.viz import Moebius\n\nG = ... # A graph object\nmoebius = Moebius(G)\nmoebius.show()\n

    Attributes:

    Name Type Description G Graph

    The graph to be visualized.

    use_spark bool

    Flag indicating if Spark is used.

    front_pat str

    Path to the frontend resources.

    _int_id_map dict

    A dictionary mapping node IDs to integer IDs.

    name() dict

    The instance name of the object required by the JS callback mechanism.

    Source code in mercury/graph/viz/moebius.py
    def __init__(self, G):\n\n    if HTML is None:\n        raise ImportError('IPython is not installed')\n\n    self.G = G\n    self.use_spark = self.G._as_networkx is None\n    self.front_pat = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/frontend'\n    self._int_id_map = {node['id'] : i for i, node in enumerate(self.G.nodes)}\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.name","title":"name property","text":"

    Get the instance name of the object which is required by the JS callback mechanism.

    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.FHT","title":"FHT(fn)","text":"

    Syntactic sugar for display(HTML(filename = fn))

    Source code in mercury/graph/viz/moebius.py
    def FHT(self, fn):\n    \"\"\"\n    Syntactic sugar for display(HTML(filename = fn))\n    \"\"\"\n\n    display(HTML(filename = fn))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.FJS","title":"FJS(fn)","text":"

    Syntactic sugar for display(Javascript(filename = fn))

    Source code in mercury/graph/viz/moebius.py
    def FJS(self, fn):\n    \"\"\"\n    Syntactic sugar for display(Javascript(filename = fn))\n    \"\"\"\n\n    display(Javascript(filename = fn))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.JS","title":"JS(s)","text":"

    Syntactic sugar for display(Javascript())

    Source code in mercury/graph/viz/moebius.py
    def JS(self, s):\n    \"\"\"\n    Syntactic sugar for display(Javascript())\n    \"\"\"\n\n    display(Javascript(s))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.__getitem__","title":"__getitem__(item)","text":"

    Add support for the [] operator.

    Source code in mercury/graph/viz/moebius.py
    def __getitem__(self, item):\n    \"\"\"\n    Add support for the [] operator.\n    \"\"\"\n\n    return self._get_adjacent_nodes_moebius(item)\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.__str__","title":"__str__()","text":"

    Convert the object via str()

    Source code in mercury/graph/viz/moebius.py
    def __str__(self):\n    \"\"\"\n    Convert the object via str()\n    \"\"\"\n\n    return 'Moebius(%s)' % str(self.G)\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.node_or_edge_config","title":"node_or_edge_config(text_is=None, color_is=None, colors=None, size_is=None, size_range=None, size_scale='linear')","text":"

    Create a node_config or edge_config configuration dictionary for show() in an understandable way.

    Parameters:

    Name Type Description Default text_is str

    The node/edge attribute to be displayed as text. Use the string \u00ecd to draw the node id (regardless of the column having another name) or any valid node attribute name.

    None color_is str

    A categorical node/edge attribute that can be represented as a color. This will also enable a legend interface where categories can be individually shown or hidden.

    None colors dict

    The colors for each category defined as a dictionary. The keys are possible outcomes of category. The values are html RGB strings. E.g., .draw(category = 'size', colors = {'big' : '#c0a080', 'small' : '#a0c080'}) where 'big' and 'small' are possible values of the category 'size'.

    None size_is str

    The node attribute to be displayed as the size of the nodes. Use the string id to set the node id (regardless of the column having another name) or any valid node attribute name. See the options in the Moebius configuration menu to set minimum, maximum sizes, linear or logarithmic scale, etc.

    None size_range List of two numbers

    Combined with edge_label, this parameter controls the values in the variable that correspond to the minimum and maximum displayed sizes. The values below or equal the first value will be displayed with the base radius (that depends on the zoom) and the values above or equal to the second value will be shown with the maximum radius.

    None size_scale (linear, power, sqrt or log)

    Combined with edge_label, the scale used to convert the value in the variable to the displayed radius.

    'linear'

    Returns:

    Type Description dict

    The node configuration dictionary

    Source code in mercury/graph/viz/moebius.py
    def node_or_edge_config(self, text_is = None, color_is = None, colors = None, size_is = None, size_range = None, size_scale = 'linear'):\n    \"\"\"\n    Create a `node_config` or `edge_config` configuration dictionary for `show()` in an understandable way.\n\n    Args:\n        text_is (str): The node/edge attribute to be displayed as text. Use the string `\u00ecd` to draw the node id (regardless of the\n            column having another name) or any valid node attribute name.\n        color_is (str): A categorical node/edge attribute that can be represented as a color. This will also enable a legend interface\n            where categories can be individually shown or hidden.\n        colors (dict): The colors for each category defined as a dictionary. The keys are possible outcomes of category.\n            The values are html RGB strings. E.g., .draw(category = 'size', colors = {'big' : '#c0a080', 'small' : '#a0c080'})\n            where 'big' and 'small' are possible values of the category 'size'.\n        size_is (str): The node attribute to be displayed as the size of the nodes. Use the string `id` to set the node id (regardless\n            of the column having another name) or any valid node attribute name. See the options in the Moebius configuration menu to\n            set minimum, maximum sizes, linear or logarithmic scale, etc.\n        size_range (List of two numbers): Combined with edge_label, this parameter controls the values in the variable that\n            correspond to the minimum and maximum displayed sizes. The values below or equal the first value will be displayed with the\n            base radius (that depends on the zoom) and the values above or equal to the second value will be shown with the maximum\n            radius.\n        size_scale ('linear', 'power', 'sqrt' or 'log'): Combined with edge_label, the scale used to convert the value in the variable\n            to the displayed radius.\n\n    Returns:\n        (dict): The node configuration dictionary\n    \"\"\"\n\n    config = {}\n\n    if text_is is not None:\n        config['label'] = text_is\n\n    if color_is is not None:\n        config['color'] = color_is\n\n    if colors is not None:\n        config['color_palette'] = colors\n    else:\n        config['color_palette'] = {}\n\n    if size_is is None:\n        config['size_thresholds'] = []\n    else:\n        config['size'] = size_is\n\n        if size_range is None:\n            config['size_thresholds'] = []\n        else:\n            assert type(size_range) == list and len(size_range) == 2\n            config['size_thresholds'] = size_range\n\n        if size_scale != 'linear':\n            assert size_scale in {'power', 'sqrt', 'log'}\n\n        config['scale'] = size_scale\n\n    return config\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.show","title":"show(initial_id=None, initial_depth=1, node_config=None, edge_config=None)","text":"

    Start the interactive graph visualization in a Jupyter notebook.

    Parameters:

    Name Type Description Default initial_id str

    The id of the node to start the visualization.

    None initial_depth int

    The initial depth of the graph (starting with initial_id as 0) to be shown.

    1 node_config dict

    A node configuration dictionary created by node_config().

    None edge_config dict

    An edge configuration dictionary created by edge_config().

    None Source code in mercury/graph/viz/moebius.py
    def show(self, initial_id = None, initial_depth = 1, node_config = None, edge_config = None):\n    \"\"\"\n    Start the interactive graph visualization in a Jupyter notebook.\n\n    Args:\n        initial_id (str): The id of the node to start the visualization.\n        initial_depth (int): The initial depth of the graph (starting with `initial_id` as 0) to be shown.\n        node_config (dict): A node configuration dictionary created by `node_config()`.\n        edge_config (dict): An edge configuration dictionary created by `edge_config()`.\n    \"\"\"\n\n    if initial_id is None:\n        initial_id = next(iter(self._int_id_map))\n\n    initial_json = self._get_adjacent_nodes_moebius(initial_id, depth = initial_depth)\n\n    if node_config is None:\n        node_config = self.node_or_edge_config()\n\n    if edge_config is None:\n        edge_config = self.node_or_edge_config()\n\n    self._load_moebius_js(initial_json, self.name, node_config, edge_config)\n
    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"mercury-graph","text":"

    mercury-graph is a Python library that offers graph analytics capabilities with a technology-agnostic API, enabling users to apply a curated range of performant and scalable algorithms and utilities regardless of the underlying data framework. The consistent, scikit-like interface abstracts away the complexities of internal transformations, allowing users to effortlessly switch between different graph representations to leverage optimized algorithms implemented using pure Python, numba, networkx and PySpark GraphFrames.

    Currently implemented submodules in mercury.graph include:

    • mercury.graph.core, with the main classes of the library that create and store the graphs' data and properties.

    • mercury.graph.ml, with graph theory and machine learning algorithms such as Louvain community detection, spectral clustering, Markov chains, spreading activation-based diffusion models and graph random walkers.

    • mercury.graph.embeddings, with classes that calculate graph embeddings in different ways, such as following the Node2Vec algorithm.

    • mercury.graph.viz, with capabilities for graph visualization.

    "},{"location":"#repository","title":"Repository","text":"

    The website for the GitHub repository can be found here.

    "},{"location":"reference/core/","title":"mercury.graph.core","text":""},{"location":"reference/core/#mercury.graph.core.Graph","title":"mercury.graph.core.Graph(data=None, keys=None, nodes=None)","text":"

    This is the main class in mercury.graph.

    This class seamlessly abstracts the underlying technology used to represent the graph. You can create a graph passing the following objects to the constructor:

    • A pandas DataFrame containing edges (with a keys dictionary to specify the columns and possibly a nodes DataFrame)
    • A pyspark DataFrame containing edges (with a keys dictionary to specify the columns and possibly a nodes DataFrame)
    • A networkx graph
    • A graphframes graph

    Bear in mind that the graph object is immutable. This means that you can't modify the graph object once it has been created. If you want to modify it, you have to create a new graph object.

    The graph object provides:

    • Properties to access the graph in different formats (networkx, graphframes, dgl)
    • Properties with metrics and summary information that are calculated on demand and technology independent.
    • It is inherited by other graph classes in mercury-graph providing ML algorithms such as graph embedding, visualization, etc.

    Using this class from the other classes in mercury-graph:

    The other classes in mercury-graph define models or functionalities that are based on graphs. They use a Scikit-learn-like API to interact with the graph object. This means that the graph object is passed to the class constructor and the class follow the Scikit-learn conventions. It is recommended to follow the same conventions when creating your own classes to work with mercury-graph.

    The conventions can be found here:

    • Scikit API
    • On scikit conventions

    Parameters:

    Name Type Description Default data (DataFrame, Graph or DataFrame)

    The data to create the graph from. It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. In case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored.

    None keys dict

    A dictionary with keys to specify the columns in the data DataFrame. The keys are:

    • 'src': The name of the column with the source node.
    • 'dst': The name of the column with the destination node.
    • 'id': The name of the column with the node id.
    • 'weight': The name of the column with the edge weight.
    • 'directed': A boolean to specify if the graph is directed. (Only for pyspark DataFrames)

    When the keys argument is not provided or the key is missing, the default values are:

    • 'src': 'src'
    • 'dst': 'dst'
    • 'id': 'id'
    • 'weight': 'weight'
    • 'directed': True
    None nodes DataFrame

    A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when data is pandas or pyspark DataFrame and with the same type as data) If not given, the nodes are inferred from the edges DataFrame.

    None Source code in mercury/graph/core/graph.py
    def __init__(self, data = None, keys = None, nodes = None):\n    self._as_networkx = None\n    self._as_graphframe = None\n    self._as_dgl = None\n    self._degree = None\n    self._in_degree = None\n    self._out_degree = None\n    self._closeness_centrality = None\n    self._betweenness_centrality = None\n    self._pagerank = None\n    self._connected_components = None\n    self._nodes_colnames = None\n    self._edges_colnames = None\n\n    self._number_of_nodes = 0\n    self._number_of_edges = 0\n    self._node_ix = 0\n    self._is_directed = False\n    self._is_weighted = False\n\n    self._init_values = {k: v for k, v in locals().items() if k in inspect.signature(self.__init__).parameters}\n\n    if type(data) == pd.core.frame.DataFrame:\n        self._from_pandas(data, nodes, keys)\n        return\n\n    if isinstance(data, nx.Graph):      # This is the most general case, including: ...Graph, ...DiGraph and ...MultiGraph\n        self._from_networkx(data)\n        return\n\n    spark_int = SparkInterface()\n\n    if pyspark_installed and graphframes_installed:\n        if type(data) == spark_int.type_spark_dataframe:\n            self._from_dataframe(data, nodes, keys)\n            return\n\n        if type(data) == spark_int.type_graphframe:\n            self._from_graphframes(data)\n            return\n\n    raise ValueError('Invalid input data. (Expected: pandas DataFrame, a networkx Graph, a pyspark DataFrame, a graphframes Graph.)')\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.betweenness_centrality","title":"betweenness_centrality property","text":"

    Returns the betweenness centrality of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.closeness_centrality","title":"closeness_centrality property","text":"

    Returns the closeness centrality of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.connected_components","title":"connected_components property","text":"

    Returns the connected components of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.degree","title":"degree property","text":"

    Returns the degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.dgl","title":"dgl property","text":"

    Returns the graph as a DGL graph.

    If the graph has not been converted to a DGL graph yet, it will be converted and cached for future use.

    Returns:

    Type Description DGLGraph

    The graph represented as a DGL graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges","title":"edges property","text":"

    Returns an iterator over the edges in the graph.

    Returns:

    Type Description EdgeIterator

    An iterator object that allows iterating over the edges in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_colnames","title":"edges_colnames property","text":"

    Returns the column names of the edges DataFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.graphframe","title":"graphframe property","text":"

    Returns the graph as a GraphFrame.

    If the graph has not been converted to a GraphFrame yet, it will be converted and cached for future use.

    Returns:

    Type Description GraphFrame

    The graph represented as a GraphFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.in_degree","title":"in_degree property","text":"

    Returns the in-degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.is_directed","title":"is_directed property","text":"

    Returns True if the graph is directed, False otherwise.

    Note

    Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph. This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the dict argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means this class cannot be aware of.

    "},{"location":"reference/core/#mercury.graph.core.Graph.is_weighted","title":"is_weighted property","text":"

    Returns True if the graph is weighted, False otherwise.

    A graph is considered weight if it has a column named 'weight' in the edges DataFrame or the column has a different name and that name is passed in the dict argument as the 'weight' key.

    "},{"location":"reference/core/#mercury.graph.core.Graph.networkx","title":"networkx property","text":"

    Returns the graph representation as a NetworkX graph.

    If the graph has not been converted to NetworkX format yet, it will be converted and cached for future use.

    Returns:

    Type Description Graph

    The graph representation as a NetworkX graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes","title":"nodes property","text":"

    Returns an iterator over all the nodes in the graph.

    Returns:

    Type Description NodeIterator

    An iterator that yields each node in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_colnames","title":"nodes_colnames property","text":"

    Returns the column names of the nodes DataFrame.

    "},{"location":"reference/core/#mercury.graph.core.Graph.number_of_edges","title":"number_of_edges property","text":"

    Returns the number of edges in the graph.

    Returns:

    Type Description int

    The number of edges in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.number_of_nodes","title":"number_of_nodes property","text":"

    Returns the number of nodes in the graph.

    Returns:

    Type Description int

    The number of nodes in the graph.

    "},{"location":"reference/core/#mercury.graph.core.Graph.out_degree","title":"out_degree property","text":"

    Returns the out-degree of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.pagerank","title":"pagerank property","text":"

    Returns the PageRank of each node in the graph as a Python dictionary.

    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_as_dataframe","title":"edges_as_dataframe()","text":"

    Returns the edges as a pyspark DataFrame.

    If the graph is represented as a graphframes graph, the edges are extracted from it. Otherwise, the edges are converted from the pandas DataFrame representation. The columns used as the source and destination nodes are always named 'src' and 'dst', respectively, regardless of the original column names passed to the constructor.

    Source code in mercury/graph/core/graph.py
    def edges_as_dataframe(self):\n    \"\"\"\n    Returns the edges as a pyspark DataFrame.\n\n    If the graph is represented as a graphframes graph, the edges are extracted from it. Otherwise, the edges are converted from the\n    pandas DataFrame representation. The columns used as the source and destination nodes are always named 'src' and 'dst',\n    respectively, regardless of the original column names passed to the constructor.\n    \"\"\"\n    if self._as_graphframe is not None:\n        return self._as_graphframe.edges\n\n    return SparkInterface().spark.createDataFrame(self.edges_as_pandas())\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.edges_as_pandas","title":"edges_as_pandas()","text":"

    Returns the edges as a pandas DataFrame.

    If the graph is represented as a networkx graph, the edges are extracted from it. Otherwise, the graphframes graph will be used. This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The columns used as the source and destination nodes are always named 'src' and 'dst', respectively.

    Source code in mercury/graph/core/graph.py
    def edges_as_pandas(self):\n    \"\"\"\n    Returns the edges as a pandas DataFrame.\n\n    If the graph is represented as a networkx graph, the edges are extracted from it. Otherwise, the graphframes graph will be used.\n    This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The columns used\n    as the source and destination nodes are always named 'src' and 'dst', respectively.\n    \"\"\"\n    if self._as_networkx is not None:\n        edges_data = self._as_networkx.edges(data = True)\n        edges_df   = pd.DataFrame([(src, dst, attr) for src, dst, attr in edges_data], columns = ['src', 'dst', 'attributes'])\n\n        attrs_df   = pd.json_normalize(edges_df['attributes'])\n\n        return pd.concat([edges_df.drop('attributes', axis = 1), attrs_df], axis = 1)\n\n    return self.graphframe.edges.toPandas()\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_as_dataframe","title":"nodes_as_dataframe()","text":"

    Returns the nodes as a pyspark DataFrame.

    If the graph is represented as a graphframes graph, the nodes are extracted from it. Otherwise, the nodes are converted from the pandas DataFrame representation. The column used as the node id is always named 'id', regardless of the original column name passed to the constructor.

    Source code in mercury/graph/core/graph.py
    def nodes_as_dataframe(self):\n    \"\"\"\n    Returns the nodes as a pyspark DataFrame.\n\n    If the graph is represented as a graphframes graph, the nodes are extracted from it. Otherwise, the nodes are converted from the\n    pandas DataFrame representation. The column used as the node id is always named 'id', regardless of the original column name passed\n    to the constructor.\n    \"\"\"\n    if self._as_graphframe is not None:\n        return self._as_graphframe.vertices\n\n    return SparkInterface().spark.createDataFrame(self.nodes_as_pandas())\n
    "},{"location":"reference/core/#mercury.graph.core.Graph.nodes_as_pandas","title":"nodes_as_pandas()","text":"

    Returns the nodes as a pandas DataFrame.

    If the graph is represented as a networkx graph, the nodes are extracted from it. Otherwise, the graphframes graph will be used. This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The column used as the node id is always named 'id'.

    Source code in mercury/graph/core/graph.py
    def nodes_as_pandas(self):\n    \"\"\"\n    Returns the nodes as a pandas DataFrame.\n\n    If the graph is represented as a networkx graph, the nodes are extracted from it. Otherwise, the graphframes graph will be used.\n    This dataset may differ from possible pandas DataFrame passed to the constructor in the column names and order. The column used\n    as the node id is always named 'id'.\n    \"\"\"\n    if self._as_networkx is not None:\n        nodes_data = self._as_networkx.nodes(data = True)\n        nodes_df   = pd.DataFrame([(node, attr) for node, attr in nodes_data], columns = ['id', 'attributes'])\n\n        attrs_df = pd.json_normalize(nodes_df['attributes'])\n\n        return pd.concat([nodes_df.drop('attributes', axis = 1), attrs_df], axis = 1)\n\n    return self.graphframe.vertices.toPandas()\n
    "},{"location":"reference/core/#mercury.graph.core.SparkInterface","title":"mercury.graph.core.SparkInterface(config=None, session=None)","text":"

    A class that provides an interface for interacting with Apache Spark, graphframes and dgl.

    Attributes:

    Name Type Description _spark_session SparkSession

    The shared Spark session.

    _graphframes module

    The shared graphframes namespace.

    Methods:

    Name Description _create_spark_session

    Creates a Spark session.

    spark

    Property that returns the shared Spark session.

    pyspark

    Property that returns the pyspark namespace.

    graphframes

    Property that returns the shared graphframes namespace.

    dgl

    Property that returns the shared dgl namespace.

    read_csv

    Reads a CSV file into a DataFrame.

    read_parquet

    Reads a Parquet file into a DataFrame.

    read_json

    Reads a JSON file into a DataFrame.

    read_text

    Reads a text file into a DataFrame.

    read

    Reads a file into a DataFrame.

    sql

    Executes a SQL query.

    udf

    Registers a user-defined function (UDF).

    stop

    Stops the Spark session.

    Parameters:

    Name Type Description Default config dict

    A dictionary of Spark configuration options. If not provided, the configuration in the global variable default_spark_config will be used.

    None Source code in mercury/graph/core/spark_interface.py
    def __init__(self, config=None, session=None):\n    if SparkInterface._spark_session is None:\n        if session is not None:\n            SparkInterface._spark_session = session\n        else:\n            SparkInterface._spark_session = self._create_spark_session(config)\n            # Set checkpoint directory\n            SparkInterface._spark_session.sparkContext.setCheckpointDir(\".checkpoint\")\n\n    if SparkInterface._graphframes is None and graphframes_installed:\n        SparkInterface._graphframes = gf\n\n    if SparkInterface._dgl is None and dgl_installed:\n        SparkInterface._dgl = dgl\n
    "},{"location":"reference/embeddings/","title":"mercury.graph.embeddings","text":""},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings","title":"mercury.graph.embeddings.Embeddings(dimension, num_elements=0, mean=0, sd=1, learn_step=3, bidirectional=False)","text":"

    Bases: BaseClass

    This class holds a matrix object that is interpreted as the embeddings for any list of objects, not only the nodes of a graph. You can see this class as the internal object holding the embedding for other classes such as class GraphEmbedding.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note below.

    required num_elements int

    The number of rows in the embedding. You can leave this empty on creation and then use initialize_as() to automatically match the nodes in a graph.

    0 mean float

    The (expected) mean of the initial values.

    0 sd float

    The (expected) standard deviation of the initial values.

    1 learn_step float

    The size of the learning step elements get approached or moved away. Units are hexadecimal degrees in along an ellipse.

    3 bidirectional bool

    Should the changes apply only to the elements of first column (False) or to both.

    False Note

    On dimension: Embeddings cannot be zero (that is against the whole concept). Smaller dimension embeddings can only hold few elements without introducing spurious correlations by some form of 'birthday attack' phenomenon as elements increase. Later it is very hard to get rid of that spurious 'knowledge'.

    Solution: With may elements, you have to go to high enough dimension even if the structure is simple. Pretending to fit many embeddings in low dimension without them being correlated is like pretending to plot a trillion random points in a square centimeter while keeping them 1 mm apart from each other: It's simply impossible!

    Source code in mercury/graph/embeddings/embeddings.py
    def __init__(\n    self, dimension, num_elements=0, mean=0, sd=1, learn_step=3, bidirectional=False\n):\n    self.dimension = dimension\n    self.num_elements = num_elements\n    self.mean = mean\n    self.sd = sd\n    self.learn_step = learn_step\n    self.bidirectional = bidirectional\n\n    if self.num_elements > 0:\n        self.embeddings_matrix_ = np.random.normal(\n            self.mean, self.sd, (self.num_elements, self.dimension)\n        )\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.as_numpy","title":"as_numpy()","text":"

    Return the embedding as a numpy matrix where each row is an embedding.

    Source code in mercury/graph/embeddings/embeddings.py
    def as_numpy(self):\n    \"\"\"\n    Return the embedding as a numpy matrix where each row is an embedding.\n    \"\"\"\n    if not hasattr(self, \"embeddings_matrix_\"):\n        return\n\n    return self.embeddings_matrix_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.fit","title":"fit(converge=None, diverge=None)","text":"

    Apply a learning step to the embedding.

    Parameters:

    Name Type Description Default converge numpy matrix of two columns

    A matrix of indices to elements meaning (first column) should be approached to (second column).

    None diverge numpy matrix of two columns

    A matrix of indices to elements meaning (first column) should be moved away from (second column).

    None

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Note

    Embeddings start being randomly distributed and hold no structure other than spurious correlations. Each time you apply a learning step by calling this method, you are tweaking the embedding to approach some rows and/or move others away. You can use both converge and diverge or just one of them and call this as many times you want with varying learning step. A proxy of how much an embedding can learn can be estimated by measuring how row correlations are converging towards some asymptotic values.

    Source code in mercury/graph/embeddings/embeddings.py
    def fit(self, converge=None, diverge=None):\n    \"\"\"\n    Apply a learning step to the embedding.\n\n    Args:\n        converge (numpy matrix of two columns): A matrix of indices to elements meaning (first column) should be approached to\n            (second column).\n        diverge (numpy matrix of two columns): A matrix of indices to elements meaning (first column) should be moved away from\n            (second column).\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    Note:\n        Embeddings start being randomly distributed and hold no structure other than spurious correlations. Each time you apply a\n        learning step by calling this method, you are tweaking the embedding to approach some rows and/or move others away. You can use\n        both converge and diverge or just one of them and call this as many times you want with varying learning step. A proxy of how\n        much an embedding can learn can be estimated by measuring how row correlations are converging towards some asymptotic values.\n    \"\"\"\n\n    w = self.learn_step * np.pi / 180\n\n    cos_w = np.cos(w)\n    sin_w = np.sin(w)\n\n    if converge is not None:\n        self.embeddings_matrix_ = _elliptic_rotate(\n            self.embeddings_matrix_, converge[:, 0], converge[:, 1], cos_w, sin_w\n        )\n\n        if self.bidirectional:\n            self.embeddings_matrix_ = _elliptic_rotate(\n                self.embeddings_matrix_,\n                converge[:, 1],\n                converge[:, 0],\n                cos_w,\n                sin_w,\n            )\n\n    if diverge is not None:\n        self.embeddings_matrix_ = _elliptic_rotate(\n            self.embeddings_matrix_, diverge[:, 0], diverge[:, 1], cos_w, -sin_w\n        )\n\n        if self.bidirectional:\n            self.embeddings_matrix_ = _elliptic_rotate(\n                self.embeddings_matrix_, diverge[:, 1], diverge[:, 0], cos_w, -sin_w\n            )\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.Embeddings.get_most_similar_embeddings","title":"get_most_similar_embeddings(index, k=5, metric='cosine')","text":"

    Given an index of a vector in the embedding matrix, returns the k most similar embeddings in the matrix

    Parameters:

    Name Type Description Default index int

    index of the vector in the matrix that we want to compute the similar embeddings

    required k int

    Number of most similar embeddings to return

    5 metric str

    metric to use as a similarity.

    'cosine'

    Returns:

    Type Description list

    list of k most similar nodes as indices and list of similarities of the most similar nodes

    Source code in mercury/graph/embeddings/embeddings.py
    def get_most_similar_embeddings(self, index, k=5, metric=\"cosine\"):\n    \"\"\"\n    Given an index of a vector in the embedding matrix, returns the k most similar embeddings in the matrix\n\n    Args:\n        index (int): index of the vector in the matrix that we want to compute the similar embeddings\n        k (int): Number of most similar embeddings to return\n        metric (str): metric to use as a similarity.\n\n    Returns:\n        (list): list of k most similar nodes as indices and list of similarities of the most similar nodes\n    \"\"\"\n    if metric == \"cosine\":\n        similarities = (\n            1\n            - cdist(\n                np.expand_dims(self.as_numpy()[index], axis=0),\n                self.as_numpy(),\n                \"cosine\",\n            )[0]\n        )\n\n    elif metric == \"euclidean\":\n        similarities = 1 / (\n            1\n            + cdist(\n                np.expand_dims(self.as_numpy()[index], axis=0),\n                self.as_numpy(),\n                \"euclidean\",\n            )[0]\n        )\n\n    else:\n        raise ValueError(\"Unknown Distance Metric: %s\" % metric)\n\n    ordered_indices = np.argsort(similarities)[::-1][1 : (k + 1)]\n    ordered_similarities = similarities[ordered_indices]\n\n    return ordered_indices, ordered_similarities\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding","title":"mercury.graph.embeddings.GraphEmbedding(dimension=None, n_jumps=None, max_per_epoch=None, learn_step=3, bidirectional=False, load_file=None)","text":"

    Bases: BaseClass

    Create an embedding mapping the nodes of a graph.

    Includes contributions by David Muelas Recuenco.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note the notes in Embeddings for details. (This parameter will be ignored when load_file is used.)

    None n_jumps int

    Number of random jumps from node to node.

    None max_per_epoch int

    Maximum number Number of consecutive random jumps without randomly jumping outside the edges. Note that normal random jumps are not going to explore outside a connected component.

    None learn_step float

    The size of the learning step elements get approached or moved away. Units are hexadecimal degrees in along an ellipse.

    3 bidirectional bool

    Should the changes apply only to the elements of first column (False) or to both.

    False load_file str

    (optional) The full path to a binary file containing a serialized GraphEmbedding object. This file must be created using GraphEmbedding.save().

    None

    GraphEmbedding class constructor

    Source code in mercury/graph/embeddings/graphembeddings.py
    def __init__(\n    self,\n    dimension=None,\n    n_jumps=None,\n    max_per_epoch=None,\n    learn_step=3,\n    bidirectional=False,\n    load_file=None,\n):\n    \"\"\"GraphEmbedding class constructor\"\"\"\n    if load_file is None and (dimension is None or n_jumps is None):\n        raise ValueError(\n            \"Parameters dimension and n_jumps are required when load_file is None\"\n        )\n\n    self.dimension = dimension\n    self.n_jumps = n_jumps\n    self.max_per_epoch = max_per_epoch\n    self.learn_step = learn_step\n    self.bidirectional = bidirectional\n    self.load_file = load_file\n\n    if self.load_file is not None:\n        self._load(self.load_file)\n        return\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.__getitem__","title":"__getitem__(arg)","text":"

    Method to access rows in the embedding by ID.

    Parameters:

    Name Type Description Default arg same as node ids in the graph

    A node ID in the graph

    required

    Returns:

    Type Description matrix

    A numpy matrix of one row

    Source code in mercury/graph/embeddings/graphembeddings.py
    def __getitem__(self, arg):\n    \"\"\"\n    Method to access rows in the embedding by ID.\n\n    Args:\n        arg (same as node ids in the graph): A node ID in the graph\n\n    Returns:\n        (numpy.matrix): A numpy matrix of one row\n\n    \"\"\"\n    return self.embeddings_.embeddings_matrix_[self.node_ids.index(arg)]\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.embedding","title":"embedding()","text":"

    Return the internal Embeddings object.

    Returns:

    Type Description Embeddings

    The embedding which is a dense matrix of float that can be used with numpy functions.

    Source code in mercury/graph/embeddings/graphembeddings.py
    def embedding(self):\n    \"\"\"\n    Return the internal Embeddings object.\n\n    Returns:\n        (mercury.graph.embeddings.Embeddings): The embedding which is a dense matrix of `float` that can be used with `numpy` functions.\n    \"\"\"\n    if not hasattr(self, \"embeddings_\"):\n        return\n\n    return self.embeddings_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.fit","title":"fit(g)","text":"

    Train the embedding by doing random walks.

    Parameters:

    Name Type Description Default g mercury.graph Graph asset

    A mercury.graph Graph object. The embedding will be created so that each row in the embedding maps a node ID in g.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    This does a number of random walks starting from a random node and selecting the edges with a probability that is proportional to the weight of the edge. If the destination node also has outgoing edges, the next step will start from it, otherwise, a new random node will be selected. The edges visited (concordant pairs) will get some reinforcement in the embedding while a randomly selected non-existent edges will get divergence instead (discordant pairs).

    Internally, this stores the node IDS of the node visited and calls Embeddings.fit() to transfer the structure to the embedding. Of course, it can be called many times on the same GraphEmbedding.

    Source code in mercury/graph/embeddings/graphembeddings.py
    def fit(self, g: Graph):\n    \"\"\"\n    Train the embedding by doing random walks.\n\n    Args:\n        g (mercury.graph Graph asset): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps\n            a node ID in g.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    This does a number of random walks starting from a random node and selecting the edges with a probability that is proportional to\n    the weight of the edge. If the destination node also has outgoing edges, the next step will start from it, otherwise, a new random\n    node will be selected. The edges visited (concordant pairs) will get some reinforcement in the embedding while a randomly selected\n    non-existent edges will get divergence instead (discordant pairs).\n\n    Internally, this stores the node IDS of the node visited and calls Embeddings.fit() to transfer the structure to the embedding.\n    Of course, it can be called many times on the same GraphEmbedding.\n\n    \"\"\"\n\n    self.node_ids = list(g.networkx.nodes)\n\n    j_matrix = nx.adjacency_matrix(g.networkx)\n\n    N = j_matrix.shape[1]\n    M = j_matrix.nnz\n\n    self.r_ini = np.zeros(N, dtype=int)\n    self.r_len = np.zeros(N, dtype=int)\n    self.r_sum = np.zeros(N, dtype=float)\n    self.r_col = np.zeros(M, dtype=int)\n    self.r_wgt = np.zeros(M, dtype=float)\n\n    i = 0\n    for r in range(N):\n        self.r_ini[r] = i\n\n        i_col = j_matrix[[r], :].nonzero()[1]\n        L = len(i_col)\n\n        self.r_len[r] = L\n\n        for k in range(L):\n            c = i_col[k]\n            w = j_matrix[r, c]\n\n            self.r_sum[r] += w\n            self.r_col[i] = c\n            self.r_wgt[i] = w\n\n            i += 1\n\n    self.TotW = sum(self.r_sum)\n\n    converge, diverge = _random_walks(\n        self.r_ini,\n        self.r_len,\n        self.r_sum,\n        self.r_col,\n        self.r_wgt,\n        self.TotW,\n        self.n_jumps,\n        self.max_per_epoch if self.max_per_epoch is not None else self.n_jumps,\n    )\n\n    self.embeddings_ = Embeddings(\n        dimension=self.dimension,\n        num_elements=len(self.node_ids),\n        learn_step=self.learn_step,\n        bidirectional=self.bidirectional,\n    )\n    self.embeddings_.fit(converge, diverge)\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.get_most_similar_nodes","title":"get_most_similar_nodes(node_id, k=5, metric='cosine', return_as_indices=False)","text":"

    Returns the k most similar nodes and the similarities

    Parameters:

    Name Type Description Default node_id object

    Id of the node that we want to search the similar nodes.

    required k int

    Number of most similar nodes to return

    5 metric str

    metric to use as a similarity.

    'cosine' return_as_indices bool

    if return the nodes as indices (False), or as node ids (True)

    False

    Returns:

    Type Description list

    list of k most similar nodes and list of similarities of the most similar nodes

    DataFrame

    A list of k most similar nodes as a pd.DataFrame[word: string, similarity: double]

    Source code in mercury/graph/embeddings/graphembeddings.py
    def get_most_similar_nodes(\n    self, node_id, k=5, metric=\"cosine\", return_as_indices=False\n):\n    \"\"\"\n    Returns the k most similar nodes and the similarities\n\n    Args:\n        node_id (object): Id of the node that we want to search the similar nodes.\n        k (int): Number of most similar nodes to return\n        metric (str): metric to use as a similarity.\n        return_as_indices (bool): if return the nodes as indices (False), or as node ids (True)\n\n    Returns:\n        (list): list of k most similar nodes and list of similarities of the most similar nodes\n        (DataFrame): A list of k most similar nodes as a `pd.DataFrame[word: string, similarity: double]`\n    \"\"\"\n    node_index = self.node_ids.index(node_id)\n\n    ordered_indices, ordered_similarities = (\n        self.embeddings_.get_most_similar_embeddings(node_index, k, metric)\n    )\n\n    if not return_as_indices:\n        nodes = list(np.array(self.node_ids)[ordered_indices])\n    else:\n        nodes = list(ordered_indices)\n\n    return pd.DataFrame({\"word\": nodes, \"similarity\": ordered_similarities})\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.GraphEmbedding.save","title":"save(file_name, save_embedding=False)","text":"

    Saves a GraphEmbedding to a compressed binary file with or without the embedding itself. It saves the graph's node names and the adjacency matrix as a sparse matrix.

    Parameters:

    Name Type Description Default file_name str

    The name of the file to which the GraphEmbedding will be saved.

    required save_embedding bool

    Since the embedding can be big and, if not trained, it is just a matrix of uniform random numbers it is possible avoiding saving it. In case it is not saved, loading the file will create a new random embedding. This parameter controls if the embedding is saved or not (the default value).

    False Source code in mercury/graph/embeddings/graphembeddings.py
    def save(self, file_name, save_embedding=False):\n    \"\"\"\n    Saves a GraphEmbedding to a compressed binary file with or without the embedding itself. It saves the graph's node names\n    and the adjacency matrix as a sparse matrix.\n\n    Args:\n        file_name (str): The name of the file to which the GraphEmbedding will be saved.\n        save_embedding (bool): Since the embedding can be big and, if not trained, it is just a matrix of uniform random numbers it is\n            possible avoiding saving it. In case it is not saved, loading the file will create a new random embedding. This parameter\n            controls if the embedding is saved or not (the default value).\n    \"\"\"\n    with bz2.BZ2File(file_name, \"w\") as f:\n        pickle.dump(GraphEmbedding.FILE_HEAD, f)\n        pickle.dump(save_embedding, f)\n        pickle.dump(self.embeddings_.dimension, f)\n\n        pickle.dump(self.node_ids, f)\n\n        np.save(f, self.r_ini)\n        np.save(f, self.r_len)\n        np.save(f, self.r_sum)\n        np.save(f, self.r_col)\n        np.save(f, self.r_wgt)\n\n        pickle.dump(self.TotW, f)\n\n        if save_embedding:\n            np.save(f, self.embeddings_.embeddings_matrix_)\n\n        pickle.dump(GraphEmbedding.FILE_END, f)\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec","title":"mercury.graph.embeddings.SparkNode2Vec(dimension=None, sampling_ratio=1.0, num_epochs=10, num_paths_per_node=1, batch_size=1000000, w2v_max_iter=1, w2v_num_partitions=1, w2v_step_size=0.025, w2v_min_count=5, path_cache=None, use_cached_rw=False, n_partitions_cache=10, load_file=None)","text":"

    Bases: BaseClass

    Create or reload a SparkNode2Vec embedding mapping the nodes of a graph.

    Parameters:

    Name Type Description Default dimension int

    The number of columns in the embedding. See note the notes in Embeddings for details. (This parameter will be ignored when load_file is used.)

    None sampling_ratio float

    The proportion from the total number of nodes to be used in parallel at each step (whenever possible).

    1.0 num_epochs int

    Number of epochs. This is the total number of steps the iteration goes through. At each step, sampling_ratio times the total number of nodes paths will be computed in parallel.

    10 num_paths_per_node int

    The amount of random walks to source from each node.

    1 batch_size int

    This forces caching the random walks computed so far and breaks planning each time this number of epochs is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to \"cache a continue\" so to say.

    1000000 w2v_max_iter int

    This is the Spark Word2Vec parameter maxIter, the default value is the original default value.

    1 w2v_num_partitions int

    This is the Spark Word2Vec parameter numPartitions, the default value is the original default value.

    1 w2v_step_size float

    This is the Spark Word2Vec parameter stepSize, the default value is the original default value.

    0.025 w2v_min_count int

    This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the minimum number of times that a node has to appear to generate an embedding.

    5 path_cache str

    Folder where random walks will be stored, the default value is None which entails that random walks will not be stored.

    None use_cached_rw bool

    Flag that indicates if random walks should be read from disk (hence, they will not be computed again). Setting this parameter to True requires a valid path_cache.

    False n_partitions_cache int

    Number of partitions that will be used when storing the random walks, to optimize read access. The default value is 10.

    10 load_file str

    (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created using SparkNode2Vec.save().

    None Source code in mercury/graph/embeddings/spark_node2vec.py
    def __init__(\n    self,\n    dimension=None,\n    sampling_ratio=1.0,\n    num_epochs=10,\n    num_paths_per_node=1,\n    batch_size=1000000,\n    w2v_max_iter=1,\n    w2v_num_partitions=1,\n    w2v_step_size=0.025,\n    w2v_min_count=5,\n    path_cache=None,\n    use_cached_rw=False,\n    n_partitions_cache=10,\n    load_file=None,\n):\n    \"\"\"\n    Create or reload a SparkNode2Vec embedding mapping the nodes of a graph.\n\n    Args:\n        dimension (int): The number of columns in the embedding. See note the notes in `Embeddings` for details. (This parameter will be\n            ignored when `load_file` is used.)\n        sampling_ratio (float): The proportion from the total number of nodes to be used in parallel at each step (whenever possible).\n        num_epochs (int): Number of epochs. This is the total number of steps the iteration goes through. At each step, sampling_ratio\n            times the total number of nodes paths will be computed in parallel.\n        num_paths_per_node (int): The amount of random walks to source from each node.\n        batch_size (int): This forces caching the random walks computed so far and breaks planning each time this number of epochs\n            is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to\n            set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high\n            number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to\n            \"cache a continue\" so to say.\n        w2v_max_iter (int): This is the Spark Word2Vec parameter maxIter, the default value is the original default value.\n        w2v_num_partitions (int): This is the Spark Word2Vec parameter numPartitions, the default value is the original default value.\n        w2v_step_size (float): This is the Spark Word2Vec parameter stepSize, the default value is the original default value.\n        w2v_min_count (int): This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the\n            minimum number of times that a node has to appear to generate an embedding.\n        path_cache (str): Folder where random walks will be stored, the default value is None which entails that random walks will not\n            be stored.\n        use_cached_rw (bool): Flag that indicates if random walks should be read from disk (hence, they will not be computed again).\n            Setting this parameter to True requires a valid path_cache.\n        n_partitions_cache (int): Number of partitions that will be used when storing the random walks, to optimize read access.\n            The default value is 10.\n        load_file (str): (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created\n            using SparkNode2Vec.save().\n    \"\"\"\n    self.dimension = dimension\n    self.sampling_ratio = sampling_ratio\n    self.num_epochs = num_epochs\n    self.num_paths_per_node = num_paths_per_node\n    self.batch_size = batch_size\n    self.w2v_max_iter = w2v_max_iter\n    self.w2v_num_partitions = w2v_num_partitions\n    self.w2v_step_size = w2v_step_size\n    self.w2v_min_count = w2v_min_count\n    self.path_cache = path_cache\n    self.use_cached_rw = use_cached_rw\n    self.n_partitions_cache = n_partitions_cache\n    self.load_file = load_file\n\n    if self.load_file is not None:\n        self._load(self.load_file)\n        return\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.embedding","title":"embedding()","text":"

    Return all embeddings.

    Returns:

    Type Description DataFrame

    All embeddings as a DataFrame[word: string, vector: vector].

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def embedding(self):\n    \"\"\"\n    Return all embeddings.\n\n    Returns:\n        (DataFrame): All embeddings as a `DataFrame[word: string, vector: vector]`.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.getVectors()\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.fit","title":"fit(G)","text":"

    Train the embedding by doing random walks.

    Random walk paths are available in attribute paths_.

    Parameters:

    Name Type Description Default G Graph

    A mercury.graph Graph object. The embedding will be created so that each row in the embedding maps a node ID in G. (This parameter will be ignored when load_file is used.)

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def fit(self, G: Graph):\n    \"\"\"\n    Train the embedding by doing random walks.\n\n    Random walk paths are available in attribute `paths_`.\n\n    Args:\n        G (mercury.graph.core.Graph): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps\n            a node ID in G. (This parameter will be ignored when `load_file` is used.)\n\n    Returns:\n        (self): Fitted self (or raises an error)\n    \"\"\"\n\n    if self.path_cache is None:\n        if self.use_cached_rw:\n            logging.warning(\n                \"Wrong options (use_cached_rw and no path_cache). \"\n                \"Paths will be recomputed.\"\n            )\n        self.use_cached_rw = False\n\n    if not self.use_cached_rw:\n        paths = (\n            self._run_rw(G)\n            .withColumn(\"size\", f.size(\"random_walks\"))\n            .where(f.col(\"size\") > 1)\n            .drop(\"size\")\n        )\n\n        if self.path_cache is not None:\n            (\n                paths.repartition(self.n_partitions_cache)\n                .write.mode(\"overwrite\")\n                .parquet(\"%s/block=0\" % self.path_cache)\n            )\n\n        if self.num_paths_per_node > 1:\n            for block_id in range(1, self.num_paths_per_node):\n                new_paths = (\n                    self._run_rw(G)\n                    .withColumn(\"size\", f.size(\"random_walks\"))\n                    .where(f.col(\"size\") > 1)\n                    .drop(\"size\")\n                )\n                if self.path_cache is None:\n                    paths = paths.unionByName(new_paths)\n                else:\n                    (\n                        new_paths.repartition(self.n_partitions_cache)\n                        .write.mode(\"overwrite\")\n                        .parquet(\"%s/block=%d\" % (self.path_cache, block_id))\n                    )\n                    # With this, we clear the persisted dataframe\n                    new_paths.unpersist()\n\n    if self.path_cache is None:\n        self.paths_ = paths.persist()\n    else:\n        self.paths_ = (\n            SparkInterface()\n            .read_parquet(self.path_cache)\n            .drop(\"block\")\n            .repartition(self.n_partitions_cache)\n            .persist()\n        )\n\n    w2v = Word2Vec(\n        vectorSize=self.dimension,\n        maxIter=self.w2v_max_iter,\n        numPartitions=self.w2v_num_partitions,\n        stepSize=self.w2v_step_size,\n        inputCol=\"random_walks\",\n        outputCol=\"model\",\n        minCount=self.w2v_min_count,\n    )\n\n    self.node2vec_ = w2v.fit(self.paths_)\n\n    return self\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.get_most_similar_nodes","title":"get_most_similar_nodes(node_id, k=5)","text":"

    Returns the k most similar nodes and a similarity measure.

    Parameters:

    Name Type Description Default node_id str

    Id of the node we want to search.

    required k int

    Number of most similar nodes to return

    5

    Returns:

    Type Description DataFrame

    A list of k most similar nodes (using cosine similarity) as a DataFrame[word: string, similarity: double]

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def get_most_similar_nodes(self, node_id, k=5):\n    \"\"\"\n    Returns the k most similar nodes and a similarity measure.\n\n    Args:\n        node_id (str): Id of the node we want to search.\n        k (int): Number of most similar nodes to return\n\n    Returns:\n        (DataFrame): A list of k most similar nodes (using cosine similarity) as a `DataFrame[word: string, similarity: double]`\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.findSynonyms(node_id, k)\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.model","title":"model()","text":"

    Returns the Spark Word2VecModel object.

    Returns:

    Type Description Word2VecModel

    The Spark Word2VecModel of the embedding to use its API directly.

    Source code in mercury/graph/embeddings/spark_node2vec.py
    def model(self):\n    \"\"\"\n    Returns the Spark Word2VecModel object.\n\n    Returns:\n        (pyspark.ml.feature.Word2VecModel): The Spark Word2VecModel of the embedding to use its API directly.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_\n
    "},{"location":"reference/embeddings/#mercury.graph.embeddings.SparkNode2Vec.save","title":"save(file_name)","text":"

    Saves the internal Word2VecModel to a human-readable (JSON) model metadata as a Parquet formatted data file.

    The model may be loaded using SparkNode2Vec(load_file='path/file')

    Parameters:

    Name Type Description Default file_name str

    The name of the file to which the Word2VecModel will be saved.

    required Source code in mercury/graph/embeddings/spark_node2vec.py
    def save(self, file_name):\n    \"\"\"\n    Saves the internal Word2VecModel to a human-readable (JSON) model metadata as a Parquet formatted data file.\n\n    The model may be loaded using SparkNode2Vec(load_file='path/file')\n\n    Args:\n        file_name (str): The name of the file to which the Word2VecModel will be saved.\n    \"\"\"\n    if not hasattr(self, \"node2vec_\"):\n        return\n\n    return self.node2vec_.save(file_name)\n
    "},{"location":"reference/ml/","title":"mercury.graph.ml","text":""},{"location":"reference/ml/#mercury.graph.ml.LouvainCommunities","title":"mercury.graph.ml.LouvainCommunities(min_modularity_gain=0.001, max_pass=2, max_iter=10, resolution=1, all_partitions=True, verbose=True)","text":"

    Bases: BaseClass

    Class that defines the functions that run a PySpark implementation of the Louvain algorithm to find the partition that maximizes the modularity of an undirected graph (as in 1).

    This version of the algorithm differs from 1 in that the reassignment of nodes to new communities is calculated in parallel, not sequentially. That is, all nodes are reassigned at the same time and conflicts (i.e., 1 -> C2 and 2 -> C1) are resolved with a simple tie-breaking rule. This version also introduces the resolution parameter gamma, as in 2.

    Contributed by Arturo Soberon Cedillo, Jose Antonio Guzman Vazquez and Isaac Dodanim Hernandez Garcia.

    1. Blondel V D, Guillaume J-L, Lambiotte R and Lefebvre E (2008). Fast unfolding of communities in large networks. Journal of Statistical Mechanics: Theory and Experiment, 2008. https://doi.org/10.1088/1742-5468/2008/10/p10008 \u21a9\u21a9

    2. Aynaud T, Blondel V D, Guillaume J-L and Lambiotte R (2013). Multilevel local optimization of modularity. Graph Partitioning (315--345), 2013.\u00a0\u21a9

    Parameters:

    Name Type Description Default min_modularity_gain float

    Modularity gain threshold between each pass. The algorithm stops if the gain in modularity between the current pass and the previous one is less than the given threshold.

    0.001 max_pass int

    Maximum number of passes.

    2 max_iter int

    Maximum number of iterations within each pass.

    10 resolution float

    The resolution parameter gamma. Its value must be greater or equal to zero. If resolution is less than 1, modularity favors larger communities, while values greater than 1 favor smaller communities.

    1 all_partitions bool

    If True, the function will return all the partitions found at each step of the algorithm (i.e., pass0, pass1, pass2, ..., pass20). If False, only the last (and best) partition will be returned.

    True verbose bool

    If True, print progress information during the Louvain algorithm execution. Defaults to True.

    True Source code in mercury/graph/ml/louvain.py
    def __init__(\n    self,\n    min_modularity_gain=1e-03,\n    max_pass=2,\n    max_iter=10,\n    resolution: Union[float, int] = 1,\n    all_partitions=True,\n    verbose=True,\n):\n    self.min_modularity_gain = min_modularity_gain\n    self.max_pass = max_pass\n    self.max_iter = max_iter\n    self.resolution = resolution\n    self.all_partitions = all_partitions\n    self.verbose = verbose\n\n    # Check resolution\n    if resolution < 0:\n        exceptionMsg = f\"Resolution value is {resolution} and cannot be < 0.\"\n        raise ValueError(exceptionMsg)\n
    "},{"location":"reference/ml/#mercury.graph.ml.LouvainCommunities.fit","title":"fit(g)","text":"

    Parameters:

    Name Type Description Default g Graph

    A mercury graph structure.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error).

    Source code in mercury/graph/ml/louvain.py
    def fit(self, g: Graph):\n    \"\"\"\n    Args:\n        g (Graph): A mercury graph structure.\n\n    Returns:\n        (self): Fitted self (or raises an error).\n    \"\"\"\n    edges = g.graphframe.edges\n\n    # Verify edges input\n    self._verify_data(\n        df=edges,\n        expected_cols_grouping=[\"src\", \"dst\"],\n        expected_cols_others=[\"weight\"],\n    )\n\n    # Init dataframe to be returned\n    ret = (\n        edges.selectExpr(\"src as id\")\n        .unionByName(edges.selectExpr(\"dst as id\"))\n        .distinct()\n        .withColumn(\"pass0\", F.row_number().over(Window.orderBy(\"id\")))\n    ).checkpoint()\n\n    # Convert edges to anonymized src's and dst's\n    edges = (\n        edges.selectExpr(\"src as src0\", \"dst as dst0\", \"weight\")\n        .join(other=ret.selectExpr(\"id as src0\", \"pass0 as src\"), on=\"src0\")\n        .join(other=ret.selectExpr(\"id as dst0\", \"pass0 as dst\"), on=\"dst0\")\n        .select(\"src\", \"dst\", \"weight\")\n    ).checkpoint()\n\n    # Calculate m and initialize modularity\n    m = self._calculate_m(edges)\n    modularity0 = -1.0\n\n    # Begin pass\n    canPass, _pass = True, 0\n    while canPass:\n\n        # Declare naive partition\n        p1 = (\n            edges.selectExpr(\"src as id\")\n            .unionByName(edges.selectExpr(\"dst as id\"))\n            .distinct()\n            .withColumn(\"c\", F.col(\"id\"))\n        )\n\n        # Begin iterations within pass\n        canIter, _iter = True, 0\n        # Carry reference to previously cached p2 to call unpersist()\n        prev_p2 = None\n        while canIter:\n\n            if _iter >= self.max_iter:\n                break\n\n            # Print progress\n            if self.verbose:\n                print(f\"Starting Pass {_pass} Iteration {_iter}.\")\n\n            # Create new partition and check if movements were made\n            p2 = self._reassign_all(edges, p1)\n            # Break complex lineage caused by loops first\n            p2 = p2.checkpoint()\n            p2.cache()\n\n            canIter = len(p2.where(\"cx != cj\").take(1)) > 0\n            if canIter:\n                p1 = p2.selectExpr(\"id\", \"cj as c\")\n            if prev_p2 is not None:\n                prev_p2.unpersist()\n            prev_p2 = p2\n            _iter += 1\n\n        # Calculate new modularity and update pass counter\n        modularity1 = self._calculate_modularity(edges=edges, partition=p1, m=m)\n\n        # Declare stopping criterion and update old modularity\n        canPass = (modularity1 - modularity0 > self.min_modularity_gain) and (\n            _pass < self.max_pass\n        )\n        modularity0 = modularity1\n\n        self.modularity_ = modularity0\n\n        # Update ret and compress graph\n        if canPass:\n            ret = ret.join(\n                other=p1.selectExpr(f\"id as pass{_pass}\", f\"c as pass{_pass + 1}\"),\n                on=f\"pass{_pass}\",\n            ).checkpoint()\n\n            edges = (\n                self._label_edges(edges, p1)\n                .select(\"cSrc\", \"cDst\", \"weight\")\n                .groupBy(\"cSrc\", \"cDst\")\n                .agg(F.sum(\"weight\").alias(\"weight\"))\n                .selectExpr(\"cSrc as src\", \"cDst as dst\", \"weight\")\n            ).checkpoint()\n\n        prev_p2.unpersist()\n        _pass += 1\n\n    # Return final dataframe with sorted columns\n    if self.all_partitions:\n\n        # Return sorted columns\n        cols = self._sort_passes(ret)\n        ret = ret.select(cols)\n\n    # Return final dataframe with id & community\n    else:\n        _last = self._last_pass(ret)\n        ret = ret.selectExpr(\"id as node_id\", f\"{_last} as cluster\")\n\n    self.labels_ = ret\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkRandomWalker","title":"mercury.graph.ml.SparkRandomWalker(num_epochs=10, batch_size=1, n_sampling_edges=None)","text":"

    Bases: BaseClass

    Class to perform random walks from a specific source_id node within a given Graph

    Parameters:

    Name Type Description Default num_epochs int

    Number of epochs. This is the total number of steps the iteration goes through.

    10 batch_size int

    This forces caching the random walks computed so far and breaks planning each time this number of epochs is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to \"cache a continue\" so to say.

    1 n_sampling_edges int

    by setting this parameter you can limit at each timestep the number of new paths opened from each node. This is useful when the graph contains nodes with very high out-degree, where running the algorithm several epochs is not feasible. When using this parameter, the graph will consider only at most edge_sampling outgoing edges at each epoch for each path. If the last node of the path contains more than edge_sampling the selected edges are sampled using its weight.

    None Source code in mercury/graph/ml/spark_randomwalker.py
    def __init__(self, num_epochs=10, batch_size=1, n_sampling_edges=None):\n    \"\"\"\n    Class to perform random walks from a specific source_id node within a given Graph\n\n    Args:\n        num_epochs (int): Number of epochs. This is the total number of steps the iteration goes through.\n        batch_size (int): This forces caching the random walks computed so far and breaks planning each time this number of epochs\n            is reached. The default value is a high number to avoid this entering at all. In really large jobs, you may want to\n            set this parameter to avoid possible overflows even if it can add some extra time to the process. Note that with a high\n            number of epochs and nodes resource requirements for the active part of your random walks can be high. This allows to\n            \"cache a continue\" so to say.\n        n_sampling_edges (int): by setting this parameter you can limit at each timestep the number of new paths opened from each node.\n            This is useful when the graph contains nodes with very high out-degree, where running the algorithm several epochs is\n            not feasible. When using this parameter, the graph will consider only at most `edge_sampling` outgoing edges at each\n            epoch for each path. If the last node of the path contains more than `edge_sampling` the selected edges are sampled\n            using its weight.\n    \"\"\"\n    self.num_epochs = num_epochs\n    self.batch_size = batch_size\n    self.n_sampling_edges = n_sampling_edges\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkRandomWalker.fit","title":"fit(G, source_id)","text":"

    Perform random walks from a specific source_id node within a given Graph

    Parameters:

    Name Type Description Default G mercury.graph Graph asset

    A mercury.graph Graph

    required source_id int / str / list

    the source vertex or list for vertices to start the random walks.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Attribute paths_ contains a Spark Dataframe with a columns random_walks containing an array of the elements of the path walked and another column with the corresponding weights. The weights represent the probability of following that specific path starting from source_id.

    Source code in mercury/graph/ml/spark_randomwalker.py
    def fit(self, G: Graph, source_id):\n    \"\"\"\n    Perform random walks from a specific source_id node within a given Graph\n\n    Args:\n        G (mercury.graph Graph asset): A `mercury.graph` Graph\n        source_id (int/str/list): the source vertex or list for vertices to start the random walks.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    Attribute `paths_` contains a Spark Dataframe with a columns `random_walks` containing an array of the elements\n    of the path walked and another column with the corresponding weights. The weights represent the probability of\n    following that specific path starting from source_id.\n    \"\"\"\n    self.paths_ = self._run_rw(G, source_id)\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkSpreadingActivation","title":"mercury.graph.ml.SparkSpreadingActivation(attribute='influence', spreading_factor=0.2, transfer_function='weighted', steps=1, influenced_by=False)","text":"

    Bases: BaseClass

    This class is a model that represents a \u201cword-of-mouth\u201d scenario where a node influences his neighbors, from where the influence spreads to other neighbors, and so on.

    At the end of the diffusion process, we inspect the amount of influence received by each node. Using a threshold-based technique, a node that is currently not influenced can be declared to be a potential future one, based on the influence that has been accumulated.

    The diffusion model is based on Spreading Activation (SPA) techniques proposed in cognitive psychology and later used for trust metric computations. For more details, please see paper entitled \"Social Ties and their Relevance to Churn in Mobile Telecom Networks\"

    Parameters:

    Name Type Description Default attribute str

    Column name which will store the amount of influence spread

    'influence' spreading_factor float

    Percentage of influence to distribute. Low values favor influence proximity to the source of injection, while high values allow the influence to also reach nodes which are further away. It must be a value in the range (0,1). Default value is 0.2

    0.2 transfer_function str

    Allowed values: \"weighted\" or \"unweighted\". Once a node decides what fraction of energy to distribute, the next step is to decide what fraction of the energy is transferred to each neighbor. This is controlled by the Transfer Function. If \"weighted\" then the energy distributed along the directed edge depends on its relatively weight compared to the sum of weights of all outgoing edges of X. If \"unweighted\", then the energy distributed along the edge is independent of its relatively weight. 'weighted' steps int

    Number of steps to perform

    1 influenced_by bool

    if True, and extra column \"influenced_by\" is calculated which contains the seed nodes that have spread some influence to a given node. When True, the ids of the nodes cannot contain commas \",\". Note that seed_nodes will have at least their own (remaining) influence

    False Source code in mercury/graph/ml/spark_spreadactivation.py
    def __init__(\n    self,\n    attribute: str = \"influence\",\n    spreading_factor: float = 0.2,\n    transfer_function: str = \"weighted\",\n    steps: int = 1,\n    influenced_by: bool = False,\n):\n    self.attribute = attribute\n    self.spreading_factor = spreading_factor\n    self.transfer_function = transfer_function\n    self.steps = steps\n    self.influenced_by = influenced_by\n
    "},{"location":"reference/ml/#mercury.graph.ml.SparkSpreadingActivation.fit","title":"fit(g, seed_nodes)","text":"

    Perform all iterations of spread_activation

    Parameters:

    Name Type Description Default g Graph

    A mercury.graph Graph object.

    required seed_nodes Union[List, DataFrame]

    Collection of nodes that are the \"seed\" or are the source to spread the influence. It must be pyspark dataframe with column 'id' or python list

    required

    Returns:

    Type Description self

    Fitted self

    Source code in mercury/graph/ml/spark_spreadactivation.py
    def fit(\n    self,\n    g: Graph,\n    seed_nodes: Union[List, \"pyspark.sql.DataFrame\"],\n):\n    \"\"\"\n    Perform all iterations of spread_activation\n\n    Args:\n        g (mercury.graph.core.Graph): A `mercury.graph` Graph object.\n        seed_nodes (Union[List, pyspark.sql.DataFrame]): Collection of nodes that are the \"seed\" or are the source to spread\n            the influence. It must be pyspark dataframe with column 'id' or python list\n\n    Returns:\n        (self): Fitted self\n    \"\"\"\n\n    # Set seed nodes which are the source of influence\n    g = self._set_seed_nodes(g, seed_nodes)\n\n    # Compute degrees\n    g = self._compute_degrees(g)\n\n    # Number of iterations specified for spread activation\n    for _ in range(0, self.steps, 1):\n        g = self._spread_activation_step(\n            g,\n        )\n\n    # Graph with updated attributes\n    self.fitted_graph_ = g\n    # Influences as DataFrame\n    self.influences_ = self.fitted_graph_.nodes_as_dataframe().select(\n        \"id\", \"influence\"\n    )\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.SpectralClustering","title":"mercury.graph.ml.SpectralClustering(n_clusters=2, mode='networkx', max_iterations=10, random_state=0)","text":"

    Bases: BaseClass

    Implementation of the spectral clustering algorithm which detect communities inside a graph.

    Contributed by Gibran Gabriel Otazo Sanchez.

    Parameters:

    Name Type Description Default n_clusters int

    The number of clusters that you want to detect.

    2 random_state int

    Seed for reproducibility

    0 mode str

    Calculation mode. Pass 'networkx' for using pandas + networkx or 'spark' for spark + graphframes

    'networkx' max_iterations int

    Max iterations parameter (only used if mode==spark)

    10 Source code in mercury/graph/ml/spectral.py
    def __init__(\n    self, n_clusters=2, mode=\"networkx\", max_iterations=10, random_state=0\n):\n    self.n_clusters = n_clusters\n    self.mode = mode\n    self.max_iterations = max_iterations\n    self.random_state = random_state\n\n    if self.mode not in (\"networkx\", \"spark\"):\n        raise ValueError(\"Error: Mode must be either 'networkx' or 'spark'\")\n
    "},{"location":"reference/ml/#mercury.graph.ml.SpectralClustering.fit","title":"fit(graph)","text":"

    Find the optimal clusters of a given graph. The function returns nothing, but saves the clusters and the modularity in the object self.

    Parameters:

    Name Type Description Default graph Graph

    A mercury graph structure.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error)

    Source code in mercury/graph/ml/spectral.py
    def fit(self, graph: Graph):\n    \"\"\"\n    Find the optimal clusters of a given graph. The function returns nothing, but saves the clusters and\n    the modularity in the object self.\n\n    Args:\n        graph (Graph): A mercury graph structure.\n\n    Returns:\n        (self): Fitted self (or raises an error)\n\n    \"\"\"\n    if self.mode == \"networkx\":\n        self._fit_networkx(graph)\n    else:\n        self._fit_spark(graph)\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition","title":"mercury.graph.ml.Transition()","text":"

    Bases: BaseClass

    Create an interface class to manage the adjacency matrix of a directed graph as a transition matrix. This enables computing distributions of probabilities over the nodes after a given number of iterations.

    Source code in mercury/graph/ml/transition.py
    def __init__(self):\n    self.fitted_graph_ = None\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition.fit","title":"fit(G)","text":"

    Converts the adjacency matrix into a transition matrix. Transition matrices are used to compute the distribution of probability of being in each of the nodes (or states) of a directed graph (or Markov process). The distribution for state s is:

    • \\(s_t = T*s_{t-1}\\)

    Where:

    T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .to_pandas() to see it. \\(s_{t-1}\\) is the previous state.

    What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows recurrent states. A recurrent state is a final state, a state whose next state is itself.

    Parameters:

    Name Type Description Default G Graph

    A mercury.graph Graph.

    required

    Returns:

    Type Description self

    Fitted self (or raises an error).

    Note

    If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value.

    Source code in mercury/graph/ml/transition.py
    def fit(self, G: Graph):\n    \"\"\"\n    Converts the adjacency matrix into a transition matrix. Transition matrices are used to compute the distribution of probability\n    of being in each of the nodes (or states) of a directed graph (or Markov process). The distribution for state s is:\n\n    * $s_t = T*s_{t-1}$\n\n    Where:\n\n    T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .to_pandas() to see it.\n    $s_{t-1}$ is the previous state.\n\n    What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows\n    recurrent states. A recurrent state is a final state, a state whose next state is itself.\n\n    Args:\n        G (Graph): A `mercury.graph` Graph.\n\n    Returns:\n        (self): Fitted self (or raises an error).\n\n    Note:\n        If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way\n        to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value.\n\n    \"\"\"\n    names = list(G.networkx.nodes)\n    adj_m = nx.adjacency_matrix(G.networkx, weight=\"weight\", dtype=float)\n\n    with warnings.catch_warnings():\n        warnings.simplefilter(\"ignore\")\n\n        for i in range(adj_m.shape[0]):\n            row = adj_m[[i], :]\n            tot = row.sum()\n\n            if tot == 0:\n                row[0, i] = 1\n            else:\n                row = row / tot\n\n            adj_m[[i], :] = row\n\n    df = pd.DataFrame(adj_m.todense(), index=names, columns=names)\n    self.fitted_graph_ = Graph(nx.from_pandas_adjacency(df, create_using=nx.DiGraph))\n\n    return self\n
    "},{"location":"reference/ml/#mercury.graph.ml.Transition.to_pandas","title":"to_pandas(num_iterations=1)","text":"

    Returns the adjacency (which is the transition matrix after fit() was called) for a given number of iterations as a pandas dataframe with labeled rows and columns.

    Parameters:

    Name Type Description Default num_iterations int

    If you want to compute the matrix for a different number of iterations, k, you can use this argument to raise the matrix to any non negative integer, since \\(s_{t+k} = T^k*s_t\\)

    1

    Returns:

    Type Description DataFrame

    The transition matrix for num_iterations.

    Note

    This method does not automatically call fit(). This allows inspecting the adjacency matrix as a pandas dataframe. The result of computing num_iterations will not make sense if fit() has not been called before to_pandas().

    Source code in mercury/graph/ml/transition.py
    def to_pandas(self, num_iterations=1):\n    \"\"\"\n    Returns the adjacency (which is the transition matrix after `fit()` was called) for a given number of iterations as a pandas\n    dataframe with labeled rows and columns.\n\n    Args:\n        num_iterations (int): If you want to compute the matrix for a different number of iterations, k, you can use this argument to\n            raise the matrix to any non negative integer, since $s_{t+k} = T^k*s_t$\n\n    Returns:\n        (pd.DataFrame): The transition matrix for num_iterations.\n\n    Note:\n        This method does not automatically call `fit()`. This allows inspecting the adjacency matrix as a pandas dataframe.\n        The result of computing num_iterations will not make sense if `fit()` has not been called before `to_pandas()`.\n\n    \"\"\"\n    if self.fitted_graph_ is None:\n        raise ValueError(\"Error: fit() must be called first.\")\n\n    names = list(self.fitted_graph_.networkx.nodes)\n    adj_m = nx.adjacency_matrix(self.fitted_graph_.networkx, weight=\"weight\").todense()\n\n    if num_iterations != 1:\n        adj_m = matrix_power(adj_m, num_iterations)\n\n    return pd.DataFrame(adj_m, index=names, columns=names)\n
    "},{"location":"reference/viz/","title":"mercury.graph.viz","text":""},{"location":"reference/viz/#mercury.graph.viz.Moebius","title":"mercury.graph.viz.Moebius(G)","text":"

    Moebius class for visualizing graphs using JavaScript and HTML.

    Note

    Moebius is currently only compatible with Google Colab and Jupyter Notebooks Classic (prior to v7).

    Usage
    from mercury.graph.viz import Moebius\n\nG = ... # A graph object\nmoebius = Moebius(G)\nmoebius.show()\n

    Attributes:

    Name Type Description G Graph

    The graph to be visualized.

    use_spark bool

    Flag indicating if Spark is used.

    front_pat str

    Path to the frontend resources.

    _int_id_map dict

    A dictionary mapping node IDs to integer IDs.

    name() dict

    The instance name of the object required by the JS callback mechanism.

    Source code in mercury/graph/viz/moebius.py
    def __init__(self, G):\n\n    if HTML is None:\n        raise ImportError('IPython is not installed')\n\n    self.G = G\n    self.use_spark = self.G._as_networkx is None\n    self.front_pat = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/frontend'\n    self._int_id_map = {node['id'] : i for i, node in enumerate(self.G.nodes)}\n\n    # Define callback for JS interactions within Google Colab\n    if importlib.util.find_spec('google') is not None and importlib.util.find_spec('google.colab') is not None:\n        from google.colab import output\n        from IPython import get_ipython\n\n        def colab_execute_python(code):\n            # Use get_ipython() to access the Moebius object defined by the user in a Colab cell\n            get_ipython().run_cell(f\"_temp_colab_execute_python_result = {code}\")\n            return get_ipython().user_ns[\"_temp_colab_execute_python_result\"]\n\n        output.register_callback(\"notebook.colab_execute_python\", colab_execute_python)\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.name","title":"name property","text":"

    Get the instance name of the object which is required by the JS callback mechanism.

    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.FHT","title":"FHT(fn)","text":"

    Syntactic sugar for display(HTML(filename = fn))

    Source code in mercury/graph/viz/moebius.py
    def FHT(self, fn):\n    \"\"\"\n    Syntactic sugar for display(HTML(filename = fn))\n    \"\"\"\n\n    display(HTML(filename = fn))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.FJS","title":"FJS(fn)","text":"

    Syntactic sugar for display(Javascript(filename = fn))

    Source code in mercury/graph/viz/moebius.py
    def FJS(self, fn):\n    \"\"\"\n    Syntactic sugar for display(Javascript(filename = fn))\n    \"\"\"\n\n    display(Javascript(filename = fn))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.JS","title":"JS(s)","text":"

    Syntactic sugar for display(Javascript())

    Source code in mercury/graph/viz/moebius.py
    def JS(self, s):\n    \"\"\"\n    Syntactic sugar for display(Javascript())\n    \"\"\"\n\n    display(Javascript(s))\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.__getitem__","title":"__getitem__(item)","text":"

    Add support for the [] operator.

    Source code in mercury/graph/viz/moebius.py
    def __getitem__(self, item):\n    \"\"\"\n    Add support for the [] operator.\n    \"\"\"\n\n    return self._get_adjacent_nodes_moebius(item)\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.__str__","title":"__str__()","text":"

    Convert the object via str()

    Source code in mercury/graph/viz/moebius.py
    def __str__(self):\n    \"\"\"\n    Convert the object via str()\n    \"\"\"\n\n    return 'Moebius(%s)' % str(self.G)\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.node_or_edge_config","title":"node_or_edge_config(text_is=None, color_is=None, colors=None, size_is=None, size_range=None, size_scale='linear')","text":"

    Create a node_config or edge_config configuration dictionary for show() in an understandable way.

    Parameters:

    Name Type Description Default text_is str

    The node/edge attribute to be displayed as text. Use the string \u00ecd to draw the node id (regardless of the column having another name) or any valid node attribute name.

    None color_is str

    A categorical node/edge attribute that can be represented as a color. This will also enable a legend interface where categories can be individually shown or hidden.

    None colors dict

    The colors for each category defined as a dictionary. The keys are possible outcomes of category. The values are html RGB strings. E.g., .draw(category = 'size', colors = {'big' : '#c0a080', 'small' : '#a0c080'}) where 'big' and 'small' are possible values of the category 'size'.

    None size_is str

    The node attribute to be displayed as the size of the nodes. Use the string id to set the node id (regardless of the column having another name) or any valid node attribute name. See the options in the Moebius configuration menu to set minimum, maximum sizes, linear or logarithmic scale, etc.

    None size_range List of two numbers

    Combined with edge_label, this parameter controls the values in the variable that correspond to the minimum and maximum displayed sizes. The values below or equal the first value will be displayed with the base radius (that depends on the zoom) and the values above or equal to the second value will be shown with the maximum radius.

    None size_scale (linear, power, sqrt or log)

    Combined with edge_label, the scale used to convert the value in the variable to the displayed radius.

    'linear'

    Returns:

    Type Description dict

    The node configuration dictionary

    Source code in mercury/graph/viz/moebius.py
    def node_or_edge_config(self, text_is = None, color_is = None, colors = None, size_is = None, size_range = None, size_scale = 'linear'):\n    \"\"\"\n    Create a `node_config` or `edge_config` configuration dictionary for `show()` in an understandable way.\n\n    Args:\n        text_is (str): The node/edge attribute to be displayed as text. Use the string `\u00ecd` to draw the node id (regardless of the\n            column having another name) or any valid node attribute name.\n        color_is (str): A categorical node/edge attribute that can be represented as a color. This will also enable a legend interface\n            where categories can be individually shown or hidden.\n        colors (dict): The colors for each category defined as a dictionary. The keys are possible outcomes of category.\n            The values are html RGB strings. E.g., .draw(category = 'size', colors = {'big' : '#c0a080', 'small' : '#a0c080'})\n            where 'big' and 'small' are possible values of the category 'size'.\n        size_is (str): The node attribute to be displayed as the size of the nodes. Use the string `id` to set the node id (regardless\n            of the column having another name) or any valid node attribute name. See the options in the Moebius configuration menu to\n            set minimum, maximum sizes, linear or logarithmic scale, etc.\n        size_range (List of two numbers): Combined with edge_label, this parameter controls the values in the variable that\n            correspond to the minimum and maximum displayed sizes. The values below or equal the first value will be displayed with the\n            base radius (that depends on the zoom) and the values above or equal to the second value will be shown with the maximum\n            radius.\n        size_scale ('linear', 'power', 'sqrt' or 'log'): Combined with edge_label, the scale used to convert the value in the variable\n            to the displayed radius.\n\n    Returns:\n        (dict): The node configuration dictionary\n    \"\"\"\n\n    config = {}\n\n    if text_is is not None:\n        config['label'] = text_is\n\n    if color_is is not None:\n        config['color'] = color_is\n\n    if colors is not None:\n        config['color_palette'] = colors\n    else:\n        config['color_palette'] = {}\n\n    if size_is is None:\n        config['size_thresholds'] = []\n    else:\n        config['size'] = size_is\n\n        if size_range is None:\n            config['size_thresholds'] = []\n        else:\n            assert type(size_range) == list and len(size_range) == 2\n            config['size_thresholds'] = size_range\n\n        if size_scale != 'linear':\n            assert size_scale in {'power', 'sqrt', 'log'}\n\n        config['scale'] = size_scale\n\n    return config\n
    "},{"location":"reference/viz/#mercury.graph.viz.Moebius.show","title":"show(initial_id=None, initial_depth=1, node_config=None, edge_config=None)","text":"

    Start the interactive graph visualization in a Jupyter notebook.

    Parameters:

    Name Type Description Default initial_id str

    The id of the node to start the visualization.

    None initial_depth int

    The initial depth of the graph (starting with initial_id as 0) to be shown.

    1 node_config dict

    A node configuration dictionary created by node_config().

    None edge_config dict

    An edge configuration dictionary created by edge_config().

    None Source code in mercury/graph/viz/moebius.py
    def show(self, initial_id = None, initial_depth = 1, node_config = None, edge_config = None):\n    \"\"\"\n    Start the interactive graph visualization in a Jupyter notebook.\n\n    Args:\n        initial_id (str): The id of the node to start the visualization.\n        initial_depth (int): The initial depth of the graph (starting with `initial_id` as 0) to be shown.\n        node_config (dict): A node configuration dictionary created by `node_config()`.\n        edge_config (dict): An edge configuration dictionary created by `edge_config()`.\n    \"\"\"\n\n    if initial_id is None:\n        initial_id = next(iter(self._int_id_map))\n\n    initial_json = self._get_adjacent_nodes_moebius(initial_id, depth = initial_depth)\n\n    if node_config is None:\n        node_config = self.node_or_edge_config()\n\n    if edge_config is None:\n        edge_config = self.node_or_edge_config()\n\n    self._load_moebius_js(initial_json, self.name, node_config, edge_config)\n
    "}]} \ No newline at end of file diff --git a/site/sitemap.xml.gz b/site/sitemap.xml.gz index 8c0754f..412ffb4 100644 Binary files a/site/sitemap.xml.gz and b/site/sitemap.xml.gz differ