From 0dfd68845979415306c4054bb854051ee9ef5ffe Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Tue, 7 Jan 2025 16:46:17 -0500
Subject: [PATCH 1/6] bringing in Clustergrammer-PY backe-end

---
 src/celldega/__init__.py                 |    2 +
 src/celldega/clust/__init__.py           | 1504 ++++++++++++++++++++++
 src/celldega/clust/calc_clust.py         |  183 +++
 src/celldega/clust/cat_pval.py           |   98 ++
 src/celldega/clust/categories.py         |  346 +++++
 src/celldega/clust/data_formats.py       |  134 ++
 src/celldega/clust/downsample_fun.py     |  280 ++++
 src/celldega/clust/enrichr_functions.py  |  376 ++++++
 src/celldega/clust/export_data.py        |   57 +
 src/celldega/clust/iframe_web_app.py     |   32 +
 src/celldega/clust/initialize_net.py     |   97 ++
 src/celldega/clust/load_data.py          |   97 ++
 src/celldega/clust/load_vect_post.py     |   46 +
 src/celldega/clust/make_clust_fun.py     |   75 ++
 src/celldega/clust/make_sim_mat.py       |   71 +
 src/celldega/clust/make_unique_labels.py |   88 ++
 src/celldega/clust/make_viz.py           |   72 ++
 src/celldega/clust/normalize_fun.py      |  165 +++
 src/celldega/clust/proc_df_labels.py     |   62 +
 src/celldega/clust/run_filter.py         |  209 +++
 20 files changed, 3994 insertions(+)
 create mode 100644 src/celldega/clust/__init__.py
 create mode 100644 src/celldega/clust/calc_clust.py
 create mode 100644 src/celldega/clust/cat_pval.py
 create mode 100644 src/celldega/clust/categories.py
 create mode 100644 src/celldega/clust/data_formats.py
 create mode 100644 src/celldega/clust/downsample_fun.py
 create mode 100644 src/celldega/clust/enrichr_functions.py
 create mode 100644 src/celldega/clust/export_data.py
 create mode 100644 src/celldega/clust/iframe_web_app.py
 create mode 100644 src/celldega/clust/initialize_net.py
 create mode 100644 src/celldega/clust/load_data.py
 create mode 100644 src/celldega/clust/load_vect_post.py
 create mode 100644 src/celldega/clust/make_clust_fun.py
 create mode 100644 src/celldega/clust/make_sim_mat.py
 create mode 100644 src/celldega/clust/make_unique_labels.py
 create mode 100644 src/celldega/clust/make_viz.py
 create mode 100644 src/celldega/clust/normalize_fun.py
 create mode 100644 src/celldega/clust/proc_df_labels.py
 create mode 100644 src/celldega/clust/run_filter.py

diff --git a/src/celldega/__init__.py b/src/celldega/__init__.py
index d051f6ba..9bb3ac92 100644
--- a/src/celldega/__init__.py
+++ b/src/celldega/__init__.py
@@ -4,6 +4,8 @@
 from celldega.pre import landscape
 from celldega.nbhd import alpha_shape
 
+from celldega.clust import Network
+
 # temporary fix for libpysal warning
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py
new file mode 100644
index 00000000..28550be1
--- /dev/null
+++ b/src/celldega/clust/__init__.py
@@ -0,0 +1,1504 @@
+import numpy as np
+import pandas as pd
+from copy import deepcopy
+
+from . import initialize_net
+from . import load_data
+from . import export_data
+from . import load_vect_post
+from . import make_clust_fun
+from . import normalize_fun
+from . import data_formats
+from . import enrichr_functions as enr_fun
+from . import iframe_web_app
+from . import run_filter
+from . import downsample_fun
+from . import categories
+
+from scipy.stats import ttest_ind, mannwhitneyu
+from sklearn.metrics import pairwise_distances, roc_curve, auc
+from scipy.spatial.distance import pdist
+from sklearn.metrics import confusion_matrix
+import random
+from itertools import combinations
+import matplotlib.pyplot as plt
+import json
+import ipywidgets as widgets
+import statsmodels.stats.multitest as smm
+
+def cluster():
+  net = Network()
+
+  print('**********')
+
+class Network(object):
+  '''
+  Clustergrammer.py takes a matrix as input (either from a file of a Pandas DataFrame), normalizes/filters, hierarchically clusters, and produces the :ref:`visualization_json` for :ref:`clustergrammer_js`.
+
+  Networks have two states:
+
+    1. the data state, where they are stored as a matrix and nodes
+    2. the viz state where they are stored as viz.links, viz.row_nodes, and viz.col_nodes.
+
+  The goal is to start in a data-state and produce a viz-state of
+  the network that will be used as input to clustergram.js.
+  '''
+
+  def __init__(self, widget=None):
+    initialize_net.main(self, widget)
+
+  def reset(self):
+    '''
+    This re-initializes the Network object.
+    '''
+    initialize_net.main(self)
+
+  def load_file(self, filename):
+    '''
+    Load TSV file.
+    '''
+    load_data.load_file(self, filename)
+
+  def load_file_as_string(self, file_string, filename=''):
+    '''
+    Load file as a string.
+    '''
+    load_data.load_file_as_string(self, file_string, filename=filename)
+
+
+  def load_stdin(self):
+    '''
+    Load stdin TSV-formatted string.
+    '''
+    load_data.load_stdin(self)
+
+  def load_tsv_to_net(self, file_buffer, filename=None):
+    '''
+    This will load a TSV matrix file buffer; this is exposed so that it will
+    be possible to load data without having to read from a file.
+    '''
+    load_data.load_tsv_to_net(self, file_buffer, filename)
+
+  def load_vect_post_to_net(self, vect_post):
+    '''
+    Load data in the vector format JSON.
+    '''
+    load_vect_post.main(self, vect_post)
+
+  def load_data_file_to_net(self, filename):
+    '''
+    Load Clustergrammer's dat format (saved as JSON).
+    '''
+    inst_dat = self.load_json_to_dict(filename)
+    load_data.load_data_to_net(self, inst_dat)
+
+  def cluster(self, dist_type='cosine', run_clustering=True,
+                 dendro=True, views=[],
+                 linkage_type='average', sim_mat=False, filter_sim=0.0,
+                 calc_cat_pval=False, run_enrichr=None, enrichrgram=None,
+                 clust_library='scipy', min_samples=1, min_cluster_size=2):
+    '''
+    The main function performs hierarchical clustering, optionally generates
+    filtered views (e.g. row-filtered views), and generates the :
+    ``visualization_json``.
+
+    Used to set views equal to ['N_row_sum', 'N_row_var']
+    '''
+    initialize_net.viz(self)
+
+    make_clust_fun.make_clust(self, dist_type=dist_type,
+                                    run_clustering=run_clustering,
+                                    dendro=dendro,
+                                    requested_views=views,
+                                    linkage_type=linkage_type,
+                                    sim_mat=sim_mat,
+                                    filter_sim=filter_sim,
+                                    calc_cat_pval=calc_cat_pval,
+                                    run_enrichr=run_enrichr,
+                                    enrichrgram=enrichrgram,
+                                    clust_library=clust_library,
+                                    min_samples=min_samples,
+                                    min_cluster_size=min_cluster_size)
+
+  def swap_nan_for_zero(self):
+    '''
+    Swaps all NaN (numpy NaN) instances for zero.
+    '''
+    self.dat['mat'][np.isnan(self.dat['mat'])] = 0
+
+  def load_df(self, df_ini, meta_col=None, meta_row=None, col_cats=None,
+              row_cats=None, is_downsampled=False, meta_ds_row=None,
+              meta_ds_col=None):
+    '''
+    Load Pandas DataFrame.
+    '''
+    self.reset()
+
+    # load dataframe
+    df = deepcopy(df_ini)
+
+
+    if is_downsampled:
+      if meta_ds_col is not None:
+        self.meta_ds_col = meta_ds_col
+      if meta_ds_row is not None:
+        self.meta_ds_row = meta_ds_row
+
+    # define downsampled status
+    self.is_downsampled = is_downsampled
+    # print('load_df: is_downsampled', is_downsampled)
+
+    if hasattr(self, 'meta_col') == False and hasattr(self, 'meta_row') == False:
+      self.meta_cat = False
+
+    # load metadata
+    if isinstance(meta_col, pd.DataFrame):
+      self.meta_col = meta_col
+
+      if col_cats is None:
+        self.col_cats = meta_col.columns.tolist()
+      else:
+        self.col_cats = col_cats
+
+      self.meta_cat = True
+
+    if isinstance(meta_row, pd.DataFrame):
+      self.meta_row = meta_row
+
+      if row_cats is None:
+        self.row_cats = meta_row.columns.tolist()
+      else:
+        self.row_cats = row_cats
+
+      self.meta_cat = True
+
+    data_formats.df_to_dat(self, df, define_cat_colors=True)
+
+  def export_df(self):
+    '''
+    Export Pandas DataFrame/
+    '''
+    df = data_formats.dat_to_df(self)
+
+    # drop tuple categories if downsampling
+    if self.is_downsampled:
+      df.columns = self.dat['nodes']['col']
+      df.index = self.dat['nodes']['row']
+
+    return df
+
+  def df_to_dat(self, df, define_cat_colors=False):
+    '''
+    Load Pandas DataFrame (will be deprecated).
+    '''
+    data_formats.df_to_dat(self, df, define_cat_colors)
+
+  def set_matrix_colors(self, pos='red', neg='blue'):
+
+    self.viz['matrix_colors'] = {}
+    self.viz['matrix_colors']['pos'] = pos
+    self.viz['matrix_colors']['neg'] = neg
+
+  def set_global_cat_colors(self, df_meta):
+
+    for inst_name in df_meta.index.tolist():
+      inst_color = df_meta.loc[inst_name, 'color']
+
+      self.viz['global_cat_colors'][inst_name] = inst_color
+
+  def set_cat_color(self, axis, cat_index, cat_name, inst_color):
+
+    if axis == 0:
+      axis = 'row'
+    if axis == 1:
+      axis = 'col'
+
+    try:
+      # process cat_index
+      cat_index = cat_index - 1
+      cat_index = 'cat-' + str(cat_index)
+
+      self.viz['cat_colors'][axis][cat_index][cat_name] = inst_color
+
+    except:
+      print('there was an error setting the category color')
+
+  def dat_to_df(self):
+    '''
+    Export Pandas DataFrams (will be deprecated).
+    '''
+    return data_formats.dat_to_df(self)
+
+  def export_net_json(self, net_type='viz', indent='no-indent'):
+    '''
+    Export dat or viz JSON.
+    '''
+    return export_data.export_net_json(self, net_type, indent)
+
+  def export_viz_to_widget(self, which_viz='viz'):
+    '''
+    Export viz JSON, for use with clustergrammer_widget. Formerly method was
+    named widget.
+    '''
+
+    return export_data.export_net_json(self, which_viz, 'no-indent')
+
+  def widget(self, which_viz='viz', link_net=None, link_net_js=None, clust_library='scipy',
+    min_samples=1, min_cluster_size=2):
+    '''
+    Generate a widget visualization using the widget. The export_viz_to_widget
+    method passes the visualization JSON to the instantiated widget, which is
+    returned and visualized on the front-end.
+    '''
+    # run clustering if necessary
+    if len(self.viz['row_nodes']) == 0:
+      self.cluster(clust_library=clust_library, min_samples=min_samples,
+                   min_cluster_size=min_cluster_size)
+
+      # add manual_category to viz json
+      if 'manual_category' in self.dat:
+        self.viz['manual_category'] = self.dat['manual_category']
+
+      # add pre-z-score data to viz
+      if 'pre_zscore' in self.dat:
+        self.viz['pre_zscore'] = self.dat['pre_zscore']
+
+    self.widget_instance = self.widget_class(network = self.export_viz_to_widget(which_viz))
+
+    # initialize manual category
+    if 'manual_category' in self.dat:
+      manual_cat = {}
+      axis = 'col'
+      manual_cat[axis] = {}
+      manual_cat[axis]['col_cat_colors'] = self.viz['cat_colors'][axis]['cat-0']
+
+      man_cat_name = self.dat['manual_category'][axis]
+      if axis == 'col':
+        manual_cat[axis][man_cat_name] = self.meta_col[man_cat_name].to_dict()
+
+      self.widget_instance.manual_cat = json.dumps(manual_cat)
+
+      self.widget_instance.observe(self.get_manual_category, names='manual_cat')
+
+    # add link (python)
+    if link_net is not None:
+      inst_link = widgets.link(
+                                (self.widget_instance, 'manual_cat'),
+                                (link_net.widget_instance, 'manual_cat')
+                               )
+      self.widget_instance.link = inst_link
+
+    # add jslink (JavaScript)
+    if link_net_js is not None:
+
+      inst_link = widgets.jslink(
+                                (self.widget_instance, 'manual_cat'),
+                                (link_net_js.widget_instance, 'manual_cat')
+                               )
+      self.widget_instance.link = inst_link
+
+    return self.widget_instance
+
+
+  def widget_df(self):
+    '''
+    Export a DataFrame from the front-end visualization. For instance, a user
+    can filter to show only a single cluster using the dendrogram and then
+    get a dataframe of this cluster using the widget_df method.
+    '''
+
+    if hasattr(self, 'widget_instance') == True:
+
+      if self.widget_instance.mat_string != '':
+
+        tmp_net = deepcopy(Network())
+
+        df_string = self.widget_instance.mat_string
+
+        tmp_net.load_file_as_string(df_string)
+
+        df = tmp_net.export_df()
+
+        return df
+
+      else:
+        return self.export_df()
+
+    else:
+      if hasattr(self, 'widget_class') == True:
+        print('Please make the widget before exporting the widget DataFrame.')
+        print('Do this using the widget method: net.widget()')
+
+      else:
+        print('Can not make widget because Network has no attribute widget_class')
+        print('Please instantiate Network with clustergrammer_widget using: Network(clustergrammer_widget)')
+
+  def write_json_to_file(self, net_type, filename, indent='no-indent'):
+    '''
+    Save dat or viz as a JSON to file.
+    '''
+    export_data.write_json_to_file(self, net_type, filename, indent)
+
+  def write_matrix_to_tsv(self, filename=None, df=None):
+    '''
+    Export data-matrix to file.
+    '''
+    return export_data.write_matrix_to_tsv(self, filename, df)
+
+  def filter_sum(self, threshold, take_abs=True, axis=None, inst_rc=None):
+    '''
+    Filter a network's rows or columns based on the sum across rows or columns.
+    '''
+
+    if axis is None:
+      axis = inst_rc
+      print('warning inst_rc argument will be deprecated, please use axis')
+
+      if inst_rc is None:
+        print('please provide axis argument')
+
+    inst_df = self.dat_to_df()
+    if axis == 'row':
+      inst_df = run_filter.df_filter_row_sum(inst_df, threshold, take_abs)
+    elif axis == 'col':
+      inst_df = run_filter.df_filter_col_sum(inst_df, threshold, take_abs)
+    self.df_to_dat(inst_df)
+
+  def filter_N_top(self, N_top, rank_type='sum', inst_rc=None, axis=None):
+    '''
+    Filter the matrix rows or columns based on sum/variance, and only keep the top
+    N.
+    '''
+
+    if axis is None:
+      axis = inst_rc
+      print('warning inst_rc argument will be deprecated, please use axis')
+
+      if inst_rc is None:
+        print('please provide axis argument')
+
+    inst_df = self.dat_to_df()
+
+    inst_df = run_filter.filter_N_top(inst_rc, inst_df, N_top, rank_type)
+
+    self.df_to_dat(inst_df)
+
+  def filter_threshold(self, threshold, num_occur=1, inst_rc=None, axis=None):
+    '''
+    Filter the matrix rows or columns based on num_occur values being above a
+    threshold (in absolute value).
+    '''
+
+    if axis is None:
+      axis = inst_rc
+      print('warning inst_rc argument will be deprecated, please use axis')
+
+      if inst_rc is None:
+        print('please provide axis argument')
+
+    inst_df = self.dat_to_df()
+
+    inst_df = run_filter.filter_threshold(inst_df, axis, threshold,
+      num_occur)
+
+    self.df_to_dat(inst_df)
+
+  def filter_cat(self, axis, cat_index, cat_name):
+    '''
+    Filter the matrix based on their category. cat_index is the index of the category, the first category has index=1.
+    '''
+    run_filter.filter_cat(self, axis, cat_index, cat_name)
+
+  def filter_names(self, axis, names):
+    '''
+    Filter the visualization using row/column names. The function takes, axis ('row'/'col') and names, a list of strings.
+    '''
+    run_filter.filter_names(self, axis, names)
+
+  def clip(self, lower=None, upper=None):
+    '''
+    Trim values at input thresholds using pandas function
+    '''
+    df = self.export_df()
+    df = df.clip(lower=lower, upper=upper)
+    self.load_df(df)
+
+  def normalize(self, df=None, norm_type='zscore', axis='row', z_clip=None):
+    '''
+    Normalize the matrix rows or columns using Z-score (zscore) or Quantile Normalization (qn). Users can optionally pass in a DataFrame to be normalized (and this will be incorporated into the Network object).
+    '''
+    normalize_fun.run_norm(self, df, norm_type, axis, z_clip)
+
+  def downsample(self, df=None, ds_type='kmeans', axis='row', num_samples=100,
+                 random_state=1000, ds_name='Downsample',
+                 ds_cluster_name='cluster'):
+    '''
+    Downsample the matrix rows or columns (currently supporting kmeans only). Users can optionally pass in a DataFrame to be downsampled (and this will be incorporated into the network object).
+    '''
+    return downsample_fun.main(self, df, ds_type, axis, num_samples, random_state, ds_name, ds_cluster_name)
+
+  def random_sample(self, num_samples, df=None, replace=False, weights=None, random_state=100, axis='row'):
+    '''
+    Return random sample of matrix.
+    '''
+
+    if df is None:
+      df = self.dat_to_df()
+
+    if axis == 'row':
+      axis = 0
+    if axis == 'col':
+      axis = 1
+
+    df = self.export_df()
+    df = df.sample(n=num_samples, replace=replace, weights=weights, random_state=random_state,  axis=axis)
+
+    self.load_df(df)
+
+  def add_cats(self, axis, cat_data):
+    '''
+    Add categories to rows or columns using cat_data array of objects. Each object in cat_data is a dictionary with one key (category title) and value (rows/column names) that have this category. Categories will be added onto the existing categories and will be added in the order of the objects in the array.
+
+    Example ``cat_data``::
+
+
+        [
+          {
+            "title": "First Category",
+            "cats": {
+              "true": [
+                "ROS1",
+                "AAK1"
+              ]
+            }
+          },
+          {
+            "title": "Second Category",
+            "cats": {
+              "something": [
+                "PDK4"
+              ]
+            }
+          }
+        ]
+
+
+    '''
+    for inst_data in cat_data:
+      categories.add_cats(self, axis, inst_data)
+
+  def Iframe_web_app(self, filename=None, width=1000, height=800):
+
+    link = iframe_web_app.main(self, filename, width, height)
+
+    return link
+
+  def enrichrgram(self, lib, axis='row'):
+    '''
+    Add Enrichr gene enrichment results to your visualization (where your rows
+    are genes). Run enrichrgram before clustering to incldue enrichment results
+    as row categories. Enrichrgram can also be run on the front-end using the
+    Enrichr logo at the top left.
+
+    Set lib to the Enrichr library that you want to use for enrichment analysis.
+    Libraries included:
+
+      * ChEA_2016
+      * KEA_2015
+      * ENCODE_TF_ChIP-seq_2015
+      * ENCODE_Histone_Modifications_2015
+      * Disease_Perturbations_from_GEO_up
+      * Disease_Perturbations_from_GEO_down
+      * GO_Molecular_Function_2015
+      * GO_Biological_Process_2015
+      * GO_Cellular_Component_2015
+      * Reactome_2016
+      * KEGG_2016
+      * MGI_Mammalian_Phenotype_Level_4
+      * LINCS_L1000_Chem_Pert_up
+      * LINCS_L1000_Chem_Pert_down
+
+    '''
+
+    df = self.export_df()
+    df, bar_info = enr_fun.add_enrichr_cats(df, axis, lib)
+    self.load_df(df)
+
+    self.dat['enrichrgram_lib'] = lib
+    self.dat['row_cat_bars'] = bar_info
+
+  @staticmethod
+  def load_gmt(filename):
+    return load_data.load_gmt(filename)
+
+  @staticmethod
+  def load_json_to_dict(filename):
+    return load_data.load_json_to_dict(filename)
+
+  @staticmethod
+  def save_dict_to_json(inst_dict, filename, indent='no-indent'):
+    export_data.save_dict_to_json(inst_dict, filename, indent)
+
+  @staticmethod
+  def load_gene_exp_to_df(inst_path):
+    '''
+    Loads gene expression data from 10x in sparse matrix format and returns a
+    Pandas dataframe
+    '''
+
+    import pandas as pd
+    from scipy import io
+    from scipy import sparse
+    from ast import literal_eval as make_tuple
+
+    # matrix
+    Matrix = io.mmread( inst_path + 'matrix.mtx')
+    mat = Matrix.todense()
+
+    # genes
+    filename = inst_path + 'genes.tsv'
+    f = open(filename, 'r')
+    lines = f.readlines()
+    f.close()
+
+    # # add unique id to all genes
+    # genes = []
+    # unique_id = 0
+    # for inst_line in lines:
+    #     inst_line = inst_line.strip().split()
+
+    #     if len(inst_line) > 1:
+    #       inst_gene = inst_line[1]
+    #     else:
+    #       inst_gene = inst_line[0]
+
+    #     genes.append(inst_gene + '_' + str(unique_id))
+    #     unique_id = unique_id + 1
+
+    # add unique id only to duplicate genes
+    ini_genes = []
+    for inst_line in lines:
+        inst_line = inst_line.strip().split()
+        if len(inst_line) > 1:
+          inst_gene = inst_line[1]
+        else:
+          inst_gene = inst_line[0]
+        ini_genes.append(inst_gene)
+
+    gene_name_count = pd.Series(ini_genes).value_counts()
+    duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist()
+
+    dup_index = {}
+    genes = []
+    for inst_row in ini_genes:
+
+      # add index to non-unique genes
+      if inst_row in duplicate_genes:
+
+        # calc_non-unque index
+        if inst_row not in dup_index:
+          dup_index[inst_row] = 1
+        else:
+          dup_index[inst_row] = dup_index[inst_row] + 1
+
+        new_row = inst_row + '_' + str(dup_index[inst_row])
+
+      else:
+        new_row = inst_row
+
+      genes.append(new_row)
+
+    # barcodes
+    filename = inst_path + 'barcodes.tsv'
+    f = open(filename, 'r')
+    lines = f.readlines()
+    f.close()
+
+    cell_barcodes = []
+    for inst_bc in lines:
+        inst_bc = inst_bc.strip().split('\t')
+
+        # remove dash from barcodes if necessary
+        if '-' in inst_bc[0]:
+          inst_bc[0] = inst_bc[0].split('-')[0]
+
+        cell_barcodes.append(inst_bc[0])
+
+    # parse tuples if necessary
+    try:
+        cell_barcodes = [make_tuple(x) for x in cell_barcodes]
+    except:
+        pass
+
+    try:
+        genes = [make_tuple(x) for x in genes]
+    except:
+        pass
+
+    # make dataframe
+    df = pd.DataFrame(mat, index=genes, columns=cell_barcodes)
+
+    return df
+
+  @staticmethod
+  def save_gene_exp_to_mtx_dir(inst_path, df):
+
+    import os
+    from scipy import io
+    from scipy import sparse
+
+    if not os.path.exists(inst_path):
+        os.makedirs(inst_path)
+
+    genes = df.index.tolist()
+    barcodes = df.columns.tolist()
+
+    save_list_to_tsv(genes, inst_path + 'genes.tsv')
+    save_list_to_tsv(barcodes, inst_path + 'barcodes.tsv')
+
+    mat_ge = df.values
+    mat_ge_sparse = sparse.coo_matrix(mat_ge)
+
+    io.mmwrite( inst_path + 'matrix.mtx', mat_ge_sparse)
+
+  @staticmethod
+  def save_list_to_tsv(inst_list, filename):
+
+      f = open(filename, 'w')
+      for inst_line in inst_list:
+          f.write(str(inst_line) + '\n')
+
+      f.close()
+
+  def sim_same_and_diff_category_samples(self, df, cat_index=1, dist_type='cosine',
+                                         equal_var=False, plot_roc=True,
+                                         precalc_dist=False, calc_roc=True):
+      '''
+      Calculate the similarity of samples from the same and different categories. The
+      cat_index gives the index of the category, where 1 in the first category
+      '''
+
+      cols = df.columns.tolist()
+
+      if type(precalc_dist) == bool:
+          # compute distnace between rows (transpose to get cols as rows)
+          dist_arr = 1 - pdist(df.transpose(), metric=dist_type)
+      else:
+          dist_arr = precalc_dist
+
+      # generate sample names with categories
+      sample_combos = list(combinations(range(df.shape[1]),2))
+
+      sample_names = [str(ind) + '_same' if cols[x[0]][cat_index] == cols[x[1]][cat_index] else str(ind) + '_different' for ind, x in enumerate(sample_combos)]
+
+      ser_dist = pd.Series(data=dist_arr, index=sample_names)
+
+      # find same-cat sample comparisons
+      same_cat = [x for x in sample_names if x.split('_')[1] == 'same']
+
+      # find diff-cat sample comparisons
+      diff_cat = [x for x in sample_names if x.split('_')[1] == 'different']
+
+      # make series of same and diff category sample comparisons
+      ser_same = ser_dist[same_cat]
+      ser_same.name = 'Same Category'
+      ser_diff = ser_dist[diff_cat]
+      ser_diff.name = 'Different Category'
+
+      sim_dict = {}
+      roc_data = {}
+      sim_data = {}
+
+      sim_dict['same'] = ser_same
+      sim_dict['diff'] = ser_diff
+
+      pval_dict = {}
+      ttest_stat, pval_dict['ttest'] = ttest_ind(ser_diff, ser_same, equal_var=equal_var)
+
+      ttest_stat, pval_dict['mannwhitney'] = mannwhitneyu(ser_diff, ser_same)
+
+      if calc_roc:
+          # calc AUC
+          true_index = list(np.ones(sim_dict['same'].shape[0]))
+          false_index = list(np.zeros(sim_dict['diff'].shape[0]))
+          y_true = true_index + false_index
+
+          true_val = list(sim_dict['same'].values)
+          false_val = list(sim_dict['diff'].values)
+          y_score = true_val + false_val
+
+          fpr, tpr, thresholds = roc_curve(y_true, y_score)
+
+          inst_auc = auc(fpr, tpr)
+
+          if plot_roc:
+              plt.figure()
+              plt.plot(fpr, tpr)
+              plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
+              plt.figure(figsize=(10,10))
+
+              print('AUC', inst_auc)
+
+          roc_data['true'] = y_true
+          roc_data['score'] = y_score
+          roc_data['fpr'] = fpr
+          roc_data['tpr'] = tpr
+          roc_data['thresholds'] = thresholds
+          roc_data['auc'] = inst_auc
+
+      sim_data['sim_dict'] = sim_dict
+      sim_data['pval_dict'] = pval_dict
+      sim_data['roc_data'] = roc_data
+
+      return sim_data
+
+  def generate_signatures(self, df_data, df_meta, category_name, pval_cutoff=0.05,
+                          num_top_dims=False, verbose=True, equal_var=False):
+
+      '''
+      Generate signatures for column categories
+      T-test is run for each category in a one-vs-all manner. The num_top_dims overrides
+      the P-value cutoff.
+      '''
+
+      df_t = df_data.transpose()
+
+      # remove columns (dimensions) with constant values
+      orig_num_cols = df_t.shape[1]
+      df_t = df_t.loc[:, (df_t != df_t.iloc[0]).any()]
+      if df_t.shape[1] < orig_num_cols:
+        print('dropped columns with constant values')
+
+      # make tuple rows
+      df_t.index = [(x, df_meta.loc[x, category_name]) for x in df_t.index.tolist()]
+      category_level = 1
+
+      df = self.row_tuple_to_multiindex(df_t)
+
+      cell_types = sorted(list(set(df.index.get_level_values(category_level).tolist())))
+
+      keep_genes = []
+      keep_genes_dict = {}
+      gene_pval_dict = {}
+      all_fold_info = {}
+
+      for inst_ct in cell_types:
+
+          inst_ct_mat = df.xs(key=inst_ct, level=category_level)
+          inst_other_mat = df.drop(inst_ct, level=category_level)
+
+          # save mean values and fold change
+          fold_info = {}
+          fold_info['cluster_mean'] = inst_ct_mat.mean()
+          fold_info['other_mean'] = inst_other_mat.mean()
+          fold_info['log2_fold'] = fold_info['cluster_mean']/fold_info['other_mean']
+          fold_info['log2_fold'] = fold_info['log2_fold'].apply(np.log2)
+          all_fold_info[inst_ct] = fold_info
+
+          inst_stats, inst_pvals = ttest_ind(inst_ct_mat, inst_other_mat, axis=0, equal_var=equal_var)
+
+          ser_pval = pd.Series(data=inst_pvals, index=df.columns.tolist()).sort_values()
+
+          if num_top_dims == False:
+              ser_pval_keep = ser_pval[ser_pval < pval_cutoff]
+          else:
+              ser_pval_keep = ser_pval[:num_top_dims]
+
+          gene_pval_dict[inst_ct] = ser_pval_keep
+
+          inst_keep = ser_pval_keep.index.tolist()
+          keep_genes.extend(inst_keep)
+          keep_genes_dict[inst_ct] = inst_keep
+
+      keep_genes = sorted(list(set(keep_genes)))
+
+      df_gbm = df.groupby(level=category_level).mean().transpose()
+      cols = df_gbm.columns.tolist()
+
+      df_sig = df_gbm.loc[keep_genes]
+
+      if len(keep_genes) == 0 and verbose:
+          print('found no informative dimensions')
+
+      df_gene_pval = pd.concat(gene_pval_dict, axis=1, sort=False)
+
+      df_diff = {}
+      for inst_col in df_gene_pval:
+
+          inst_pvals = df_gene_pval[inst_col]
+
+          # nans represent dimensions that did not meet pval or top threshold
+          inst_pvals = inst_pvals.dropna().sort_values()
+
+          inst_genes = inst_pvals.index.tolist()
+
+          # prevent failure if no cells have this category
+          if inst_pvals.shape[0] > 0:
+              rej, pval_corr = smm.multipletests(inst_pvals, 0.05, method='fdr_bh')[:2]
+
+              ser_pval = pd.Series(data=inst_pvals, index=inst_genes, name='P-values')
+              ser_bh_pval = pd.Series(data=pval_corr, index=inst_genes, name='BH P-values')
+              ser_log2_fold = all_fold_info[inst_col]['log2_fold'].loc[inst_genes]
+              ser_log2_fold.name = 'Log2 Fold Change'
+              ser_cluster_mean = all_fold_info[inst_col]['cluster_mean'].loc[inst_genes]
+              ser_cluster_mean.name = 'Cluster Mean'
+              ser_other_mean = all_fold_info[inst_col]['other_mean'].loc[inst_genes]
+              ser_other_mean.name = 'All Other Mean'
+              inst_df = pd.concat([ser_pval, ser_bh_pval, ser_log2_fold, ser_cluster_mean, ser_other_mean], axis=1)
+
+              df_diff[inst_col] = inst_df
+
+      return df_sig, df_diff
+
+  def old_generate_signatures(self, df_ini, category_level, pval_cutoff=0.05,
+                          num_top_dims=False, verbose=True, equal_var=False):
+
+      ''' Generate signatures for column categories '''
+
+      # change this to use category name and set caetgory level to 1 always
+      ###################################
+
+      df_t = df_ini.transpose()
+
+      # remove columns with constant values
+      df_t = df_t.loc[:, (df_t != df_t.iloc[0]).any()]
+
+      df = self.row_tuple_to_multiindex(df_t)
+
+      cell_types = sorted(list(set(df.index.get_level_values(category_level).tolist())))
+
+      keep_genes = []
+      keep_genes_dict = {}
+      gene_pval_dict = {}
+      all_fold_info = {}
+
+      for inst_ct in cell_types:
+
+          inst_ct_mat = df.xs(key=inst_ct, level=category_level)
+          inst_other_mat = df.drop(inst_ct, level=category_level)
+
+          # save mean values and fold change
+          fold_info = {}
+          fold_info['cluster_mean'] = inst_ct_mat.mean()
+          fold_info['other_mean'] = inst_other_mat.mean()
+          fold_info['log2_fold'] = fold_info['cluster_mean']/fold_info['other_mean']
+          fold_info['log2_fold'] = fold_info['log2_fold'].apply(np.log2)
+          all_fold_info[inst_ct] = fold_info
+
+          inst_stats, inst_pvals = ttest_ind(inst_ct_mat, inst_other_mat, axis=0, equal_var=equal_var)
+
+          ser_pval = pd.Series(data=inst_pvals, index=df.columns.tolist()).sort_values()
+
+          if num_top_dims == False:
+              ser_pval_keep = ser_pval[ser_pval < pval_cutoff]
+          else:
+              ser_pval_keep = ser_pval[:num_top_dims]
+
+          gene_pval_dict[inst_ct] = ser_pval_keep
+
+          inst_keep = ser_pval_keep.index.tolist()
+          keep_genes.extend(inst_keep)
+          keep_genes_dict[inst_ct] = inst_keep
+
+      keep_genes = sorted(list(set(keep_genes)))
+
+      df_gbm = df.groupby(level=category_level).mean().transpose()
+      cols = df_gbm.columns.tolist()
+      new_cols = []
+      for inst_col in cols:
+          new_col = (inst_col, category_level + ': ' + inst_col)
+          new_cols.append(new_col)
+      df_gbm.columns = new_cols
+
+      df_sig = df_gbm.loc[keep_genes]
+
+      if len(keep_genes) == 0 and verbose:
+          print('found no informative dimensions')
+
+      df_gene_pval = pd.concat(gene_pval_dict, axis=1, sort=False)
+
+      return df_sig, df_gene_pval, all_fold_info
+
+  def predict_cats_from_sigs(self, df_data_ini, df_meta, df_sig_ini,
+                             predict='Predicted Category', dist_type='cosine',
+                             unknown_thresh=-1):
+      ''' Predict category using signature '''
+
+      keep_rows = df_sig_ini.index.tolist()
+      data_rows = df_data_ini.index.tolist()
+
+      common_rows = list(set(data_rows).intersection(keep_rows))
+
+      df_data = deepcopy(df_data_ini.loc[common_rows])
+      df_sig = deepcopy(df_sig_ini.loc[common_rows])
+
+      # calculate sim_mat of df_data and df_sig
+      cell_types = df_sig.columns.tolist()
+      barcodes = df_data.columns.tolist()
+      sim_mat = 1 - pairwise_distances(df_sig.transpose(), df_data.transpose(), metric=dist_type)
+      df_sim = pd.DataFrame(data=sim_mat, index=cell_types, columns=barcodes).transpose()
+
+      # get the top column value (most similar signature)
+      df_sim_top = df_sim.idxmax(axis=1)
+
+      # get the maximum similarity of a cell to a cell type definition
+      max_sim = df_sim.max(axis=1)
+
+      unknown_cells = max_sim[max_sim < unknown_thresh].index.tolist()
+
+      # assign unknown cells (need category of same name)
+      df_sim_top[unknown_cells] = 'Unknown'
+
+      # add predicted category name to top list
+      # top_list = df_sim_top.values
+      df_sim = df_sim.transpose()
+
+      df_meta[predict] = df_sim_top
+
+      return df_sim, df_meta
+
+
+  def old_predict_cats_from_sigs(self, df_data_ini, df_sig_ini, dist_type='cosine', predict_level='Predict Category',
+                             truth_level=1, unknown_thresh=-1):
+      ''' Predict category using signature '''
+
+      keep_rows = df_sig_ini.index.tolist()
+      data_rows = df_data_ini.index.tolist()
+
+      common_rows = list(set(data_rows).intersection(keep_rows))
+
+      df_data = deepcopy(df_data_ini.loc[common_rows])
+      df_sig = deepcopy(df_sig_ini.loc[common_rows])
+
+      # calculate sim_mat of df_data and df_sig
+      cell_types = df_sig.columns.tolist()
+      barcodes = df_data.columns.tolist()
+      sim_mat = 1 - pairwise_distances(df_sig.transpose(), df_data.transpose(), metric=dist_type)
+      df_sim = pd.DataFrame(data=sim_mat, index=cell_types, columns=barcodes).transpose()
+
+      # get the top column value (most similar signature)
+      df_sim_top = df_sim.idxmax(axis=1)
+
+      # get the maximum similarity of a cell to a cell type definition
+      max_sim = df_sim.max(axis=1)
+
+      unknown_cells = max_sim[max_sim < unknown_thresh].index.tolist()
+
+      # assign unknown cells (need category of same name)
+      df_sim_top[unknown_cells] = 'Unknown'
+
+      # add predicted category name to top list
+      top_list = df_sim_top.values
+      top_list = [ predict_level + ': ' + x[0] if type(x) is tuple else predict_level + ': ' + x  for x in top_list]
+
+      # add cell type category to input data
+      df_cat = deepcopy(df_data)
+      cols = df_cat.columns.tolist()
+      new_cols = []
+
+      # check whether the columns have the true category available
+      has_truth = False
+      if type(cols[0]) is tuple:
+          has_truth = True
+
+      if has_truth:
+          new_cols = [tuple(list(a) + [b]) for a,b in zip(cols, top_list)]
+      else:
+          new_cols = [tuple([a] + [b]) for a,b in zip(cols, top_list)]
+
+      # transfer new categories
+      df_cat.columns = new_cols
+
+      # keep track of true and predicted labels
+      y_info = {}
+      y_info['true'] = []
+      y_info['pred'] = []
+
+      if has_truth:
+          y_info['true'] = [x[truth_level].split(': ')[1] for x in cols]
+          y_info['pred'] = [x.split(': ')[1] for x in top_list]
+
+      return df_cat, df_sim.transpose(), y_info
+
+  def assess_prediction(self, df_meta, truth, pred):
+      ''' Generate confusion matrix from y_info '''
+
+      y_info = {}
+      y_info['true'] = df_meta[truth].values.tolist()
+      y_info['pred'] = df_meta[pred].values.tolist()
+
+      sorted_cats = sorted(list(set(y_info['true'] + y_info['pred'])))
+      conf_mat = confusion_matrix(y_info['true'], y_info['pred'], labels=sorted_cats)
+
+      # true columns and pred rows
+      df_conf = pd.DataFrame(conf_mat, index=sorted_cats, columns=sorted_cats).transpose()
+
+      total_correct = np.trace(df_conf)
+      total_pred = df_conf.sum().sum()
+      fraction_correct = total_correct/float(total_pred)
+
+      # calculate ser_correct
+      correct_list = []
+      cat_counts = df_conf.sum(axis=0)
+      all_cols = df_conf.columns.tolist()
+      for inst_cat in all_cols:
+          inst_correct = df_conf.loc[inst_cat, inst_cat] / cat_counts[inst_cat]
+          correct_list.append(inst_correct)
+
+      ser_correct = pd.Series(data=correct_list, index=all_cols)
+
+      return df_conf, ser_correct, fraction_correct
+
+  def old_confusion_matrix_and_correct_series(self, y_info):
+      ''' Generate confusion matrix from y_info '''
+
+
+      a = deepcopy(y_info['true'])
+      true_count = dict((i, a.count(i)) for i in set(a))
+
+      a = deepcopy(y_info['pred'])
+      pred_count = dict((i, a.count(i)) for i in set(a))
+
+      sorted_cats = sorted(list(set(y_info['true'] + y_info['pred'])))
+      conf_mat = confusion_matrix(y_info['true'], y_info['pred'], sorted_cats)
+      df_conf = pd.DataFrame(conf_mat, index=sorted_cats, columns=sorted_cats)
+
+      total_correct = np.trace(df_conf)
+      total_pred = df_conf.sum().sum()
+      fraction_correct = total_correct/float(total_pred)
+
+      # calculate ser_correct
+      correct_list = []
+      cat_counts = df_conf.sum(axis=1)
+      all_cols = df_conf.columns.tolist()
+      for inst_cat in all_cols:
+          inst_correct = df_conf[inst_cat].loc[inst_cat] / cat_counts[inst_cat]
+          correct_list.append(inst_correct)
+
+      ser_correct = pd.Series(data=correct_list, index=all_cols)
+
+      populations = {}
+      populations['true'] = true_count
+      populations['pred'] = pred_count
+
+      return df_conf, populations, ser_correct, fraction_correct
+
+  def compare_performance_to_shuffled_labels(self, df_data, category_level, num_shuffles=100,
+                                             random_seed=99, pval_cutoff=0.05, dist_type='cosine',
+                                             num_top_dims=False, predict_level='Predict Category',
+                                             truth_level=1, unknown_thresh=-1, equal_var=False,
+                                             performance_type='prediction'):
+      random.seed(random_seed)
+
+      perform_list = []
+      num_shuffles = num_shuffles
+
+      # pre-calculate the distance matrix (similarity matrix) if necessary
+      if performance_type == 'cat_sim_auc':
+          dist_arr = 1 - pdist(df_data.transpose(), metric=dist_type)
+
+      for inst_run in range(num_shuffles + 1):
+
+          cols = df_data.columns.tolist()
+          rows = df_data.index.tolist()
+          mat = df_data.values
+
+          shuffled_cols = deepcopy(cols)
+          random.shuffle(shuffled_cols)
+
+          # do not perform shuffling the first time to confirm that we get the same
+          # results as the unshuffled dataaset
+          if inst_run == 0:
+              df_shuffle = deepcopy(df_data)
+          else:
+              df_shuffle = pd.DataFrame(data=mat, columns=shuffled_cols, index=rows)
+
+          # generate signature on shuffled data
+          df_sig, keep_genes, keep_genes_dict, fold_info = generate_signatures(df_shuffle,
+                                                             category_level,
+                                                             pval_cutoff=pval_cutoff,
+                                                             num_top_dims=num_top_dims,
+                                                             equal_var=equal_var)
+
+          # predictive performance
+          if performance_type == 'prediction':
+
+              # predict categories from signature
+              df_pred_cat, df_sig_sim, y_info = self.predict_cats_from_sigs(df_shuffle, df_sig,
+                  dist_type=dist_type, predict_level=predict_level, truth_level=truth_level,
+                  unknown_thresh=unknown_thresh)
+
+              # calc confusion matrix and performance
+              df_conf, populations, ser_correct, fraction_correct = self.confusion_matrix_and_correct_series(y_info)
+
+              # store performances of shuffles
+              if inst_run > 0:
+                  perform_list.append(fraction_correct)
+              else:
+                  real_performance = fraction_correct
+                  print('performance (fraction correct) of unshuffled: ' + str(fraction_correct))
+
+          elif performance_type == 'cat_sim_auc':
+
+              # predict categories from signature
+              sim_dict, pval_dict, roc_data = self.sim_same_and_diff_category_samples(df_shuffle,
+                  cat_index=1, plot_roc=False, equal_var=equal_var, precalc_dist=dist_arr)
+
+              # store performances of shuffles
+              if inst_run > 0:
+                  perform_list.append(roc_data['auc'])
+              else:
+                  real_performance = roc_data['auc']
+                  print('performance (category similarity auc) of unshuffled: ' + str(roc_data['auc']))
+
+      perform_ser = pd.Series(perform_list)
+
+      in_top_fraction = perform_ser[perform_ser > real_performance].shape[0]/num_shuffles
+      print('real data performs in the top ' + str(in_top_fraction*100) + '% of shuffled labels\n')
+
+      return perform_ser
+
+  @staticmethod
+  def box_scatter_plot(df, group, columns=False, rand_seed=100, alpha=0.5,
+      dot_color='red', num_row=None, num_col=1, figsize=(10,10),
+      start_title='Variable Measurements Across', end_title='Groups',
+      group_list=False):
+
+      from scipy import stats
+      import pandas as pd
+
+      import matplotlib.pyplot as plt
+      # %matplotlib inline
+
+      if columns == False:
+          columns = df.columns.tolist()
+
+      plt.figure(figsize=figsize)
+      figure_title = start_title + ' ' + group + ' ' + end_title
+      plt.suptitle(figure_title, fontsize=20)
+
+      # list of arranged dataframes
+      dfs = {}
+
+      for col_num in range(len(columns)):
+          column = columns[col_num]
+          plot_id = col_num + 1
+
+          # group by column name or multiIndex name
+          if group in df.columns.tolist():
+              grouped = df.groupby(group)
+          else:
+              grouped = df.groupby(level=group)
+
+          names, vals, xs = [], [] ,[]
+
+          if type(column) is tuple:
+              column_title = column[0]
+          else:
+              column_title = column
+
+          for i, (name, subdf) in enumerate(grouped):
+
+              names.append(name)
+
+              inst_ser = subdf[column]
+
+              column_name = column_title + '-' + str(name)
+
+              inst_ser.name = column_name
+              vals.append(inst_ser)
+
+              np.random.seed(rand_seed)
+              xs.append(np.random.normal(i+1, 0.04, subdf.shape[0]))
+
+          ax = plt.subplot(num_row, num_col, plot_id)
+
+          plt.boxplot(vals, labels=names)
+
+          ngroup = len(vals)
+
+          clevels = np.linspace(0., 1., ngroup)
+
+          for x, val, clevel in zip(xs, vals, clevels):
+
+              plt.subplot(num_row, num_col, plot_id)
+              plt.scatter(x, val, c=dot_color, alpha=alpha)
+
+
+          df_arranged = pd.concat(vals, axis=1)
+
+          # anova
+          anova_data = [df_arranged[col].dropna() for col in df_arranged]
+          f_val, pval = stats.f_oneway(*anova_data)
+
+          if pval < 0.01:
+              ax.set_title(column_title + ' P-val: ' + '{:.2e}'.format(pval))
+          else:
+              pval = round(pval * 100000)/100000
+              ax.set_title(column_title + ' P-val: ' + str(pval))
+
+          dfs[column] = df_arranged
+
+      return dfs
+
+  @staticmethod
+  def rank_cols_by_anova_pval(df, group, columns=False, rand_seed=100, alpha=0.5, dot_color='red', num_row=None, num_col=1,
+                       figsize=(10,10)):
+
+      from scipy import stats
+      import numpy as np
+      import pandas as pd
+
+      # import matplotlib.pyplot as plt
+      # %matplotlib inline
+
+      if columns == False:
+          columns = df.columns.tolist()
+
+      # plt.figure(figsize=figsize)
+
+      # list of arranged dataframes
+      dfs = {}
+
+      pval_list = []
+
+      for col_num in range(len(columns)):
+          column = columns[col_num]
+          plot_id = col_num + 1
+
+          # group by column name or multiIndex name
+          if group in df.columns.tolist():
+              grouped = df.groupby(group)
+          else:
+              grouped = df.groupby(level=group)
+
+          names, vals, xs = [], [] ,[]
+
+          if type(column) is tuple:
+              column_title = column[0]
+          else:
+              column_title = column
+
+          for i, (name, subdf) in enumerate(grouped):
+              names.append(name)
+
+              inst_ser = subdf[column]
+
+              column_name = column_title + '-' + str(name)
+
+              inst_ser.name = column_name
+              vals.append(inst_ser)
+
+              np.random.seed(rand_seed)
+              xs.append(np.random.normal(i+1, 0.04, subdf.shape[0]))
+
+
+          ngroup = len(vals)
+
+          df_arranged = pd.concat(vals, axis=1)
+
+          # anova
+          anova_data = [df_arranged[col].dropna() for col in df_arranged]
+          f_val, pval = stats.f_oneway(*anova_data)
+
+          pval_list.append(pval)
+
+      pval_ser = pd.Series(data=pval_list, index=columns)
+      pval_ser = pval_ser.sort_values(ascending=True)
+
+      return pval_ser
+
+
+  def row_tuple_to_multiindex(self, df):
+
+      import pandas as pd
+
+      from copy import deepcopy
+      df_mi = deepcopy(df)
+      rows = df_mi.index.tolist()
+      titles = []
+      for inst_part in rows[0]:
+
+          if ': ' in inst_part:
+              inst_title = inst_part.split(': ')[0]
+          else:
+              inst_title = 'Name'
+          titles.append(inst_title)
+
+      new_rows = []
+      for inst_row in rows:
+          inst_row = list(inst_row)
+          new_row = []
+          for inst_part in inst_row:
+              if ': ' in inst_part:
+                  inst_part = inst_part.split(': ')[1]
+              new_row.append(inst_part)
+          new_row = tuple(new_row)
+          new_rows.append(new_row)
+
+      df_mi.index = new_rows
+
+      df_mi.index = pd.MultiIndex.from_tuples(df_mi.index, names=titles)
+
+      return df_mi
+
+  def set_cat_colors(self, cat_colors, axis, cat_index, cat_title=False):
+      for inst_ct in cat_colors:
+          if cat_title != False:
+              cat_name = cat_title + ': ' + inst_ct
+          else:
+              cat_name = inst_ct
+
+          inst_color = cat_colors[inst_ct]
+          self.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)
+
+  def set_manual_category(self, col=None, row=None, preferred_cats=None):
+    '''
+    This method is used to tell Clustergrammer2 that the user wants to define
+    a manual category interactively using the dendrogram.
+    '''
+
+    self.dat['manual_category'] = {}
+    self.dat['manual_category']['col'] = col
+    self.dat['manual_category']['row'] = row
+
+    if preferred_cats is not None:
+      pref_cats = []
+      for inst_row in preferred_cats.index.tolist():
+          inst_dict = {}
+          inst_dict['name'] = inst_row
+          inst_dict['color'] = preferred_cats.loc[inst_row, 'color']
+          pref_cats.append(inst_dict)
+
+      if col is not None:
+        self.dat['manual_category']['col_cats'] = pref_cats
+
+      if row is not None:
+        self.dat['manual_category']['row_cats'] = pref_cats
+
+  def ds_to_original_meta(self, axis):
+    clusters = self.meta_ds_col.index.tolist()
+
+    man_cat_title = self.dat['manual_category'][axis]
+
+    for inst_cluster in clusters:
+
+        # get the manual category from the downsampled data
+        inst_man_cat = self.meta_ds_col.loc[inst_cluster, man_cat_title]
+
+        # find original labels that are assigned to this cluster
+        found_labels = self.meta_col[self.meta_col[self.ds_name] == inst_cluster].index.tolist()
+
+        # update manual category
+        self.meta_col.loc[found_labels, man_cat_title] = inst_man_cat
+
+  def get_manual_category(self, tmp):
+
+
+    for axis in ['col']:
+
+      try:
+      ###########################################################
+
+        export_dict = {}
+
+        inst_nodes = self.dat['nodes'][axis]
+        # print('get get_manual_category', len(inst_nodes))
+
+        cat_title = self.dat['manual_category'][axis]
+
+        # Category Names
+        try:
+        ###########################################################
+
+          export_dict[cat_title] = pd.Series(json.loads(self.widget_instance.manual_cat)[axis][cat_title])
+
+          if hasattr(self, 'meta_cat') == True:
+
+            if axis == 'row':
+              if self.is_downsampled == False:
+                self.meta_row.loc[inst_nodes, cat_title] = export_dict[cat_title]
+              else:
+                self.meta_ds_row.loc[inst_nodes, cat_title] = export_dict[cat_title]
+                self.ds_to_original_meta(axis)
+
+            elif axis == 'col':
+              if self.is_downsampled == False:
+                # print('> > > > > > > > > > > ')
+                # print(export_dict[cat_title])
+
+                # print('.. .. .. .. .. .. .. .. .. .. .. .. ')
+                # print(self.meta_col)
+
+                # # for some reason this is breaking when trying to sync metadata
+                # # switching to slower row based syncing
+                # self.meta_col.loc[inst_nodes, cat_title] = export_dict[cat_title]
+
+                # manually looping through rows in metadata works
+                for inst_row in export_dict[cat_title].index.tolist():
+                  self.meta_col.loc[inst_row, cat_title] = export_dict[cat_title][inst_row]
+
+                # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
+                # print(self.meta_col)
+
+              else:
+                self.meta_ds_col.loc[inst_nodes, cat_title] = export_dict[cat_title]
+                self.ds_to_original_meta(axis)
+
+
+        except:
+          print('unable to load', axis ,' category, please check title')
+
+        # Category Colors
+        #######################
+        export_dict['cat_colors'] = json.loads(self.widget_instance.manual_cat)['global_cat_colors']
+
+        # # drop title from category colors
+        # export_dict['cat_colors'] = {}
+        # for inst_cat in ini_new_colors:
+        #   if (': ' in inst_cat):
+        #     export_dict['cat_colors'][inst_cat.split(': ')[1]] = ini_new_colors[inst_cat]
+        #   else:
+        #     export_dict['cat_colors'][inst_cat] = ini_new_colors[inst_cat]
+
+        # if hasattr(self, 'meta_cat') == False:
+        #   return export_dict
+
+      except:
+          # print('failed to parse manual_category')
+          pass
+
+
+  @staticmethod
+  def umi_norm(df):
+      # umi norm
+      barcode_umi_sum = df.sum()
+      df_umi = df.div(barcode_umi_sum)
+      return df_umi
+
+  @staticmethod
+  def make_df_from_cols(cols):
+    inst_col = cols[0]
+
+    cat_titles = []
+    for inst_info in inst_col[1:]:
+        inst_title = inst_info.split(': ')[0]
+        cat_titles.append(inst_title)
+
+    clean_cols = []
+    for inst_col in cols:
+        inst_clean = []
+        for inst_info in inst_col:
+            if ': ' in inst_info:
+                inst_clean.append(inst_info.split(': ')[1])
+            else:
+                inst_clean.append(inst_info)
+        clean_cols.append(tuple(inst_clean))
+
+    df_ini = pd.DataFrame(data=clean_cols).set_index(0)
+    mat = df_ini.values
+    rows = df_ini.index.tolist()
+
+    df_meta = pd.DataFrame(data=mat, index=rows, columns=cat_titles)
+
+    return df_meta
diff --git a/src/celldega/clust/calc_clust.py b/src/celldega/clust/calc_clust.py
new file mode 100644
index 00000000..395b5fa0
--- /dev/null
+++ b/src/celldega/clust/calc_clust.py
@@ -0,0 +1,183 @@
+def cluster_row_and_col(net, dist_type='cosine', linkage_type='average',
+                        dendro=True, run_clustering=True, run_rank=True,
+                        ignore_cat=False, calc_cat_pval=False, links=False,
+                        clust_library='scipy', min_samples=1, min_cluster_size=2):
+  ''' cluster net.dat and make visualization json, net.viz.
+  optionally leave out dendrogram colorbar groups with dendro argument '''
+
+  # import umap
+  import scipy
+  from copy import deepcopy
+  from scipy.spatial.distance import pdist
+  from . import categories, make_viz, cat_pval
+
+  dm = {}
+  for axis in ['row', 'col']:
+
+    # save directly to dat structure
+    node_info = net.dat['node_info'][axis]
+
+    node_info['ini'] = list(range( len(net.dat['nodes'][axis]), -1, -1))
+
+    tmp_mat = deepcopy(net.dat['mat'])
+
+    # calc distance matrix
+    if clust_library != 'hdbscan':
+      dm[axis] = calc_distance_matrix(tmp_mat, axis, dist_type)
+    else:
+      dm[axis] = None
+
+    # dm[axis] = calc_distance_matrix(tmp_mat, axis, dist_type)
+
+    # cluster
+    if run_clustering is True:
+      node_info['clust'], node_info['Y'] = clust_and_group(net,
+                                                     dm[axis],
+                                                     axis,
+                                                     tmp_mat,
+                                                     dist_type=dist_type,
+                                                     linkage_type=linkage_type,
+                                                     clust_library=clust_library,
+                                                     min_samples=min_samples,
+                                                     min_cluster_size=min_cluster_size)
+    else:
+      dendro = False
+      node_info['clust'] = node_info['ini']
+
+    # sorting
+    if run_rank is True:
+      node_info['rank'] = sort_rank_nodes(net, axis, 'sum')
+      node_info['rankvar'] = sort_rank_nodes(net, axis, 'var')
+    else:
+      node_info['rank'] = node_info['ini']
+      node_info['rankvar'] = node_info['ini']
+
+    ##################################
+    if ignore_cat is False:
+      categories.calc_cat_clust_order(net, axis)
+
+  if calc_cat_pval is True:
+    cat_pval.main(net)
+
+  # make the visualization json
+  make_viz.viz_json(net, dendro, links)
+
+  return dm
+
+def calc_distance_matrix(tmp_mat, axis, dist_type='cosine'):
+  from scipy.spatial.distance import pdist
+  import numpy as np
+
+  if axis == 'row':
+    inst_dm = pdist(tmp_mat, metric=dist_type)
+  elif axis == 'col':
+    inst_dm = pdist(tmp_mat.transpose(), metric=dist_type)
+
+  inst_dm[inst_dm < 0] = float(0)
+
+  return inst_dm
+
+def clust_and_group(net, inst_dm, axis, mat, dist_type='cosine', linkage_type='average',
+                    clust_library='scipy', min_samples=1, min_cluster_size=2):
+
+  # print(clust_library)
+
+  import scipy.cluster.hierarchy as hier
+  import pandas as pd
+
+  if clust_library == 'scipy':
+    Y = hier.linkage(inst_dm, method=linkage_type)
+
+  elif clust_library == 'fastcluster':
+    import fastcluster
+    Y = fastcluster.linkage(inst_dm, method=linkage_type)
+
+  elif clust_library == 'hdbscan':
+    # print('HDBSCAN!')
+    import hdbscan
+
+
+    # pca-umap-hdbscan using data (no pre-cal distance matrix)
+    ######################################################
+    from sklearn.decomposition import PCA
+    clusterer = hdbscan.HDBSCAN(min_samples=min_samples,
+                                min_cluster_size=min_cluster_size)
+
+    # rows are the data points, cols are dimensions
+    n_components = 50
+    if axis == 'row':
+      if mat.shape[1] > n_components:
+        low_d_mat = PCA(n_components=n_components).fit_transform(mat)
+      else:
+        low_d_mat = mat
+
+    elif axis == 'col':
+      if mat.shape[0] > n_components:
+        low_d_mat = PCA(n_components=n_components).fit_transform(mat.transpose())
+      else:
+        low_d_mat = mat.transpose()
+
+
+    # run UMAP on low_d_mat (after PCA)
+    # print('running umap!!!!!!!!!!!!!!!!!!!!!!!!!!')
+    import umap
+    umap_mat = umap.UMAP(
+                          metric=dist_type,
+                          n_neighbors=5,
+                          min_dist=0.0,
+                          n_components=2,
+                          random_state=42,
+                          ).fit_transform(low_d_mat)
+
+    umap_df = pd.DataFrame(umap_mat.transpose(),
+                           index=['x','y'],
+                           columns=net.dat['nodes'][axis])
+
+    net.umap[axis] = umap_df
+    clusterer.fit(umap_mat)
+
+    Y = clusterer.single_linkage_tree_.to_numpy()
+    Z = hier.dendrogram(Y, no_plot=True)
+
+  Z = hier.dendrogram(Y, no_plot=True)
+  inst_clust_order = Z['leaves']
+
+  return inst_clust_order, Y
+
+def sort_rank_nodes(net, rowcol, rank_type):
+  import numpy as np
+  from operator import itemgetter
+  from copy import deepcopy
+
+  tmp_nodes = deepcopy(net.dat['nodes'][rowcol])
+  inst_mat = deepcopy(net.dat['mat'])
+
+  sum_term = []
+  for i in range(len(tmp_nodes)):
+    inst_dict = {}
+    inst_dict['name'] = tmp_nodes[i]
+
+    if rowcol == 'row':
+      if rank_type == 'sum':
+        inst_dict['rank'] = np.sum(inst_mat[i, :])
+      elif rank_type == 'var':
+        inst_dict['rank'] = np.var(inst_mat[i, :])
+    else:
+      if rank_type == 'sum':
+        inst_dict['rank'] = np.sum(inst_mat[:, i])
+      elif rank_type == 'var':
+        inst_dict['rank'] = np.var(inst_mat[:, i])
+
+    sum_term.append(inst_dict)
+
+  sum_term = sorted(sum_term, key=itemgetter('rank'), reverse=False)
+
+  tmp_sort_nodes = []
+  for inst_dict in sum_term:
+    tmp_sort_nodes.append(inst_dict['name'])
+
+  sort_index = []
+  for inst_node in tmp_nodes:
+    sort_index.append(tmp_sort_nodes.index(inst_node))
+
+  return sort_index
diff --git a/src/celldega/clust/cat_pval.py b/src/celldega/clust/cat_pval.py
new file mode 100644
index 00000000..7653ef8c
--- /dev/null
+++ b/src/celldega/clust/cat_pval.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pandas as pd
+from copy import deepcopy
+
+def main(net):
+  '''
+  calculate pvalue of category closeness
+  '''
+  # calculate the distance between the data points within the same category and
+  # compare to null distribution
+  for inst_rc in ['row', 'col']:
+
+    inst_nodes = deepcopy(net.dat['nodes'][inst_rc])
+
+    inst_index = deepcopy(net.dat['node_info'][inst_rc]['clust'])
+
+    # reorder based on clustered order
+    inst_nodes = [ inst_nodes[i] for i in inst_index]
+
+    # make distance matrix dataframe
+    dm = dist_matrix_lattice(inst_nodes)
+
+    node_infos = list(net.dat['node_info'][inst_rc].keys())
+
+    all_cats = []
+    for inst_info in node_infos:
+      if 'dict_cat_' in inst_info:
+        all_cats.append(inst_info)
+
+    for cat_dict in all_cats:
+
+      tmp_dict = net.dat['node_info'][inst_rc][cat_dict]
+
+      pval_name = cat_dict.replace('dict_','pval_')
+      net.dat['node_info'][inst_rc][pval_name] = {}
+
+      for cat_name in tmp_dict:
+
+        subset = tmp_dict[cat_name]
+
+        inst_median = calc_median_dist_subset(dm, subset)
+
+        hist = calc_hist_distances(dm, subset, inst_nodes)
+
+        pval = 0
+
+        for i in range(len(hist['prob'])):
+          if i == 0:
+            pval = hist['prob'][i]
+          if i >= 1:
+            if inst_median >= hist['bins'][i]:
+              pval = pval + hist['prob'][i]
+
+        net.dat['node_info'][inst_rc][pval_name][cat_name] = pval
+
+def dist_matrix_lattice(names):
+  from scipy.spatial.distance import pdist, squareform
+
+  lattice_size = len(names)
+  mat = np.zeros([lattice_size, 1])
+  mat[:,0] = list(range(lattice_size))
+
+  inst_dm = pdist(mat, metric='euclidean')
+
+  inst_dm[inst_dm < 0] = float(0)
+
+  inst_dm = squareform(inst_dm)
+
+  df = pd.DataFrame(data=inst_dm, columns=names, index=names)
+
+  return df
+
+
+def calc_median_dist_subset(dm, subset):
+  return np.median(dm[subset].loc[subset].values)
+
+def calc_hist_distances(dm, subset, inst_nodes):
+  np.random.seed(100)
+
+  num_null = 1000
+  num_points = len(subset)
+
+  median_dist = []
+  for i in range(num_null):
+    tmp = np.random.choice(inst_nodes, num_points, replace=False)
+    median_dist.append( np.median(dm[tmp].loc[tmp].values)  )
+
+  tmp_dist = sorted(deepcopy(median_dist))
+
+  median_dist = np.asarray(median_dist)
+  s1 = pd.Series(median_dist)
+  hist = np.histogram(s1, bins=30)
+
+  H = {}
+  H['prob'] = hist[0]/np.float(num_null)
+  H['bins'] = hist[1]
+
+  return H
\ No newline at end of file
diff --git a/src/celldega/clust/categories.py b/src/celldega/clust/categories.py
new file mode 100644
index 00000000..2b1fcf0c
--- /dev/null
+++ b/src/celldega/clust/categories.py
@@ -0,0 +1,346 @@
+def check_categories(lines):
+  '''
+  find out how many row and col categories are available
+  when loading data from file
+  '''
+  # count the number of row categories
+  rcat_line = lines[0].split('\t')
+
+  # calc the number of row names and categories
+  num_rc = 0
+  found_end = False
+
+  # skip first tab
+  for inst_string in rcat_line[1:]:
+    if inst_string == '':
+      if found_end is False:
+        num_rc = num_rc + 1
+    else:
+      found_end = True
+
+  max_rcat = 15
+  if max_rcat > len(lines):
+    max_rcat = len(lines) - 1
+
+  num_cc = 0
+  for i in range(max_rcat):
+    ccat_line = lines[i + 1].split('\t')
+
+    # make sure that line has length greater than one to prevent false cats from
+    # trailing new lines at end of matrix
+    if ccat_line[0] == '' and len(ccat_line) > 1:
+      num_cc = num_cc + 1
+
+  num_labels = {}
+  num_labels['row'] = num_rc + 1
+  num_labels['col'] = num_cc + 1
+
+  return num_labels
+
+def dict_cat(net, define_cat_colors=False):
+  '''
+  make a dictionary of node-category associations
+  '''
+  net.persistent_cat_colors = True
+
+  for inst_rc in ['row', 'col']:
+    inst_keys = list(net.dat['node_info'][inst_rc].keys())
+    all_cats = [x for x in inst_keys if 'cat-' in x]
+
+    for inst_name_cat in all_cats:
+
+      dict_cat = {}
+      tmp_cats = net.dat['node_info'][inst_rc][inst_name_cat]
+      tmp_nodes = net.dat['nodes'][inst_rc]
+
+      for i in range(len(tmp_cats)):
+        inst_cat = tmp_cats[i]
+        inst_node = tmp_nodes[i]
+
+        if inst_cat not in dict_cat:
+          dict_cat[inst_cat] = []
+
+        dict_cat[inst_cat].append(inst_node)
+
+      tmp_name = 'dict_' + inst_name_cat.replace('-', '_')
+      net.dat['node_info'][inst_rc][tmp_name] = dict_cat
+
+  # merge with old cat_colors by default
+  cat_colors = net.viz['cat_colors']
+
+  if define_cat_colors == True:
+    cat_number = 0
+
+    for inst_rc in ['row', 'col']:
+
+      inst_keys = list(net.dat['node_info'][inst_rc].keys())
+      all_cats = [x for x in inst_keys if 'cat-' in x]
+
+      for cat_index in all_cats:
+
+        if cat_index not in cat_colors[inst_rc]:
+          cat_colors[inst_rc][cat_index] = {}
+
+        cat_names = sorted(list(set(net.dat['node_info'][inst_rc][cat_index])))
+
+        # loop through each category name and assign a color
+        for tmp_name in cat_names:
+
+          # using the same rules as the front-end to define cat_colors
+          inst_color = get_cat_color(cat_number + cat_names.index(tmp_name))
+
+          check_name = tmp_name
+
+          # check if category is string type and non-numeric
+          try:
+            float(check_name)
+            is_string_cat = False
+          except:
+            is_string_cat = True
+
+          if is_string_cat == True:
+            # check for default non-color
+            if ': ' in check_name:
+              check_name = check_name.split(': ')[1]
+
+            # if check_name == 'False' or check_name == 'false':
+            if 'False' in check_name or 'false' in check_name:
+              inst_color = '#eee'
+
+            if 'Not ' in check_name:
+              inst_color = '#eee'
+
+          # do not overwrite old colors
+          if tmp_name not in cat_colors[inst_rc][cat_index] and is_string_cat:
+
+            cat_colors[inst_rc][cat_index][tmp_name] = inst_color
+            # print('overwrite: ' + tmp_name + ' -> ' + str(inst_color))
+
+          cat_number = cat_number + 1
+
+    net.viz['cat_colors'] = cat_colors
+
+    net.viz['global_cat_colors'] = {}
+
+    global_cat_colors = {}
+    for axis in net.viz['cat_colors']:
+        for cat_index in net.viz['cat_colors'][axis]:
+
+            inst_dict = net.viz['cat_colors'][axis][cat_index]
+
+            for inst_full_name in inst_dict:
+
+                inst_name = inst_full_name.split(': ')[1]
+                inst_color = inst_dict[inst_full_name]
+
+                global_cat_colors[inst_name] = inst_color
+
+    net.viz['global_cat_colors'] = global_cat_colors
+
+
+def calc_cat_clust_order(net, inst_rc):
+  '''
+  cluster category subset of data
+  '''
+  from .__init__ import Network
+  from copy import deepcopy
+  from . import calc_clust, run_filter
+
+  inst_keys = list(net.dat['node_info'][inst_rc].keys())
+  all_cats = [x for x in inst_keys if 'cat-' in x]
+
+  if len(all_cats) > 0:
+
+    for inst_name_cat in all_cats:
+
+      tmp_name = 'dict_' + inst_name_cat.replace('-', '_')
+      dict_cat = net.dat['node_info'][inst_rc][tmp_name]
+
+      unordered_cats = dict_cat.keys()
+
+      ordered_cats = order_categories(unordered_cats)
+
+      # this is the ordering of the columns based on their category, not
+      # including their clustering ordering within category
+      all_cat_orders = []
+      tmp_names_list = []
+      for inst_cat in ordered_cats:
+
+        inst_nodes = dict_cat[inst_cat]
+
+        tmp_names_list.extend(inst_nodes)
+
+      names_clust_list = tmp_names_list
+
+      # calc category-cluster order
+      final_order = []
+
+      for i in range(len(net.dat['nodes'][inst_rc])):
+
+        inst_node_name = net.dat['nodes'][inst_rc][i]
+        inst_node_num = names_clust_list.index(inst_node_name)
+
+        final_order.append(inst_node_num)
+
+      inst_index_cat = inst_name_cat.replace('-', '_') + '_index'
+
+      net.dat['node_info'][inst_rc][inst_index_cat] = final_order
+
+
+def order_categories(unordered_cats):
+  '''
+  If categories are strings, then simple ordering is fine.
+  If categories are values then I'll need to order based on their values.
+  The final ordering is given as the original categories (including titles) in a
+  ordered list.
+  '''
+
+  no_titles = remove_titles(unordered_cats)
+
+  all_are_numbers = check_all_numbers(no_titles)
+
+  if all_are_numbers:
+    ordered_cats = order_cats_based_on_values(unordered_cats, no_titles)
+  else:
+    ordered_cats = sorted(unordered_cats)
+
+  return ordered_cats
+
+
+def order_cats_based_on_values(unordered_cats, values_list):
+  import pandas as pd
+
+  try:
+    # convert values_list to values
+    values_list = [float(i) for i in values_list]
+
+    inst_series = pd.Series(data=values_list, index=unordered_cats)
+
+    inst_series.sort_values(inplace=True)
+
+    ordered_cats = inst_series.index.tolist()
+
+    # ordered_cats = unordered_cats
+  except:
+    # keep default ordering if error occurs
+    print('error sorting cats based on values ')
+    ordered_cats = unordered_cats
+
+  return ordered_cats
+
+def check_all_numbers(no_titles):
+  all_numbers = True
+  for tmp in no_titles:
+    if is_number(tmp) == False:
+      all_numbers = False
+
+  return all_numbers
+
+def remove_titles(cats):
+  from copy import deepcopy
+
+  # check if all have titles
+  ###########################
+  all_have_titles = True
+
+  for inst_cat in cats:
+    if is_number(inst_cat) == False:
+      if ': ' not in inst_cat:
+        all_have_titles = False
+    else:
+      all_have_titles = False
+
+  if all_have_titles:
+    no_titles = cats
+    no_titles = [i.split(': ')[1] for i in no_titles]
+
+  else:
+    no_titles = cats
+
+  return no_titles
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+def get_cat_color(cat_num):
+
+  all_colors = [ "#393b79", "#aec7e8", "#ff7f0e", "#ffbb78", "#98df8a", "#bcbd22",
+    "#404040", "#ff9896", "#c5b0d5", "#8c564b", "#1f77b4", "#5254a3", "#FFDB58",
+    "#c49c94", "#e377c2", "#7f7f7f", "#2ca02c", "#9467bd", "#dbdb8d", "#17becf",
+    "#637939", "#6b6ecf", "#9c9ede", "#d62728", "#8ca252", "#8c6d31", "#bd9e39",
+    "#e7cb94", "#843c39", "#ad494a", "#d6616b", "#7b4173", "#a55194", "#ce6dbd",
+    "#de9ed6"];
+
+  inst_color = all_colors[cat_num % len(all_colors)]
+
+  return inst_color
+
+def add_cats(net, axis, cat_data):
+
+  try:
+    df = net.export_df()
+
+    if axis == 'row':
+      labels = df.index.tolist()
+    elif axis == 'col':
+      labels = df.columns.tolist()
+
+    if 'title' in cat_data:
+      inst_title = cat_data['title']
+    else:
+      inst_title = 'New Category'
+
+    all_cats = cat_data['cats']
+
+    # loop through all labels
+    new_labels = []
+    for inst_label in labels:
+
+      if type(inst_label) is tuple:
+        check_name = inst_label[0]
+        found_tuple = True
+      else:
+        check_name = inst_label
+        found_tuple = False
+
+      if ': ' in check_name:
+        check_name = check_name.split(': ')[1]
+
+      # default to False for found cat, overwrite if necessary
+      found_cat = inst_title + ': False'
+
+      # check all categories in cats
+      for inst_cat in all_cats:
+
+        inst_names = all_cats[inst_cat]
+
+        if check_name in inst_names:
+          found_cat = inst_title + ': ' + inst_cat
+
+      # add category to label
+      if found_tuple is True:
+        new_label = inst_label + (found_cat,)
+      else:
+        new_label = (inst_label, found_cat)
+
+      new_labels.append(new_label)
+
+
+    # add labels back to DataFrame
+    if axis == 'row':
+      df.index = new_labels
+    elif axis == 'col':
+      df.columns = new_labels
+
+    net.load_df(df)
+
+  except:
+    print('error adding new categories')
+
+
+
+
diff --git a/src/celldega/clust/data_formats.py b/src/celldega/clust/data_formats.py
new file mode 100644
index 00000000..24e37f9b
--- /dev/null
+++ b/src/celldega/clust/data_formats.py
@@ -0,0 +1,134 @@
+from . import make_unique_labels
+import pandas as pd
+from . import categories
+
+def df_to_dat(net, df, define_cat_colors=False):
+  '''
+  This is always run when data is loaded.
+  '''
+
+  # check if df has unique values
+  df = make_unique_labels.main(net, df)
+
+  net.dat['mat'] = df.values
+  net.dat['nodes']['row'] = df.index.tolist()
+  net.dat['nodes']['col'] = df.columns.tolist()
+
+  # print('checking ds status')
+  # print('is_downsampled', net.is_downsampled)
+  # print('meta_cat', net.meta_cat)
+  # print(hasattr(net, 'meta_ds_col'))
+  # print(hasattr(net, 'meta_ds_row'))
+
+  # if net.meta_cat == False or net.is_downsampled:
+  if net.meta_cat == False:
+
+    # tuple cats
+    ##################################
+
+    for axis in ['row', 'col']:
+
+      inst_nodes = net.dat['nodes'][axis]
+
+      if type(inst_nodes[0]) is tuple:
+
+        if axis == 'row':
+          net.dat['node_info'][axis]['full_names'] = df.index.tolist()
+        elif axis == 'col':
+          net.dat['node_info'][axis]['full_names'] = df.columns.tolist()
+
+        # get the number of categories from the length of the tuple
+        # subtract 1 because the name is the first element of the tuple
+        num_cats = len(inst_nodes[0]) - 1
+        for inst_cat in range(num_cats):
+          cat_name = 'cat-' + str(inst_cat)
+          cat_index = inst_cat + 1
+          cat_values = [x[cat_index] for x in inst_nodes]
+          net.dat['node_info'][axis][cat_name] = cat_values
+
+        # clean up nodes after parsing categories
+        net.dat['nodes'][axis] = [x[0] for x in inst_nodes]
+
+  else:
+
+    # meta_cats
+    ##########################
+
+    for axis in ['row', 'col']:
+
+      inst_nodes = net.dat['nodes'][axis]
+
+      if axis == 'row':
+        net.dat['node_info'][axis]['full_names'] = df.index.tolist()
+      elif axis == 'col':
+        net.dat['node_info'][axis]['full_names'] = df.columns.tolist()
+
+      inst_cats = []
+      if axis == 'row':
+        # inst_cats = net.meta_row.columns.tolist()
+        if hasattr(net, 'row_cats'):
+          inst_cats = net.row_cats
+      else:
+        # inst_cats = net.meta_col.columns.tolist()
+        if hasattr(net, 'col_cats'):
+          inst_cats = net.col_cats
+
+      num_cats = len(inst_cats)
+
+      # if axis == 'row':
+      #   num_cats = len(net.row_cats)
+      # elif axis == 'col':
+      #   num_cats = len(net.col_cats)
+
+      if num_cats > 0:
+        for inst_cat in range(num_cats):
+          cat_name = 'cat-' + str(inst_cat)
+          cat_index = inst_cat + 1
+
+          cat_title = inst_cats[inst_cat]
+          if axis == 'row':
+            if net.is_downsampled:
+              if hasattr(net, 'meta_ds_row'):
+                cat_values = net.meta_ds_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+              else:
+                cat_values = net.meta_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+
+            # detault with no downsampling
+            else:
+              cat_values = net.meta_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+          else:
+            # cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + x).values.tolist()
+            if net.is_downsampled:
+              if hasattr(net, 'meta_ds_col'):
+                # print(inst_nodes)
+
+                cat_values = net.meta_ds_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+              else:
+                cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+
+            # detault with no downsampling
+            else:
+              cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist()
+
+          net.dat['node_info'][axis][cat_name] = cat_values
+
+  categories.dict_cat(net, define_cat_colors=define_cat_colors)
+
+def dat_to_df(net):
+
+  nodes = {}
+  for axis in ['row', 'col']:
+    if 'full_names' in net.dat['node_info'][axis]:
+      nodes[axis] = net.dat['node_info'][axis]['full_names']
+    else:
+      nodes[axis] = net.dat['nodes'][axis]
+
+  df = pd.DataFrame(data=net.dat['mat'], columns=nodes['col'],
+      index=nodes['row'])
+
+  return df
+
+def mat_to_numpy_arr(self):
+  ''' convert list to numpy array - numpy arrays can not be saved as json '''
+  import numpy as np
+  self.dat['mat'] = np.asarray(self.dat['mat'])
\ No newline at end of file
diff --git a/src/celldega/clust/downsample_fun.py b/src/celldega/clust/downsample_fun.py
new file mode 100644
index 00000000..342b7b66
--- /dev/null
+++ b/src/celldega/clust/downsample_fun.py
@@ -0,0 +1,280 @@
+import pandas as pd
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+# string used to format titles
+super_string = ': '
+
+def main(net, df=None, ds_type='kmeans', axis='row', num_samples=100,
+         random_state=1000, ds_name='Downsample', ds_cluster_name='cluster'):
+
+  # print('is meta cat 1 ??', net.meta_cat)
+
+  if df is None:
+    df = net.export_df()
+  net.ds_name = ds_name
+
+  # print('run k-means!!!!!!!!!!!!!')
+  ds_df, ds_data = run_kmeans_mini_batch(net, df, num_samples, axis,
+                                         random_state, ds_cluster_name)
+
+  ds_data = [ds_cluster_name + '-' + str(x + 1) for x in ds_data]
+
+  if axis == 'row':
+    labels = df.index.tolist()
+  else:
+    labels = df.columns.tolist()
+
+  ser_ds = pd.Series(ds_data, index=labels)
+
+  # generate downsampled metadata from tuples
+  if axis == 'col':
+    net.meta_ds_col = net.make_df_from_cols(ds_df.columns.tolist())
+  else:
+    net.meta_ds_row = net.make_df_from_cols(ds_df.index.tolist())
+
+  # strip tuples
+  # print('strip tuples!!!!!!!!!!!!!!!!!!!!1')
+  if axis == 'col':
+    ds_df.columns = [x[0] for x in ds_df.columns.tolist()]
+    # print('after stripping col tuples')
+    # print(ds_df.columns.tolist())
+    net.dat['nodes']['col'] = ds_df.columns.tolist()
+
+  else:
+    ds_df.index = [x[0] for x in ds_df.index.tolist()]
+    net.dat['nodes']['row'] = ds_df.index.tolist()
+
+  # print('is meta cat 1 ??', net.meta_cat)
+
+  # load downsampled dataframe into net
+  # print('setting is_downsampled to True')
+  # print('load downsampled dataframe into net!!!!!!!!!!!!!!')
+  net.load_df(ds_df, is_downsampled=True)
+
+  # print('is meta cat 3 ??', net.meta_cat)
+
+  if net.meta_cat:
+    if axis == 'row':
+      net.meta_row[ds_name] = ser_ds
+    else:
+      net.meta_col[ds_name] = ser_ds
+  else:
+    return ser_ds
+
+
+
+def meta_cat_to_tuple(net, axis, orig_labels, inst_cats):
+  tuple_labels = []
+  for inst_label in orig_labels:
+      new_label = [inst_label]
+
+      for inst_cat_type in inst_cats:
+          if axis == 'col':
+              inst_cat = inst_cat_type + ': ' + net.meta_col.loc[inst_label, inst_cat_type]
+          else:
+              inst_cat = inst_cat_type + ': ' + net.meta_row.loc[inst_label, inst_cat_type]
+          new_label.append(inst_cat)
+
+      new_label = tuple(new_label)
+      tuple_labels.append(new_label)
+
+  return tuple_labels
+
+def run_kmeans_mini_batch(net, df, num_samples=100, axis='row',
+                          random_state=1000, ds_cluster_name='cluster'):
+
+  # gather downsampled axis information
+  if axis == 'row':
+    X = df
+    orig_labels = df.index.tolist()
+    non_ds_labels = df.columns.tolist()
+
+    # print(orig_labels)
+    if net.meta_cat:
+      orig_labels = meta_cat_to_tuple(net, axis, orig_labels, net.row_cats)
+
+  else:
+    X = df.transpose()
+    orig_labels = df.columns.tolist()
+    non_ds_labels = df.index.tolist()
+
+    if net.meta_cat:
+      orig_labels = meta_cat_to_tuple(net, axis, orig_labels, net.col_cats)
+
+  cat_index = 1
+
+  # run until the number of returned clusters with data-points is equal to the
+  # number of requested clusters
+  num_returned_clusters = 0
+  while num_samples != num_returned_clusters:
+
+    clusters, num_returned_clusters, cluster_data, cluster_pop = \
+      calc_mbk_clusters(X, num_samples, random_state)
+
+    random_state = random_state + random_state
+
+  clust_numbers = range(num_returned_clusters)
+  clust_labels = [ds_cluster_name + '-' + str(i + 1) for i in clust_numbers]
+
+  if type(orig_labels[0]) is tuple:
+    found_cats = True
+  else:
+    found_cats = False
+
+  # Gather categories if necessary
+  ########################################
+  # check if there are categories
+  if found_cats:
+    all_cats = generate_cat_data(cluster_data, orig_labels, num_samples)
+
+  # genrate cluster labels, e.g. add number in each cluster and majority cat
+  # if necessary
+  cluster_labels = []
+  for i in range(num_returned_clusters):
+
+    inst_name = clust_labels[i]
+    num_in_clust_string =  'number in clust: '+ str(cluster_pop[i])
+
+    inst_tuple = (inst_name,)
+
+    if found_cats:
+      for cat_data in all_cats:
+        cat_values = cat_data['counts'][i]
+        max_cat_fraction = cat_values.max()
+        max_index = np.where(cat_values == max_cat_fraction)[0][0]
+        max_cat_name = cat_data['types'][max_index]
+
+        # add category title if available
+        cat_name_string = cat_data['title'] +': ' + max_cat_name
+
+        inst_tuple = inst_tuple + (cat_name_string,)
+
+    inst_tuple = inst_tuple + (num_in_clust_string,)
+
+    cluster_labels.append(inst_tuple)
+
+  # ds_df is always downsampling the rows, if the user wants to downsample the
+  # columns, the df will be switched back later
+  ds_df = pd.DataFrame(data=clusters, index=cluster_labels, columns=non_ds_labels)
+
+  # swap back for downsampled columns
+  if axis == 'col':
+    ds_df = ds_df.transpose()
+
+  return ds_df, cluster_data
+
+def generate_cat_data(cluster_data, orig_labels, num_samples):
+
+  # generate an array of orig_labels, using an array so that I can gather
+  # label subsets using indices
+  orig_array = np.asarray(orig_labels)
+
+  example_label = orig_labels[0]
+
+  # find out how many string categories are available
+  num_cats = 0
+  for i in range(len(example_label)):
+
+    if i > 0:
+      inst_cat = example_label[i]
+      if super_string in inst_cat:
+        inst_cat = inst_cat.split(super_string)[1]
+
+      string_cat = True
+      try:
+        float(inst_cat)
+        string_cat = False
+      except:
+        string_cat = True
+
+      if string_cat:
+        num_cats = num_cats + 1
+
+  all_cats = []
+
+  for cat_index in range(num_cats):
+
+    # index zero is for the names
+    cat_index = cat_index + 1
+
+    cat_data = {}
+
+    if super_string in example_label[cat_index]:
+      cat_data['title'] = example_label[cat_index].split(super_string)[0]
+    else:
+      cat_data['title'] = 'Category'
+
+    # if there are string categories, then keep track of how many of each category
+    # are found in each of the downsampled clusters.
+    cat_data['types'] = []
+
+    # gather possible categories
+    for inst_label in orig_labels:
+
+      inst_cat = inst_label[cat_index]
+
+      if super_string in inst_cat:
+        inst_cat = inst_cat.split(super_string)[1]
+
+      # get first category
+      cat_data['types'].append(inst_cat)
+
+    cat_data['types'] = sorted(list(set(cat_data['types'])))
+
+    num_cats = len(cat_data['types'])
+
+    # initialize cat_data['counts'] dictionary
+    cat_data['counts'] = {}
+    for inst_clust in range(num_samples):
+      cat_data['counts'][inst_clust] = np.zeros([num_cats])
+
+    # populate cat_data['counts']
+    for inst_clust in range(num_samples):
+
+      # get the indicies of all original labels that fall in the cluster
+      found = np.where(cluster_data == inst_clust)
+      found_indicies = found[0]
+
+      clust_names = orig_array[found_indicies]
+
+      for inst_name in clust_names:
+
+        # get first category name
+        inst_name = inst_name[cat_index]
+
+        if super_string in inst_name:
+          inst_name = inst_name.split(super_string)[1]
+
+        tmp_index = cat_data['types'].index(inst_name)
+
+        cat_data['counts'][inst_clust][tmp_index] = cat_data['counts'][inst_clust][tmp_index] + 1
+
+    # calculate fractions
+    for inst_clust in range(num_samples):
+      # get array
+      counts = cat_data['counts'][inst_clust]
+      inst_total = np.sum(counts)
+      cat_data['counts'][inst_clust] = cat_data['counts'][inst_clust] / inst_total
+
+    all_cats.append(cat_data)
+
+  return all_cats
+
+def calc_mbk_clusters(X, n_clusters, random_state=1000):
+
+  # kmeans is run with rows as data-points and columns as dimensions
+  mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters,
+                         max_no_improvement=100, verbose=0,
+                         random_state=random_state)
+
+  # need to loop through each label (each k-means cluster) and count how many
+  # points were given this label. This will give the population size of each label
+  mbk.fit(X)
+  cluster_data = mbk.labels_
+  clusters = mbk.cluster_centers_
+
+  mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True)
+
+  num_returned_clusters = len(cluster_pop)
+
+  return clusters, num_returned_clusters, cluster_data, cluster_pop
\ No newline at end of file
diff --git a/src/celldega/clust/enrichr_functions.py b/src/celldega/clust/enrichr_functions.py
new file mode 100644
index 00000000..91b46cd1
--- /dev/null
+++ b/src/celldega/clust/enrichr_functions.py
@@ -0,0 +1,376 @@
+def add_enrichr_cats(df, inst_rc, run_enrichr, num_terms=10):
+  from copy import deepcopy
+
+  tmp_gene_list = deepcopy(df.index.tolist())
+
+  gene_list = []
+  if type(tmp_gene_list[0]) is tuple:
+    for inst_tuple in tmp_gene_list:
+      gene_list.append(inst_tuple[0])
+  else:
+    gene_list = tmp_gene_list
+
+  orig_gene_list = deepcopy(gene_list)
+
+  # set up for non-tuple case first
+  if ': ' in  gene_list[0]:
+    # strip titles
+    gene_list = [inst_gene.split(': ')[1] for inst_gene in gene_list]
+
+  # strip extra information (e.g. PTMs)
+  gene_list = [inst_gene.split('_')[0] for inst_gene in gene_list]
+  gene_list = [inst_gene.split(' ')[0] for inst_gene in gene_list]
+  gene_list = [inst_gene.split('-')[0] for inst_gene in gene_list]
+
+  user_list_id = post_request(gene_list)
+
+  enr, response_list = get_request(run_enrichr, user_list_id, max_terms=20)
+
+  # p-value, adjusted pvalue, z-score, combined score, genes
+  # 1: Term
+  # 2: P-value
+  # 3: Z-score
+  # 4: Combined Score
+  # 5: Genes
+  # 6: pval_bh
+
+  # while generating categories store as list of lists, then convert to list of
+  # tuples
+
+  bar_info = []
+  cat_list = []
+  for inst_gene in orig_gene_list:
+    cat_list.append([inst_gene])
+
+  for inst_enr in response_list[0:num_terms]:
+    inst_term = inst_enr[1]
+    inst_pval = inst_enr[2]
+    inst_cs = inst_enr[4]
+    inst_list = inst_enr[5]
+
+    pval_string = '<p> Pval ' + str(inst_pval) + '</p>'
+
+    bar_info.append(inst_cs)
+
+    for inst_info in cat_list:
+
+      # strip titles
+      gene_name = inst_info[0]
+
+      if ': ' in gene_name:
+        gene_name = gene_name.split(': ')[1]
+
+      # strip extra information (e.g. PTMs)
+      gene_name = gene_name.split('_')[0]
+      gene_name = gene_name.split(' ')[0]
+      gene_name = gene_name.split('-')[0]
+
+      if gene_name in inst_list:
+        inst_info.append(inst_term+': True'+ pval_string)
+      else:
+        inst_info.append(inst_term+': False'+pval_string)
+
+  cat_list = [tuple(x) for x in cat_list]
+
+  df.index = cat_list
+
+  return df, bar_info
+
+def clust_from_response(response_list):
+  from clustergrammer import Network
+  import scipy
+  import json
+  import pandas as pd
+  import math
+  from copy import deepcopy
+
+  # print('----------------------')
+  # print('enrichr_clust_from_response')
+  # print('----------------------')
+
+  ini_enr = transfer_to_enr_dict( response_list )
+
+  enr = []
+  scores = {}
+  score_types = ['combined_score','pval','zscore']
+
+  for score_type in score_types:
+    scores[score_type] = pd.Series()
+
+  for inst_enr in ini_enr:
+    if inst_enr['combined_score'] > 0:
+
+      # make series of enriched terms with scores
+      for score_type in score_types:
+
+        # collect the scores of the enriched terms
+        if score_type == 'combined_score':
+          scores[score_type][inst_enr['name']] = inst_enr[score_type]
+        if score_type == 'pval':
+          scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type])
+        if score_type == 'zscore':
+          scores[score_type][inst_enr['name']] = -inst_enr[score_type]
+
+      # keep enrichement values
+      enr.append(inst_enr)
+
+  # sort and normalize the scores
+  for score_type in score_types:
+    scores[score_type] = scores[score_type]/scores[score_type].max()
+    scores[score_type].sort_values(ascending=False)
+
+  number_of_enriched_terms = len(scores['combined_score'])
+
+  enr_score_types = ['combined_score','pval','zscore']
+
+  if number_of_enriched_terms <10:
+    num_dict = {'ten':10}
+  elif number_of_enriched_terms <20:
+    num_dict = {'ten':10, 'twenty':20}
+  else:
+    num_dict = {'ten':10, 'twenty':20, 'thirty':30}
+
+  # gather lists of top scores
+  top_terms = {}
+  for enr_type in enr_score_types:
+    top_terms[enr_type] = {}
+    for num_terms in list(num_dict.keys()):
+      inst_num = num_dict[num_terms]
+      top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num]
+
+  # gather the terms that should be kept - they are at the top of the score list
+  keep_terms = []
+  for inst_enr_score in top_terms:
+    for tmp_num in list(num_dict.keys()):
+      keep_terms.extend( top_terms[inst_enr_score][tmp_num] )
+
+  keep_terms = list(set(keep_terms))
+
+  # keep enriched terms that are at the top 10 based on at least one score
+  keep_enr = []
+  for inst_enr in enr:
+    if inst_enr['name'] in keep_terms:
+      keep_enr.append(inst_enr)
+
+
+  # fill in full matrix
+  #######################
+
+  # genes
+  row_node_names = []
+  # enriched terms
+  col_node_names = []
+
+  # gather information from the list of enriched terms
+  for inst_enr in keep_enr:
+    col_node_names.append(inst_enr['name'])
+    row_node_names.extend(inst_enr['int_genes'])
+
+  row_node_names = sorted(list(set(row_node_names)))
+
+  net = Network()
+  net.dat['nodes']['row'] = row_node_names
+  net.dat['nodes']['col'] = col_node_names
+  net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)])
+
+  for inst_enr in keep_enr:
+
+    inst_term = inst_enr['name']
+    col_index = col_node_names.index(inst_term)
+
+    # use combined score for full matrix - will not be seen in viz
+    tmp_score = scores['combined_score'][inst_term]
+    net.dat['node_info']['col']['value'].append(tmp_score)
+
+    for inst_gene in inst_enr['int_genes']:
+      row_index = row_node_names.index(inst_gene)
+
+      # save association
+      net.dat['mat'][row_index, col_index] = 1
+
+  # cluster full matrix
+  #############################
+  # do not make multiple views
+  views = ['']
+
+  if len(net.dat['nodes']['row']) > 1:
+    net.cluster(dist_type='jaccard', views=views, dendro=False)
+  else:
+    net.cluster(dist_type='jaccard', views=views, dendro=False, run_clustering=False)
+
+  # get dataframe from full matrix
+  df = net.dat_to_df()
+
+  for score_type in score_types:
+
+    for num_terms in num_dict:
+
+      inst_df = deepcopy(df)
+      inst_net = deepcopy(Network())
+
+      inst_df = inst_df[top_terms[score_type][num_terms]]
+
+      # load back into net
+      inst_net.df_to_dat(inst_df)
+
+      # make views
+      if len(net.dat['nodes']['row']) > 1:
+        inst_net.cluster(dist_type='jaccard', views=['N_row_sum'], dendro=False)
+      else:
+        inst_net.cluster(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False)
+
+      inst_views = inst_net.viz['views']
+
+      # add score_type to views
+      for inst_view in inst_views:
+
+        inst_view['N_col_sum'] = num_dict[num_terms]
+
+        inst_view['enr_score_type'] = score_type
+
+        # add values to col_nodes and order according to rank
+        for inst_col in inst_view['nodes']['col_nodes']:
+
+          inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name'])
+
+          inst_name = inst_col['name']
+          inst_col['value'] = scores[score_type][inst_name]
+
+      # add views to main network
+      net.viz['views'].extend(inst_views)
+
+  return net
+
+# make the get request to enrichr using the requests library
+# this is done before making the get request with the lib name
+def post_request(input_genes, meta=''):
+  # get metadata
+  import requests
+  import json
+
+  # stringify list
+  input_genes = '\n'.join(input_genes)
+
+  # define post url
+  post_url = 'http://amp.pharm.mssm.edu/Enrichr/addList'
+
+  # define parameters
+  params = {'list':input_genes, 'description':''}
+
+  # make request: post the gene list
+  post_response = requests.post( post_url, files=params)
+
+  # load json
+  inst_dict = json.loads( post_response.text )
+  userListId = str(inst_dict['userListId'])
+
+  # print(userListId)
+
+  # return the userListId that is needed to reference the list later
+  return userListId
+
+# make the get request to enrichr using the requests library
+# this is done after submitting post request with the input gene list
+def get_request(lib, userListId, max_terms=50 ):
+  import requests
+  import json
+
+  # convert userListId to string
+  userListId = str(userListId)
+
+  # define the get url
+  get_url = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
+
+  # get parameters
+  params = {'backgroundType':lib,'userListId':userListId}
+
+  # try get request until status code is 200
+  inst_status_code = 400
+
+  # wait until okay status code is returned
+  num_try = 0
+
+  # print(('\tEnrichr enrichment get req userListId: '+str(userListId)))
+
+  while inst_status_code == 400 and num_try < 100:
+    num_try = num_try +1
+    try:
+      # make the get request to get the enrichr results
+
+      try:
+        get_response = requests.get( get_url, params=params )
+
+        # get status_code
+        inst_status_code = get_response.status_code
+
+      except:
+        print('retry get request')
+
+    except:
+      print('get requests failed')
+
+  # load as dictionary
+  resp_json = json.loads( get_response.text )
+
+  # get the key
+  only_key = list(resp_json.keys())[0]
+
+  # get response_list
+  response_list = resp_json[only_key]
+
+  # transfer the response_list to the enr_dict
+  enr = transfer_to_enr_dict( response_list, max_terms )
+
+  # return enrichment json and userListId
+  return enr, response_list
+
+# transfer the response_list to a list of dictionaries
+def transfer_to_enr_dict(response_list, max_terms=50):
+
+  # # reduce the number of enriched terms if necessary
+  # if len(response_list) < num_terms:
+  #   num_terms = len(response_list)
+
+  # p-value, adjusted pvalue, z-score, combined score, genes
+  # 1: Term
+  # 2: P-value
+  # 3: Z-score
+  # 4: Combined Score
+  # 5: Genes
+  # 6: pval_bh
+
+  num_enr_term = len(response_list)
+  if num_enr_term > max_terms:
+    num_enr_term = max_terms
+
+  # transfer response_list to enr structure
+  # and only keep the top terms
+  #
+  # initialize enr
+  enr = []
+  for i in range(num_enr_term):
+
+    # get list element
+    inst_enr = response_list[i]
+
+    # initialize dict
+    inst_dict = {}
+
+    # transfer term
+    inst_dict['name'] = inst_enr[1]
+    # transfer pval
+    inst_dict['pval'] = inst_enr[2]
+    # transfer zscore
+    inst_dict['zscore'] = inst_enr[3]
+    # transfer combined_score
+    inst_dict['combined_score'] = inst_enr[4]
+    # transfer int_genes
+    inst_dict['int_genes'] = inst_enr[5]
+    # adjusted pval
+    inst_dict['pval_bh'] = inst_enr[6]
+
+    # append dict
+    enr.append(inst_dict)
+
+  return enr
+
diff --git a/src/celldega/clust/export_data.py b/src/celldega/clust/export_data.py
new file mode 100644
index 00000000..5d51ffea
--- /dev/null
+++ b/src/celldega/clust/export_data.py
@@ -0,0 +1,57 @@
+def export_net_json(net, net_type, indent='no-indent'):
+  ''' export json string of dat '''
+  import json
+  from copy import deepcopy
+
+  if net_type == 'dat':
+    exp_dict = deepcopy(net.dat)
+
+    if type(exp_dict['mat']) is not list:
+      exp_dict['mat'] = exp_dict['mat'].tolist()
+
+  elif net_type == 'viz':
+    exp_dict = net.viz
+
+  elif net_type == 'sim_row':
+    exp_dict = net.sim['row']
+
+  elif net_type == 'sim_col':
+    exp_dict = net.sim['col']
+
+  # make json
+  if indent == 'indent':
+    exp_json = json.dumps(exp_dict, indent=2)
+  else:
+    exp_json = json.dumps(exp_dict)
+
+  return exp_json
+
+def write_matrix_to_tsv(net, filename=None, df=None):
+  '''
+  This will export the matrix in net.dat or a dataframe (optional df in
+  arguments) as a tsv file. Row/column categories will be saved as tuples in
+  tsv, which can be read back into the network object.
+  '''
+  import pandas as pd
+
+  if df is None:
+    df = net.dat_to_df()
+
+  return df.to_csv(filename, sep='\t')
+
+def write_json_to_file(net, net_type, filename, indent='no-indent'):
+
+  exp_json = net.export_net_json(net_type, indent)
+
+  fw = open(filename, 'w')
+  fw.write(exp_json)
+  fw.close()
+
+def save_dict_to_json(inst_dict, filename, indent='no-indent'):
+  import json
+  fw = open(filename, 'w')
+  if indent == 'indent':
+    fw.write(json.dumps(inst_dict, indent=2))
+  else:
+    fw.write(json.dumps(inst_dict))
+  fw.close()
\ No newline at end of file
diff --git a/src/celldega/clust/iframe_web_app.py b/src/celldega/clust/iframe_web_app.py
new file mode 100644
index 00000000..548b5136
--- /dev/null
+++ b/src/celldega/clust/iframe_web_app.py
@@ -0,0 +1,32 @@
+def main(net, filename=None, width=1000, height=800):
+  import requests, json
+  # from io import StringIO
+  from IPython.display import IFrame, display
+
+  try:
+      from StringIO import StringIO
+  except ImportError:
+      from io import StringIO
+
+  clustergrammer_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'
+
+  if filename is None:
+    file_string = net.write_matrix_to_tsv()
+    file_obj = StringIO(file_string)
+
+    if net.dat['filename'] is None:
+      fake_filename = 'Network.txt'
+    else:
+      fake_filename = net.dat['filename']
+
+    r = requests.post(clustergrammer_url, files={'file': (fake_filename, file_obj)})
+  else:
+    file_obj = open(filename, 'r')
+    r = requests.post(clustergrammer_url, files={'file': file_obj})
+
+
+  link = r.text
+
+  display(IFrame(link, width=width, height=height))
+
+  return link
\ No newline at end of file
diff --git a/src/celldega/clust/initialize_net.py b/src/celldega/clust/initialize_net.py
new file mode 100644
index 00000000..2a2ee0e1
--- /dev/null
+++ b/src/celldega/clust/initialize_net.py
@@ -0,0 +1,97 @@
+def main(self, widget=None):
+
+  if hasattr(self, 'meta_cat') == False:
+    self.meta_cat = False
+
+  self.dat = {}
+  self.dat['nodes'] = {}
+  self.dat['nodes']['row'] = []
+  self.dat['nodes']['col'] = []
+  self.dat['mat'] = []
+
+  self.dat['node_info'] = {}
+  for inst_rc in self.dat['nodes']:
+    self.dat['node_info'][inst_rc] = {}
+    self.dat['node_info'][inst_rc]['ini'] = []
+    self.dat['node_info'][inst_rc]['clust'] = []
+    self.dat['node_info'][inst_rc]['rank'] = []
+    self.dat['node_info'][inst_rc]['info'] = []
+    self.dat['node_info'][inst_rc]['cat'] = []
+    self.dat['node_info'][inst_rc]['value'] = []
+
+  # check if net has categories predefined
+  if hasattr(self, 'persistent_cat_colors') == False:
+    self.persistent_cat_colors = False
+    found_cats = False
+  else:
+    found_cats = True
+    inst_cat_colors = self.viz['cat_colors']
+    inst_global_cat_colors = self.viz['global_cat_colors']
+
+  # initialize matrix colors
+  ###########################
+  has_matrix_colors = False
+  if hasattr(self, 'viz'):
+    if 'matrix_colors' in self.viz:
+      has_matrix_colors = True
+
+  if has_matrix_colors:
+    matrix_colors = self.viz['matrix_colors']
+  else:
+    matrix_colors = {}
+    matrix_colors['pos'] = 'red'
+    matrix_colors['neg'] = 'blue'
+
+  # add widget if necessary
+  if widget != None:
+    self.widget_class = widget
+
+  self.is_downsampled = False
+
+  self.viz = {}
+  self.viz['row_nodes'] = []
+  self.viz['col_nodes'] = []
+  self.viz['links'] = []
+  self.viz['mat'] = []
+
+  if found_cats == False:
+    self.viz['cat_colors'] = {}
+    self.viz['cat_colors']['row'] = {}
+    self.viz['cat_colors']['col'] = {}
+    self.viz['global_cat_colors'] = {}
+  else:
+    self.viz['cat_colors'] = inst_cat_colors
+    self.viz['global_cat_colors'] = inst_global_cat_colors
+
+  self.viz['matrix_colors'] = matrix_colors
+
+  self.sim = {}
+
+  self.umap = {}
+
+
+def viz(self, reset_cat_colors=False):
+
+  # keep track of old cat_colors
+  old_cat_colors = self.viz['cat_colors']
+  old_global_cat_colors = self.viz['global_cat_colors']
+
+  if 'matrix_colors' in self.viz:
+    matrix_colors = self.viz['matrix_colors']
+
+  self.viz = {}
+  self.viz['row_nodes'] = []
+  self.viz['col_nodes'] = []
+  self.viz['links'] = []
+  self.viz['mat'] = []
+
+  if reset_cat_colors == True:
+    self.viz['cat_colors'] = {}
+    self.viz['cat_colors']['row'] = {}
+    self.viz['cat_colors']['col'] = {}
+    self.viz['global_cat_colors'] = {}
+  else:
+    self.viz['cat_colors'] = old_cat_colors
+    self.viz['global_cat_colors'] = old_global_cat_colors
+
+  self.viz['matrix_colors'] = matrix_colors
diff --git a/src/celldega/clust/load_data.py b/src/celldega/clust/load_data.py
new file mode 100644
index 00000000..0149d9d6
--- /dev/null
+++ b/src/celldega/clust/load_data.py
@@ -0,0 +1,97 @@
+import io, sys
+import json
+import pandas as pd
+from . import categories
+from . import proc_df_labels
+from . import data_formats
+from . import make_unique_labels
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+def load_file(net, filename):
+  # reset network when loaing file, prevents errors when loading new file
+  # have persistent categories
+
+  net.reset()
+
+  f = open(filename, 'r')
+
+  file_string = f.read()
+  f.close()
+
+  load_file_as_string(net, file_string, filename)
+
+def load_file_as_string(net, file_string, filename=''):
+
+  if (sys.version_info > (3, 0)):
+    # python 3
+    ####################
+    file_string = str(file_string)
+  else:
+    # python 2
+    ####################
+    file_string = unicode(file_string)
+
+  buff = io.StringIO(file_string)
+
+  if '/' in filename:
+    filename = filename.split('/')[-1]
+
+  net.load_tsv_to_net(buff, filename)
+
+def load_stdin(net):
+  data = ''
+
+  for line in sys.stdin:
+    data = data + line
+
+  data = StringIO.StringIO(data)
+
+  net.load_tsv_to_net(data)
+
+def load_tsv_to_net(net, file_buffer, filename=None):
+  lines = file_buffer.getvalue().split('\n')
+  num_labels = categories.check_categories(lines)
+
+  row_arr = list(range(num_labels['row']))
+  col_arr = list(range(num_labels['col']))
+
+  # use header if there are col categories
+  if len(col_arr) > 1:
+    df = pd.read_table(file_buffer, index_col=row_arr,
+                                  header=col_arr)
+  else:
+    df = pd.read_table(file_buffer, index_col=row_arr)
+
+  df = proc_df_labels.main(df)
+
+  net.df_to_dat(df, True)
+  net.dat['filename'] = filename
+
+def load_json_to_dict(filename):
+  f = open(filename, 'r')
+  inst_dict = json.load(f)
+  f.close()
+  return inst_dict
+
+def load_gmt(filename):
+  f = open(filename, 'r')
+  lines = f.readlines()
+  f.close()
+  gmt = {}
+  for i in range(len(lines)):
+    inst_line = lines[i].rstrip()
+    inst_term = inst_line.split('\t')[0]
+    inst_elems = inst_line.split('\t')[2:]
+    gmt[inst_term] = inst_elems
+
+  return gmt
+
+def load_data_to_net(net, inst_net):
+  ''' load data into nodes and mat, also convert mat to numpy array'''
+  net.dat['nodes'] = inst_net['nodes']
+  net.dat['mat'] = inst_net['mat']
+  data_formats.mat_to_numpy_arr(net)
\ No newline at end of file
diff --git a/src/celldega/clust/load_vect_post.py b/src/celldega/clust/load_vect_post.py
new file mode 100644
index 00000000..5396b4f7
--- /dev/null
+++ b/src/celldega/clust/load_vect_post.py
@@ -0,0 +1,46 @@
+def main(real_net, vect_post):
+  import numpy as np
+  from copy import deepcopy
+  from .__init__ import Network
+  from . import proc_df_labels
+
+  net = deepcopy(Network())
+
+  sigs = vect_post['columns']
+
+  all_rows = []
+  all_sigs = []
+  for inst_sig in sigs:
+    all_sigs.append(inst_sig['col_name'])
+
+    col_data = inst_sig['data']
+
+    for inst_row_data in col_data:
+      all_rows.append(inst_row_data['row_name'])
+
+  all_rows = sorted(list(set(all_rows)))
+  all_sigs = sorted(list(set(all_sigs)))
+
+  net.dat['nodes']['row'] = all_rows
+  net.dat['nodes']['col'] = all_sigs
+
+  net.dat['mat'] = np.empty((len(all_rows), len(all_sigs)))
+  net.dat['mat'][:] = np.nan
+
+  for inst_sig in sigs:
+    inst_sig_name = inst_sig['col_name']
+    col_data = inst_sig['data']
+
+    for inst_row_data in col_data:
+      inst_row = inst_row_data['row_name']
+      inst_value = inst_row_data['val']
+
+      row_index = all_rows.index(inst_row)
+      col_index = all_sigs.index(inst_sig_name)
+
+      net.dat['mat'][row_index, col_index] = inst_value
+
+  tmp_df = net.dat_to_df()
+  tmp_df = proc_df_labels.main(tmp_df)
+
+  real_net.df_to_dat(tmp_df)
diff --git a/src/celldega/clust/make_clust_fun.py b/src/celldega/clust/make_clust_fun.py
new file mode 100644
index 00000000..8e52aa91
--- /dev/null
+++ b/src/celldega/clust/make_clust_fun.py
@@ -0,0 +1,75 @@
+from copy import deepcopy
+import scipy
+from . import calc_clust, run_filter, make_sim_mat, cat_pval
+from . import enrichr_functions as enr_fun
+
+def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True,
+                          requested_views=['pct_row_sum', 'N_row_sum'],
+                          linkage_type='average', sim_mat=False, filter_sim=0.0,
+                          calc_cat_pval=False, sim_mat_views=['N_row_sum'],
+                          run_enrichr=None, enrichrgram=None, clust_library='scipy',
+                          min_samples=1, min_cluster_size=2):
+  '''
+  This will perform hierarchical clustering
+  '''
+
+  # threshold = 0.0001
+  # df = run_filter.df_filter_row_sum(df, threshold)
+  # df = run_filter.df_filter_col_sum(df, threshold)
+
+  if run_enrichr is not None:
+    df = net.dat_to_df()
+    df = enr_fun.add_enrichr_cats(df, 'row', run_enrichr)
+    define_cat_colors = True
+    net.df_to_dat(df, define_cat_colors=True)
+
+  inst_dm = calc_clust.cluster_row_and_col(net, dist_type=dist_type,
+                                linkage_type=linkage_type,
+                                run_clustering=run_clustering,
+                                dendro=dendro, ignore_cat=False,
+                                calc_cat_pval=calc_cat_pval,
+                                clust_library=clust_library,
+                                min_samples=min_samples,
+                                min_cluster_size=min_cluster_size)
+
+  which_sim = []
+
+  if sim_mat == True:
+    which_sim = ['row', 'col']
+  elif sim_mat == 'row':
+    which_sim = ['row']
+  elif sim_mat == 'col':
+    which_sim = ['col']
+
+  if sim_mat is not False:
+    sim_net = make_sim_mat.main(net, inst_dm, which_sim, filter_sim, sim_mat_views)
+
+    net.sim = {}
+
+    for inst_rc in which_sim:
+      net.sim[inst_rc] = sim_net[inst_rc].viz
+
+      if inst_rc == 'row':
+        other_rc = 'col'
+      elif inst_rc == 'col':
+        other_rc = 'row'
+
+      # keep track of cat_colors
+      net.sim[inst_rc]['cat_colors'][inst_rc] = net.viz['cat_colors'][inst_rc]
+      net.sim[inst_rc]['cat_colors'][other_rc] = net.viz['cat_colors'][inst_rc]
+
+  else:
+    net.sim = {}
+
+  net.viz['views'] = []
+
+  if enrichrgram != None:
+    # toggle enrichrgram functionality from back-end
+    net.viz['enrichrgram'] = enrichrgram
+
+  if 'enrichrgram_lib' in net.dat:
+    net.viz['enrichrgram'] = True
+    net.viz['enrichrgram_lib'] = net.dat['enrichrgram_lib']
+
+  if 'row_cat_bars' in net.dat:
+    net.viz['row_cat_bars'] = net.dat['row_cat_bars']
diff --git a/src/celldega/clust/make_sim_mat.py b/src/celldega/clust/make_sim_mat.py
new file mode 100644
index 00000000..28384097
--- /dev/null
+++ b/src/celldega/clust/make_sim_mat.py
@@ -0,0 +1,71 @@
+def main(net, inst_dm, which_sim, filter_sim, sim_mat_views=['N_row_sum']):
+  from .__init__ import Network
+  from copy import deepcopy
+  from . import calc_clust
+
+  sim_dict = {}
+
+  for inst_rc in which_sim:
+
+    sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True,
+                             filter_sim=filter_sim)
+
+  sim_net = {}
+
+  for inst_rc in which_sim:
+
+    sim_net[inst_rc] = deepcopy(Network())
+
+    sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc]
+
+    sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc]
+    sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc]
+
+    sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][inst_rc]
+    sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][inst_rc]
+
+    calc_clust.cluster_row_and_col(sim_net[inst_rc])
+
+    all_views = []
+    df = sim_net[inst_rc].dat_to_df()
+    send_df = deepcopy(df)
+
+    sim_net[inst_rc].viz['views'] = all_views
+
+  return sim_net
+
+def dm_to_sim(inst_dm, make_squareform=False, filter_sim=0):
+  import numpy as np
+  from scipy.spatial.distance import squareform
+
+  if make_squareform is True:
+    inst_dm = squareform(inst_dm)
+
+  inst_sim_mat = 1 - inst_dm
+
+  if filter_sim > 0:
+    filter_sim = adjust_filter_sim(inst_sim_mat, filter_sim)
+    inst_sim_mat[ np.abs(inst_sim_mat) < filter_sim] = 0
+
+  return inst_sim_mat
+
+def adjust_filter_sim(inst_dm, filter_sim, keep_top=20000):
+  import pandas as pd
+  import numpy as np
+
+  inst_df = pd.DataFrame(inst_dm)
+  val_vect = np.abs(inst_df.values.flatten())
+
+  val_vect = val_vect[val_vect > 0.01]
+
+  if len(val_vect) > keep_top:
+
+
+    inst_series = pd.Series(val_vect)
+    inst_series.sort_values(ascending=False)
+
+    sort_values = inst_series.values
+
+    filter_sim = sort_values[keep_top]
+
+  return filter_sim
\ No newline at end of file
diff --git a/src/celldega/clust/make_unique_labels.py b/src/celldega/clust/make_unique_labels.py
new file mode 100644
index 00000000..d1f9d6eb
--- /dev/null
+++ b/src/celldega/clust/make_unique_labels.py
@@ -0,0 +1,88 @@
+import pandas as pd
+
+# make_unique_labels
+
+def main(net, df=None):
+  '''
+  Run in load_data module (which runs when file is loaded or dataframe is loaded),
+  check for duplicate row/col names, and add index to names if necesary
+  '''
+  if df is None:
+    df = net.export_df()
+
+  # rows
+  #############
+  rows = df.index.tolist()
+  if type(rows[0]) is str:
+
+    if len(rows) != len(list(set(rows))):
+      print('warning: making row names unique')
+      new_rows = add_index_list(rows)
+      df.index = new_rows
+
+  elif type(rows[0]) is tuple:
+
+    row_names = []
+    for inst_row in rows:
+      row_names.append(inst_row[0])
+
+    if len(row_names) != len(list(set(row_names))):
+      print('warning: making row names unique')
+      row_names = add_index_list(row_names)
+
+      # add back to tuple
+      new_rows = []
+      for inst_index in range(len(rows)):
+        inst_row = rows[inst_index]
+        new_row = list(inst_row)
+        new_row[0] = row_names[inst_index]
+        new_row = tuple(new_row)
+        new_rows.append(new_row)
+
+      df.index = new_rows
+
+  # cols
+  #############
+  cols = df.columns.tolist()
+  if type(cols[0]) is str:
+
+    # list column names
+    if len(cols) != len(list(set(cols))):
+      print('warning: making col names unique')
+      new_cols = add_index_list(cols)
+      df.columns = new_cols
+
+  elif type(cols[0]) is tuple:
+
+    col_names = []
+    for inst_col in cols:
+      col_names.append(inst_col[0])
+
+    if len(col_names) != len(list(set(col_names))):
+      print('warning: making col names unique')
+      col_names = add_index_list(col_names)
+
+      # add back to tuple
+      new_cols = []
+      for inst_index in range(len(cols)):
+        inst_col = cols[inst_index]
+        new_col = list(inst_col)
+        new_col[0] = col_names[inst_index]
+        new_col = tuple(new_col)
+        new_cols.append(new_col)
+
+      df.columns = new_cols
+
+  # return dataframe with unique names
+  return df
+
+def add_index_list(nodes):
+
+  new_nodes = []
+  for i in range(len(nodes)):
+    index = i + 1
+    inst_node = nodes[i]
+    new_node = inst_node + '-' + str(index)
+    new_nodes.append(new_node)
+
+  return new_nodes
diff --git a/src/celldega/clust/make_viz.py b/src/celldega/clust/make_viz.py
new file mode 100644
index 00000000..a14d8719
--- /dev/null
+++ b/src/celldega/clust/make_viz.py
@@ -0,0 +1,72 @@
+def viz_json(net, dendro=True, links=False):
+  ''' make the dictionary for the clustergram.js visualization '''
+  from . import calc_clust
+  import numpy as np
+
+  # linkage information
+  net.viz['linkage'] = {}
+  net.viz['linkage']['row'] = net.dat['node_info']['row']['Y'].tolist()
+  net.viz['linkage']['col'] = net.dat['node_info']['col']['Y'].tolist()
+
+  # node information
+  for inst_rc in net.dat['nodes']:
+
+    inst_keys = net.dat['node_info'][inst_rc]
+    all_cats = [x for x in inst_keys if 'cat-' in x]
+
+    for i in range(len(net.dat['nodes'][inst_rc])):
+      inst_dict = {}
+      inst_dict['name'] = net.dat['nodes'][inst_rc][i]
+      inst_dict['ini'] = net.dat['node_info'][inst_rc]['ini'][i]
+      inst_dict['clust'] = net.dat['node_info'][inst_rc]['clust'].index(i)
+      inst_dict['rank'] = net.dat['node_info'][inst_rc]['rank'][i]
+
+      if 'rankvar' in inst_keys:
+        inst_dict['rankvar'] = net.dat['node_info'][inst_rc]['rankvar'][i]
+
+      # fix for similarity matrix
+      if len(all_cats) > 0:
+
+        for inst_name_cat in all_cats:
+
+          actual_cat_name = net.dat['node_info'][inst_rc][inst_name_cat][i]
+          inst_dict[inst_name_cat] = actual_cat_name
+
+          check_pval = 'pval_'+inst_name_cat.replace('-','_')
+
+          if check_pval in net.dat['node_info'][inst_rc]:
+            tmp_pval_name = inst_name_cat.replace('-','_') + '_pval'
+            inst_dict[tmp_pval_name] = net.dat['node_info'][inst_rc][check_pval][actual_cat_name]
+
+          tmp_index_name = inst_name_cat.replace('-', '_') + '_index'
+
+          inst_dict[tmp_index_name] = net.dat['node_info'][inst_rc] \
+              [tmp_index_name][i]
+
+
+      if len(net.dat['node_info'][inst_rc]['value']) > 0:
+        inst_dict['value'] = net.dat['node_info'][inst_rc]['value'][i]
+
+      if len(net.dat['node_info'][inst_rc]['info']) > 0:
+        inst_dict['info'] = net.dat['node_info'][inst_rc]['info'][i]
+
+      net.viz[inst_rc + '_nodes'].append(inst_dict)
+
+  # save data as links or mat
+  ###########################
+  if links is True:
+    for i in range(len(net.dat['nodes']['row'])):
+      for j in range(len(net.dat['nodes']['col'])):
+
+        inst_dict = {}
+        inst_dict['source'] = i
+        inst_dict['target'] = j
+        inst_dict['value'] = float(net.dat['mat'][i, j])
+
+        if np.isnan(inst_dict['value_orig']):
+          inst_dict['value_orig'] = 'NaN'
+
+        net.viz['links'].append(inst_dict)
+
+  else:
+    net.viz['mat'] = net.dat['mat'].tolist()
diff --git a/src/celldega/clust/normalize_fun.py b/src/celldega/clust/normalize_fun.py
new file mode 100644
index 00000000..8a02d040
--- /dev/null
+++ b/src/celldega/clust/normalize_fun.py
@@ -0,0 +1,165 @@
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+
+def run_norm(net, df=None, norm_type='zscore', axis='row', z_clip=None):
+  '''
+  A dataframe can be passed to run_norm and a normalization will be run (
+  e.g. zscore) on either the rows or columns
+  '''
+
+  if df is None:
+    df = net.dat_to_df()
+
+  if norm_type == 'zscore':
+    df, ser_mean, ser_std = zscore_df(df, axis, z_clip=z_clip)
+
+    net.dat['pre_zscore'] = {}
+    net.dat['pre_zscore']['mean'] = ser_mean.values.tolist()
+    net.dat['pre_zscore']['std'] = ser_std.values.tolist()
+
+  if norm_type == 'qn':
+    df = qn_df(df, axis)
+
+  if norm_type == 'umi':
+    df = umi_norm(df)
+
+  net.df_to_dat(df)
+
+  # if norm_type == 'zscore' and axis == 'row':
+  #   net.dat['pre_zscore'] = {}
+  #   net.dat['pre_zscore']['mean'] = ser_mean
+  #   net.dat['pre_zscore']['std'] = ser_std
+
+def qn_df(df, axis='row'):
+  '''
+  do quantile normalization of a dataframe dictionary, does not write to net
+  '''
+  # using transpose to do row qn
+  if axis == 'row':
+    df = df.transpose()
+
+  missing_values = df.isnull().values.any()
+
+  # make mask of missing values
+  if missing_values:
+
+    # get nan mask
+    missing_mask = pd.isnull(df)
+
+    # tmp fill in na with zero, will not affect qn
+    df = df.fillna(value=0)
+
+  # calc common distribution
+  common_dist = calc_common_dist(df)
+
+  # swap in common distribution
+  df = swap_in_common_dist(df, common_dist)
+
+  # swap back in missing values
+  if missing_values:
+    df = df.mask(missing_mask, other=np.nan)
+
+  # using transpose to do row qn
+  if axis == 'row':
+    df = df.transpose()
+
+  df_qn = df
+
+  return df_qn
+
+def swap_in_common_dist(df, common_dist):
+
+  col_names = df.columns.tolist()
+
+  qn_arr = np.array([])
+  orig_rows = df.index.tolist()
+
+  # loop through each column
+  for inst_col in col_names:
+
+    # get the sorted list of row names for the given column
+    tmp_series = deepcopy(df[inst_col])
+    tmp_series = tmp_series.sort_values(ascending=False)
+    sorted_names = tmp_series.index.tolist()
+
+    qn_vect = np.array([])
+    for inst_row in orig_rows:
+      inst_index = sorted_names.index(inst_row)
+      inst_val = common_dist[inst_index]
+      qn_vect = np.hstack((qn_vect, inst_val))
+
+    if qn_arr.shape[0] == 0:
+      qn_arr = qn_vect
+    else:
+      qn_arr = np.vstack((qn_arr, qn_vect))
+
+  # transpose (because of vstacking)
+  qn_arr = qn_arr.transpose()
+
+  qn_df = pd.DataFrame(data=qn_arr, columns=col_names, index=orig_rows)
+
+  return qn_df
+
+def calc_common_dist(df):
+  '''
+  calculate a common distribution (for col qn only) that will be used to qn
+  '''
+
+  # axis is col
+  tmp_arr = np.array([])
+
+  col_names = df.columns.tolist()
+
+  for inst_col in col_names:
+
+    # sort column
+    tmp_vect = df[inst_col].sort_values(ascending=False).values
+
+    # stacking rows vertically (will transpose)
+    if tmp_arr.shape[0] == 0:
+      tmp_arr = tmp_vect
+    else:
+      tmp_arr = np.vstack((tmp_arr, tmp_vect))
+
+  tmp_arr = tmp_arr.transpose()
+
+  common_dist = tmp_arr.mean(axis=1)
+
+  return common_dist
+
+def zscore_df(df, axis='row', z_clip=None):
+  '''
+  take the zscore of a dataframe dictionary, does not write to net (self)
+  '''
+
+  if axis == 'row':
+    df = df.transpose()
+
+  ser_mean = df.mean()
+  ser_std = df.std()
+
+  df_z = (df - ser_mean)/ser_std
+
+  if axis == 'row':
+    df_z = df_z.transpose()
+
+  if z_clip is not None:
+    df_z = z_clip_fun(df_z, lower=-z_clip, upper=z_clip)
+
+  return df_z, ser_mean, ser_std
+
+def umi_norm(df):
+    # umi norm
+    barcode_umi_sum = df.sum()
+    df_umi = df.div(barcode_umi_sum)
+    return df_umi
+
+
+def z_clip_fun(df_z, lower=None, upper=None):
+  '''
+  Trim values at input thresholds using pandas function
+  '''
+  df_z = df_z.clip(lower=lower, upper=upper)
+
+  return df_z
\ No newline at end of file
diff --git a/src/celldega/clust/proc_df_labels.py b/src/celldega/clust/proc_df_labels.py
new file mode 100644
index 00000000..1caed5ca
--- /dev/null
+++ b/src/celldega/clust/proc_df_labels.py
@@ -0,0 +1,62 @@
+def main(df):
+  '''
+  1) check that rows are strings (in case of numerical names)
+  2) check for tuples, and in that case load tuples to categories
+  '''
+  import numpy as np
+  from ast import literal_eval as make_tuple
+
+  test = {}
+  test['row'] = df.index.tolist()
+  test['col'] = df.columns.tolist()
+
+  # if type( test_row ) is not str and type( test_row ) is not tuple:
+
+  found_tuple = {}
+  found_number = {}
+  for inst_rc in ['row','col']:
+
+    inst_name = test[inst_rc][0]
+
+    found_tuple[inst_rc] = False
+    found_number[inst_rc] = False
+
+    if type(inst_name) != tuple:
+
+      if type(inst_name) is int or type(inst_name) is float or type(inst_name) is np.int64:
+        found_number[inst_rc] = True
+
+      else:
+        check_open = inst_name[0]
+        check_comma = inst_name.find(',')
+        check_close = inst_name[-1]
+
+        if check_open == '(' and check_close == ')' and check_comma > 0 \
+          and check_comma < len(inst_name):
+          found_tuple[inst_rc] = True
+
+  # convert to tuple if necessary
+  #################################################
+  if found_tuple['row']:
+    row_names = df.index.tolist()
+    row_names = [make_tuple(x) for x in row_names]
+    df.index = row_names
+
+  if found_tuple['col']:
+    col_names = df.columns.tolist()
+    col_names = [make_tuple(x) for x in col_names]
+    df.columns = col_names
+
+  # convert numbers to string if necessary
+  #################################################
+  if found_number['row']:
+    row_names = df.index.tolist()
+    row_names = [str(x) for x in row_names]
+    df.index = row_names
+
+  if found_number['col']:
+    col_names = df.columns.tolist()
+    col_names = [str(x) for x in col_names]
+    df.columns = col_names
+
+  return df
\ No newline at end of file
diff --git a/src/celldega/clust/run_filter.py b/src/celldega/clust/run_filter.py
new file mode 100644
index 00000000..1fc506fb
--- /dev/null
+++ b/src/celldega/clust/run_filter.py
@@ -0,0 +1,209 @@
+def df_filter_row_sum(df, threshold, take_abs=True):
+  ''' filter rows in matrix at some threshold
+  and remove columns that have a sum below this threshold '''
+
+  from copy import deepcopy
+  from .__init__ import Network
+  net = Network()
+
+  if take_abs is True:
+    df_copy = deepcopy(df.abs())
+  else:
+    df_copy = deepcopy(df)
+
+  ini_rows = df_copy.index.values.tolist()
+  df_copy = df_copy.transpose()
+  tmp_sum = df_copy.sum(axis=0)
+  tmp_sum = tmp_sum.abs()
+  tmp_sum.sort_values(inplace=True, ascending=False)
+
+  tmp_sum = tmp_sum[tmp_sum > threshold]
+  keep_rows = sorted(tmp_sum.index.values.tolist())
+
+  if len(keep_rows) < len(ini_rows):
+    df = grab_df_subset(df, keep_rows=keep_rows)
+
+  return df
+
+def df_filter_col_sum(df, threshold, take_abs=True):
+  ''' filter columns in matrix at some threshold
+  and remove rows that have all zero values '''
+
+  from copy import deepcopy
+  from .__init__ import Network
+  net = Network()
+
+  if take_abs is True:
+    df_copy = deepcopy(df.abs())
+  else:
+    df_copy = deepcopy(df)
+
+  df_copy = df_copy.transpose()
+  df_copy = df_copy[df_copy.sum(axis=1) > threshold]
+  df_copy = df_copy.transpose()
+  df_copy = df_copy[df_copy.sum(axis=1) > 0]
+
+  if take_abs is True:
+    inst_rows = df_copy.index.tolist()
+    inst_cols = df_copy.columns.tolist()
+    df = grab_df_subset(df, inst_rows, inst_cols)
+
+  else:
+    df = df_copy
+
+  return df
+
+def grab_df_subset(df, keep_rows='all', keep_cols='all'):
+  if keep_cols != 'all':
+    df = df[keep_cols]
+  if keep_rows != 'all':
+    df = df.loc[keep_rows]
+  return df
+
+def get_sorted_rows(df, rank_type='sum'):
+  from copy import deepcopy
+
+  inst_df = deepcopy(df)
+  inst_df = inst_df.transpose()
+
+  if rank_type == 'sum':
+    tmp_sum = inst_df.sum(axis=0)
+  elif rank_type == 'var':
+    tmp_sum = inst_df.var(axis=0)
+
+  tmp_sum = tmp_sum.abs()
+  tmp_sum.sort_values(inplace=True, ascending=False)
+  rows_sorted = tmp_sum.index.values.tolist()
+
+  return rows_sorted
+
+def filter_N_top(inst_rc, df, N_top, rank_type='sum'):
+
+  if inst_rc == 'col':
+    for inst_type in df:
+      df[inst_type] = df[inst_type].transpose()
+
+  rows_sorted = get_sorted_rows(df, rank_type)
+
+  keep_rows = rows_sorted[:N_top]
+
+  df = df.loc[keep_rows]
+
+  if inst_rc == 'col':
+    for inst_type in df:
+      df[inst_type] = df[inst_type].transpose()
+
+  return df
+
+def filter_threshold(df, inst_rc, threshold, num_occur=1):
+  '''
+  Filter a network's rows or cols based on num_occur values being above a
+  threshold (in absolute_value)
+  '''
+  from copy import deepcopy
+
+  inst_df = deepcopy(df)
+
+  if inst_rc == 'col':
+    inst_df = inst_df.transpose()
+
+  inst_df = inst_df.abs()
+
+  ini_rows = inst_df.index.values.tolist()
+
+  inst_df[inst_df < threshold] = 0
+  inst_df[inst_df >= threshold] = 1
+
+  tmp_sum = inst_df.sum(axis=1)
+
+  tmp_sum = tmp_sum[tmp_sum >= num_occur]
+
+  keep_names = tmp_sum.index.values.tolist()
+
+  if inst_rc == 'row':
+    if len(keep_names) < len(ini_rows):
+      df = grab_df_subset(df, keep_rows=keep_names)
+
+  elif inst_rc == 'col':
+    inst_df = inst_df.transpose()
+
+    inst_rows = inst_df.index.values.tolist()
+    inst_cols = keep_names
+
+    df = grab_df_subset(df, inst_rows, inst_cols)
+
+  return df
+
+def filter_cat(net, axis, cat_index, cat_name):
+
+  try:
+    df = net.export_df()
+
+    # DataFrame filtering will be run always be run on columns if the user
+    # wants to filter rows, transpose the matrix before and after
+    if axis == 'row':
+      df = df.transpose()
+
+    all_names = df.columns.tolist()
+
+    found_names = [i for i in all_names if i[cat_index] == cat_name]
+
+    if len(found_names) > 0:
+      df = df[found_names]
+
+      if axis == 'row':
+        df = df.transpose()
+    else:
+      print('no ' + axis + 's were found with this category and filtering was not run')
+
+    net.load_df(df)
+
+  except:
+    print('category filtering did not run\n check that your category filtering is set up correctly')
+
+
+def filter_names(net, axis, names):
+
+  print('filter_names')
+  print(names)
+
+  try:
+
+    df = net.export_df()
+
+    # Dataframe filtering will always be run on the columns. If the user wants to filter rows, then it will transpose back and forth.
+
+    if axis == 'row':
+      df = df.transpose()
+
+    all_names = df.columns.tolist()
+
+    found_names = []
+    for inst_name in all_names:
+
+      if type(inst_name) is tuple:
+        check_name = inst_name[0]
+      else:
+        check_name = inst_name
+
+      if ': ' in check_name:
+        check_name = check_name.split(': ')[1]
+
+      if check_name in names:
+        found_names.append(inst_name)
+
+    if len(found_names) > 0:
+      df = df[found_names]
+
+      if axis == 'row':
+        df = df.transpose()
+
+      net.load_df(df)
+
+    else:
+      print('no ' + axis + 's were found with these names')
+
+  except:
+    print('error in filtering names')
+
+  print(found_names)
\ No newline at end of file

From fbe32fbbf19d29b6489fcf9cfaeba6571e7092af Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Tue, 7 Jan 2025 17:01:01 -0500
Subject: [PATCH 2/6] working on clust api for hierarchical clustering

---
 src/celldega/clust/__init__.py | 46 ++++++++++++++++++++++++++++++++--
 src/celldega/viz/widget.py     |  2 ++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py
index 28550be1..d768760a 100644
--- a/src/celldega/clust/__init__.py
+++ b/src/celldega/clust/__init__.py
@@ -1,3 +1,30 @@
+# The Celldega Matrix Vizualization Method is being built using the approaches
+# and code adaptations from the Clustergrammer-GL library, which is available at
+# github.com/ismms-himc/clustergrammer2
+# and being used under the license
+#
+# MIT License
+
+# Copyright (c) 2021 Nicolas Fernandez
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import numpy as np
 import pandas as pd
 from copy import deepcopy
@@ -26,10 +53,25 @@
 import ipywidgets as widgets
 import statsmodels.stats.multitest as smm
 
-def cluster():
+def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'):
   net = Network()
 
-  print('**********')
+  net.load_df(df)
+
+  if filter_N_top is not None:
+    net.filter_N_top(axis='row', N_top=5000)
+
+  if norm_col == 'total':
+    net.normalize(axis='col', norm_type='umi')
+
+  if norm_row == 'zscore':
+    net.normalize(axis='row', norm_type='zscore')
+
+  net.cluster()
+
+  network = net.viz
+
+  return network
 
 class Network(object):
   '''
diff --git a/src/celldega/viz/widget.py b/src/celldega/viz/widget.py
index bef366c7..e96d4bed 100644
--- a/src/celldega/viz/widget.py
+++ b/src/celldega/viz/widget.py
@@ -82,12 +82,14 @@ class Matrix(anywidget.AnyWidget):
     Returns:
         Matrix: A widget for visualizing a hierarchically clustered matrix.
     """
+
     _esm = pathlib.Path(__file__).parent / "../static" / "widget.js"
     _css = pathlib.Path(__file__).parent / "../static" / "widget.css"
     value = traitlets.Int(0).tag(sync=True)
     component = traitlets.Unicode("Matrix").tag(sync=True)
 
     network = traitlets.Dict({}).tag(sync=True)
+
     width = traitlets.Int(600).tag(sync=True)
     height = traitlets.Int(600).tag(sync=True)
     click_info = traitlets.Dict({}).tag(sync=True)

From aafb2df9fa0a489901cebbc30651affd33e47bed Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:45:47 -0500
Subject: [PATCH 3/6] quick fix for tooltip cutoff in matrix and ist landscape

---
 js/deck-gl/make_tooltip.js          | 16 +++++++++++++---
 js/deck-gl/matrix/matrix_tooltip.js | 13 +++++++++++--
 js/viz/landscape_ist.js             |  2 ++
 js/viz/matrix_viz.js                | 10 ++++++++++
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/js/deck-gl/make_tooltip.js b/js/deck-gl/make_tooltip.js
index bbb9f4fe..46267dbc 100644
--- a/js/deck-gl/make_tooltip.js
+++ b/js/deck-gl/make_tooltip.js
@@ -8,7 +8,9 @@ export const make_tooltip = (viz_state, info) => {
     let inst_name = ''
     let inst_cat = ''
 
-    console.log(info.layer.id)
+    // console.log(info.layer.id)
+
+
 
     if (info.layer.id.startsWith('cell-layer') || info.layer.id.startsWith('path-layer')) {
         inst_name = info.layer.id.startsWith('cell-layer') ? viz_state.cats.cell_names_array[info.index] : viz_state.cats.polygon_cell_names[info.index]
@@ -26,8 +28,16 @@ export const make_tooltip = (viz_state, info) => {
         inst_html = `<div>neighborhood: ${inst_name}</div><div>cluster: ${inst_cat}</div>`
     }
 
-    d3.selectAll('.deck-tooltip')
-      .style('margin-top', '75px')
+    // d3.selectAll('.deck-tooltip')
+    //   .style('margin-top', '75px')
+
+    // console.log(viz_state.root)
+
+    // select the parent element of .deck-tooltip within viz_state.root
+    const tooltipContainer = viz_state.root.querySelector('.deck-tooltip');
+    tooltipContainer.style.marginTop = '50px'
+    const tooltipParent = tooltipContainer.parentElement.parentElement;
+    tooltipParent.style.position = 'unset'
 
     return {
         html: inst_html,
diff --git a/js/deck-gl/matrix/matrix_tooltip.js b/js/deck-gl/matrix/matrix_tooltip.js
index 3100939f..dad2f1be 100644
--- a/js/deck-gl/matrix/matrix_tooltip.js
+++ b/js/deck-gl/matrix/matrix_tooltip.js
@@ -4,8 +4,17 @@ export const get_tooltip = (viz_state, params) => {
 
     const {object, layer} = params;
 
-    d3.selectAll('.deck-tooltip')
-      .style('margin-top', '50px')
+    // d3.selectAll('.deck-tooltip')
+    //   .style('margin-top', '50px')
+
+    console.log('get_tooltip')
+    // select the parent element of .deck-tooltip within viz_state.root
+    const tooltipContainer = viz_state.root.querySelector('.deck-tooltip');
+    const tooltipParent = tooltipContainer.parentElement.parentElement;
+    tooltipParent.style.position = 'unset'
+
+    tooltipContainer.style.marginTop = '50px'
+
 
     if (object) {
       // Check which layer the tooltip is currently over
diff --git a/js/viz/landscape_ist.js b/js/viz/landscape_ist.js
index 69da0fdb..e4fba98d 100644
--- a/js/viz/landscape_ist.js
+++ b/js/viz/landscape_ist.js
@@ -51,6 +51,8 @@ export const landscape_ist = async (
 
     let viz_state = {}
 
+    viz_state.root = el
+
     viz_state.buttons = {}
     viz_state.buttons.blue = '#8797ff'
     viz_state.buttons.gray = 'gray'
diff --git a/js/viz/matrix_viz.js b/js/viz/matrix_viz.js
index e577ae5f..aaab9214 100644
--- a/js/viz/matrix_viz.js
+++ b/js/viz/matrix_viz.js
@@ -128,4 +128,14 @@ export const matrix_viz = async (
     el.appendChild(viz_state.root)
 
 
+    const tooltipContainer = viz_state.root.querySelector('.deck-tooltip');
+
+
+    console.log(viz_state.root)
+    console.log(tooltipContainer)
+
+    // select canvas element within viz_state.root
+    const canvas = viz_state.root.querySelector('canvas')
+    console.log(canvas)
+
 }
\ No newline at end of file

From 68591f785d2bef36fd179dd2d8e6919193748584 Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:11:57 -0500
Subject: [PATCH 4/6] added link to example from within docstring

---
 docs/python/viz/api.md              |  2 +-
 js/deck-gl/make_tooltip.js          |  4 ----
 js/deck-gl/matrix/matrix_tooltip.js |  8 +-------
 mkdocs.yaml                         |  1 +
 src/celldega/viz/__init__.py        | 32 ++++++++++++++++++++++++++++-
 5 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/docs/python/viz/api.md b/docs/python/viz/api.md
index 8220848a..4ea1745c 100644
--- a/docs/python/viz/api.md
+++ b/docs/python/viz/api.md
@@ -2,5 +2,5 @@
 
 ## Widget Classes
 
-::: celldega.viz.widget
+::: celldega.viz
 
diff --git a/js/deck-gl/make_tooltip.js b/js/deck-gl/make_tooltip.js
index 46267dbc..ae69f5a8 100644
--- a/js/deck-gl/make_tooltip.js
+++ b/js/deck-gl/make_tooltip.js
@@ -8,10 +8,6 @@ export const make_tooltip = (viz_state, info) => {
     let inst_name = ''
     let inst_cat = ''
 
-    // console.log(info.layer.id)
-
-
-
     if (info.layer.id.startsWith('cell-layer') || info.layer.id.startsWith('path-layer')) {
         inst_name = info.layer.id.startsWith('cell-layer') ? viz_state.cats.cell_names_array[info.index] : viz_state.cats.polygon_cell_names[info.index]
         inst_cat = viz_state.cats.dict_cell_cats[inst_name]
diff --git a/js/deck-gl/matrix/matrix_tooltip.js b/js/deck-gl/matrix/matrix_tooltip.js
index dad2f1be..2c506a31 100644
--- a/js/deck-gl/matrix/matrix_tooltip.js
+++ b/js/deck-gl/matrix/matrix_tooltip.js
@@ -4,18 +4,12 @@ export const get_tooltip = (viz_state, params) => {
 
     const {object, layer} = params;
 
-    // d3.selectAll('.deck-tooltip')
-    //   .style('margin-top', '50px')
-
-    console.log('get_tooltip')
     // select the parent element of .deck-tooltip within viz_state.root
     const tooltipContainer = viz_state.root.querySelector('.deck-tooltip');
+    tooltipContainer.style.marginTop = '50px'
     const tooltipParent = tooltipContainer.parentElement.parentElement;
     tooltipParent.style.position = 'unset'
 
-    tooltipContainer.style.marginTop = '50px'
-
-
     if (object) {
       // Check which layer the tooltip is currently over
       if (layer.id === 'row-label-layer') {
diff --git a/mkdocs.yaml b/mkdocs.yaml
index f7ed281d..f9a309de 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -39,6 +39,7 @@ nav:
     - examples/index.md
     - Brief Notebooks:
       - Landscape View Xenium: examples/brief_notebooks/Landscape_View_Xenium.ipynb
+      - Landscape-Matrix View Xenium: examples/brief_notebooks/Landscape-Matrix_Xenium.ipynb
       - Pre-Process LandscapeFiles: examples/brief_notebooks/Pre-process_Xenium_V1_human_Pancreas_FFPE_outs.ipynb
     # - Tutorials:
     #     - examples/Landscape_View_Xenium.ipynb
diff --git a/src/celldega/viz/__init__.py b/src/celldega/viz/__init__.py
index 17e406d9..86ee1ee2 100644
--- a/src/celldega/viz/__init__.py
+++ b/src/celldega/viz/__init__.py
@@ -3,5 +3,35 @@
 """
 
 from .widget import Landscape, Matrix
+from ipywidgets import jslink, HBox, Layout
 
-__all__ = ["Landscape", "Matrix"]
+def landscape_matrix(landscape, mat, width='600px', height='700px'):
+    """
+    Display a `Landscape` widget and a `Matrix` widget side by side.
+
+    Args:
+        landscape (Landscape): A `Landscape` widget.
+        mat (Matrix): A `Matrix` widget.
+        width (str): The width of the widgets.
+        height (str): The height of the widgets.
+
+    Returns:
+        Visualization display
+
+    Example:
+    See example [Landscape-Matrix_Xenium](../../../examples/brief_notebooks/Landscape-Matrix_Xenium) notebook
+    """
+
+    # Use `jslink` to directly link `click_info` from `mat` to `trigger_value` in `landscape_ist`
+    jslink((mat, 'click_info'), (landscape, 'update_trigger'))
+
+    # Set layouts for the widgets
+    mat.layout = Layout(width=width)  # Adjust as needed
+    landscape.layout = Layout(width=width, height=height)  # Adjust as needed
+
+    # Display widgets side by side
+    widgets_side_by_side = HBox([landscape, mat])
+
+    display(widgets_side_by_side)
+
+__all__ = ["Landscape", "Matrix", 'landscape_matrix']

From 1d17729a397033be53c5457338adcaa4edae8a63 Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:45:49 -0500
Subject: [PATCH 5/6] cleaned documentation for main modules

---
 docs/python/clust/api.md       |  3 +++
 docs/python/index.md           | 10 +++++-----
 docs/python/nbhd/api.md        |  2 --
 mkdocs.yaml                    |  2 ++
 pyproject.toml                 |  1 -
 src/celldega/clust/__init__.py | 25 +++++++++++++++++++++++++
 src/celldega/nbhd/__init__.py  |  4 ++++
 7 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 docs/python/clust/api.md

diff --git a/docs/python/clust/api.md b/docs/python/clust/api.md
new file mode 100644
index 00000000..32f61dc7
--- /dev/null
+++ b/docs/python/clust/api.md
@@ -0,0 +1,3 @@
+# Clust Module API Reference
+
+::: celldega.clust
\ No newline at end of file
diff --git a/docs/python/index.md b/docs/python/index.md
index 0d76eee1..ca85bedb 100644
--- a/docs/python/index.md
+++ b/docs/python/index.md
@@ -1,13 +1,13 @@
 # Python API Overview
 
-## [Neighborhood Overview](nbhd/api)
-The `nbhd` module contains methods for calculating tissue neighborhoods.
-
 ## [Pre Module Overview](pre/api)
-
 The `pre` module contains methods for pre-processing LandscapeFiles.
 
+## [Clust Module Overview](clust/api)
+The `clust` module contains methods for clustering high-dimensional data.
 
-## [Viz Module Overview](viz/api)
+## [Neighborhood Overview](nbhd/api)
+The `nbhd` module contains methods for calculating tissue neighborhoods.
 
+## [Viz Module Overview](viz/api)
 The `viz` module contains functions and classes for data visualization.
diff --git a/docs/python/nbhd/api.md b/docs/python/nbhd/api.md
index f738d069..2e846793 100644
--- a/docs/python/nbhd/api.md
+++ b/docs/python/nbhd/api.md
@@ -1,5 +1,3 @@
 # Neighborhood Module API Reference
 
-<!-- ::: celldega.pre.landscape -->
-<!-- ::: celldega.pre -->
 ::: celldega.nbhd
\ No newline at end of file
diff --git a/mkdocs.yaml b/mkdocs.yaml
index f9a309de..1ca6237e 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -22,6 +22,8 @@ nav:
   - Python API:
       - python/index.md
       - pre: python/pre/api.md
+      - clust: python/clust/api.md
+      - nbhd: python/nbhd/api.md
       - viz: python/viz/api.md
 
   - JavaScript API:
diff --git a/pyproject.toml b/pyproject.toml
index c2bf049f..c2a82a0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
     "imagecodecs~=2024.1.1",
     "scanpy~=1.10.2",
     "squidpy~=1.5.0",
-    "clustergrammer2==0.18.0",
     "shapely~=2.0.5",
     "polars~=1.10.0",
     "libpysal~=4.8.1"
diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py
index d768760a..a9b5b2f8 100644
--- a/src/celldega/clust/__init__.py
+++ b/src/celldega/clust/__init__.py
@@ -1,3 +1,7 @@
+"""
+Module for clustering high-dimensional data.
+"""
+
 # The Celldega Matrix Vizualization Method is being built using the approaches
 # and code adaptations from the Clustergrammer-GL library, which is available at
 # github.com/ismms-himc/clustergrammer2
@@ -54,6 +58,27 @@
 import statsmodels.stats.multitest as smm
 
 def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'):
+
+  """
+  This function performs hierarchical clustering on the rows and columns of a
+  DataFrame and returns the visualization JSON for the Celldega-Matrix method.
+  This function is developed using the approaches and code adaptations from the
+  Clustergrammer2 project.
+
+  Args:
+    df (pd.DataFrame): A Pandas DataFrame.
+    filter_N_top (int): The number of top dimensions to keep after filtering.
+    norm_col (str): The type of normalization to apply to the columns.
+    norm_row (str): The type of normalization to apply to the rows.
+
+  Returns:
+    dict: The visualization JSON for the Celldega-Matrix method
+
+
+  Example:
+  See [Landscape-Matrix_Xenium](../../../examples/brief_notebooks/Landscape-Matrix_Xenium) notebook.
+  """
+
   net = Network()
 
   net.load_df(df)
diff --git a/src/celldega/nbhd/__init__.py b/src/celldega/nbhd/__init__.py
index 3b818674..26f9c420 100644
--- a/src/celldega/nbhd/__init__.py
+++ b/src/celldega/nbhd/__init__.py
@@ -1,3 +1,7 @@
+"""
+Module for performing neighborhood analysis.
+"""
+
 from libpysal.cg import alpha_shape as libpysal_alpha_shape
 import geopandas as gpd
 from shapely import Point, MultiPoint, MultiPolygon

From b24b6feffd99c144815719ca376a4bfd21523f21 Mon Sep 17 00:00:00 2001
From: Nicolas Fernandez <cornhundred@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:55:42 -0500
Subject: [PATCH 6/6] removed unnecessary changes

---
 js/viz/matrix_viz.js | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/js/viz/matrix_viz.js b/js/viz/matrix_viz.js
index aaab9214..e577ae5f 100644
--- a/js/viz/matrix_viz.js
+++ b/js/viz/matrix_viz.js
@@ -128,14 +128,4 @@ export const matrix_viz = async (
     el.appendChild(viz_state.root)
 
 
-    const tooltipContainer = viz_state.root.querySelector('.deck-tooltip');
-
-
-    console.log(viz_state.root)
-    console.log(tooltipContainer)
-
-    // select canvas element within viz_state.root
-    const canvas = viz_state.root.querySelector('canvas')
-    console.log(canvas)
-
 }
\ No newline at end of file