From 0dfd68845979415306c4054bb854051ee9ef5ffe Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Tue, 7 Jan 2025 16:46:17 -0500 Subject: [PATCH 1/6] bringing in Clustergrammer-PY backe-end --- src/celldega/__init__.py | 2 + src/celldega/clust/__init__.py | 1504 ++++++++++++++++++++++ src/celldega/clust/calc_clust.py | 183 +++ src/celldega/clust/cat_pval.py | 98 ++ src/celldega/clust/categories.py | 346 +++++ src/celldega/clust/data_formats.py | 134 ++ src/celldega/clust/downsample_fun.py | 280 ++++ src/celldega/clust/enrichr_functions.py | 376 ++++++ src/celldega/clust/export_data.py | 57 + src/celldega/clust/iframe_web_app.py | 32 + src/celldega/clust/initialize_net.py | 97 ++ src/celldega/clust/load_data.py | 97 ++ src/celldega/clust/load_vect_post.py | 46 + src/celldega/clust/make_clust_fun.py | 75 ++ src/celldega/clust/make_sim_mat.py | 71 + src/celldega/clust/make_unique_labels.py | 88 ++ src/celldega/clust/make_viz.py | 72 ++ src/celldega/clust/normalize_fun.py | 165 +++ src/celldega/clust/proc_df_labels.py | 62 + src/celldega/clust/run_filter.py | 209 +++ 20 files changed, 3994 insertions(+) create mode 100644 src/celldega/clust/__init__.py create mode 100644 src/celldega/clust/calc_clust.py create mode 100644 src/celldega/clust/cat_pval.py create mode 100644 src/celldega/clust/categories.py create mode 100644 src/celldega/clust/data_formats.py create mode 100644 src/celldega/clust/downsample_fun.py create mode 100644 src/celldega/clust/enrichr_functions.py create mode 100644 src/celldega/clust/export_data.py create mode 100644 src/celldega/clust/iframe_web_app.py create mode 100644 src/celldega/clust/initialize_net.py create mode 100644 src/celldega/clust/load_data.py create mode 100644 src/celldega/clust/load_vect_post.py create mode 100644 src/celldega/clust/make_clust_fun.py create mode 100644 src/celldega/clust/make_sim_mat.py create mode 100644 src/celldega/clust/make_unique_labels.py create mode 100644 src/celldega/clust/make_viz.py create mode 100644 src/celldega/clust/normalize_fun.py create mode 100644 src/celldega/clust/proc_df_labels.py create mode 100644 src/celldega/clust/run_filter.py diff --git a/src/celldega/__init__.py b/src/celldega/__init__.py index d051f6ba..9bb3ac92 100644 --- a/src/celldega/__init__.py +++ b/src/celldega/__init__.py @@ -4,6 +4,8 @@ from celldega.pre import landscape from celldega.nbhd import alpha_shape +from celldega.clust import Network + # temporary fix for libpysal warning import warnings warnings.filterwarnings("ignore", category=FutureWarning) diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py new file mode 100644 index 00000000..28550be1 --- /dev/null +++ b/src/celldega/clust/__init__.py @@ -0,0 +1,1504 @@ +import numpy as np +import pandas as pd +from copy import deepcopy + +from . import initialize_net +from . import load_data +from . import export_data +from . import load_vect_post +from . import make_clust_fun +from . import normalize_fun +from . import data_formats +from . import enrichr_functions as enr_fun +from . import iframe_web_app +from . import run_filter +from . import downsample_fun +from . import categories + +from scipy.stats import ttest_ind, mannwhitneyu +from sklearn.metrics import pairwise_distances, roc_curve, auc +from scipy.spatial.distance import pdist +from sklearn.metrics import confusion_matrix +import random +from itertools import combinations +import matplotlib.pyplot as plt +import json +import ipywidgets as widgets +import statsmodels.stats.multitest as smm + +def cluster(): + net = Network() + + print('**********') + +class Network(object): + ''' + Clustergrammer.py takes a matrix as input (either from a file of a Pandas DataFrame), normalizes/filters, hierarchically clusters, and produces the :ref:`visualization_json` for :ref:`clustergrammer_js`. + + Networks have two states: + + 1. the data state, where they are stored as a matrix and nodes + 2. the viz state where they are stored as viz.links, viz.row_nodes, and viz.col_nodes. + + The goal is to start in a data-state and produce a viz-state of + the network that will be used as input to clustergram.js. + ''' + + def __init__(self, widget=None): + initialize_net.main(self, widget) + + def reset(self): + ''' + This re-initializes the Network object. + ''' + initialize_net.main(self) + + def load_file(self, filename): + ''' + Load TSV file. + ''' + load_data.load_file(self, filename) + + def load_file_as_string(self, file_string, filename=''): + ''' + Load file as a string. + ''' + load_data.load_file_as_string(self, file_string, filename=filename) + + + def load_stdin(self): + ''' + Load stdin TSV-formatted string. + ''' + load_data.load_stdin(self) + + def load_tsv_to_net(self, file_buffer, filename=None): + ''' + This will load a TSV matrix file buffer; this is exposed so that it will + be possible to load data without having to read from a file. + ''' + load_data.load_tsv_to_net(self, file_buffer, filename) + + def load_vect_post_to_net(self, vect_post): + ''' + Load data in the vector format JSON. + ''' + load_vect_post.main(self, vect_post) + + def load_data_file_to_net(self, filename): + ''' + Load Clustergrammer's dat format (saved as JSON). + ''' + inst_dat = self.load_json_to_dict(filename) + load_data.load_data_to_net(self, inst_dat) + + def cluster(self, dist_type='cosine', run_clustering=True, + dendro=True, views=[], + linkage_type='average', sim_mat=False, filter_sim=0.0, + calc_cat_pval=False, run_enrichr=None, enrichrgram=None, + clust_library='scipy', min_samples=1, min_cluster_size=2): + ''' + The main function performs hierarchical clustering, optionally generates + filtered views (e.g. row-filtered views), and generates the : + ``visualization_json``. + + Used to set views equal to ['N_row_sum', 'N_row_var'] + ''' + initialize_net.viz(self) + + make_clust_fun.make_clust(self, dist_type=dist_type, + run_clustering=run_clustering, + dendro=dendro, + requested_views=views, + linkage_type=linkage_type, + sim_mat=sim_mat, + filter_sim=filter_sim, + calc_cat_pval=calc_cat_pval, + run_enrichr=run_enrichr, + enrichrgram=enrichrgram, + clust_library=clust_library, + min_samples=min_samples, + min_cluster_size=min_cluster_size) + + def swap_nan_for_zero(self): + ''' + Swaps all NaN (numpy NaN) instances for zero. + ''' + self.dat['mat'][np.isnan(self.dat['mat'])] = 0 + + def load_df(self, df_ini, meta_col=None, meta_row=None, col_cats=None, + row_cats=None, is_downsampled=False, meta_ds_row=None, + meta_ds_col=None): + ''' + Load Pandas DataFrame. + ''' + self.reset() + + # load dataframe + df = deepcopy(df_ini) + + + if is_downsampled: + if meta_ds_col is not None: + self.meta_ds_col = meta_ds_col + if meta_ds_row is not None: + self.meta_ds_row = meta_ds_row + + # define downsampled status + self.is_downsampled = is_downsampled + # print('load_df: is_downsampled', is_downsampled) + + if hasattr(self, 'meta_col') == False and hasattr(self, 'meta_row') == False: + self.meta_cat = False + + # load metadata + if isinstance(meta_col, pd.DataFrame): + self.meta_col = meta_col + + if col_cats is None: + self.col_cats = meta_col.columns.tolist() + else: + self.col_cats = col_cats + + self.meta_cat = True + + if isinstance(meta_row, pd.DataFrame): + self.meta_row = meta_row + + if row_cats is None: + self.row_cats = meta_row.columns.tolist() + else: + self.row_cats = row_cats + + self.meta_cat = True + + data_formats.df_to_dat(self, df, define_cat_colors=True) + + def export_df(self): + ''' + Export Pandas DataFrame/ + ''' + df = data_formats.dat_to_df(self) + + # drop tuple categories if downsampling + if self.is_downsampled: + df.columns = self.dat['nodes']['col'] + df.index = self.dat['nodes']['row'] + + return df + + def df_to_dat(self, df, define_cat_colors=False): + ''' + Load Pandas DataFrame (will be deprecated). + ''' + data_formats.df_to_dat(self, df, define_cat_colors) + + def set_matrix_colors(self, pos='red', neg='blue'): + + self.viz['matrix_colors'] = {} + self.viz['matrix_colors']['pos'] = pos + self.viz['matrix_colors']['neg'] = neg + + def set_global_cat_colors(self, df_meta): + + for inst_name in df_meta.index.tolist(): + inst_color = df_meta.loc[inst_name, 'color'] + + self.viz['global_cat_colors'][inst_name] = inst_color + + def set_cat_color(self, axis, cat_index, cat_name, inst_color): + + if axis == 0: + axis = 'row' + if axis == 1: + axis = 'col' + + try: + # process cat_index + cat_index = cat_index - 1 + cat_index = 'cat-' + str(cat_index) + + self.viz['cat_colors'][axis][cat_index][cat_name] = inst_color + + except: + print('there was an error setting the category color') + + def dat_to_df(self): + ''' + Export Pandas DataFrams (will be deprecated). + ''' + return data_formats.dat_to_df(self) + + def export_net_json(self, net_type='viz', indent='no-indent'): + ''' + Export dat or viz JSON. + ''' + return export_data.export_net_json(self, net_type, indent) + + def export_viz_to_widget(self, which_viz='viz'): + ''' + Export viz JSON, for use with clustergrammer_widget. Formerly method was + named widget. + ''' + + return export_data.export_net_json(self, which_viz, 'no-indent') + + def widget(self, which_viz='viz', link_net=None, link_net_js=None, clust_library='scipy', + min_samples=1, min_cluster_size=2): + ''' + Generate a widget visualization using the widget. The export_viz_to_widget + method passes the visualization JSON to the instantiated widget, which is + returned and visualized on the front-end. + ''' + # run clustering if necessary + if len(self.viz['row_nodes']) == 0: + self.cluster(clust_library=clust_library, min_samples=min_samples, + min_cluster_size=min_cluster_size) + + # add manual_category to viz json + if 'manual_category' in self.dat: + self.viz['manual_category'] = self.dat['manual_category'] + + # add pre-z-score data to viz + if 'pre_zscore' in self.dat: + self.viz['pre_zscore'] = self.dat['pre_zscore'] + + self.widget_instance = self.widget_class(network = self.export_viz_to_widget(which_viz)) + + # initialize manual category + if 'manual_category' in self.dat: + manual_cat = {} + axis = 'col' + manual_cat[axis] = {} + manual_cat[axis]['col_cat_colors'] = self.viz['cat_colors'][axis]['cat-0'] + + man_cat_name = self.dat['manual_category'][axis] + if axis == 'col': + manual_cat[axis][man_cat_name] = self.meta_col[man_cat_name].to_dict() + + self.widget_instance.manual_cat = json.dumps(manual_cat) + + self.widget_instance.observe(self.get_manual_category, names='manual_cat') + + # add link (python) + if link_net is not None: + inst_link = widgets.link( + (self.widget_instance, 'manual_cat'), + (link_net.widget_instance, 'manual_cat') + ) + self.widget_instance.link = inst_link + + # add jslink (JavaScript) + if link_net_js is not None: + + inst_link = widgets.jslink( + (self.widget_instance, 'manual_cat'), + (link_net_js.widget_instance, 'manual_cat') + ) + self.widget_instance.link = inst_link + + return self.widget_instance + + + def widget_df(self): + ''' + Export a DataFrame from the front-end visualization. For instance, a user + can filter to show only a single cluster using the dendrogram and then + get a dataframe of this cluster using the widget_df method. + ''' + + if hasattr(self, 'widget_instance') == True: + + if self.widget_instance.mat_string != '': + + tmp_net = deepcopy(Network()) + + df_string = self.widget_instance.mat_string + + tmp_net.load_file_as_string(df_string) + + df = tmp_net.export_df() + + return df + + else: + return self.export_df() + + else: + if hasattr(self, 'widget_class') == True: + print('Please make the widget before exporting the widget DataFrame.') + print('Do this using the widget method: net.widget()') + + else: + print('Can not make widget because Network has no attribute widget_class') + print('Please instantiate Network with clustergrammer_widget using: Network(clustergrammer_widget)') + + def write_json_to_file(self, net_type, filename, indent='no-indent'): + ''' + Save dat or viz as a JSON to file. + ''' + export_data.write_json_to_file(self, net_type, filename, indent) + + def write_matrix_to_tsv(self, filename=None, df=None): + ''' + Export data-matrix to file. + ''' + return export_data.write_matrix_to_tsv(self, filename, df) + + def filter_sum(self, threshold, take_abs=True, axis=None, inst_rc=None): + ''' + Filter a network's rows or columns based on the sum across rows or columns. + ''' + + if axis is None: + axis = inst_rc + print('warning inst_rc argument will be deprecated, please use axis') + + if inst_rc is None: + print('please provide axis argument') + + inst_df = self.dat_to_df() + if axis == 'row': + inst_df = run_filter.df_filter_row_sum(inst_df, threshold, take_abs) + elif axis == 'col': + inst_df = run_filter.df_filter_col_sum(inst_df, threshold, take_abs) + self.df_to_dat(inst_df) + + def filter_N_top(self, N_top, rank_type='sum', inst_rc=None, axis=None): + ''' + Filter the matrix rows or columns based on sum/variance, and only keep the top + N. + ''' + + if axis is None: + axis = inst_rc + print('warning inst_rc argument will be deprecated, please use axis') + + if inst_rc is None: + print('please provide axis argument') + + inst_df = self.dat_to_df() + + inst_df = run_filter.filter_N_top(inst_rc, inst_df, N_top, rank_type) + + self.df_to_dat(inst_df) + + def filter_threshold(self, threshold, num_occur=1, inst_rc=None, axis=None): + ''' + Filter the matrix rows or columns based on num_occur values being above a + threshold (in absolute value). + ''' + + if axis is None: + axis = inst_rc + print('warning inst_rc argument will be deprecated, please use axis') + + if inst_rc is None: + print('please provide axis argument') + + inst_df = self.dat_to_df() + + inst_df = run_filter.filter_threshold(inst_df, axis, threshold, + num_occur) + + self.df_to_dat(inst_df) + + def filter_cat(self, axis, cat_index, cat_name): + ''' + Filter the matrix based on their category. cat_index is the index of the category, the first category has index=1. + ''' + run_filter.filter_cat(self, axis, cat_index, cat_name) + + def filter_names(self, axis, names): + ''' + Filter the visualization using row/column names. The function takes, axis ('row'/'col') and names, a list of strings. + ''' + run_filter.filter_names(self, axis, names) + + def clip(self, lower=None, upper=None): + ''' + Trim values at input thresholds using pandas function + ''' + df = self.export_df() + df = df.clip(lower=lower, upper=upper) + self.load_df(df) + + def normalize(self, df=None, norm_type='zscore', axis='row', z_clip=None): + ''' + Normalize the matrix rows or columns using Z-score (zscore) or Quantile Normalization (qn). Users can optionally pass in a DataFrame to be normalized (and this will be incorporated into the Network object). + ''' + normalize_fun.run_norm(self, df, norm_type, axis, z_clip) + + def downsample(self, df=None, ds_type='kmeans', axis='row', num_samples=100, + random_state=1000, ds_name='Downsample', + ds_cluster_name='cluster'): + ''' + Downsample the matrix rows or columns (currently supporting kmeans only). Users can optionally pass in a DataFrame to be downsampled (and this will be incorporated into the network object). + ''' + return downsample_fun.main(self, df, ds_type, axis, num_samples, random_state, ds_name, ds_cluster_name) + + def random_sample(self, num_samples, df=None, replace=False, weights=None, random_state=100, axis='row'): + ''' + Return random sample of matrix. + ''' + + if df is None: + df = self.dat_to_df() + + if axis == 'row': + axis = 0 + if axis == 'col': + axis = 1 + + df = self.export_df() + df = df.sample(n=num_samples, replace=replace, weights=weights, random_state=random_state, axis=axis) + + self.load_df(df) + + def add_cats(self, axis, cat_data): + ''' + Add categories to rows or columns using cat_data array of objects. Each object in cat_data is a dictionary with one key (category title) and value (rows/column names) that have this category. Categories will be added onto the existing categories and will be added in the order of the objects in the array. + + Example ``cat_data``:: + + + [ + { + "title": "First Category", + "cats": { + "true": [ + "ROS1", + "AAK1" + ] + } + }, + { + "title": "Second Category", + "cats": { + "something": [ + "PDK4" + ] + } + } + ] + + + ''' + for inst_data in cat_data: + categories.add_cats(self, axis, inst_data) + + def Iframe_web_app(self, filename=None, width=1000, height=800): + + link = iframe_web_app.main(self, filename, width, height) + + return link + + def enrichrgram(self, lib, axis='row'): + ''' + Add Enrichr gene enrichment results to your visualization (where your rows + are genes). Run enrichrgram before clustering to incldue enrichment results + as row categories. Enrichrgram can also be run on the front-end using the + Enrichr logo at the top left. + + Set lib to the Enrichr library that you want to use for enrichment analysis. + Libraries included: + + * ChEA_2016 + * KEA_2015 + * ENCODE_TF_ChIP-seq_2015 + * ENCODE_Histone_Modifications_2015 + * Disease_Perturbations_from_GEO_up + * Disease_Perturbations_from_GEO_down + * GO_Molecular_Function_2015 + * GO_Biological_Process_2015 + * GO_Cellular_Component_2015 + * Reactome_2016 + * KEGG_2016 + * MGI_Mammalian_Phenotype_Level_4 + * LINCS_L1000_Chem_Pert_up + * LINCS_L1000_Chem_Pert_down + + ''' + + df = self.export_df() + df, bar_info = enr_fun.add_enrichr_cats(df, axis, lib) + self.load_df(df) + + self.dat['enrichrgram_lib'] = lib + self.dat['row_cat_bars'] = bar_info + + @staticmethod + def load_gmt(filename): + return load_data.load_gmt(filename) + + @staticmethod + def load_json_to_dict(filename): + return load_data.load_json_to_dict(filename) + + @staticmethod + def save_dict_to_json(inst_dict, filename, indent='no-indent'): + export_data.save_dict_to_json(inst_dict, filename, indent) + + @staticmethod + def load_gene_exp_to_df(inst_path): + ''' + Loads gene expression data from 10x in sparse matrix format and returns a + Pandas dataframe + ''' + + import pandas as pd + from scipy import io + from scipy import sparse + from ast import literal_eval as make_tuple + + # matrix + Matrix = io.mmread( inst_path + 'matrix.mtx') + mat = Matrix.todense() + + # genes + filename = inst_path + 'genes.tsv' + f = open(filename, 'r') + lines = f.readlines() + f.close() + + # # add unique id to all genes + # genes = [] + # unique_id = 0 + # for inst_line in lines: + # inst_line = inst_line.strip().split() + + # if len(inst_line) > 1: + # inst_gene = inst_line[1] + # else: + # inst_gene = inst_line[0] + + # genes.append(inst_gene + '_' + str(unique_id)) + # unique_id = unique_id + 1 + + # add unique id only to duplicate genes + ini_genes = [] + for inst_line in lines: + inst_line = inst_line.strip().split() + if len(inst_line) > 1: + inst_gene = inst_line[1] + else: + inst_gene = inst_line[0] + ini_genes.append(inst_gene) + + gene_name_count = pd.Series(ini_genes).value_counts() + duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist() + + dup_index = {} + genes = [] + for inst_row in ini_genes: + + # add index to non-unique genes + if inst_row in duplicate_genes: + + # calc_non-unque index + if inst_row not in dup_index: + dup_index[inst_row] = 1 + else: + dup_index[inst_row] = dup_index[inst_row] + 1 + + new_row = inst_row + '_' + str(dup_index[inst_row]) + + else: + new_row = inst_row + + genes.append(new_row) + + # barcodes + filename = inst_path + 'barcodes.tsv' + f = open(filename, 'r') + lines = f.readlines() + f.close() + + cell_barcodes = [] + for inst_bc in lines: + inst_bc = inst_bc.strip().split('\t') + + # remove dash from barcodes if necessary + if '-' in inst_bc[0]: + inst_bc[0] = inst_bc[0].split('-')[0] + + cell_barcodes.append(inst_bc[0]) + + # parse tuples if necessary + try: + cell_barcodes = [make_tuple(x) for x in cell_barcodes] + except: + pass + + try: + genes = [make_tuple(x) for x in genes] + except: + pass + + # make dataframe + df = pd.DataFrame(mat, index=genes, columns=cell_barcodes) + + return df + + @staticmethod + def save_gene_exp_to_mtx_dir(inst_path, df): + + import os + from scipy import io + from scipy import sparse + + if not os.path.exists(inst_path): + os.makedirs(inst_path) + + genes = df.index.tolist() + barcodes = df.columns.tolist() + + save_list_to_tsv(genes, inst_path + 'genes.tsv') + save_list_to_tsv(barcodes, inst_path + 'barcodes.tsv') + + mat_ge = df.values + mat_ge_sparse = sparse.coo_matrix(mat_ge) + + io.mmwrite( inst_path + 'matrix.mtx', mat_ge_sparse) + + @staticmethod + def save_list_to_tsv(inst_list, filename): + + f = open(filename, 'w') + for inst_line in inst_list: + f.write(str(inst_line) + '\n') + + f.close() + + def sim_same_and_diff_category_samples(self, df, cat_index=1, dist_type='cosine', + equal_var=False, plot_roc=True, + precalc_dist=False, calc_roc=True): + ''' + Calculate the similarity of samples from the same and different categories. The + cat_index gives the index of the category, where 1 in the first category + ''' + + cols = df.columns.tolist() + + if type(precalc_dist) == bool: + # compute distnace between rows (transpose to get cols as rows) + dist_arr = 1 - pdist(df.transpose(), metric=dist_type) + else: + dist_arr = precalc_dist + + # generate sample names with categories + sample_combos = list(combinations(range(df.shape[1]),2)) + + sample_names = [str(ind) + '_same' if cols[x[0]][cat_index] == cols[x[1]][cat_index] else str(ind) + '_different' for ind, x in enumerate(sample_combos)] + + ser_dist = pd.Series(data=dist_arr, index=sample_names) + + # find same-cat sample comparisons + same_cat = [x for x in sample_names if x.split('_')[1] == 'same'] + + # find diff-cat sample comparisons + diff_cat = [x for x in sample_names if x.split('_')[1] == 'different'] + + # make series of same and diff category sample comparisons + ser_same = ser_dist[same_cat] + ser_same.name = 'Same Category' + ser_diff = ser_dist[diff_cat] + ser_diff.name = 'Different Category' + + sim_dict = {} + roc_data = {} + sim_data = {} + + sim_dict['same'] = ser_same + sim_dict['diff'] = ser_diff + + pval_dict = {} + ttest_stat, pval_dict['ttest'] = ttest_ind(ser_diff, ser_same, equal_var=equal_var) + + ttest_stat, pval_dict['mannwhitney'] = mannwhitneyu(ser_diff, ser_same) + + if calc_roc: + # calc AUC + true_index = list(np.ones(sim_dict['same'].shape[0])) + false_index = list(np.zeros(sim_dict['diff'].shape[0])) + y_true = true_index + false_index + + true_val = list(sim_dict['same'].values) + false_val = list(sim_dict['diff'].values) + y_score = true_val + false_val + + fpr, tpr, thresholds = roc_curve(y_true, y_score) + + inst_auc = auc(fpr, tpr) + + if plot_roc: + plt.figure() + plt.plot(fpr, tpr) + plt.plot([0, 1], [0, 1], color='navy', linestyle='--') + plt.figure(figsize=(10,10)) + + print('AUC', inst_auc) + + roc_data['true'] = y_true + roc_data['score'] = y_score + roc_data['fpr'] = fpr + roc_data['tpr'] = tpr + roc_data['thresholds'] = thresholds + roc_data['auc'] = inst_auc + + sim_data['sim_dict'] = sim_dict + sim_data['pval_dict'] = pval_dict + sim_data['roc_data'] = roc_data + + return sim_data + + def generate_signatures(self, df_data, df_meta, category_name, pval_cutoff=0.05, + num_top_dims=False, verbose=True, equal_var=False): + + ''' + Generate signatures for column categories + T-test is run for each category in a one-vs-all manner. The num_top_dims overrides + the P-value cutoff. + ''' + + df_t = df_data.transpose() + + # remove columns (dimensions) with constant values + orig_num_cols = df_t.shape[1] + df_t = df_t.loc[:, (df_t != df_t.iloc[0]).any()] + if df_t.shape[1] < orig_num_cols: + print('dropped columns with constant values') + + # make tuple rows + df_t.index = [(x, df_meta.loc[x, category_name]) for x in df_t.index.tolist()] + category_level = 1 + + df = self.row_tuple_to_multiindex(df_t) + + cell_types = sorted(list(set(df.index.get_level_values(category_level).tolist()))) + + keep_genes = [] + keep_genes_dict = {} + gene_pval_dict = {} + all_fold_info = {} + + for inst_ct in cell_types: + + inst_ct_mat = df.xs(key=inst_ct, level=category_level) + inst_other_mat = df.drop(inst_ct, level=category_level) + + # save mean values and fold change + fold_info = {} + fold_info['cluster_mean'] = inst_ct_mat.mean() + fold_info['other_mean'] = inst_other_mat.mean() + fold_info['log2_fold'] = fold_info['cluster_mean']/fold_info['other_mean'] + fold_info['log2_fold'] = fold_info['log2_fold'].apply(np.log2) + all_fold_info[inst_ct] = fold_info + + inst_stats, inst_pvals = ttest_ind(inst_ct_mat, inst_other_mat, axis=0, equal_var=equal_var) + + ser_pval = pd.Series(data=inst_pvals, index=df.columns.tolist()).sort_values() + + if num_top_dims == False: + ser_pval_keep = ser_pval[ser_pval < pval_cutoff] + else: + ser_pval_keep = ser_pval[:num_top_dims] + + gene_pval_dict[inst_ct] = ser_pval_keep + + inst_keep = ser_pval_keep.index.tolist() + keep_genes.extend(inst_keep) + keep_genes_dict[inst_ct] = inst_keep + + keep_genes = sorted(list(set(keep_genes))) + + df_gbm = df.groupby(level=category_level).mean().transpose() + cols = df_gbm.columns.tolist() + + df_sig = df_gbm.loc[keep_genes] + + if len(keep_genes) == 0 and verbose: + print('found no informative dimensions') + + df_gene_pval = pd.concat(gene_pval_dict, axis=1, sort=False) + + df_diff = {} + for inst_col in df_gene_pval: + + inst_pvals = df_gene_pval[inst_col] + + # nans represent dimensions that did not meet pval or top threshold + inst_pvals = inst_pvals.dropna().sort_values() + + inst_genes = inst_pvals.index.tolist() + + # prevent failure if no cells have this category + if inst_pvals.shape[0] > 0: + rej, pval_corr = smm.multipletests(inst_pvals, 0.05, method='fdr_bh')[:2] + + ser_pval = pd.Series(data=inst_pvals, index=inst_genes, name='P-values') + ser_bh_pval = pd.Series(data=pval_corr, index=inst_genes, name='BH P-values') + ser_log2_fold = all_fold_info[inst_col]['log2_fold'].loc[inst_genes] + ser_log2_fold.name = 'Log2 Fold Change' + ser_cluster_mean = all_fold_info[inst_col]['cluster_mean'].loc[inst_genes] + ser_cluster_mean.name = 'Cluster Mean' + ser_other_mean = all_fold_info[inst_col]['other_mean'].loc[inst_genes] + ser_other_mean.name = 'All Other Mean' + inst_df = pd.concat([ser_pval, ser_bh_pval, ser_log2_fold, ser_cluster_mean, ser_other_mean], axis=1) + + df_diff[inst_col] = inst_df + + return df_sig, df_diff + + def old_generate_signatures(self, df_ini, category_level, pval_cutoff=0.05, + num_top_dims=False, verbose=True, equal_var=False): + + ''' Generate signatures for column categories ''' + + # change this to use category name and set caetgory level to 1 always + ################################### + + df_t = df_ini.transpose() + + # remove columns with constant values + df_t = df_t.loc[:, (df_t != df_t.iloc[0]).any()] + + df = self.row_tuple_to_multiindex(df_t) + + cell_types = sorted(list(set(df.index.get_level_values(category_level).tolist()))) + + keep_genes = [] + keep_genes_dict = {} + gene_pval_dict = {} + all_fold_info = {} + + for inst_ct in cell_types: + + inst_ct_mat = df.xs(key=inst_ct, level=category_level) + inst_other_mat = df.drop(inst_ct, level=category_level) + + # save mean values and fold change + fold_info = {} + fold_info['cluster_mean'] = inst_ct_mat.mean() + fold_info['other_mean'] = inst_other_mat.mean() + fold_info['log2_fold'] = fold_info['cluster_mean']/fold_info['other_mean'] + fold_info['log2_fold'] = fold_info['log2_fold'].apply(np.log2) + all_fold_info[inst_ct] = fold_info + + inst_stats, inst_pvals = ttest_ind(inst_ct_mat, inst_other_mat, axis=0, equal_var=equal_var) + + ser_pval = pd.Series(data=inst_pvals, index=df.columns.tolist()).sort_values() + + if num_top_dims == False: + ser_pval_keep = ser_pval[ser_pval < pval_cutoff] + else: + ser_pval_keep = ser_pval[:num_top_dims] + + gene_pval_dict[inst_ct] = ser_pval_keep + + inst_keep = ser_pval_keep.index.tolist() + keep_genes.extend(inst_keep) + keep_genes_dict[inst_ct] = inst_keep + + keep_genes = sorted(list(set(keep_genes))) + + df_gbm = df.groupby(level=category_level).mean().transpose() + cols = df_gbm.columns.tolist() + new_cols = [] + for inst_col in cols: + new_col = (inst_col, category_level + ': ' + inst_col) + new_cols.append(new_col) + df_gbm.columns = new_cols + + df_sig = df_gbm.loc[keep_genes] + + if len(keep_genes) == 0 and verbose: + print('found no informative dimensions') + + df_gene_pval = pd.concat(gene_pval_dict, axis=1, sort=False) + + return df_sig, df_gene_pval, all_fold_info + + def predict_cats_from_sigs(self, df_data_ini, df_meta, df_sig_ini, + predict='Predicted Category', dist_type='cosine', + unknown_thresh=-1): + ''' Predict category using signature ''' + + keep_rows = df_sig_ini.index.tolist() + data_rows = df_data_ini.index.tolist() + + common_rows = list(set(data_rows).intersection(keep_rows)) + + df_data = deepcopy(df_data_ini.loc[common_rows]) + df_sig = deepcopy(df_sig_ini.loc[common_rows]) + + # calculate sim_mat of df_data and df_sig + cell_types = df_sig.columns.tolist() + barcodes = df_data.columns.tolist() + sim_mat = 1 - pairwise_distances(df_sig.transpose(), df_data.transpose(), metric=dist_type) + df_sim = pd.DataFrame(data=sim_mat, index=cell_types, columns=barcodes).transpose() + + # get the top column value (most similar signature) + df_sim_top = df_sim.idxmax(axis=1) + + # get the maximum similarity of a cell to a cell type definition + max_sim = df_sim.max(axis=1) + + unknown_cells = max_sim[max_sim < unknown_thresh].index.tolist() + + # assign unknown cells (need category of same name) + df_sim_top[unknown_cells] = 'Unknown' + + # add predicted category name to top list + # top_list = df_sim_top.values + df_sim = df_sim.transpose() + + df_meta[predict] = df_sim_top + + return df_sim, df_meta + + + def old_predict_cats_from_sigs(self, df_data_ini, df_sig_ini, dist_type='cosine', predict_level='Predict Category', + truth_level=1, unknown_thresh=-1): + ''' Predict category using signature ''' + + keep_rows = df_sig_ini.index.tolist() + data_rows = df_data_ini.index.tolist() + + common_rows = list(set(data_rows).intersection(keep_rows)) + + df_data = deepcopy(df_data_ini.loc[common_rows]) + df_sig = deepcopy(df_sig_ini.loc[common_rows]) + + # calculate sim_mat of df_data and df_sig + cell_types = df_sig.columns.tolist() + barcodes = df_data.columns.tolist() + sim_mat = 1 - pairwise_distances(df_sig.transpose(), df_data.transpose(), metric=dist_type) + df_sim = pd.DataFrame(data=sim_mat, index=cell_types, columns=barcodes).transpose() + + # get the top column value (most similar signature) + df_sim_top = df_sim.idxmax(axis=1) + + # get the maximum similarity of a cell to a cell type definition + max_sim = df_sim.max(axis=1) + + unknown_cells = max_sim[max_sim < unknown_thresh].index.tolist() + + # assign unknown cells (need category of same name) + df_sim_top[unknown_cells] = 'Unknown' + + # add predicted category name to top list + top_list = df_sim_top.values + top_list = [ predict_level + ': ' + x[0] if type(x) is tuple else predict_level + ': ' + x for x in top_list] + + # add cell type category to input data + df_cat = deepcopy(df_data) + cols = df_cat.columns.tolist() + new_cols = [] + + # check whether the columns have the true category available + has_truth = False + if type(cols[0]) is tuple: + has_truth = True + + if has_truth: + new_cols = [tuple(list(a) + [b]) for a,b in zip(cols, top_list)] + else: + new_cols = [tuple([a] + [b]) for a,b in zip(cols, top_list)] + + # transfer new categories + df_cat.columns = new_cols + + # keep track of true and predicted labels + y_info = {} + y_info['true'] = [] + y_info['pred'] = [] + + if has_truth: + y_info['true'] = [x[truth_level].split(': ')[1] for x in cols] + y_info['pred'] = [x.split(': ')[1] for x in top_list] + + return df_cat, df_sim.transpose(), y_info + + def assess_prediction(self, df_meta, truth, pred): + ''' Generate confusion matrix from y_info ''' + + y_info = {} + y_info['true'] = df_meta[truth].values.tolist() + y_info['pred'] = df_meta[pred].values.tolist() + + sorted_cats = sorted(list(set(y_info['true'] + y_info['pred']))) + conf_mat = confusion_matrix(y_info['true'], y_info['pred'], labels=sorted_cats) + + # true columns and pred rows + df_conf = pd.DataFrame(conf_mat, index=sorted_cats, columns=sorted_cats).transpose() + + total_correct = np.trace(df_conf) + total_pred = df_conf.sum().sum() + fraction_correct = total_correct/float(total_pred) + + # calculate ser_correct + correct_list = [] + cat_counts = df_conf.sum(axis=0) + all_cols = df_conf.columns.tolist() + for inst_cat in all_cols: + inst_correct = df_conf.loc[inst_cat, inst_cat] / cat_counts[inst_cat] + correct_list.append(inst_correct) + + ser_correct = pd.Series(data=correct_list, index=all_cols) + + return df_conf, ser_correct, fraction_correct + + def old_confusion_matrix_and_correct_series(self, y_info): + ''' Generate confusion matrix from y_info ''' + + + a = deepcopy(y_info['true']) + true_count = dict((i, a.count(i)) for i in set(a)) + + a = deepcopy(y_info['pred']) + pred_count = dict((i, a.count(i)) for i in set(a)) + + sorted_cats = sorted(list(set(y_info['true'] + y_info['pred']))) + conf_mat = confusion_matrix(y_info['true'], y_info['pred'], sorted_cats) + df_conf = pd.DataFrame(conf_mat, index=sorted_cats, columns=sorted_cats) + + total_correct = np.trace(df_conf) + total_pred = df_conf.sum().sum() + fraction_correct = total_correct/float(total_pred) + + # calculate ser_correct + correct_list = [] + cat_counts = df_conf.sum(axis=1) + all_cols = df_conf.columns.tolist() + for inst_cat in all_cols: + inst_correct = df_conf[inst_cat].loc[inst_cat] / cat_counts[inst_cat] + correct_list.append(inst_correct) + + ser_correct = pd.Series(data=correct_list, index=all_cols) + + populations = {} + populations['true'] = true_count + populations['pred'] = pred_count + + return df_conf, populations, ser_correct, fraction_correct + + def compare_performance_to_shuffled_labels(self, df_data, category_level, num_shuffles=100, + random_seed=99, pval_cutoff=0.05, dist_type='cosine', + num_top_dims=False, predict_level='Predict Category', + truth_level=1, unknown_thresh=-1, equal_var=False, + performance_type='prediction'): + random.seed(random_seed) + + perform_list = [] + num_shuffles = num_shuffles + + # pre-calculate the distance matrix (similarity matrix) if necessary + if performance_type == 'cat_sim_auc': + dist_arr = 1 - pdist(df_data.transpose(), metric=dist_type) + + for inst_run in range(num_shuffles + 1): + + cols = df_data.columns.tolist() + rows = df_data.index.tolist() + mat = df_data.values + + shuffled_cols = deepcopy(cols) + random.shuffle(shuffled_cols) + + # do not perform shuffling the first time to confirm that we get the same + # results as the unshuffled dataaset + if inst_run == 0: + df_shuffle = deepcopy(df_data) + else: + df_shuffle = pd.DataFrame(data=mat, columns=shuffled_cols, index=rows) + + # generate signature on shuffled data + df_sig, keep_genes, keep_genes_dict, fold_info = generate_signatures(df_shuffle, + category_level, + pval_cutoff=pval_cutoff, + num_top_dims=num_top_dims, + equal_var=equal_var) + + # predictive performance + if performance_type == 'prediction': + + # predict categories from signature + df_pred_cat, df_sig_sim, y_info = self.predict_cats_from_sigs(df_shuffle, df_sig, + dist_type=dist_type, predict_level=predict_level, truth_level=truth_level, + unknown_thresh=unknown_thresh) + + # calc confusion matrix and performance + df_conf, populations, ser_correct, fraction_correct = self.confusion_matrix_and_correct_series(y_info) + + # store performances of shuffles + if inst_run > 0: + perform_list.append(fraction_correct) + else: + real_performance = fraction_correct + print('performance (fraction correct) of unshuffled: ' + str(fraction_correct)) + + elif performance_type == 'cat_sim_auc': + + # predict categories from signature + sim_dict, pval_dict, roc_data = self.sim_same_and_diff_category_samples(df_shuffle, + cat_index=1, plot_roc=False, equal_var=equal_var, precalc_dist=dist_arr) + + # store performances of shuffles + if inst_run > 0: + perform_list.append(roc_data['auc']) + else: + real_performance = roc_data['auc'] + print('performance (category similarity auc) of unshuffled: ' + str(roc_data['auc'])) + + perform_ser = pd.Series(perform_list) + + in_top_fraction = perform_ser[perform_ser > real_performance].shape[0]/num_shuffles + print('real data performs in the top ' + str(in_top_fraction*100) + '% of shuffled labels\n') + + return perform_ser + + @staticmethod + def box_scatter_plot(df, group, columns=False, rand_seed=100, alpha=0.5, + dot_color='red', num_row=None, num_col=1, figsize=(10,10), + start_title='Variable Measurements Across', end_title='Groups', + group_list=False): + + from scipy import stats + import pandas as pd + + import matplotlib.pyplot as plt + # %matplotlib inline + + if columns == False: + columns = df.columns.tolist() + + plt.figure(figsize=figsize) + figure_title = start_title + ' ' + group + ' ' + end_title + plt.suptitle(figure_title, fontsize=20) + + # list of arranged dataframes + dfs = {} + + for col_num in range(len(columns)): + column = columns[col_num] + plot_id = col_num + 1 + + # group by column name or multiIndex name + if group in df.columns.tolist(): + grouped = df.groupby(group) + else: + grouped = df.groupby(level=group) + + names, vals, xs = [], [] ,[] + + if type(column) is tuple: + column_title = column[0] + else: + column_title = column + + for i, (name, subdf) in enumerate(grouped): + + names.append(name) + + inst_ser = subdf[column] + + column_name = column_title + '-' + str(name) + + inst_ser.name = column_name + vals.append(inst_ser) + + np.random.seed(rand_seed) + xs.append(np.random.normal(i+1, 0.04, subdf.shape[0])) + + ax = plt.subplot(num_row, num_col, plot_id) + + plt.boxplot(vals, labels=names) + + ngroup = len(vals) + + clevels = np.linspace(0., 1., ngroup) + + for x, val, clevel in zip(xs, vals, clevels): + + plt.subplot(num_row, num_col, plot_id) + plt.scatter(x, val, c=dot_color, alpha=alpha) + + + df_arranged = pd.concat(vals, axis=1) + + # anova + anova_data = [df_arranged[col].dropna() for col in df_arranged] + f_val, pval = stats.f_oneway(*anova_data) + + if pval < 0.01: + ax.set_title(column_title + ' P-val: ' + '{:.2e}'.format(pval)) + else: + pval = round(pval * 100000)/100000 + ax.set_title(column_title + ' P-val: ' + str(pval)) + + dfs[column] = df_arranged + + return dfs + + @staticmethod + def rank_cols_by_anova_pval(df, group, columns=False, rand_seed=100, alpha=0.5, dot_color='red', num_row=None, num_col=1, + figsize=(10,10)): + + from scipy import stats + import numpy as np + import pandas as pd + + # import matplotlib.pyplot as plt + # %matplotlib inline + + if columns == False: + columns = df.columns.tolist() + + # plt.figure(figsize=figsize) + + # list of arranged dataframes + dfs = {} + + pval_list = [] + + for col_num in range(len(columns)): + column = columns[col_num] + plot_id = col_num + 1 + + # group by column name or multiIndex name + if group in df.columns.tolist(): + grouped = df.groupby(group) + else: + grouped = df.groupby(level=group) + + names, vals, xs = [], [] ,[] + + if type(column) is tuple: + column_title = column[0] + else: + column_title = column + + for i, (name, subdf) in enumerate(grouped): + names.append(name) + + inst_ser = subdf[column] + + column_name = column_title + '-' + str(name) + + inst_ser.name = column_name + vals.append(inst_ser) + + np.random.seed(rand_seed) + xs.append(np.random.normal(i+1, 0.04, subdf.shape[0])) + + + ngroup = len(vals) + + df_arranged = pd.concat(vals, axis=1) + + # anova + anova_data = [df_arranged[col].dropna() for col in df_arranged] + f_val, pval = stats.f_oneway(*anova_data) + + pval_list.append(pval) + + pval_ser = pd.Series(data=pval_list, index=columns) + pval_ser = pval_ser.sort_values(ascending=True) + + return pval_ser + + + def row_tuple_to_multiindex(self, df): + + import pandas as pd + + from copy import deepcopy + df_mi = deepcopy(df) + rows = df_mi.index.tolist() + titles = [] + for inst_part in rows[0]: + + if ': ' in inst_part: + inst_title = inst_part.split(': ')[0] + else: + inst_title = 'Name' + titles.append(inst_title) + + new_rows = [] + for inst_row in rows: + inst_row = list(inst_row) + new_row = [] + for inst_part in inst_row: + if ': ' in inst_part: + inst_part = inst_part.split(': ')[1] + new_row.append(inst_part) + new_row = tuple(new_row) + new_rows.append(new_row) + + df_mi.index = new_rows + + df_mi.index = pd.MultiIndex.from_tuples(df_mi.index, names=titles) + + return df_mi + + def set_cat_colors(self, cat_colors, axis, cat_index, cat_title=False): + for inst_ct in cat_colors: + if cat_title != False: + cat_name = cat_title + ': ' + inst_ct + else: + cat_name = inst_ct + + inst_color = cat_colors[inst_ct] + self.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color) + + def set_manual_category(self, col=None, row=None, preferred_cats=None): + ''' + This method is used to tell Clustergrammer2 that the user wants to define + a manual category interactively using the dendrogram. + ''' + + self.dat['manual_category'] = {} + self.dat['manual_category']['col'] = col + self.dat['manual_category']['row'] = row + + if preferred_cats is not None: + pref_cats = [] + for inst_row in preferred_cats.index.tolist(): + inst_dict = {} + inst_dict['name'] = inst_row + inst_dict['color'] = preferred_cats.loc[inst_row, 'color'] + pref_cats.append(inst_dict) + + if col is not None: + self.dat['manual_category']['col_cats'] = pref_cats + + if row is not None: + self.dat['manual_category']['row_cats'] = pref_cats + + def ds_to_original_meta(self, axis): + clusters = self.meta_ds_col.index.tolist() + + man_cat_title = self.dat['manual_category'][axis] + + for inst_cluster in clusters: + + # get the manual category from the downsampled data + inst_man_cat = self.meta_ds_col.loc[inst_cluster, man_cat_title] + + # find original labels that are assigned to this cluster + found_labels = self.meta_col[self.meta_col[self.ds_name] == inst_cluster].index.tolist() + + # update manual category + self.meta_col.loc[found_labels, man_cat_title] = inst_man_cat + + def get_manual_category(self, tmp): + + + for axis in ['col']: + + try: + ########################################################### + + export_dict = {} + + inst_nodes = self.dat['nodes'][axis] + # print('get get_manual_category', len(inst_nodes)) + + cat_title = self.dat['manual_category'][axis] + + # Category Names + try: + ########################################################### + + export_dict[cat_title] = pd.Series(json.loads(self.widget_instance.manual_cat)[axis][cat_title]) + + if hasattr(self, 'meta_cat') == True: + + if axis == 'row': + if self.is_downsampled == False: + self.meta_row.loc[inst_nodes, cat_title] = export_dict[cat_title] + else: + self.meta_ds_row.loc[inst_nodes, cat_title] = export_dict[cat_title] + self.ds_to_original_meta(axis) + + elif axis == 'col': + if self.is_downsampled == False: + # print('> > > > > > > > > > > ') + # print(export_dict[cat_title]) + + # print('.. .. .. .. .. .. .. .. .. .. .. .. ') + # print(self.meta_col) + + # # for some reason this is breaking when trying to sync metadata + # # switching to slower row based syncing + # self.meta_col.loc[inst_nodes, cat_title] = export_dict[cat_title] + + # manually looping through rows in metadata works + for inst_row in export_dict[cat_title].index.tolist(): + self.meta_col.loc[inst_row, cat_title] = export_dict[cat_title][inst_row] + + # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') + # print(self.meta_col) + + else: + self.meta_ds_col.loc[inst_nodes, cat_title] = export_dict[cat_title] + self.ds_to_original_meta(axis) + + + except: + print('unable to load', axis ,' category, please check title') + + # Category Colors + ####################### + export_dict['cat_colors'] = json.loads(self.widget_instance.manual_cat)['global_cat_colors'] + + # # drop title from category colors + # export_dict['cat_colors'] = {} + # for inst_cat in ini_new_colors: + # if (': ' in inst_cat): + # export_dict['cat_colors'][inst_cat.split(': ')[1]] = ini_new_colors[inst_cat] + # else: + # export_dict['cat_colors'][inst_cat] = ini_new_colors[inst_cat] + + # if hasattr(self, 'meta_cat') == False: + # return export_dict + + except: + # print('failed to parse manual_category') + pass + + + @staticmethod + def umi_norm(df): + # umi norm + barcode_umi_sum = df.sum() + df_umi = df.div(barcode_umi_sum) + return df_umi + + @staticmethod + def make_df_from_cols(cols): + inst_col = cols[0] + + cat_titles = [] + for inst_info in inst_col[1:]: + inst_title = inst_info.split(': ')[0] + cat_titles.append(inst_title) + + clean_cols = [] + for inst_col in cols: + inst_clean = [] + for inst_info in inst_col: + if ': ' in inst_info: + inst_clean.append(inst_info.split(': ')[1]) + else: + inst_clean.append(inst_info) + clean_cols.append(tuple(inst_clean)) + + df_ini = pd.DataFrame(data=clean_cols).set_index(0) + mat = df_ini.values + rows = df_ini.index.tolist() + + df_meta = pd.DataFrame(data=mat, index=rows, columns=cat_titles) + + return df_meta diff --git a/src/celldega/clust/calc_clust.py b/src/celldega/clust/calc_clust.py new file mode 100644 index 00000000..395b5fa0 --- /dev/null +++ b/src/celldega/clust/calc_clust.py @@ -0,0 +1,183 @@ +def cluster_row_and_col(net, dist_type='cosine', linkage_type='average', + dendro=True, run_clustering=True, run_rank=True, + ignore_cat=False, calc_cat_pval=False, links=False, + clust_library='scipy', min_samples=1, min_cluster_size=2): + ''' cluster net.dat and make visualization json, net.viz. + optionally leave out dendrogram colorbar groups with dendro argument ''' + + # import umap + import scipy + from copy import deepcopy + from scipy.spatial.distance import pdist + from . import categories, make_viz, cat_pval + + dm = {} + for axis in ['row', 'col']: + + # save directly to dat structure + node_info = net.dat['node_info'][axis] + + node_info['ini'] = list(range( len(net.dat['nodes'][axis]), -1, -1)) + + tmp_mat = deepcopy(net.dat['mat']) + + # calc distance matrix + if clust_library != 'hdbscan': + dm[axis] = calc_distance_matrix(tmp_mat, axis, dist_type) + else: + dm[axis] = None + + # dm[axis] = calc_distance_matrix(tmp_mat, axis, dist_type) + + # cluster + if run_clustering is True: + node_info['clust'], node_info['Y'] = clust_and_group(net, + dm[axis], + axis, + tmp_mat, + dist_type=dist_type, + linkage_type=linkage_type, + clust_library=clust_library, + min_samples=min_samples, + min_cluster_size=min_cluster_size) + else: + dendro = False + node_info['clust'] = node_info['ini'] + + # sorting + if run_rank is True: + node_info['rank'] = sort_rank_nodes(net, axis, 'sum') + node_info['rankvar'] = sort_rank_nodes(net, axis, 'var') + else: + node_info['rank'] = node_info['ini'] + node_info['rankvar'] = node_info['ini'] + + ################################## + if ignore_cat is False: + categories.calc_cat_clust_order(net, axis) + + if calc_cat_pval is True: + cat_pval.main(net) + + # make the visualization json + make_viz.viz_json(net, dendro, links) + + return dm + +def calc_distance_matrix(tmp_mat, axis, dist_type='cosine'): + from scipy.spatial.distance import pdist + import numpy as np + + if axis == 'row': + inst_dm = pdist(tmp_mat, metric=dist_type) + elif axis == 'col': + inst_dm = pdist(tmp_mat.transpose(), metric=dist_type) + + inst_dm[inst_dm < 0] = float(0) + + return inst_dm + +def clust_and_group(net, inst_dm, axis, mat, dist_type='cosine', linkage_type='average', + clust_library='scipy', min_samples=1, min_cluster_size=2): + + # print(clust_library) + + import scipy.cluster.hierarchy as hier + import pandas as pd + + if clust_library == 'scipy': + Y = hier.linkage(inst_dm, method=linkage_type) + + elif clust_library == 'fastcluster': + import fastcluster + Y = fastcluster.linkage(inst_dm, method=linkage_type) + + elif clust_library == 'hdbscan': + # print('HDBSCAN!') + import hdbscan + + + # pca-umap-hdbscan using data (no pre-cal distance matrix) + ###################################################### + from sklearn.decomposition import PCA + clusterer = hdbscan.HDBSCAN(min_samples=min_samples, + min_cluster_size=min_cluster_size) + + # rows are the data points, cols are dimensions + n_components = 50 + if axis == 'row': + if mat.shape[1] > n_components: + low_d_mat = PCA(n_components=n_components).fit_transform(mat) + else: + low_d_mat = mat + + elif axis == 'col': + if mat.shape[0] > n_components: + low_d_mat = PCA(n_components=n_components).fit_transform(mat.transpose()) + else: + low_d_mat = mat.transpose() + + + # run UMAP on low_d_mat (after PCA) + # print('running umap!!!!!!!!!!!!!!!!!!!!!!!!!!') + import umap + umap_mat = umap.UMAP( + metric=dist_type, + n_neighbors=5, + min_dist=0.0, + n_components=2, + random_state=42, + ).fit_transform(low_d_mat) + + umap_df = pd.DataFrame(umap_mat.transpose(), + index=['x','y'], + columns=net.dat['nodes'][axis]) + + net.umap[axis] = umap_df + clusterer.fit(umap_mat) + + Y = clusterer.single_linkage_tree_.to_numpy() + Z = hier.dendrogram(Y, no_plot=True) + + Z = hier.dendrogram(Y, no_plot=True) + inst_clust_order = Z['leaves'] + + return inst_clust_order, Y + +def sort_rank_nodes(net, rowcol, rank_type): + import numpy as np + from operator import itemgetter + from copy import deepcopy + + tmp_nodes = deepcopy(net.dat['nodes'][rowcol]) + inst_mat = deepcopy(net.dat['mat']) + + sum_term = [] + for i in range(len(tmp_nodes)): + inst_dict = {} + inst_dict['name'] = tmp_nodes[i] + + if rowcol == 'row': + if rank_type == 'sum': + inst_dict['rank'] = np.sum(inst_mat[i, :]) + elif rank_type == 'var': + inst_dict['rank'] = np.var(inst_mat[i, :]) + else: + if rank_type == 'sum': + inst_dict['rank'] = np.sum(inst_mat[:, i]) + elif rank_type == 'var': + inst_dict['rank'] = np.var(inst_mat[:, i]) + + sum_term.append(inst_dict) + + sum_term = sorted(sum_term, key=itemgetter('rank'), reverse=False) + + tmp_sort_nodes = [] + for inst_dict in sum_term: + tmp_sort_nodes.append(inst_dict['name']) + + sort_index = [] + for inst_node in tmp_nodes: + sort_index.append(tmp_sort_nodes.index(inst_node)) + + return sort_index diff --git a/src/celldega/clust/cat_pval.py b/src/celldega/clust/cat_pval.py new file mode 100644 index 00000000..7653ef8c --- /dev/null +++ b/src/celldega/clust/cat_pval.py @@ -0,0 +1,98 @@ +import numpy as np +import pandas as pd +from copy import deepcopy + +def main(net): + ''' + calculate pvalue of category closeness + ''' + # calculate the distance between the data points within the same category and + # compare to null distribution + for inst_rc in ['row', 'col']: + + inst_nodes = deepcopy(net.dat['nodes'][inst_rc]) + + inst_index = deepcopy(net.dat['node_info'][inst_rc]['clust']) + + # reorder based on clustered order + inst_nodes = [ inst_nodes[i] for i in inst_index] + + # make distance matrix dataframe + dm = dist_matrix_lattice(inst_nodes) + + node_infos = list(net.dat['node_info'][inst_rc].keys()) + + all_cats = [] + for inst_info in node_infos: + if 'dict_cat_' in inst_info: + all_cats.append(inst_info) + + for cat_dict in all_cats: + + tmp_dict = net.dat['node_info'][inst_rc][cat_dict] + + pval_name = cat_dict.replace('dict_','pval_') + net.dat['node_info'][inst_rc][pval_name] = {} + + for cat_name in tmp_dict: + + subset = tmp_dict[cat_name] + + inst_median = calc_median_dist_subset(dm, subset) + + hist = calc_hist_distances(dm, subset, inst_nodes) + + pval = 0 + + for i in range(len(hist['prob'])): + if i == 0: + pval = hist['prob'][i] + if i >= 1: + if inst_median >= hist['bins'][i]: + pval = pval + hist['prob'][i] + + net.dat['node_info'][inst_rc][pval_name][cat_name] = pval + +def dist_matrix_lattice(names): + from scipy.spatial.distance import pdist, squareform + + lattice_size = len(names) + mat = np.zeros([lattice_size, 1]) + mat[:,0] = list(range(lattice_size)) + + inst_dm = pdist(mat, metric='euclidean') + + inst_dm[inst_dm < 0] = float(0) + + inst_dm = squareform(inst_dm) + + df = pd.DataFrame(data=inst_dm, columns=names, index=names) + + return df + + +def calc_median_dist_subset(dm, subset): + return np.median(dm[subset].loc[subset].values) + +def calc_hist_distances(dm, subset, inst_nodes): + np.random.seed(100) + + num_null = 1000 + num_points = len(subset) + + median_dist = [] + for i in range(num_null): + tmp = np.random.choice(inst_nodes, num_points, replace=False) + median_dist.append( np.median(dm[tmp].loc[tmp].values) ) + + tmp_dist = sorted(deepcopy(median_dist)) + + median_dist = np.asarray(median_dist) + s1 = pd.Series(median_dist) + hist = np.histogram(s1, bins=30) + + H = {} + H['prob'] = hist[0]/np.float(num_null) + H['bins'] = hist[1] + + return H \ No newline at end of file diff --git a/src/celldega/clust/categories.py b/src/celldega/clust/categories.py new file mode 100644 index 00000000..2b1fcf0c --- /dev/null +++ b/src/celldega/clust/categories.py @@ -0,0 +1,346 @@ +def check_categories(lines): + ''' + find out how many row and col categories are available + when loading data from file + ''' + # count the number of row categories + rcat_line = lines[0].split('\t') + + # calc the number of row names and categories + num_rc = 0 + found_end = False + + # skip first tab + for inst_string in rcat_line[1:]: + if inst_string == '': + if found_end is False: + num_rc = num_rc + 1 + else: + found_end = True + + max_rcat = 15 + if max_rcat > len(lines): + max_rcat = len(lines) - 1 + + num_cc = 0 + for i in range(max_rcat): + ccat_line = lines[i + 1].split('\t') + + # make sure that line has length greater than one to prevent false cats from + # trailing new lines at end of matrix + if ccat_line[0] == '' and len(ccat_line) > 1: + num_cc = num_cc + 1 + + num_labels = {} + num_labels['row'] = num_rc + 1 + num_labels['col'] = num_cc + 1 + + return num_labels + +def dict_cat(net, define_cat_colors=False): + ''' + make a dictionary of node-category associations + ''' + net.persistent_cat_colors = True + + for inst_rc in ['row', 'col']: + inst_keys = list(net.dat['node_info'][inst_rc].keys()) + all_cats = [x for x in inst_keys if 'cat-' in x] + + for inst_name_cat in all_cats: + + dict_cat = {} + tmp_cats = net.dat['node_info'][inst_rc][inst_name_cat] + tmp_nodes = net.dat['nodes'][inst_rc] + + for i in range(len(tmp_cats)): + inst_cat = tmp_cats[i] + inst_node = tmp_nodes[i] + + if inst_cat not in dict_cat: + dict_cat[inst_cat] = [] + + dict_cat[inst_cat].append(inst_node) + + tmp_name = 'dict_' + inst_name_cat.replace('-', '_') + net.dat['node_info'][inst_rc][tmp_name] = dict_cat + + # merge with old cat_colors by default + cat_colors = net.viz['cat_colors'] + + if define_cat_colors == True: + cat_number = 0 + + for inst_rc in ['row', 'col']: + + inst_keys = list(net.dat['node_info'][inst_rc].keys()) + all_cats = [x for x in inst_keys if 'cat-' in x] + + for cat_index in all_cats: + + if cat_index not in cat_colors[inst_rc]: + cat_colors[inst_rc][cat_index] = {} + + cat_names = sorted(list(set(net.dat['node_info'][inst_rc][cat_index]))) + + # loop through each category name and assign a color + for tmp_name in cat_names: + + # using the same rules as the front-end to define cat_colors + inst_color = get_cat_color(cat_number + cat_names.index(tmp_name)) + + check_name = tmp_name + + # check if category is string type and non-numeric + try: + float(check_name) + is_string_cat = False + except: + is_string_cat = True + + if is_string_cat == True: + # check for default non-color + if ': ' in check_name: + check_name = check_name.split(': ')[1] + + # if check_name == 'False' or check_name == 'false': + if 'False' in check_name or 'false' in check_name: + inst_color = '#eee' + + if 'Not ' in check_name: + inst_color = '#eee' + + # do not overwrite old colors + if tmp_name not in cat_colors[inst_rc][cat_index] and is_string_cat: + + cat_colors[inst_rc][cat_index][tmp_name] = inst_color + # print('overwrite: ' + tmp_name + ' -> ' + str(inst_color)) + + cat_number = cat_number + 1 + + net.viz['cat_colors'] = cat_colors + + net.viz['global_cat_colors'] = {} + + global_cat_colors = {} + for axis in net.viz['cat_colors']: + for cat_index in net.viz['cat_colors'][axis]: + + inst_dict = net.viz['cat_colors'][axis][cat_index] + + for inst_full_name in inst_dict: + + inst_name = inst_full_name.split(': ')[1] + inst_color = inst_dict[inst_full_name] + + global_cat_colors[inst_name] = inst_color + + net.viz['global_cat_colors'] = global_cat_colors + + +def calc_cat_clust_order(net, inst_rc): + ''' + cluster category subset of data + ''' + from .__init__ import Network + from copy import deepcopy + from . import calc_clust, run_filter + + inst_keys = list(net.dat['node_info'][inst_rc].keys()) + all_cats = [x for x in inst_keys if 'cat-' in x] + + if len(all_cats) > 0: + + for inst_name_cat in all_cats: + + tmp_name = 'dict_' + inst_name_cat.replace('-', '_') + dict_cat = net.dat['node_info'][inst_rc][tmp_name] + + unordered_cats = dict_cat.keys() + + ordered_cats = order_categories(unordered_cats) + + # this is the ordering of the columns based on their category, not + # including their clustering ordering within category + all_cat_orders = [] + tmp_names_list = [] + for inst_cat in ordered_cats: + + inst_nodes = dict_cat[inst_cat] + + tmp_names_list.extend(inst_nodes) + + names_clust_list = tmp_names_list + + # calc category-cluster order + final_order = [] + + for i in range(len(net.dat['nodes'][inst_rc])): + + inst_node_name = net.dat['nodes'][inst_rc][i] + inst_node_num = names_clust_list.index(inst_node_name) + + final_order.append(inst_node_num) + + inst_index_cat = inst_name_cat.replace('-', '_') + '_index' + + net.dat['node_info'][inst_rc][inst_index_cat] = final_order + + +def order_categories(unordered_cats): + ''' + If categories are strings, then simple ordering is fine. + If categories are values then I'll need to order based on their values. + The final ordering is given as the original categories (including titles) in a + ordered list. + ''' + + no_titles = remove_titles(unordered_cats) + + all_are_numbers = check_all_numbers(no_titles) + + if all_are_numbers: + ordered_cats = order_cats_based_on_values(unordered_cats, no_titles) + else: + ordered_cats = sorted(unordered_cats) + + return ordered_cats + + +def order_cats_based_on_values(unordered_cats, values_list): + import pandas as pd + + try: + # convert values_list to values + values_list = [float(i) for i in values_list] + + inst_series = pd.Series(data=values_list, index=unordered_cats) + + inst_series.sort_values(inplace=True) + + ordered_cats = inst_series.index.tolist() + + # ordered_cats = unordered_cats + except: + # keep default ordering if error occurs + print('error sorting cats based on values ') + ordered_cats = unordered_cats + + return ordered_cats + +def check_all_numbers(no_titles): + all_numbers = True + for tmp in no_titles: + if is_number(tmp) == False: + all_numbers = False + + return all_numbers + +def remove_titles(cats): + from copy import deepcopy + + # check if all have titles + ########################### + all_have_titles = True + + for inst_cat in cats: + if is_number(inst_cat) == False: + if ': ' not in inst_cat: + all_have_titles = False + else: + all_have_titles = False + + if all_have_titles: + no_titles = cats + no_titles = [i.split(': ')[1] for i in no_titles] + + else: + no_titles = cats + + return no_titles + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + +def get_cat_color(cat_num): + + all_colors = [ "#393b79", "#aec7e8", "#ff7f0e", "#ffbb78", "#98df8a", "#bcbd22", + "#404040", "#ff9896", "#c5b0d5", "#8c564b", "#1f77b4", "#5254a3", "#FFDB58", + "#c49c94", "#e377c2", "#7f7f7f", "#2ca02c", "#9467bd", "#dbdb8d", "#17becf", + "#637939", "#6b6ecf", "#9c9ede", "#d62728", "#8ca252", "#8c6d31", "#bd9e39", + "#e7cb94", "#843c39", "#ad494a", "#d6616b", "#7b4173", "#a55194", "#ce6dbd", + "#de9ed6"]; + + inst_color = all_colors[cat_num % len(all_colors)] + + return inst_color + +def add_cats(net, axis, cat_data): + + try: + df = net.export_df() + + if axis == 'row': + labels = df.index.tolist() + elif axis == 'col': + labels = df.columns.tolist() + + if 'title' in cat_data: + inst_title = cat_data['title'] + else: + inst_title = 'New Category' + + all_cats = cat_data['cats'] + + # loop through all labels + new_labels = [] + for inst_label in labels: + + if type(inst_label) is tuple: + check_name = inst_label[0] + found_tuple = True + else: + check_name = inst_label + found_tuple = False + + if ': ' in check_name: + check_name = check_name.split(': ')[1] + + # default to False for found cat, overwrite if necessary + found_cat = inst_title + ': False' + + # check all categories in cats + for inst_cat in all_cats: + + inst_names = all_cats[inst_cat] + + if check_name in inst_names: + found_cat = inst_title + ': ' + inst_cat + + # add category to label + if found_tuple is True: + new_label = inst_label + (found_cat,) + else: + new_label = (inst_label, found_cat) + + new_labels.append(new_label) + + + # add labels back to DataFrame + if axis == 'row': + df.index = new_labels + elif axis == 'col': + df.columns = new_labels + + net.load_df(df) + + except: + print('error adding new categories') + + + + diff --git a/src/celldega/clust/data_formats.py b/src/celldega/clust/data_formats.py new file mode 100644 index 00000000..24e37f9b --- /dev/null +++ b/src/celldega/clust/data_formats.py @@ -0,0 +1,134 @@ +from . import make_unique_labels +import pandas as pd +from . import categories + +def df_to_dat(net, df, define_cat_colors=False): + ''' + This is always run when data is loaded. + ''' + + # check if df has unique values + df = make_unique_labels.main(net, df) + + net.dat['mat'] = df.values + net.dat['nodes']['row'] = df.index.tolist() + net.dat['nodes']['col'] = df.columns.tolist() + + # print('checking ds status') + # print('is_downsampled', net.is_downsampled) + # print('meta_cat', net.meta_cat) + # print(hasattr(net, 'meta_ds_col')) + # print(hasattr(net, 'meta_ds_row')) + + # if net.meta_cat == False or net.is_downsampled: + if net.meta_cat == False: + + # tuple cats + ################################## + + for axis in ['row', 'col']: + + inst_nodes = net.dat['nodes'][axis] + + if type(inst_nodes[0]) is tuple: + + if axis == 'row': + net.dat['node_info'][axis]['full_names'] = df.index.tolist() + elif axis == 'col': + net.dat['node_info'][axis]['full_names'] = df.columns.tolist() + + # get the number of categories from the length of the tuple + # subtract 1 because the name is the first element of the tuple + num_cats = len(inst_nodes[0]) - 1 + for inst_cat in range(num_cats): + cat_name = 'cat-' + str(inst_cat) + cat_index = inst_cat + 1 + cat_values = [x[cat_index] for x in inst_nodes] + net.dat['node_info'][axis][cat_name] = cat_values + + # clean up nodes after parsing categories + net.dat['nodes'][axis] = [x[0] for x in inst_nodes] + + else: + + # meta_cats + ########################## + + for axis in ['row', 'col']: + + inst_nodes = net.dat['nodes'][axis] + + if axis == 'row': + net.dat['node_info'][axis]['full_names'] = df.index.tolist() + elif axis == 'col': + net.dat['node_info'][axis]['full_names'] = df.columns.tolist() + + inst_cats = [] + if axis == 'row': + # inst_cats = net.meta_row.columns.tolist() + if hasattr(net, 'row_cats'): + inst_cats = net.row_cats + else: + # inst_cats = net.meta_col.columns.tolist() + if hasattr(net, 'col_cats'): + inst_cats = net.col_cats + + num_cats = len(inst_cats) + + # if axis == 'row': + # num_cats = len(net.row_cats) + # elif axis == 'col': + # num_cats = len(net.col_cats) + + if num_cats > 0: + for inst_cat in range(num_cats): + cat_name = 'cat-' + str(inst_cat) + cat_index = inst_cat + 1 + + cat_title = inst_cats[inst_cat] + if axis == 'row': + if net.is_downsampled: + if hasattr(net, 'meta_ds_row'): + cat_values = net.meta_ds_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + else: + cat_values = net.meta_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + + # detault with no downsampling + else: + cat_values = net.meta_row.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + else: + # cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + x).values.tolist() + if net.is_downsampled: + if hasattr(net, 'meta_ds_col'): + # print(inst_nodes) + + cat_values = net.meta_ds_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + else: + cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + + # detault with no downsampling + else: + cat_values = net.meta_col.loc[inst_nodes, cat_title].apply(lambda x: cat_title + ': ' + str(x)).values.tolist() + + net.dat['node_info'][axis][cat_name] = cat_values + + categories.dict_cat(net, define_cat_colors=define_cat_colors) + +def dat_to_df(net): + + nodes = {} + for axis in ['row', 'col']: + if 'full_names' in net.dat['node_info'][axis]: + nodes[axis] = net.dat['node_info'][axis]['full_names'] + else: + nodes[axis] = net.dat['nodes'][axis] + + df = pd.DataFrame(data=net.dat['mat'], columns=nodes['col'], + index=nodes['row']) + + return df + +def mat_to_numpy_arr(self): + ''' convert list to numpy array - numpy arrays can not be saved as json ''' + import numpy as np + self.dat['mat'] = np.asarray(self.dat['mat']) \ No newline at end of file diff --git a/src/celldega/clust/downsample_fun.py b/src/celldega/clust/downsample_fun.py new file mode 100644 index 00000000..342b7b66 --- /dev/null +++ b/src/celldega/clust/downsample_fun.py @@ -0,0 +1,280 @@ +import pandas as pd +import numpy as np +from sklearn.cluster import MiniBatchKMeans +# string used to format titles +super_string = ': ' + +def main(net, df=None, ds_type='kmeans', axis='row', num_samples=100, + random_state=1000, ds_name='Downsample', ds_cluster_name='cluster'): + + # print('is meta cat 1 ??', net.meta_cat) + + if df is None: + df = net.export_df() + net.ds_name = ds_name + + # print('run k-means!!!!!!!!!!!!!') + ds_df, ds_data = run_kmeans_mini_batch(net, df, num_samples, axis, + random_state, ds_cluster_name) + + ds_data = [ds_cluster_name + '-' + str(x + 1) for x in ds_data] + + if axis == 'row': + labels = df.index.tolist() + else: + labels = df.columns.tolist() + + ser_ds = pd.Series(ds_data, index=labels) + + # generate downsampled metadata from tuples + if axis == 'col': + net.meta_ds_col = net.make_df_from_cols(ds_df.columns.tolist()) + else: + net.meta_ds_row = net.make_df_from_cols(ds_df.index.tolist()) + + # strip tuples + # print('strip tuples!!!!!!!!!!!!!!!!!!!!1') + if axis == 'col': + ds_df.columns = [x[0] for x in ds_df.columns.tolist()] + # print('after stripping col tuples') + # print(ds_df.columns.tolist()) + net.dat['nodes']['col'] = ds_df.columns.tolist() + + else: + ds_df.index = [x[0] for x in ds_df.index.tolist()] + net.dat['nodes']['row'] = ds_df.index.tolist() + + # print('is meta cat 1 ??', net.meta_cat) + + # load downsampled dataframe into net + # print('setting is_downsampled to True') + # print('load downsampled dataframe into net!!!!!!!!!!!!!!') + net.load_df(ds_df, is_downsampled=True) + + # print('is meta cat 3 ??', net.meta_cat) + + if net.meta_cat: + if axis == 'row': + net.meta_row[ds_name] = ser_ds + else: + net.meta_col[ds_name] = ser_ds + else: + return ser_ds + + + +def meta_cat_to_tuple(net, axis, orig_labels, inst_cats): + tuple_labels = [] + for inst_label in orig_labels: + new_label = [inst_label] + + for inst_cat_type in inst_cats: + if axis == 'col': + inst_cat = inst_cat_type + ': ' + net.meta_col.loc[inst_label, inst_cat_type] + else: + inst_cat = inst_cat_type + ': ' + net.meta_row.loc[inst_label, inst_cat_type] + new_label.append(inst_cat) + + new_label = tuple(new_label) + tuple_labels.append(new_label) + + return tuple_labels + +def run_kmeans_mini_batch(net, df, num_samples=100, axis='row', + random_state=1000, ds_cluster_name='cluster'): + + # gather downsampled axis information + if axis == 'row': + X = df + orig_labels = df.index.tolist() + non_ds_labels = df.columns.tolist() + + # print(orig_labels) + if net.meta_cat: + orig_labels = meta_cat_to_tuple(net, axis, orig_labels, net.row_cats) + + else: + X = df.transpose() + orig_labels = df.columns.tolist() + non_ds_labels = df.index.tolist() + + if net.meta_cat: + orig_labels = meta_cat_to_tuple(net, axis, orig_labels, net.col_cats) + + cat_index = 1 + + # run until the number of returned clusters with data-points is equal to the + # number of requested clusters + num_returned_clusters = 0 + while num_samples != num_returned_clusters: + + clusters, num_returned_clusters, cluster_data, cluster_pop = \ + calc_mbk_clusters(X, num_samples, random_state) + + random_state = random_state + random_state + + clust_numbers = range(num_returned_clusters) + clust_labels = [ds_cluster_name + '-' + str(i + 1) for i in clust_numbers] + + if type(orig_labels[0]) is tuple: + found_cats = True + else: + found_cats = False + + # Gather categories if necessary + ######################################## + # check if there are categories + if found_cats: + all_cats = generate_cat_data(cluster_data, orig_labels, num_samples) + + # genrate cluster labels, e.g. add number in each cluster and majority cat + # if necessary + cluster_labels = [] + for i in range(num_returned_clusters): + + inst_name = clust_labels[i] + num_in_clust_string = 'number in clust: '+ str(cluster_pop[i]) + + inst_tuple = (inst_name,) + + if found_cats: + for cat_data in all_cats: + cat_values = cat_data['counts'][i] + max_cat_fraction = cat_values.max() + max_index = np.where(cat_values == max_cat_fraction)[0][0] + max_cat_name = cat_data['types'][max_index] + + # add category title if available + cat_name_string = cat_data['title'] +': ' + max_cat_name + + inst_tuple = inst_tuple + (cat_name_string,) + + inst_tuple = inst_tuple + (num_in_clust_string,) + + cluster_labels.append(inst_tuple) + + # ds_df is always downsampling the rows, if the user wants to downsample the + # columns, the df will be switched back later + ds_df = pd.DataFrame(data=clusters, index=cluster_labels, columns=non_ds_labels) + + # swap back for downsampled columns + if axis == 'col': + ds_df = ds_df.transpose() + + return ds_df, cluster_data + +def generate_cat_data(cluster_data, orig_labels, num_samples): + + # generate an array of orig_labels, using an array so that I can gather + # label subsets using indices + orig_array = np.asarray(orig_labels) + + example_label = orig_labels[0] + + # find out how many string categories are available + num_cats = 0 + for i in range(len(example_label)): + + if i > 0: + inst_cat = example_label[i] + if super_string in inst_cat: + inst_cat = inst_cat.split(super_string)[1] + + string_cat = True + try: + float(inst_cat) + string_cat = False + except: + string_cat = True + + if string_cat: + num_cats = num_cats + 1 + + all_cats = [] + + for cat_index in range(num_cats): + + # index zero is for the names + cat_index = cat_index + 1 + + cat_data = {} + + if super_string in example_label[cat_index]: + cat_data['title'] = example_label[cat_index].split(super_string)[0] + else: + cat_data['title'] = 'Category' + + # if there are string categories, then keep track of how many of each category + # are found in each of the downsampled clusters. + cat_data['types'] = [] + + # gather possible categories + for inst_label in orig_labels: + + inst_cat = inst_label[cat_index] + + if super_string in inst_cat: + inst_cat = inst_cat.split(super_string)[1] + + # get first category + cat_data['types'].append(inst_cat) + + cat_data['types'] = sorted(list(set(cat_data['types']))) + + num_cats = len(cat_data['types']) + + # initialize cat_data['counts'] dictionary + cat_data['counts'] = {} + for inst_clust in range(num_samples): + cat_data['counts'][inst_clust] = np.zeros([num_cats]) + + # populate cat_data['counts'] + for inst_clust in range(num_samples): + + # get the indicies of all original labels that fall in the cluster + found = np.where(cluster_data == inst_clust) + found_indicies = found[0] + + clust_names = orig_array[found_indicies] + + for inst_name in clust_names: + + # get first category name + inst_name = inst_name[cat_index] + + if super_string in inst_name: + inst_name = inst_name.split(super_string)[1] + + tmp_index = cat_data['types'].index(inst_name) + + cat_data['counts'][inst_clust][tmp_index] = cat_data['counts'][inst_clust][tmp_index] + 1 + + # calculate fractions + for inst_clust in range(num_samples): + # get array + counts = cat_data['counts'][inst_clust] + inst_total = np.sum(counts) + cat_data['counts'][inst_clust] = cat_data['counts'][inst_clust] / inst_total + + all_cats.append(cat_data) + + return all_cats + +def calc_mbk_clusters(X, n_clusters, random_state=1000): + + # kmeans is run with rows as data-points and columns as dimensions + mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, + max_no_improvement=100, verbose=0, + random_state=random_state) + + # need to loop through each label (each k-means cluster) and count how many + # points were given this label. This will give the population size of each label + mbk.fit(X) + cluster_data = mbk.labels_ + clusters = mbk.cluster_centers_ + + mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True) + + num_returned_clusters = len(cluster_pop) + + return clusters, num_returned_clusters, cluster_data, cluster_pop \ No newline at end of file diff --git a/src/celldega/clust/enrichr_functions.py b/src/celldega/clust/enrichr_functions.py new file mode 100644 index 00000000..91b46cd1 --- /dev/null +++ b/src/celldega/clust/enrichr_functions.py @@ -0,0 +1,376 @@ +def add_enrichr_cats(df, inst_rc, run_enrichr, num_terms=10): + from copy import deepcopy + + tmp_gene_list = deepcopy(df.index.tolist()) + + gene_list = [] + if type(tmp_gene_list[0]) is tuple: + for inst_tuple in tmp_gene_list: + gene_list.append(inst_tuple[0]) + else: + gene_list = tmp_gene_list + + orig_gene_list = deepcopy(gene_list) + + # set up for non-tuple case first + if ': ' in gene_list[0]: + # strip titles + gene_list = [inst_gene.split(': ')[1] for inst_gene in gene_list] + + # strip extra information (e.g. PTMs) + gene_list = [inst_gene.split('_')[0] for inst_gene in gene_list] + gene_list = [inst_gene.split(' ')[0] for inst_gene in gene_list] + gene_list = [inst_gene.split('-')[0] for inst_gene in gene_list] + + user_list_id = post_request(gene_list) + + enr, response_list = get_request(run_enrichr, user_list_id, max_terms=20) + + # p-value, adjusted pvalue, z-score, combined score, genes + # 1: Term + # 2: P-value + # 3: Z-score + # 4: Combined Score + # 5: Genes + # 6: pval_bh + + # while generating categories store as list of lists, then convert to list of + # tuples + + bar_info = [] + cat_list = [] + for inst_gene in orig_gene_list: + cat_list.append([inst_gene]) + + for inst_enr in response_list[0:num_terms]: + inst_term = inst_enr[1] + inst_pval = inst_enr[2] + inst_cs = inst_enr[4] + inst_list = inst_enr[5] + + pval_string = '

Pval ' + str(inst_pval) + '

' + + bar_info.append(inst_cs) + + for inst_info in cat_list: + + # strip titles + gene_name = inst_info[0] + + if ': ' in gene_name: + gene_name = gene_name.split(': ')[1] + + # strip extra information (e.g. PTMs) + gene_name = gene_name.split('_')[0] + gene_name = gene_name.split(' ')[0] + gene_name = gene_name.split('-')[0] + + if gene_name in inst_list: + inst_info.append(inst_term+': True'+ pval_string) + else: + inst_info.append(inst_term+': False'+pval_string) + + cat_list = [tuple(x) for x in cat_list] + + df.index = cat_list + + return df, bar_info + +def clust_from_response(response_list): + from clustergrammer import Network + import scipy + import json + import pandas as pd + import math + from copy import deepcopy + + # print('----------------------') + # print('enrichr_clust_from_response') + # print('----------------------') + + ini_enr = transfer_to_enr_dict( response_list ) + + enr = [] + scores = {} + score_types = ['combined_score','pval','zscore'] + + for score_type in score_types: + scores[score_type] = pd.Series() + + for inst_enr in ini_enr: + if inst_enr['combined_score'] > 0: + + # make series of enriched terms with scores + for score_type in score_types: + + # collect the scores of the enriched terms + if score_type == 'combined_score': + scores[score_type][inst_enr['name']] = inst_enr[score_type] + if score_type == 'pval': + scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type]) + if score_type == 'zscore': + scores[score_type][inst_enr['name']] = -inst_enr[score_type] + + # keep enrichement values + enr.append(inst_enr) + + # sort and normalize the scores + for score_type in score_types: + scores[score_type] = scores[score_type]/scores[score_type].max() + scores[score_type].sort_values(ascending=False) + + number_of_enriched_terms = len(scores['combined_score']) + + enr_score_types = ['combined_score','pval','zscore'] + + if number_of_enriched_terms <10: + num_dict = {'ten':10} + elif number_of_enriched_terms <20: + num_dict = {'ten':10, 'twenty':20} + else: + num_dict = {'ten':10, 'twenty':20, 'thirty':30} + + # gather lists of top scores + top_terms = {} + for enr_type in enr_score_types: + top_terms[enr_type] = {} + for num_terms in list(num_dict.keys()): + inst_num = num_dict[num_terms] + top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num] + + # gather the terms that should be kept - they are at the top of the score list + keep_terms = [] + for inst_enr_score in top_terms: + for tmp_num in list(num_dict.keys()): + keep_terms.extend( top_terms[inst_enr_score][tmp_num] ) + + keep_terms = list(set(keep_terms)) + + # keep enriched terms that are at the top 10 based on at least one score + keep_enr = [] + for inst_enr in enr: + if inst_enr['name'] in keep_terms: + keep_enr.append(inst_enr) + + + # fill in full matrix + ####################### + + # genes + row_node_names = [] + # enriched terms + col_node_names = [] + + # gather information from the list of enriched terms + for inst_enr in keep_enr: + col_node_names.append(inst_enr['name']) + row_node_names.extend(inst_enr['int_genes']) + + row_node_names = sorted(list(set(row_node_names))) + + net = Network() + net.dat['nodes']['row'] = row_node_names + net.dat['nodes']['col'] = col_node_names + net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)]) + + for inst_enr in keep_enr: + + inst_term = inst_enr['name'] + col_index = col_node_names.index(inst_term) + + # use combined score for full matrix - will not be seen in viz + tmp_score = scores['combined_score'][inst_term] + net.dat['node_info']['col']['value'].append(tmp_score) + + for inst_gene in inst_enr['int_genes']: + row_index = row_node_names.index(inst_gene) + + # save association + net.dat['mat'][row_index, col_index] = 1 + + # cluster full matrix + ############################# + # do not make multiple views + views = [''] + + if len(net.dat['nodes']['row']) > 1: + net.cluster(dist_type='jaccard', views=views, dendro=False) + else: + net.cluster(dist_type='jaccard', views=views, dendro=False, run_clustering=False) + + # get dataframe from full matrix + df = net.dat_to_df() + + for score_type in score_types: + + for num_terms in num_dict: + + inst_df = deepcopy(df) + inst_net = deepcopy(Network()) + + inst_df = inst_df[top_terms[score_type][num_terms]] + + # load back into net + inst_net.df_to_dat(inst_df) + + # make views + if len(net.dat['nodes']['row']) > 1: + inst_net.cluster(dist_type='jaccard', views=['N_row_sum'], dendro=False) + else: + inst_net.cluster(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False) + + inst_views = inst_net.viz['views'] + + # add score_type to views + for inst_view in inst_views: + + inst_view['N_col_sum'] = num_dict[num_terms] + + inst_view['enr_score_type'] = score_type + + # add values to col_nodes and order according to rank + for inst_col in inst_view['nodes']['col_nodes']: + + inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name']) + + inst_name = inst_col['name'] + inst_col['value'] = scores[score_type][inst_name] + + # add views to main network + net.viz['views'].extend(inst_views) + + return net + +# make the get request to enrichr using the requests library +# this is done before making the get request with the lib name +def post_request(input_genes, meta=''): + # get metadata + import requests + import json + + # stringify list + input_genes = '\n'.join(input_genes) + + # define post url + post_url = 'http://amp.pharm.mssm.edu/Enrichr/addList' + + # define parameters + params = {'list':input_genes, 'description':''} + + # make request: post the gene list + post_response = requests.post( post_url, files=params) + + # load json + inst_dict = json.loads( post_response.text ) + userListId = str(inst_dict['userListId']) + + # print(userListId) + + # return the userListId that is needed to reference the list later + return userListId + +# make the get request to enrichr using the requests library +# this is done after submitting post request with the input gene list +def get_request(lib, userListId, max_terms=50 ): + import requests + import json + + # convert userListId to string + userListId = str(userListId) + + # define the get url + get_url = 'http://amp.pharm.mssm.edu/Enrichr/enrich' + + # get parameters + params = {'backgroundType':lib,'userListId':userListId} + + # try get request until status code is 200 + inst_status_code = 400 + + # wait until okay status code is returned + num_try = 0 + + # print(('\tEnrichr enrichment get req userListId: '+str(userListId))) + + while inst_status_code == 400 and num_try < 100: + num_try = num_try +1 + try: + # make the get request to get the enrichr results + + try: + get_response = requests.get( get_url, params=params ) + + # get status_code + inst_status_code = get_response.status_code + + except: + print('retry get request') + + except: + print('get requests failed') + + # load as dictionary + resp_json = json.loads( get_response.text ) + + # get the key + only_key = list(resp_json.keys())[0] + + # get response_list + response_list = resp_json[only_key] + + # transfer the response_list to the enr_dict + enr = transfer_to_enr_dict( response_list, max_terms ) + + # return enrichment json and userListId + return enr, response_list + +# transfer the response_list to a list of dictionaries +def transfer_to_enr_dict(response_list, max_terms=50): + + # # reduce the number of enriched terms if necessary + # if len(response_list) < num_terms: + # num_terms = len(response_list) + + # p-value, adjusted pvalue, z-score, combined score, genes + # 1: Term + # 2: P-value + # 3: Z-score + # 4: Combined Score + # 5: Genes + # 6: pval_bh + + num_enr_term = len(response_list) + if num_enr_term > max_terms: + num_enr_term = max_terms + + # transfer response_list to enr structure + # and only keep the top terms + # + # initialize enr + enr = [] + for i in range(num_enr_term): + + # get list element + inst_enr = response_list[i] + + # initialize dict + inst_dict = {} + + # transfer term + inst_dict['name'] = inst_enr[1] + # transfer pval + inst_dict['pval'] = inst_enr[2] + # transfer zscore + inst_dict['zscore'] = inst_enr[3] + # transfer combined_score + inst_dict['combined_score'] = inst_enr[4] + # transfer int_genes + inst_dict['int_genes'] = inst_enr[5] + # adjusted pval + inst_dict['pval_bh'] = inst_enr[6] + + # append dict + enr.append(inst_dict) + + return enr + diff --git a/src/celldega/clust/export_data.py b/src/celldega/clust/export_data.py new file mode 100644 index 00000000..5d51ffea --- /dev/null +++ b/src/celldega/clust/export_data.py @@ -0,0 +1,57 @@ +def export_net_json(net, net_type, indent='no-indent'): + ''' export json string of dat ''' + import json + from copy import deepcopy + + if net_type == 'dat': + exp_dict = deepcopy(net.dat) + + if type(exp_dict['mat']) is not list: + exp_dict['mat'] = exp_dict['mat'].tolist() + + elif net_type == 'viz': + exp_dict = net.viz + + elif net_type == 'sim_row': + exp_dict = net.sim['row'] + + elif net_type == 'sim_col': + exp_dict = net.sim['col'] + + # make json + if indent == 'indent': + exp_json = json.dumps(exp_dict, indent=2) + else: + exp_json = json.dumps(exp_dict) + + return exp_json + +def write_matrix_to_tsv(net, filename=None, df=None): + ''' + This will export the matrix in net.dat or a dataframe (optional df in + arguments) as a tsv file. Row/column categories will be saved as tuples in + tsv, which can be read back into the network object. + ''' + import pandas as pd + + if df is None: + df = net.dat_to_df() + + return df.to_csv(filename, sep='\t') + +def write_json_to_file(net, net_type, filename, indent='no-indent'): + + exp_json = net.export_net_json(net_type, indent) + + fw = open(filename, 'w') + fw.write(exp_json) + fw.close() + +def save_dict_to_json(inst_dict, filename, indent='no-indent'): + import json + fw = open(filename, 'w') + if indent == 'indent': + fw.write(json.dumps(inst_dict, indent=2)) + else: + fw.write(json.dumps(inst_dict)) + fw.close() \ No newline at end of file diff --git a/src/celldega/clust/iframe_web_app.py b/src/celldega/clust/iframe_web_app.py new file mode 100644 index 00000000..548b5136 --- /dev/null +++ b/src/celldega/clust/iframe_web_app.py @@ -0,0 +1,32 @@ +def main(net, filename=None, width=1000, height=800): + import requests, json + # from io import StringIO + from IPython.display import IFrame, display + + try: + from StringIO import StringIO + except ImportError: + from io import StringIO + + clustergrammer_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/' + + if filename is None: + file_string = net.write_matrix_to_tsv() + file_obj = StringIO(file_string) + + if net.dat['filename'] is None: + fake_filename = 'Network.txt' + else: + fake_filename = net.dat['filename'] + + r = requests.post(clustergrammer_url, files={'file': (fake_filename, file_obj)}) + else: + file_obj = open(filename, 'r') + r = requests.post(clustergrammer_url, files={'file': file_obj}) + + + link = r.text + + display(IFrame(link, width=width, height=height)) + + return link \ No newline at end of file diff --git a/src/celldega/clust/initialize_net.py b/src/celldega/clust/initialize_net.py new file mode 100644 index 00000000..2a2ee0e1 --- /dev/null +++ b/src/celldega/clust/initialize_net.py @@ -0,0 +1,97 @@ +def main(self, widget=None): + + if hasattr(self, 'meta_cat') == False: + self.meta_cat = False + + self.dat = {} + self.dat['nodes'] = {} + self.dat['nodes']['row'] = [] + self.dat['nodes']['col'] = [] + self.dat['mat'] = [] + + self.dat['node_info'] = {} + for inst_rc in self.dat['nodes']: + self.dat['node_info'][inst_rc] = {} + self.dat['node_info'][inst_rc]['ini'] = [] + self.dat['node_info'][inst_rc]['clust'] = [] + self.dat['node_info'][inst_rc]['rank'] = [] + self.dat['node_info'][inst_rc]['info'] = [] + self.dat['node_info'][inst_rc]['cat'] = [] + self.dat['node_info'][inst_rc]['value'] = [] + + # check if net has categories predefined + if hasattr(self, 'persistent_cat_colors') == False: + self.persistent_cat_colors = False + found_cats = False + else: + found_cats = True + inst_cat_colors = self.viz['cat_colors'] + inst_global_cat_colors = self.viz['global_cat_colors'] + + # initialize matrix colors + ########################### + has_matrix_colors = False + if hasattr(self, 'viz'): + if 'matrix_colors' in self.viz: + has_matrix_colors = True + + if has_matrix_colors: + matrix_colors = self.viz['matrix_colors'] + else: + matrix_colors = {} + matrix_colors['pos'] = 'red' + matrix_colors['neg'] = 'blue' + + # add widget if necessary + if widget != None: + self.widget_class = widget + + self.is_downsampled = False + + self.viz = {} + self.viz['row_nodes'] = [] + self.viz['col_nodes'] = [] + self.viz['links'] = [] + self.viz['mat'] = [] + + if found_cats == False: + self.viz['cat_colors'] = {} + self.viz['cat_colors']['row'] = {} + self.viz['cat_colors']['col'] = {} + self.viz['global_cat_colors'] = {} + else: + self.viz['cat_colors'] = inst_cat_colors + self.viz['global_cat_colors'] = inst_global_cat_colors + + self.viz['matrix_colors'] = matrix_colors + + self.sim = {} + + self.umap = {} + + +def viz(self, reset_cat_colors=False): + + # keep track of old cat_colors + old_cat_colors = self.viz['cat_colors'] + old_global_cat_colors = self.viz['global_cat_colors'] + + if 'matrix_colors' in self.viz: + matrix_colors = self.viz['matrix_colors'] + + self.viz = {} + self.viz['row_nodes'] = [] + self.viz['col_nodes'] = [] + self.viz['links'] = [] + self.viz['mat'] = [] + + if reset_cat_colors == True: + self.viz['cat_colors'] = {} + self.viz['cat_colors']['row'] = {} + self.viz['cat_colors']['col'] = {} + self.viz['global_cat_colors'] = {} + else: + self.viz['cat_colors'] = old_cat_colors + self.viz['global_cat_colors'] = old_global_cat_colors + + self.viz['matrix_colors'] = matrix_colors diff --git a/src/celldega/clust/load_data.py b/src/celldega/clust/load_data.py new file mode 100644 index 00000000..0149d9d6 --- /dev/null +++ b/src/celldega/clust/load_data.py @@ -0,0 +1,97 @@ +import io, sys +import json +import pandas as pd +from . import categories +from . import proc_df_labels +from . import data_formats +from . import make_unique_labels + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +def load_file(net, filename): + # reset network when loaing file, prevents errors when loading new file + # have persistent categories + + net.reset() + + f = open(filename, 'r') + + file_string = f.read() + f.close() + + load_file_as_string(net, file_string, filename) + +def load_file_as_string(net, file_string, filename=''): + + if (sys.version_info > (3, 0)): + # python 3 + #################### + file_string = str(file_string) + else: + # python 2 + #################### + file_string = unicode(file_string) + + buff = io.StringIO(file_string) + + if '/' in filename: + filename = filename.split('/')[-1] + + net.load_tsv_to_net(buff, filename) + +def load_stdin(net): + data = '' + + for line in sys.stdin: + data = data + line + + data = StringIO.StringIO(data) + + net.load_tsv_to_net(data) + +def load_tsv_to_net(net, file_buffer, filename=None): + lines = file_buffer.getvalue().split('\n') + num_labels = categories.check_categories(lines) + + row_arr = list(range(num_labels['row'])) + col_arr = list(range(num_labels['col'])) + + # use header if there are col categories + if len(col_arr) > 1: + df = pd.read_table(file_buffer, index_col=row_arr, + header=col_arr) + else: + df = pd.read_table(file_buffer, index_col=row_arr) + + df = proc_df_labels.main(df) + + net.df_to_dat(df, True) + net.dat['filename'] = filename + +def load_json_to_dict(filename): + f = open(filename, 'r') + inst_dict = json.load(f) + f.close() + return inst_dict + +def load_gmt(filename): + f = open(filename, 'r') + lines = f.readlines() + f.close() + gmt = {} + for i in range(len(lines)): + inst_line = lines[i].rstrip() + inst_term = inst_line.split('\t')[0] + inst_elems = inst_line.split('\t')[2:] + gmt[inst_term] = inst_elems + + return gmt + +def load_data_to_net(net, inst_net): + ''' load data into nodes and mat, also convert mat to numpy array''' + net.dat['nodes'] = inst_net['nodes'] + net.dat['mat'] = inst_net['mat'] + data_formats.mat_to_numpy_arr(net) \ No newline at end of file diff --git a/src/celldega/clust/load_vect_post.py b/src/celldega/clust/load_vect_post.py new file mode 100644 index 00000000..5396b4f7 --- /dev/null +++ b/src/celldega/clust/load_vect_post.py @@ -0,0 +1,46 @@ +def main(real_net, vect_post): + import numpy as np + from copy import deepcopy + from .__init__ import Network + from . import proc_df_labels + + net = deepcopy(Network()) + + sigs = vect_post['columns'] + + all_rows = [] + all_sigs = [] + for inst_sig in sigs: + all_sigs.append(inst_sig['col_name']) + + col_data = inst_sig['data'] + + for inst_row_data in col_data: + all_rows.append(inst_row_data['row_name']) + + all_rows = sorted(list(set(all_rows))) + all_sigs = sorted(list(set(all_sigs))) + + net.dat['nodes']['row'] = all_rows + net.dat['nodes']['col'] = all_sigs + + net.dat['mat'] = np.empty((len(all_rows), len(all_sigs))) + net.dat['mat'][:] = np.nan + + for inst_sig in sigs: + inst_sig_name = inst_sig['col_name'] + col_data = inst_sig['data'] + + for inst_row_data in col_data: + inst_row = inst_row_data['row_name'] + inst_value = inst_row_data['val'] + + row_index = all_rows.index(inst_row) + col_index = all_sigs.index(inst_sig_name) + + net.dat['mat'][row_index, col_index] = inst_value + + tmp_df = net.dat_to_df() + tmp_df = proc_df_labels.main(tmp_df) + + real_net.df_to_dat(tmp_df) diff --git a/src/celldega/clust/make_clust_fun.py b/src/celldega/clust/make_clust_fun.py new file mode 100644 index 00000000..8e52aa91 --- /dev/null +++ b/src/celldega/clust/make_clust_fun.py @@ -0,0 +1,75 @@ +from copy import deepcopy +import scipy +from . import calc_clust, run_filter, make_sim_mat, cat_pval +from . import enrichr_functions as enr_fun + +def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, + requested_views=['pct_row_sum', 'N_row_sum'], + linkage_type='average', sim_mat=False, filter_sim=0.0, + calc_cat_pval=False, sim_mat_views=['N_row_sum'], + run_enrichr=None, enrichrgram=None, clust_library='scipy', + min_samples=1, min_cluster_size=2): + ''' + This will perform hierarchical clustering + ''' + + # threshold = 0.0001 + # df = run_filter.df_filter_row_sum(df, threshold) + # df = run_filter.df_filter_col_sum(df, threshold) + + if run_enrichr is not None: + df = net.dat_to_df() + df = enr_fun.add_enrichr_cats(df, 'row', run_enrichr) + define_cat_colors = True + net.df_to_dat(df, define_cat_colors=True) + + inst_dm = calc_clust.cluster_row_and_col(net, dist_type=dist_type, + linkage_type=linkage_type, + run_clustering=run_clustering, + dendro=dendro, ignore_cat=False, + calc_cat_pval=calc_cat_pval, + clust_library=clust_library, + min_samples=min_samples, + min_cluster_size=min_cluster_size) + + which_sim = [] + + if sim_mat == True: + which_sim = ['row', 'col'] + elif sim_mat == 'row': + which_sim = ['row'] + elif sim_mat == 'col': + which_sim = ['col'] + + if sim_mat is not False: + sim_net = make_sim_mat.main(net, inst_dm, which_sim, filter_sim, sim_mat_views) + + net.sim = {} + + for inst_rc in which_sim: + net.sim[inst_rc] = sim_net[inst_rc].viz + + if inst_rc == 'row': + other_rc = 'col' + elif inst_rc == 'col': + other_rc = 'row' + + # keep track of cat_colors + net.sim[inst_rc]['cat_colors'][inst_rc] = net.viz['cat_colors'][inst_rc] + net.sim[inst_rc]['cat_colors'][other_rc] = net.viz['cat_colors'][inst_rc] + + else: + net.sim = {} + + net.viz['views'] = [] + + if enrichrgram != None: + # toggle enrichrgram functionality from back-end + net.viz['enrichrgram'] = enrichrgram + + if 'enrichrgram_lib' in net.dat: + net.viz['enrichrgram'] = True + net.viz['enrichrgram_lib'] = net.dat['enrichrgram_lib'] + + if 'row_cat_bars' in net.dat: + net.viz['row_cat_bars'] = net.dat['row_cat_bars'] diff --git a/src/celldega/clust/make_sim_mat.py b/src/celldega/clust/make_sim_mat.py new file mode 100644 index 00000000..28384097 --- /dev/null +++ b/src/celldega/clust/make_sim_mat.py @@ -0,0 +1,71 @@ +def main(net, inst_dm, which_sim, filter_sim, sim_mat_views=['N_row_sum']): + from .__init__ import Network + from copy import deepcopy + from . import calc_clust + + sim_dict = {} + + for inst_rc in which_sim: + + sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True, + filter_sim=filter_sim) + + sim_net = {} + + for inst_rc in which_sim: + + sim_net[inst_rc] = deepcopy(Network()) + + sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc] + + sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc] + sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc] + + sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][inst_rc] + sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][inst_rc] + + calc_clust.cluster_row_and_col(sim_net[inst_rc]) + + all_views = [] + df = sim_net[inst_rc].dat_to_df() + send_df = deepcopy(df) + + sim_net[inst_rc].viz['views'] = all_views + + return sim_net + +def dm_to_sim(inst_dm, make_squareform=False, filter_sim=0): + import numpy as np + from scipy.spatial.distance import squareform + + if make_squareform is True: + inst_dm = squareform(inst_dm) + + inst_sim_mat = 1 - inst_dm + + if filter_sim > 0: + filter_sim = adjust_filter_sim(inst_sim_mat, filter_sim) + inst_sim_mat[ np.abs(inst_sim_mat) < filter_sim] = 0 + + return inst_sim_mat + +def adjust_filter_sim(inst_dm, filter_sim, keep_top=20000): + import pandas as pd + import numpy as np + + inst_df = pd.DataFrame(inst_dm) + val_vect = np.abs(inst_df.values.flatten()) + + val_vect = val_vect[val_vect > 0.01] + + if len(val_vect) > keep_top: + + + inst_series = pd.Series(val_vect) + inst_series.sort_values(ascending=False) + + sort_values = inst_series.values + + filter_sim = sort_values[keep_top] + + return filter_sim \ No newline at end of file diff --git a/src/celldega/clust/make_unique_labels.py b/src/celldega/clust/make_unique_labels.py new file mode 100644 index 00000000..d1f9d6eb --- /dev/null +++ b/src/celldega/clust/make_unique_labels.py @@ -0,0 +1,88 @@ +import pandas as pd + +# make_unique_labels + +def main(net, df=None): + ''' + Run in load_data module (which runs when file is loaded or dataframe is loaded), + check for duplicate row/col names, and add index to names if necesary + ''' + if df is None: + df = net.export_df() + + # rows + ############# + rows = df.index.tolist() + if type(rows[0]) is str: + + if len(rows) != len(list(set(rows))): + print('warning: making row names unique') + new_rows = add_index_list(rows) + df.index = new_rows + + elif type(rows[0]) is tuple: + + row_names = [] + for inst_row in rows: + row_names.append(inst_row[0]) + + if len(row_names) != len(list(set(row_names))): + print('warning: making row names unique') + row_names = add_index_list(row_names) + + # add back to tuple + new_rows = [] + for inst_index in range(len(rows)): + inst_row = rows[inst_index] + new_row = list(inst_row) + new_row[0] = row_names[inst_index] + new_row = tuple(new_row) + new_rows.append(new_row) + + df.index = new_rows + + # cols + ############# + cols = df.columns.tolist() + if type(cols[0]) is str: + + # list column names + if len(cols) != len(list(set(cols))): + print('warning: making col names unique') + new_cols = add_index_list(cols) + df.columns = new_cols + + elif type(cols[0]) is tuple: + + col_names = [] + for inst_col in cols: + col_names.append(inst_col[0]) + + if len(col_names) != len(list(set(col_names))): + print('warning: making col names unique') + col_names = add_index_list(col_names) + + # add back to tuple + new_cols = [] + for inst_index in range(len(cols)): + inst_col = cols[inst_index] + new_col = list(inst_col) + new_col[0] = col_names[inst_index] + new_col = tuple(new_col) + new_cols.append(new_col) + + df.columns = new_cols + + # return dataframe with unique names + return df + +def add_index_list(nodes): + + new_nodes = [] + for i in range(len(nodes)): + index = i + 1 + inst_node = nodes[i] + new_node = inst_node + '-' + str(index) + new_nodes.append(new_node) + + return new_nodes diff --git a/src/celldega/clust/make_viz.py b/src/celldega/clust/make_viz.py new file mode 100644 index 00000000..a14d8719 --- /dev/null +++ b/src/celldega/clust/make_viz.py @@ -0,0 +1,72 @@ +def viz_json(net, dendro=True, links=False): + ''' make the dictionary for the clustergram.js visualization ''' + from . import calc_clust + import numpy as np + + # linkage information + net.viz['linkage'] = {} + net.viz['linkage']['row'] = net.dat['node_info']['row']['Y'].tolist() + net.viz['linkage']['col'] = net.dat['node_info']['col']['Y'].tolist() + + # node information + for inst_rc in net.dat['nodes']: + + inst_keys = net.dat['node_info'][inst_rc] + all_cats = [x for x in inst_keys if 'cat-' in x] + + for i in range(len(net.dat['nodes'][inst_rc])): + inst_dict = {} + inst_dict['name'] = net.dat['nodes'][inst_rc][i] + inst_dict['ini'] = net.dat['node_info'][inst_rc]['ini'][i] + inst_dict['clust'] = net.dat['node_info'][inst_rc]['clust'].index(i) + inst_dict['rank'] = net.dat['node_info'][inst_rc]['rank'][i] + + if 'rankvar' in inst_keys: + inst_dict['rankvar'] = net.dat['node_info'][inst_rc]['rankvar'][i] + + # fix for similarity matrix + if len(all_cats) > 0: + + for inst_name_cat in all_cats: + + actual_cat_name = net.dat['node_info'][inst_rc][inst_name_cat][i] + inst_dict[inst_name_cat] = actual_cat_name + + check_pval = 'pval_'+inst_name_cat.replace('-','_') + + if check_pval in net.dat['node_info'][inst_rc]: + tmp_pval_name = inst_name_cat.replace('-','_') + '_pval' + inst_dict[tmp_pval_name] = net.dat['node_info'][inst_rc][check_pval][actual_cat_name] + + tmp_index_name = inst_name_cat.replace('-', '_') + '_index' + + inst_dict[tmp_index_name] = net.dat['node_info'][inst_rc] \ + [tmp_index_name][i] + + + if len(net.dat['node_info'][inst_rc]['value']) > 0: + inst_dict['value'] = net.dat['node_info'][inst_rc]['value'][i] + + if len(net.dat['node_info'][inst_rc]['info']) > 0: + inst_dict['info'] = net.dat['node_info'][inst_rc]['info'][i] + + net.viz[inst_rc + '_nodes'].append(inst_dict) + + # save data as links or mat + ########################### + if links is True: + for i in range(len(net.dat['nodes']['row'])): + for j in range(len(net.dat['nodes']['col'])): + + inst_dict = {} + inst_dict['source'] = i + inst_dict['target'] = j + inst_dict['value'] = float(net.dat['mat'][i, j]) + + if np.isnan(inst_dict['value_orig']): + inst_dict['value_orig'] = 'NaN' + + net.viz['links'].append(inst_dict) + + else: + net.viz['mat'] = net.dat['mat'].tolist() diff --git a/src/celldega/clust/normalize_fun.py b/src/celldega/clust/normalize_fun.py new file mode 100644 index 00000000..8a02d040 --- /dev/null +++ b/src/celldega/clust/normalize_fun.py @@ -0,0 +1,165 @@ +import pandas as pd +import numpy as np +from copy import deepcopy + +def run_norm(net, df=None, norm_type='zscore', axis='row', z_clip=None): + ''' + A dataframe can be passed to run_norm and a normalization will be run ( + e.g. zscore) on either the rows or columns + ''' + + if df is None: + df = net.dat_to_df() + + if norm_type == 'zscore': + df, ser_mean, ser_std = zscore_df(df, axis, z_clip=z_clip) + + net.dat['pre_zscore'] = {} + net.dat['pre_zscore']['mean'] = ser_mean.values.tolist() + net.dat['pre_zscore']['std'] = ser_std.values.tolist() + + if norm_type == 'qn': + df = qn_df(df, axis) + + if norm_type == 'umi': + df = umi_norm(df) + + net.df_to_dat(df) + + # if norm_type == 'zscore' and axis == 'row': + # net.dat['pre_zscore'] = {} + # net.dat['pre_zscore']['mean'] = ser_mean + # net.dat['pre_zscore']['std'] = ser_std + +def qn_df(df, axis='row'): + ''' + do quantile normalization of a dataframe dictionary, does not write to net + ''' + # using transpose to do row qn + if axis == 'row': + df = df.transpose() + + missing_values = df.isnull().values.any() + + # make mask of missing values + if missing_values: + + # get nan mask + missing_mask = pd.isnull(df) + + # tmp fill in na with zero, will not affect qn + df = df.fillna(value=0) + + # calc common distribution + common_dist = calc_common_dist(df) + + # swap in common distribution + df = swap_in_common_dist(df, common_dist) + + # swap back in missing values + if missing_values: + df = df.mask(missing_mask, other=np.nan) + + # using transpose to do row qn + if axis == 'row': + df = df.transpose() + + df_qn = df + + return df_qn + +def swap_in_common_dist(df, common_dist): + + col_names = df.columns.tolist() + + qn_arr = np.array([]) + orig_rows = df.index.tolist() + + # loop through each column + for inst_col in col_names: + + # get the sorted list of row names for the given column + tmp_series = deepcopy(df[inst_col]) + tmp_series = tmp_series.sort_values(ascending=False) + sorted_names = tmp_series.index.tolist() + + qn_vect = np.array([]) + for inst_row in orig_rows: + inst_index = sorted_names.index(inst_row) + inst_val = common_dist[inst_index] + qn_vect = np.hstack((qn_vect, inst_val)) + + if qn_arr.shape[0] == 0: + qn_arr = qn_vect + else: + qn_arr = np.vstack((qn_arr, qn_vect)) + + # transpose (because of vstacking) + qn_arr = qn_arr.transpose() + + qn_df = pd.DataFrame(data=qn_arr, columns=col_names, index=orig_rows) + + return qn_df + +def calc_common_dist(df): + ''' + calculate a common distribution (for col qn only) that will be used to qn + ''' + + # axis is col + tmp_arr = np.array([]) + + col_names = df.columns.tolist() + + for inst_col in col_names: + + # sort column + tmp_vect = df[inst_col].sort_values(ascending=False).values + + # stacking rows vertically (will transpose) + if tmp_arr.shape[0] == 0: + tmp_arr = tmp_vect + else: + tmp_arr = np.vstack((tmp_arr, tmp_vect)) + + tmp_arr = tmp_arr.transpose() + + common_dist = tmp_arr.mean(axis=1) + + return common_dist + +def zscore_df(df, axis='row', z_clip=None): + ''' + take the zscore of a dataframe dictionary, does not write to net (self) + ''' + + if axis == 'row': + df = df.transpose() + + ser_mean = df.mean() + ser_std = df.std() + + df_z = (df - ser_mean)/ser_std + + if axis == 'row': + df_z = df_z.transpose() + + if z_clip is not None: + df_z = z_clip_fun(df_z, lower=-z_clip, upper=z_clip) + + return df_z, ser_mean, ser_std + +def umi_norm(df): + # umi norm + barcode_umi_sum = df.sum() + df_umi = df.div(barcode_umi_sum) + return df_umi + + +def z_clip_fun(df_z, lower=None, upper=None): + ''' + Trim values at input thresholds using pandas function + ''' + df_z = df_z.clip(lower=lower, upper=upper) + + return df_z \ No newline at end of file diff --git a/src/celldega/clust/proc_df_labels.py b/src/celldega/clust/proc_df_labels.py new file mode 100644 index 00000000..1caed5ca --- /dev/null +++ b/src/celldega/clust/proc_df_labels.py @@ -0,0 +1,62 @@ +def main(df): + ''' + 1) check that rows are strings (in case of numerical names) + 2) check for tuples, and in that case load tuples to categories + ''' + import numpy as np + from ast import literal_eval as make_tuple + + test = {} + test['row'] = df.index.tolist() + test['col'] = df.columns.tolist() + + # if type( test_row ) is not str and type( test_row ) is not tuple: + + found_tuple = {} + found_number = {} + for inst_rc in ['row','col']: + + inst_name = test[inst_rc][0] + + found_tuple[inst_rc] = False + found_number[inst_rc] = False + + if type(inst_name) != tuple: + + if type(inst_name) is int or type(inst_name) is float or type(inst_name) is np.int64: + found_number[inst_rc] = True + + else: + check_open = inst_name[0] + check_comma = inst_name.find(',') + check_close = inst_name[-1] + + if check_open == '(' and check_close == ')' and check_comma > 0 \ + and check_comma < len(inst_name): + found_tuple[inst_rc] = True + + # convert to tuple if necessary + ################################################# + if found_tuple['row']: + row_names = df.index.tolist() + row_names = [make_tuple(x) for x in row_names] + df.index = row_names + + if found_tuple['col']: + col_names = df.columns.tolist() + col_names = [make_tuple(x) for x in col_names] + df.columns = col_names + + # convert numbers to string if necessary + ################################################# + if found_number['row']: + row_names = df.index.tolist() + row_names = [str(x) for x in row_names] + df.index = row_names + + if found_number['col']: + col_names = df.columns.tolist() + col_names = [str(x) for x in col_names] + df.columns = col_names + + return df \ No newline at end of file diff --git a/src/celldega/clust/run_filter.py b/src/celldega/clust/run_filter.py new file mode 100644 index 00000000..1fc506fb --- /dev/null +++ b/src/celldega/clust/run_filter.py @@ -0,0 +1,209 @@ +def df_filter_row_sum(df, threshold, take_abs=True): + ''' filter rows in matrix at some threshold + and remove columns that have a sum below this threshold ''' + + from copy import deepcopy + from .__init__ import Network + net = Network() + + if take_abs is True: + df_copy = deepcopy(df.abs()) + else: + df_copy = deepcopy(df) + + ini_rows = df_copy.index.values.tolist() + df_copy = df_copy.transpose() + tmp_sum = df_copy.sum(axis=0) + tmp_sum = tmp_sum.abs() + tmp_sum.sort_values(inplace=True, ascending=False) + + tmp_sum = tmp_sum[tmp_sum > threshold] + keep_rows = sorted(tmp_sum.index.values.tolist()) + + if len(keep_rows) < len(ini_rows): + df = grab_df_subset(df, keep_rows=keep_rows) + + return df + +def df_filter_col_sum(df, threshold, take_abs=True): + ''' filter columns in matrix at some threshold + and remove rows that have all zero values ''' + + from copy import deepcopy + from .__init__ import Network + net = Network() + + if take_abs is True: + df_copy = deepcopy(df.abs()) + else: + df_copy = deepcopy(df) + + df_copy = df_copy.transpose() + df_copy = df_copy[df_copy.sum(axis=1) > threshold] + df_copy = df_copy.transpose() + df_copy = df_copy[df_copy.sum(axis=1) > 0] + + if take_abs is True: + inst_rows = df_copy.index.tolist() + inst_cols = df_copy.columns.tolist() + df = grab_df_subset(df, inst_rows, inst_cols) + + else: + df = df_copy + + return df + +def grab_df_subset(df, keep_rows='all', keep_cols='all'): + if keep_cols != 'all': + df = df[keep_cols] + if keep_rows != 'all': + df = df.loc[keep_rows] + return df + +def get_sorted_rows(df, rank_type='sum'): + from copy import deepcopy + + inst_df = deepcopy(df) + inst_df = inst_df.transpose() + + if rank_type == 'sum': + tmp_sum = inst_df.sum(axis=0) + elif rank_type == 'var': + tmp_sum = inst_df.var(axis=0) + + tmp_sum = tmp_sum.abs() + tmp_sum.sort_values(inplace=True, ascending=False) + rows_sorted = tmp_sum.index.values.tolist() + + return rows_sorted + +def filter_N_top(inst_rc, df, N_top, rank_type='sum'): + + if inst_rc == 'col': + for inst_type in df: + df[inst_type] = df[inst_type].transpose() + + rows_sorted = get_sorted_rows(df, rank_type) + + keep_rows = rows_sorted[:N_top] + + df = df.loc[keep_rows] + + if inst_rc == 'col': + for inst_type in df: + df[inst_type] = df[inst_type].transpose() + + return df + +def filter_threshold(df, inst_rc, threshold, num_occur=1): + ''' + Filter a network's rows or cols based on num_occur values being above a + threshold (in absolute_value) + ''' + from copy import deepcopy + + inst_df = deepcopy(df) + + if inst_rc == 'col': + inst_df = inst_df.transpose() + + inst_df = inst_df.abs() + + ini_rows = inst_df.index.values.tolist() + + inst_df[inst_df < threshold] = 0 + inst_df[inst_df >= threshold] = 1 + + tmp_sum = inst_df.sum(axis=1) + + tmp_sum = tmp_sum[tmp_sum >= num_occur] + + keep_names = tmp_sum.index.values.tolist() + + if inst_rc == 'row': + if len(keep_names) < len(ini_rows): + df = grab_df_subset(df, keep_rows=keep_names) + + elif inst_rc == 'col': + inst_df = inst_df.transpose() + + inst_rows = inst_df.index.values.tolist() + inst_cols = keep_names + + df = grab_df_subset(df, inst_rows, inst_cols) + + return df + +def filter_cat(net, axis, cat_index, cat_name): + + try: + df = net.export_df() + + # DataFrame filtering will be run always be run on columns if the user + # wants to filter rows, transpose the matrix before and after + if axis == 'row': + df = df.transpose() + + all_names = df.columns.tolist() + + found_names = [i for i in all_names if i[cat_index] == cat_name] + + if len(found_names) > 0: + df = df[found_names] + + if axis == 'row': + df = df.transpose() + else: + print('no ' + axis + 's were found with this category and filtering was not run') + + net.load_df(df) + + except: + print('category filtering did not run\n check that your category filtering is set up correctly') + + +def filter_names(net, axis, names): + + print('filter_names') + print(names) + + try: + + df = net.export_df() + + # Dataframe filtering will always be run on the columns. If the user wants to filter rows, then it will transpose back and forth. + + if axis == 'row': + df = df.transpose() + + all_names = df.columns.tolist() + + found_names = [] + for inst_name in all_names: + + if type(inst_name) is tuple: + check_name = inst_name[0] + else: + check_name = inst_name + + if ': ' in check_name: + check_name = check_name.split(': ')[1] + + if check_name in names: + found_names.append(inst_name) + + if len(found_names) > 0: + df = df[found_names] + + if axis == 'row': + df = df.transpose() + + net.load_df(df) + + else: + print('no ' + axis + 's were found with these names') + + except: + print('error in filtering names') + + print(found_names) \ No newline at end of file From fbe32fbbf19d29b6489fcf9cfaeba6571e7092af Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Tue, 7 Jan 2025 17:01:01 -0500 Subject: [PATCH 2/6] working on clust api for hierarchical clustering --- src/celldega/clust/__init__.py | 46 ++++++++++++++++++++++++++++++++-- src/celldega/viz/widget.py | 2 ++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py index 28550be1..d768760a 100644 --- a/src/celldega/clust/__init__.py +++ b/src/celldega/clust/__init__.py @@ -1,3 +1,30 @@ +# The Celldega Matrix Vizualization Method is being built using the approaches +# and code adaptations from the Clustergrammer-GL library, which is available at +# github.com/ismms-himc/clustergrammer2 +# and being used under the license +# +# MIT License + +# Copyright (c) 2021 Nicolas Fernandez + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + import numpy as np import pandas as pd from copy import deepcopy @@ -26,10 +53,25 @@ import ipywidgets as widgets import statsmodels.stats.multitest as smm -def cluster(): +def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'): net = Network() - print('**********') + net.load_df(df) + + if filter_N_top is not None: + net.filter_N_top(axis='row', N_top=5000) + + if norm_col == 'total': + net.normalize(axis='col', norm_type='umi') + + if norm_row == 'zscore': + net.normalize(axis='row', norm_type='zscore') + + net.cluster() + + network = net.viz + + return network class Network(object): ''' diff --git a/src/celldega/viz/widget.py b/src/celldega/viz/widget.py index bef366c7..e96d4bed 100644 --- a/src/celldega/viz/widget.py +++ b/src/celldega/viz/widget.py @@ -82,12 +82,14 @@ class Matrix(anywidget.AnyWidget): Returns: Matrix: A widget for visualizing a hierarchically clustered matrix. """ + _esm = pathlib.Path(__file__).parent / "../static" / "widget.js" _css = pathlib.Path(__file__).parent / "../static" / "widget.css" value = traitlets.Int(0).tag(sync=True) component = traitlets.Unicode("Matrix").tag(sync=True) network = traitlets.Dict({}).tag(sync=True) + width = traitlets.Int(600).tag(sync=True) height = traitlets.Int(600).tag(sync=True) click_info = traitlets.Dict({}).tag(sync=True) From aafb2df9fa0a489901cebbc30651affd33e47bed Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Wed, 8 Jan 2025 13:45:47 -0500 Subject: [PATCH 3/6] quick fix for tooltip cutoff in matrix and ist landscape --- js/deck-gl/make_tooltip.js | 16 +++++++++++++--- js/deck-gl/matrix/matrix_tooltip.js | 13 +++++++++++-- js/viz/landscape_ist.js | 2 ++ js/viz/matrix_viz.js | 10 ++++++++++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/js/deck-gl/make_tooltip.js b/js/deck-gl/make_tooltip.js index bbb9f4fe..46267dbc 100644 --- a/js/deck-gl/make_tooltip.js +++ b/js/deck-gl/make_tooltip.js @@ -8,7 +8,9 @@ export const make_tooltip = (viz_state, info) => { let inst_name = '' let inst_cat = '' - console.log(info.layer.id) + // console.log(info.layer.id) + + if (info.layer.id.startsWith('cell-layer') || info.layer.id.startsWith('path-layer')) { inst_name = info.layer.id.startsWith('cell-layer') ? viz_state.cats.cell_names_array[info.index] : viz_state.cats.polygon_cell_names[info.index] @@ -26,8 +28,16 @@ export const make_tooltip = (viz_state, info) => { inst_html = `
neighborhood: ${inst_name}
cluster: ${inst_cat}
` } - d3.selectAll('.deck-tooltip') - .style('margin-top', '75px') + // d3.selectAll('.deck-tooltip') + // .style('margin-top', '75px') + + // console.log(viz_state.root) + + // select the parent element of .deck-tooltip within viz_state.root + const tooltipContainer = viz_state.root.querySelector('.deck-tooltip'); + tooltipContainer.style.marginTop = '50px' + const tooltipParent = tooltipContainer.parentElement.parentElement; + tooltipParent.style.position = 'unset' return { html: inst_html, diff --git a/js/deck-gl/matrix/matrix_tooltip.js b/js/deck-gl/matrix/matrix_tooltip.js index 3100939f..dad2f1be 100644 --- a/js/deck-gl/matrix/matrix_tooltip.js +++ b/js/deck-gl/matrix/matrix_tooltip.js @@ -4,8 +4,17 @@ export const get_tooltip = (viz_state, params) => { const {object, layer} = params; - d3.selectAll('.deck-tooltip') - .style('margin-top', '50px') + // d3.selectAll('.deck-tooltip') + // .style('margin-top', '50px') + + console.log('get_tooltip') + // select the parent element of .deck-tooltip within viz_state.root + const tooltipContainer = viz_state.root.querySelector('.deck-tooltip'); + const tooltipParent = tooltipContainer.parentElement.parentElement; + tooltipParent.style.position = 'unset' + + tooltipContainer.style.marginTop = '50px' + if (object) { // Check which layer the tooltip is currently over diff --git a/js/viz/landscape_ist.js b/js/viz/landscape_ist.js index 69da0fdb..e4fba98d 100644 --- a/js/viz/landscape_ist.js +++ b/js/viz/landscape_ist.js @@ -51,6 +51,8 @@ export const landscape_ist = async ( let viz_state = {} + viz_state.root = el + viz_state.buttons = {} viz_state.buttons.blue = '#8797ff' viz_state.buttons.gray = 'gray' diff --git a/js/viz/matrix_viz.js b/js/viz/matrix_viz.js index e577ae5f..aaab9214 100644 --- a/js/viz/matrix_viz.js +++ b/js/viz/matrix_viz.js @@ -128,4 +128,14 @@ export const matrix_viz = async ( el.appendChild(viz_state.root) + const tooltipContainer = viz_state.root.querySelector('.deck-tooltip'); + + + console.log(viz_state.root) + console.log(tooltipContainer) + + // select canvas element within viz_state.root + const canvas = viz_state.root.querySelector('canvas') + console.log(canvas) + } \ No newline at end of file From 68591f785d2bef36fd179dd2d8e6919193748584 Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Wed, 8 Jan 2025 14:11:57 -0500 Subject: [PATCH 4/6] added link to example from within docstring --- docs/python/viz/api.md | 2 +- js/deck-gl/make_tooltip.js | 4 ---- js/deck-gl/matrix/matrix_tooltip.js | 8 +------- mkdocs.yaml | 1 + src/celldega/viz/__init__.py | 32 ++++++++++++++++++++++++++++- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/docs/python/viz/api.md b/docs/python/viz/api.md index 8220848a..4ea1745c 100644 --- a/docs/python/viz/api.md +++ b/docs/python/viz/api.md @@ -2,5 +2,5 @@ ## Widget Classes -::: celldega.viz.widget +::: celldega.viz diff --git a/js/deck-gl/make_tooltip.js b/js/deck-gl/make_tooltip.js index 46267dbc..ae69f5a8 100644 --- a/js/deck-gl/make_tooltip.js +++ b/js/deck-gl/make_tooltip.js @@ -8,10 +8,6 @@ export const make_tooltip = (viz_state, info) => { let inst_name = '' let inst_cat = '' - // console.log(info.layer.id) - - - if (info.layer.id.startsWith('cell-layer') || info.layer.id.startsWith('path-layer')) { inst_name = info.layer.id.startsWith('cell-layer') ? viz_state.cats.cell_names_array[info.index] : viz_state.cats.polygon_cell_names[info.index] inst_cat = viz_state.cats.dict_cell_cats[inst_name] diff --git a/js/deck-gl/matrix/matrix_tooltip.js b/js/deck-gl/matrix/matrix_tooltip.js index dad2f1be..2c506a31 100644 --- a/js/deck-gl/matrix/matrix_tooltip.js +++ b/js/deck-gl/matrix/matrix_tooltip.js @@ -4,18 +4,12 @@ export const get_tooltip = (viz_state, params) => { const {object, layer} = params; - // d3.selectAll('.deck-tooltip') - // .style('margin-top', '50px') - - console.log('get_tooltip') // select the parent element of .deck-tooltip within viz_state.root const tooltipContainer = viz_state.root.querySelector('.deck-tooltip'); + tooltipContainer.style.marginTop = '50px' const tooltipParent = tooltipContainer.parentElement.parentElement; tooltipParent.style.position = 'unset' - tooltipContainer.style.marginTop = '50px' - - if (object) { // Check which layer the tooltip is currently over if (layer.id === 'row-label-layer') { diff --git a/mkdocs.yaml b/mkdocs.yaml index f7ed281d..f9a309de 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -39,6 +39,7 @@ nav: - examples/index.md - Brief Notebooks: - Landscape View Xenium: examples/brief_notebooks/Landscape_View_Xenium.ipynb + - Landscape-Matrix View Xenium: examples/brief_notebooks/Landscape-Matrix_Xenium.ipynb - Pre-Process LandscapeFiles: examples/brief_notebooks/Pre-process_Xenium_V1_human_Pancreas_FFPE_outs.ipynb # - Tutorials: # - examples/Landscape_View_Xenium.ipynb diff --git a/src/celldega/viz/__init__.py b/src/celldega/viz/__init__.py index 17e406d9..86ee1ee2 100644 --- a/src/celldega/viz/__init__.py +++ b/src/celldega/viz/__init__.py @@ -3,5 +3,35 @@ """ from .widget import Landscape, Matrix +from ipywidgets import jslink, HBox, Layout -__all__ = ["Landscape", "Matrix"] +def landscape_matrix(landscape, mat, width='600px', height='700px'): + """ + Display a `Landscape` widget and a `Matrix` widget side by side. + + Args: + landscape (Landscape): A `Landscape` widget. + mat (Matrix): A `Matrix` widget. + width (str): The width of the widgets. + height (str): The height of the widgets. + + Returns: + Visualization display + + Example: + See example [Landscape-Matrix_Xenium](../../../examples/brief_notebooks/Landscape-Matrix_Xenium) notebook + """ + + # Use `jslink` to directly link `click_info` from `mat` to `trigger_value` in `landscape_ist` + jslink((mat, 'click_info'), (landscape, 'update_trigger')) + + # Set layouts for the widgets + mat.layout = Layout(width=width) # Adjust as needed + landscape.layout = Layout(width=width, height=height) # Adjust as needed + + # Display widgets side by side + widgets_side_by_side = HBox([landscape, mat]) + + display(widgets_side_by_side) + +__all__ = ["Landscape", "Matrix", 'landscape_matrix'] From 1d17729a397033be53c5457338adcaa4edae8a63 Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Wed, 8 Jan 2025 14:45:49 -0500 Subject: [PATCH 5/6] cleaned documentation for main modules --- docs/python/clust/api.md | 3 +++ docs/python/index.md | 10 +++++----- docs/python/nbhd/api.md | 2 -- mkdocs.yaml | 2 ++ pyproject.toml | 1 - src/celldega/clust/__init__.py | 25 +++++++++++++++++++++++++ src/celldega/nbhd/__init__.py | 4 ++++ 7 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 docs/python/clust/api.md diff --git a/docs/python/clust/api.md b/docs/python/clust/api.md new file mode 100644 index 00000000..32f61dc7 --- /dev/null +++ b/docs/python/clust/api.md @@ -0,0 +1,3 @@ +# Clust Module API Reference + +::: celldega.clust \ No newline at end of file diff --git a/docs/python/index.md b/docs/python/index.md index 0d76eee1..ca85bedb 100644 --- a/docs/python/index.md +++ b/docs/python/index.md @@ -1,13 +1,13 @@ # Python API Overview -## [Neighborhood Overview](nbhd/api) -The `nbhd` module contains methods for calculating tissue neighborhoods. - ## [Pre Module Overview](pre/api) - The `pre` module contains methods for pre-processing LandscapeFiles. +## [Clust Module Overview](clust/api) +The `clust` module contains methods for clustering high-dimensional data. -## [Viz Module Overview](viz/api) +## [Neighborhood Overview](nbhd/api) +The `nbhd` module contains methods for calculating tissue neighborhoods. +## [Viz Module Overview](viz/api) The `viz` module contains functions and classes for data visualization. diff --git a/docs/python/nbhd/api.md b/docs/python/nbhd/api.md index f738d069..2e846793 100644 --- a/docs/python/nbhd/api.md +++ b/docs/python/nbhd/api.md @@ -1,5 +1,3 @@ # Neighborhood Module API Reference - - ::: celldega.nbhd \ No newline at end of file diff --git a/mkdocs.yaml b/mkdocs.yaml index f9a309de..1ca6237e 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -22,6 +22,8 @@ nav: - Python API: - python/index.md - pre: python/pre/api.md + - clust: python/clust/api.md + - nbhd: python/nbhd/api.md - viz: python/viz/api.md - JavaScript API: diff --git a/pyproject.toml b/pyproject.toml index c2bf049f..c2a82a0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "imagecodecs~=2024.1.1", "scanpy~=1.10.2", "squidpy~=1.5.0", - "clustergrammer2==0.18.0", "shapely~=2.0.5", "polars~=1.10.0", "libpysal~=4.8.1" diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py index d768760a..a9b5b2f8 100644 --- a/src/celldega/clust/__init__.py +++ b/src/celldega/clust/__init__.py @@ -1,3 +1,7 @@ +""" +Module for clustering high-dimensional data. +""" + # The Celldega Matrix Vizualization Method is being built using the approaches # and code adaptations from the Clustergrammer-GL library, which is available at # github.com/ismms-himc/clustergrammer2 @@ -54,6 +58,27 @@ import statsmodels.stats.multitest as smm def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'): + + """ + This function performs hierarchical clustering on the rows and columns of a + DataFrame and returns the visualization JSON for the Celldega-Matrix method. + This function is developed using the approaches and code adaptations from the + Clustergrammer2 project. + + Args: + df (pd.DataFrame): A Pandas DataFrame. + filter_N_top (int): The number of top dimensions to keep after filtering. + norm_col (str): The type of normalization to apply to the columns. + norm_row (str): The type of normalization to apply to the rows. + + Returns: + dict: The visualization JSON for the Celldega-Matrix method + + + Example: + See [Landscape-Matrix_Xenium](../../../examples/brief_notebooks/Landscape-Matrix_Xenium) notebook. + """ + net = Network() net.load_df(df) diff --git a/src/celldega/nbhd/__init__.py b/src/celldega/nbhd/__init__.py index 3b818674..26f9c420 100644 --- a/src/celldega/nbhd/__init__.py +++ b/src/celldega/nbhd/__init__.py @@ -1,3 +1,7 @@ +""" +Module for performing neighborhood analysis. +""" + from libpysal.cg import alpha_shape as libpysal_alpha_shape import geopandas as gpd from shapely import Point, MultiPoint, MultiPolygon From b24b6feffd99c144815719ca376a4bfd21523f21 Mon Sep 17 00:00:00 2001 From: Nicolas Fernandez Date: Wed, 8 Jan 2025 14:55:42 -0500 Subject: [PATCH 6/6] removed unnecessary changes --- js/viz/matrix_viz.js | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/js/viz/matrix_viz.js b/js/viz/matrix_viz.js index aaab9214..e577ae5f 100644 --- a/js/viz/matrix_viz.js +++ b/js/viz/matrix_viz.js @@ -128,14 +128,4 @@ export const matrix_viz = async ( el.appendChild(viz_state.root) - const tooltipContainer = viz_state.root.querySelector('.deck-tooltip'); - - - console.log(viz_state.root) - console.log(tooltipContainer) - - // select canvas element within viz_state.root - const canvas = viz_state.root.querySelector('canvas') - console.log(canvas) - } \ No newline at end of file