making consensus biclusters based on ARI instead of Jaccard

ozolotareva · Oct 7, 2024 · 6ee609f · 6ee609f
1 parent 8864dd9
commit 6ee609f
Showing 1 changed file with 99 additions and 12 deletions.
diff --git a/unpast/utils/consensus.py b/unpast/utils/consensus.py
@@ -280,17 +280,19 @@ def make_consensus_biclusters(
         )
     return pval
 
+#### experimental #### 
+
 def calc_signif_bicluster_similarities(
     biclusters_dict,
     exprs,
+    similarity_measure = "ARI", #"Jaccard"  or "ARI"
     similarity="both",
     adj_pval_thr=0.05,
     plot=True,
     figsize=(10, 10),
     labels=False,
     colorbar_off=True,
 ):
-
     """ WILL REPLACE find_best_matching_biclusters(), currently is not used.
     Calculates Jaccard similarities of significanly overlapping bicluster paires based on gene and/or sample overlaps, and optionally visualize the results.
 
@@ -309,6 +311,9 @@ def calc_signif_bicluster_similarities(
             - "samples": only compute similarities based on sample overlaps.
             - "both": compute similarities based on both gene and sample overlaps.
 
+        similarity_measure : str, optional, default="ARI"
+            Adjusted Rand index or Jaccard similarity. If genes are accounted in similarity computations, Jaccard similarity will be used.
+
         adj_pval_thr : float, optional, default=0.05
             Adjusted p-value threshold for significance testing. Similarity values will be stored only for bicluster pairs which significantly overlap. Statistical significance of the overlaps is tested using the chi-squared test.
 
@@ -330,34 +335,52 @@ def calc_signif_bicluster_similarities(
             A DataFrame containing pairwise Jaccard similarity values between biclusters, adjusted for statistical significance.
     """ 
 
+    if similarity_measure == "ARI":
+        from sklearn.metrics import adjusted_rand_score
+        all_samples = sorted(exprs.columns.values)
+        if not similarity=="samples":
+            print("Jaccard similarity is used instead of ARI when 'similarity=%s'"%similarity,file=sys.stderr)
+            similarity_measure = "Jaccard"
+
+
+
     if similarity not in ["genes", "samples", "both"]:
         print(
             "Similarity must be 'genes','samples','both'. Set to 'both'.",
             file=sys.stderr,
         )
         similarity = "both"
 
-    J_heatmap = {}
+    sim_heatmap = {}
     s = set(exprs.columns.values)
     g = set(exprs.index.values)
     N_bics = len(biclusters_dict.keys())
-    ### plot pairwise comparisons:
+    ### do pairwise comparisons:
     for bic_n1 in biclusters_dict.keys():
         b1 = biclusters_dict[bic_n1]
         g1 = b1["genes"]
         s1 = b1["samples"]
-        J_heatmap[bic_n1] = {}
+        if similarity_measure == "ARI":
+            sb1 = pd.Series(np.zeros(exprs.shape[1]),index = all_samples)
+            sb1[s1] = 1
+        sim_heatmap[bic_n1] = {}
         for bic_n2 in biclusters_dict.keys():
             b2 = biclusters_dict[bic_n2]
             g2 = b2["genes"]
             s2 = b2["samples"]
+            if similarity_measure == "ARI":
+                sb2 = pd.Series(np.zeros(exprs.shape[1]),index = all_samples)
+                sb2[s2] = 1
             g_overlap = g1.intersection(g2)
             g_union = g1.union(g2)
             s_overlap = s1.intersection(s2)
             s_union = s1.union(s2)
             J_g = len(g_overlap) / len(g_union)
-            J_s = len(s_overlap) / len(s_union)
-            J_heatmap[bic_n1][bic_n2] = 0
+            if similarity_measure == "ARI":
+                J_s = max(0,adjusted_rand_score(sb1,sb2))
+            else:
+                J_s = len(s_overlap) / len(s_union)
+            sim_heatmap[bic_n1][bic_n2] = 0
 
             # significance of gene overlap
             if similarity != "samples":
@@ -386,10 +409,10 @@ def calc_signif_bicluster_similarities(
 
             if similarity == "genes":
                 if p_g * (N_bics - 1) * N_bics / 2 < adj_pval_thr:
-                    J_heatmap[bic_n1][bic_n2] = J_g
+                    sim_heatmap[bic_n1][bic_n2] = J_g
             elif similarity == "samples":
                 if p_s * (N_bics - 1) * N_bics / 2 < adj_pval_thr:
-                    J_heatmap[bic_n1][bic_n2] = J_s
+                    sim_heatmap[bic_n1][bic_n2] = J_s
             elif similarity == "both":  # both genes and samples considered
                 # consider significant overlaps in g and s and save max. J
                 if (
@@ -399,15 +422,15 @@ def calc_signif_bicluster_similarities(
                     p_s * (N_bics - 1) * N_bics / 2 < adj_pval_thr
                     and len(g_overlap) > 0
                 ):
-                    J_heatmap[bic_n1][bic_n2] = max(J_s, J_g)
+                    sim_heatmap[bic_n1][bic_n2] = max(J_s, J_g)
 
-    J_heatmap = pd.DataFrame.from_dict(J_heatmap)
+    sim_heatmap = pd.DataFrame.from_dict(sim_heatmap)
 
     if plot:
         import seaborn as sns
 
         g = sns.clustermap(
-            J_heatmap,
+            sim_heatmap,
             yticklabels=labels,
             xticklabels=labels,
             linewidths=0,
@@ -423,5 +446,69 @@ def calc_signif_bicluster_similarities(
             g.cax.set_visible(False)
         plt.show()
 
-    return J_heatmap
+
+def make_consensus_biclusters_ari(biclusters_dict, 
+     data,
+     similarity_measure = "ARI",  # ARI or Jaccard
+     min_similarity=0.8, 
+     min_n_samples=5, 
+     seed=42, 
+     method="kmeans", 
+     verbose=True
+):
+    """WILL REPLACE make_consensus_biclusters() """
+    bic_similarity = calc_signif_bicluster_similarities(
+                    biclusters_dict,
+                    data,
+                    similarity_measure = similarity_measure,
+                    similarity="samples",
+                    adj_pval_thr=0.05,
+                    plot=False,#plot=True,
+                    figsize=(10, 10),
+                    labels=False,
+                    colorbar_off=True )
+
+    # find groups of biclusters including the same sample sets
+    merged, not_merged, similarity_cutoff = run_Louvain(
+            bic_similarity,
+            similarity_cutoffs=np.arange(min_similarity, 0.95, 0.05),
+            m=False,
+            verbose=verbose,
+            plot=False,
+            modularity_measure="potts"
+        )
+
+    if len(merged) == 0 and verbose:
+        print("No biclusters to merge", file=sys.stdout)
+        return biclusters_dict
+
+    merged_biclusters = {}
+    # add biclusters with no changes
+    for bic_id in not_merged:
+        merged_biclusters[bic_id] = biclusters_dict[bic_id]
+
+    # merge biclusters with overlapping sample sets
+    for bic_group in merged:
+        bic_group = sorted(bic_group)
+        if verbose:
+            print("merged biclustres", bic_group, file=sys.stderr)
+        new_bicluster = biclusters_dict[bic_group[0]]
+        # update genes
+        for bic_id in bic_group[1:]:
+            bic2 = biclusters_dict[bic_id]
+            new_bicluster["genes"] = new_bicluster["genes"] | bic2["genes"]
+            new_bicluster["n_genes"] = len(new_bicluster["genes"])
+        # update sample set for new bicluster
+        # cluster samples in a space of selected genes
+        new_bicluster.update(
+            cluster_samples(
+                data.loc[list(new_bicluster["genes"]), :].T,
+                min_n_samples=min_n_samples,
+                seed=seed,
+                method=method,
+            )
+        )
+        new_bicluster["n_samples"] = len(new_bicluster["sample_indexes"])
+        merged_biclusters[bic_group[0]] = new_bicluster
+    return merged_biclusters