Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run func renamed #7

Merged
merged 3 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ python -m unpast.run_unpast --exprs unpast/tests/scenario_B500.exprs.tsv.gz --ba
python run_unpast.py -h
```
* Real data example. Analysis of a subset of 200 samples randomly chosen from TCGA-BRCA dataset, including consensus biclustering and visualization:
[jupyter-notebook](https://github.com/ozolotareva/unpast/blob/main/notebooks/UnPaSt_examples.ipynb).
[jupyter-notebook](https://github.com/ozolotareva/unpast_paper/blob/main/notebooks/UnPaSt_examples.ipynb).

## Outputs
`<basename>.[parameters].biclusters.tsv` - A `.tsv` file containing the identified biclusters with the following structure:
Expand Down
528 changes: 408 additions & 120 deletions notebooks/UnPaSt_examples.ipynb

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions notebooks/consensus.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,16 @@
"metadata": {},
"outputs": [],
"source": [
"import os, sys\n",
"import pandas as pd\n",
"import numpy as np\n",
"import random\n",
"\n",
"import sys\n",
"sys.path.insert(0, '..')\n",
"\n",
"from unpast.utils.method import zscore, prepare_input_matrix\n",
"from unpast.utils.io import read_bic_table, write_bic_table\n",
"from unpast.utils.consensus import make_consensus_biclusters\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import sys\n",
"sys.path.insert(0, '..')\n",
"%matplotlib inline"
]
},
Expand Down Expand Up @@ -206,7 +203,7 @@
"outputs": [],
"source": [
"from unpast.utils.eval import make_ref_groups\n",
"from unpast.utils.eval import calculate_perfromance, compare_gene_clusters\n",
"from unpast.utils.eval import calculate_perfromance\n",
"# TCGA\n",
"t_subtypes = pd.read_csv(\"data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv\",sep = \"\\t\",index_col=0)\n",
"t_annotation = pd.read_csv(\"data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv\",sep = \"\\t\",index_col=0)\n",
Expand Down
42 changes: 20 additions & 22 deletions notebooks/simulated_data.ipynb

Large diffs are not rendered by default.

40 changes: 19 additions & 21 deletions paper/evaluation/clusterings_evaluation_on_real_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sys,os\n",
"import os\n",
"import random\n",
"import copy\n",
"from time import time\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../..')\n",
"\n",
"from utils.method import read_bic_table\n",
"\n",
"from utils.eval import make_ref_groups\n",
"from utils.eval import calculate_perfromance, compare_gene_clusters"
"from unpast.utils.io import read_bic_table\n",
"\n",
"from unpast.utils.eval import make_ref_groups\n",
"from unpast.utils.eval import calculate_perfromance"
]
},
{
Expand Down Expand Up @@ -4087,30 +4086,29 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 1,
"id": "chicken-champion",
"metadata": {},
"outputs": [
{
"ename": "ImportError",
"evalue": "cannot import name 'HDBSCAN' from 'sklearn.cluster' (/home/olya/anaconda3/lib/python3.8/site-packages/sklearn/cluster/__init__.py)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-43-3359491e4cd4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcluster\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mHDBSCAN\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mmin_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;31m#[3,5,10,20]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcluster_selection_epsilon\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m150\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malphas\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;31m#[1.0,0.5,1.5]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mImportError\u001b[0m: cannot import name 'HDBSCAN' from 'sklearn.cluster' (/home/olya/anaconda3/lib/python3.8/site-packages/sklearn/cluster/__init__.py)"
]
"data": {
"text/plain": [
"'from sklearn.cluster import HDBSCAN\\n\\nmin_samples = [5]#[3,5,10,20]\\ncluster_selection_epsilon = [0.0,150]\\nalphas = [1.0]#[1.0,0.5,1.5]\\n\\nlen(min_samples)*len(alphas)'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.cluster import HDBSCAN\n",
"\"\"\"from sklearn.cluster import HDBSCAN\n",
"\n",
"min_samples = [5]#[3,5,10,20]\n",
"cluster_selection_epsilon = [0.0,150]\n",
"alphas = [1.0]#[1.0,0.5,1.5]\n",
"\n",
"len(min_samples)*len(alphas)"
"len(min_samples)*len(alphas)\"\"\""
]
},
{
Expand Down Expand Up @@ -14113,7 +14111,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
25 changes: 11 additions & 14 deletions paper/evaluation/evaluation_on_real_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,23 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 5,
"id": "sticky-quality",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sys,os\n",
"import sys\n",
"import random\n",
"import copy\n",
"from time import time\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../..')\n",
"\n",
"from utils.method import read_bic_table\n",
"from unpast.utils.io import read_bic_table\n",
"\n",
"from utils.eval import make_ref_groups\n",
"from utils.eval import calculate_perfromance, compare_gene_clusters"
"from unpast.utils.eval import make_ref_groups\n",
"from unpast.utils.eval import calculate_perfromance, compare_gene_clusters"
]
},
{
Expand Down Expand Up @@ -231,8 +228,8 @@
"subt_m = [] # Perfoemances for METABRIC\n",
"clustering_similarities = [] # Similarities of gene clusters found in TCGA and METABRIC\n",
"\n",
"from unpast.run_unpast import unpast\n",
"# UnPaSt parameters \n",
"from run_unpast import run\n",
"rpath=\"/home/olya/anaconda3/envs/r4_env/bin/\"\n",
"out_dir= \"results_on_real_data_WGCNA2/\"\n",
"basename_t = \"TCGA\"\n",
Expand Down Expand Up @@ -402,7 +399,7 @@
" result_t = read_bic_table(fname)\n",
" except:\n",
" print(\"not found\")\n",
" \"\"\"result_t = run(exprs_file_t, basename_t, out_dir=out_dir,\n",
" \"\"\"result_t = unpast(exprs_file_t, basename_t, out_dir=out_dir,\n",
" save=True, load = True,\n",
" min_n_samples = 5,\n",
" bin_method = bin_method, pval = pval,\n",
Expand Down Expand Up @@ -437,7 +434,7 @@
" result_m = read_bic_table(fname)\n",
" except:\n",
" print(fname)\n",
" \"\"\"result_m = run(exprs_file_m, basename_m, out_dir=out_dir,\n",
" \"\"\"result_m = unpast(exprs_file_m, basename_m, out_dir=out_dir,\n",
" save=True, load = True,\n",
" min_n_samples = 5,\n",
" bin_method = bin_method, pval = pval,\n",
Expand Down Expand Up @@ -628,7 +625,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
25 changes: 12 additions & 13 deletions paper/evaluation/evaluation_on_simulated_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,16 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sys,os\n",
"import random\n",
"import copy\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from utils.eval import calculate_perfromance, generate_exprs\n",
"from utils.method import read_bic_table"
"import sys\n",
"sys.path.insert(0, '../..')\n",
"\n",
"from unpast.utils.io import read_bic_table\n",
"\n",
"from unpast.utils.eval import calculate_perfromance"
]
},
{
Expand Down Expand Up @@ -690,7 +689,7 @@
}
},
"source": [
"# Running DESMOND2 with multiple parameter combinations"
"# Running UnPaSt with multiple parameter combinations"
]
},
{
Expand All @@ -703,13 +702,13 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 2,
"id": "vietnamese-bracelet",
"metadata": {},
"outputs": [],
"source": [
"from time import time\n",
"from run_unpast import run\n",
"from unpast.run_unpast import unpast\n",
"\n",
"out_dir= \"results_on_simulated_data/\"\n",
"fname_prefix = \"data/simulated_m=4,std=1/\"\n",
Expand Down Expand Up @@ -1328,7 +1327,7 @@
" if os.path.exists(fname):\n",
" result = read_bic_table(fname)\n",
" else:\n",
" result = run(exprs_file, basename , out_dir=out_dir,\n",
" result = unpast(exprs_file, basename , out_dir=out_dir,\n",
" save=True, load = True,\n",
" min_n_samples = 5,\n",
" bin_method = bin_method, pval = pval,\n",
Expand Down Expand Up @@ -1515,7 +1514,7 @@
" if os.path.exists(fname):\n",
" result = read_bic_table(fname)\n",
" else:\n",
" result = run(exprs_file, basename , out_dir=out_dir,\n",
" result = unpast(exprs_file, basename , out_dir=out_dir,\n",
" save=True, load = True,\n",
" min_n_samples = 5,\n",
" bin_method = bin_method, pval = pval,\n",
Expand Down Expand Up @@ -2173,7 +2172,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
4 changes: 1 addition & 3 deletions unpast/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
fisher==0.1.9
jenkspy==0.2.0
matplotlib-venn==0.11.6
numba==0.51.2
numpy==1.22.3
seaborn==0.11.1
Expand All @@ -10,4 +8,4 @@ scipy==1.7.1
statsmodels==0.13.2
pandas==1.4.2
python-louvain==0.15
lifelines==0.27.4
kneed==0.8.1
14 changes: 7 additions & 7 deletions unpast/run_unpast.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd

def run(exprs_file: pd.DataFrame,
def unpast(exprs_file: pd.DataFrame,
basename: str ='',
out_dir: str ="./",
save: bool =True,
Expand All @@ -25,7 +25,7 @@ def run(exprs_file: pd.DataFrame,
#cluster_binary: bool = False,
merge: float = 1,
seed: int = 42,
verbose: bool = True,
verbose: bool = False,
plot_all: bool = False,
e_dist_size: int = 10000,
standradize: bool = True):
Expand Down Expand Up @@ -226,10 +226,10 @@ def parse_args():
help="Absolute threshold for z-scores. For example, when set to 3, z-scores greater than 3 are set to 3 and z-scores less than -3 are set to -3. No ceiling if set to 0.")
parser.add_argument('-s','--min_n_samples', metavar=5, default=5, type=int, help = 'The minimal number of samples in a bicluster `min_n_samples` must be >= 2 and not greater than half of the cohort size.')
parser.add_argument('-b','--binarization', metavar="kmeans", default="kmeans", type=str,
choices=["kmeans","ward",'GMM', 'Jenks'], help='binarization method')
choices=["kmeans","ward",'GMM'], help='binarization method')
parser.add_argument('-p','--pval', metavar=0.01, default=0.01, type=float, help = 'binarization p-value')
parser.add_argument('-c','--clustering', metavar="WGCNA", default="WGCNA", type=str,
choices=['Louvain', 'WGCNA','iWGCNA'], help='feature clustering method')
choices=['Louvain', 'WGCNA'], help='feature clustering method')
# Louvain parameters
parser.add_argument('-m','--modularity', default=1/3, metavar="1/3", type=float, help='Modularity corresponding to a cutoff for similarity matrix (Louvain clustering)')
parser.add_argument('-r','--similarity_cutoffs', default=-1, metavar="-1", type=float, help='A cutoff or a list of cuttofs for similarity matrix (Louvain clustering). If set to -1, will be chosen authomatically from [1/5,4/5] using elbow method.')
Expand All @@ -238,7 +238,7 @@ def parse_args():
parser.add_argument('--dch', default=0.995, metavar="0.995", type=float, help='dynamicTreeCut parameter, see WGCNA documentation')
parser.add_argument('--bidirectional', action='store_true', help='Whether to cluster up- and down-regulated features together.')
parser.add_argument('--rpath', default="", metavar="", type=str, help='Full path to Rscript.')
parser.add_argument('--merge', default=1, metavar="1", type=float,help = "Whether to merge biclustres similar in samples with Jaccard index not less then the specified.")
#parser.add_argument('--merge', default=1, metavar="1", type=float,help = "Whether to merge biclustres similar in samples with Jaccard index not less than the specified.")
parser.add_argument('--load_binary', action='store_true', help = "loads binarized features from <basename>.<bin_method>.seed=42.binarized.tsv, statistics from *.binarization_stats.tsv and the background SNR distribution from <basename>.<bin_method>.n=<e_dist_size>.seed=42.background.tsv")
parser.add_argument('--save_binary', action='store_true', help = "saves binarized features to a file named as <basename>.<bin_method>.seed=42.binarized.tsv. When feature clustering method is WGCNA, binarized features will be always saved. Also, files *.binarization_stats.tsv and *.background.tsv with binarization statistincs and background SNR distributions respectively will be created")
parser.add_argument('-v','--verbose', action='store_true')
Expand All @@ -255,7 +255,7 @@ def parse_args():
if args.bidirectional:
directions = ["BOTH"]

biclusters = run(args.exprs, args.basename, out_dir=args.out_dir,
biclusters = unpast(args.exprs, args.basename, out_dir=args.out_dir,
save = args.save_binary, load = args.load_binary,
ceiling = args.ceiling,
bin_method = args.binarization,
Expand All @@ -267,7 +267,7 @@ def parse_args():
modularity = args.modularity, similarity_cutoffs = args.similarity_cutoffs, # for Louvain
ds = args.ds, dch = args.dch, rpath=args.rpath, precluster=True, # for WGCNA
cluster_binary = False,
merge = args.merge,
#merge = args.merge,
seed = args.seed,
#plot_all = args.plot,
verbose = args.verbose)
5 changes: 2 additions & 3 deletions unpast/tests/test_run_unpast.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Tests for run_unpast, and hence all the core code. Usage: python -m pytest test/test_run_unpast.py"""
import os
import sys
import pandas as pd
import pytest

Expand All @@ -11,14 +10,14 @@
RESULTS_DIR = "/tmp/unpast/results"
REFERENCE_OUTPUT_DIR = os.path.join(TEST_DIR, "test_reference_output")

from unpast.run_unpast import run
from unpast.run_unpast import unpast


### Helper functions ###


def run_unpast_on_file(filename, basename, *args, **kwargs):
run(
unpast(
os.path.join(TEST_DIR, filename),
out_dir=RESULTS_DIR,
basename=basename,
Expand Down
3 changes: 1 addition & 2 deletions unpast/tests/utils/test_method.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
import numpy as np
import warnings
from unpast.utils.method import zscore, prepare_input_matrix, get_trend, calc_SNR
from unpast.utils.method import get_trend, calc_SNR


def test_get_trend_single_point():
Expand Down
2 changes: 1 addition & 1 deletion unpast/utils/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def make_consensus_biclusters(
min_n_genes=2,
min_n_samples=5,
min_n_times_detected=2,
modularity_measure="potts",
modularity_measure="newman",
method="kmeans", # sample clustering method
seed=-1,
plot=False,
Expand Down
Loading