From 653cff2adcc00cc3bd08aaebb42aa80707b0038b Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 15 Jul 2024 17:29:55 +0200 Subject: [PATCH 01/31] add center & scaling feature for burden scores --- deeprvat/deeprvat/associate.py | 57 ++++++++++++++++++- .../association_testing/burdens.snakefile | 2 + .../association_testing_pretrained.snakefile | 1 + ...ation_testing_pretrained_regenie.snakefile | 1 + .../training_association_testing.snakefile | 1 + ...ning_association_testing_regenie.snakefile | 3 +- 6 files changed, 63 insertions(+), 2 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 9f61e3b5..8bb4e193 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -26,6 +26,7 @@ from tqdm import tqdm, trange import zarr import re +import dask.array as da import deeprvat.deeprvat.models as deeprvat_models from deeprvat.data import DenseGTDataset @@ -846,6 +847,7 @@ def load_models( @click.option("--chunk", type=int) @click.option("--dataset-file", type=click.Path(exists=True)) @click.option("--link-burdens", type=click.Path()) +@click.option("--center-scale-burdens", is_flag=True) @click.argument("data-config-file", type=click.Path(exists=True)) @click.argument("model-config-file", type=click.Path(exists=True)) @click.argument("checkpoint-files", type=click.Path(exists=True), nargs=-1) @@ -857,6 +859,7 @@ def compute_burdens( chunk: Optional[int], dataset_file: Optional[str], link_burdens: Optional[str], + center_scale_burdens: bool, data_config_file: str, model_config_file: str, checkpoint_files: Tuple[str], @@ -877,6 +880,8 @@ def compute_burdens( :type dataset_file: Optional[str] :param link_burdens: Path to burden.zarr file to link. :type link_burdens: Optional[str] + :param center_scale_burdens: Flag to enable calculation of center and scaling parameters for centering and scaling burden results. + :type center_scale_burdens: bool :param data_config_file: Path to the data configuration file. :type data_config_file: str :param model_config_file: Path to the model configuration file. @@ -932,6 +937,25 @@ def compute_burdens( skip_burdens=(link_burdens is not None), ) + if center_scale_burdens and not link_burdens: + if (chunk == 0) or not chunk: + empty_batch = { + "rare_variant_annotations": torch.zeros(1,1,34,1), + "y": None, + "x_phenotypes": None, + "sample": None, + } + this_mode, _, _, _ = get_burden( + empty_batch, agg_models, device=device, skip_burdens=False, compute_mode=True + ) + this_mode = this_mode.flatten() + center_scale_df = pd.DataFrame(columns=["max","mode"]) + for r in range(len(agg_models)): + center_scale_df.loc[r,"max"] = None + center_scale_df.loc[r,"mode"] = this_mode[r] + pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") + center_scale_df.to_csv(f"{Path(out_dir)}/computed_burdens_stats.csv", index=False) + logger.info("Saving computed burdens, corresponding genes, and targets") np.save(Path(out_dir) / "genes.npy", genes) if link_burdens is not None: @@ -958,7 +982,6 @@ def regress_on_gene_scoretest( :rtype: Tuple[List[str], List[float], List[float]] """ burdens = burdens.reshape(burdens.shape[0], -1) - assert np.all(burdens != 0) # because DeepRVAT burdens are corrently all non-zero logger.info(f"Burdens shape: {burdens.shape}") if np.all(np.abs(burdens) < 1e-6): @@ -1300,6 +1323,7 @@ def combine_regression_results( @cli.command() +@click.option("--center-scale-burdens", is_flag=True) @click.option("--n-chunks", type=int) @click.option("--chunk", type=int) @click.option("-r", "--repeats", multiple=True, type=int) @@ -1307,6 +1331,7 @@ def combine_regression_results( @click.argument("burden-file", type=click.Path(exists=True)) @click.argument("burden-out-file", type=click.Path()) def average_burdens( + center_scale_burdens: bool, repeats: Tuple, burden_file: str, burden_out_file: str, @@ -1319,6 +1344,19 @@ def average_burdens( logger.info(f"Reading burdens to aggregate from {burden_file}") burdens = zarr.open(burden_file) n_total_samples = burdens.shape[0] + + if center_scale_burdens: + center_scale_params_file = Path(os.path.split(burden_out_file)[0]) / "computed_burdens_stats.csv" + center_scale_df = pd.read_csv(center_scale_params_file) + if (chunk == 0) or not chunk: + xd = da.from_zarr(burden_file,chunks=(1000,1000,1)) + for r in range(len(repeats)): + repeat_max = xd[:,:,r].max().compute() # compute max across each repeat + center_scale_df.loc[r,"max"] = repeat_max + + pprint(f"Center and scaling values extracted from computed burdens :\n{center_scale_df}") + center_scale_df.to_csv(center_scale_params_file, index=False) + if chunk is not None: if n_chunks is None: raise ValueError("n_chunks must be specified if chunk is not None") @@ -1366,6 +1404,23 @@ def average_burdens( end_idx = min(start_idx + batch_size, chunk_end) print(start_idx, end_idx) this_burdens = np.take(burdens[start_idx:end_idx, :, :], repeats, axis=2) + + #Double-check zarr creation - no computed burdens should equal zero + assert np.all(this_burdens != 0) + + if center_scale_burdens: + print("Centering and Scaling Burdens before aggregating") + for r in range(len(repeats)): + zero_effect_val = center_scale_df.loc[r,"mode"] + repeat_max = center_scale_df.loc[r,"max"] + # Subtract off zero effect burden value (mode) + this_burdens[:,:,r] -= zero_effect_val + adjusted_max = repeat_max - zero_effect_val + # Scale only values >= mode. Scale between 0 and 1. + pos_mask = this_burdens[:,:,r] > 0 + # (this_burdens[:,:,r][pos_mask] - this_burdens[:,:,r][pos_mask].min()) / (adjusted_max - this_burdens[:,:,r][pos_mask].min()) + this_burdens[:,:,r][pos_mask] /= adjusted_max + this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) burdens_new[start_idx:end_idx, :, 0] = this_burdens diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index 23d5b34a..e00ba6fa 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -19,6 +19,7 @@ rule average_burdens: shell: ' && '.join([ ('deeprvat_associate average-burdens ' + + center_scale_burdens + '--n-chunks '+ str(n_avg_chunks) + ' ' '--chunk {wildcards.chunk} ' '{params.repeats} ' @@ -83,6 +84,7 @@ rule compute_burdens: ' && '.join([ ('deeprvat_associate compute-burdens ' + debug + + + center_scale_burdens + ' --n-chunks '+ str(n_burden_chunks) + ' ' '--chunk {wildcards.chunk} ' '--dataset-file {input.dataset} ' diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 76ee4670..13a64afc 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -18,6 +18,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index 97addd12..1514eaf5 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -18,6 +18,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 3acd75c1..6f17795d 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -18,6 +18,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index cd081b71..863f2e2d 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -18,6 +18,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] @@ -26,7 +27,7 @@ do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) model_path = Path("models") n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1) -cv_exp = False +cv_exp = config.get('cv_exp', False) wildcard_constraints: repeat="\d+", From 71c3488124e17cb75787ee6732df031eb49a09d4 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Wed, 17 Jul 2024 09:48:53 +0200 Subject: [PATCH 02/31] fixup snakemake rule spacing --- pipelines/association_testing/burdens.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index e00ba6fa..4e839834 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -18,7 +18,7 @@ rule average_burdens: priority: 10, shell: ' && '.join([ - ('deeprvat_associate average-burdens ' + ('deeprvat_associate average-burdens ' + center_scale_burdens + '--n-chunks '+ str(n_avg_chunks) + ' ' '--chunk {wildcards.chunk} ' @@ -84,10 +84,10 @@ rule compute_burdens: ' && '.join([ ('deeprvat_associate compute-burdens ' + debug + - + center_scale_burdens + ' --n-chunks '+ str(n_burden_chunks) + ' ' '--chunk {wildcards.chunk} ' '--dataset-file {input.dataset} ' + + center_scale_burdens + '{input.data_config} ' '{input.model_config} ' '{input.checkpoints} ' From 950468cf2d6a1b1afeadd446309d85aafd804f33 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Wed, 17 Jul 2024 10:32:36 +0200 Subject: [PATCH 03/31] Add in center_scale_burden option to config generation --- deeprvat/deeprvat/config.py | 1 + example/config/deeprvat_input_config.yaml | 1 + example/config/deeprvat_input_pretrained_models_config.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py index 4c7c818e..e64ea743 100644 --- a/deeprvat/deeprvat/config.py +++ b/deeprvat/deeprvat/config.py @@ -309,6 +309,7 @@ def create_main_config( "correction_method" ] full_config["evaluation"]["alpha"] = input_config["evaluation"]["alpha"] + full_config["center_scale_burdens"] = input_config["evaluation"]["center_scale_burdens"] # DeepRVAT model full_config["n_repeats"] = input_config["n_repeats"] diff --git a/example/config/deeprvat_input_config.yaml b/example/config/deeprvat_input_config.yaml index f14b3c72..58297c51 100644 --- a/example/config/deeprvat_input_config.yaml +++ b/example/config/deeprvat_input_config.yaml @@ -151,6 +151,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: False # Subsetting samples for training or association testing #sample_files: diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml index 869f3c00..5fc81137 100644 --- a/example/config/deeprvat_input_pretrained_models_config.yaml +++ b/example/config/deeprvat_input_pretrained_models_config.yaml @@ -72,6 +72,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: False # Subsetting samples for association testing #sample_files: From 9626e01044e9278905e59c55b3500cd16546e90b Mon Sep 17 00:00:00 2001 From: PMBio Date: Wed, 17 Jul 2024 08:35:00 +0000 Subject: [PATCH 04/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 58 ++++++++++++++++++++-------------- deeprvat/deeprvat/config.py | 4 ++- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 8bb4e193..23ac2b3e 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -940,21 +940,27 @@ def compute_burdens( if center_scale_burdens and not link_burdens: if (chunk == 0) or not chunk: empty_batch = { - "rare_variant_annotations": torch.zeros(1,1,34,1), + "rare_variant_annotations": torch.zeros(1, 1, 34, 1), "y": None, "x_phenotypes": None, "sample": None, } this_mode, _, _, _ = get_burden( - empty_batch, agg_models, device=device, skip_burdens=False, compute_mode=True - ) + empty_batch, + agg_models, + device=device, + skip_burdens=False, + compute_mode=True, + ) this_mode = this_mode.flatten() - center_scale_df = pd.DataFrame(columns=["max","mode"]) + center_scale_df = pd.DataFrame(columns=["max", "mode"]) for r in range(len(agg_models)): - center_scale_df.loc[r,"max"] = None - center_scale_df.loc[r,"mode"] = this_mode[r] + center_scale_df.loc[r, "max"] = None + center_scale_df.loc[r, "mode"] = this_mode[r] pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") - center_scale_df.to_csv(f"{Path(out_dir)}/computed_burdens_stats.csv", index=False) + center_scale_df.to_csv( + f"{Path(out_dir)}/computed_burdens_stats.csv", index=False + ) logger.info("Saving computed burdens, corresponding genes, and targets") np.save(Path(out_dir) / "genes.npy", genes) @@ -1344,19 +1350,25 @@ def average_burdens( logger.info(f"Reading burdens to aggregate from {burden_file}") burdens = zarr.open(burden_file) n_total_samples = burdens.shape[0] - + if center_scale_burdens: - center_scale_params_file = Path(os.path.split(burden_out_file)[0]) / "computed_burdens_stats.csv" + center_scale_params_file = ( + Path(os.path.split(burden_out_file)[0]) / "computed_burdens_stats.csv" + ) center_scale_df = pd.read_csv(center_scale_params_file) if (chunk == 0) or not chunk: - xd = da.from_zarr(burden_file,chunks=(1000,1000,1)) + xd = da.from_zarr(burden_file, chunks=(1000, 1000, 1)) for r in range(len(repeats)): - repeat_max = xd[:,:,r].max().compute() # compute max across each repeat - center_scale_df.loc[r,"max"] = repeat_max - - pprint(f"Center and scaling values extracted from computed burdens :\n{center_scale_df}") - center_scale_df.to_csv(center_scale_params_file, index=False) - + repeat_max = ( + xd[:, :, r].max().compute() + ) # compute max across each repeat + center_scale_df.loc[r, "max"] = repeat_max + + pprint( + f"Center and scaling values extracted from computed burdens :\n{center_scale_df}" + ) + center_scale_df.to_csv(center_scale_params_file, index=False) + if chunk is not None: if n_chunks is None: raise ValueError("n_chunks must be specified if chunk is not None") @@ -1404,22 +1416,22 @@ def average_burdens( end_idx = min(start_idx + batch_size, chunk_end) print(start_idx, end_idx) this_burdens = np.take(burdens[start_idx:end_idx, :, :], repeats, axis=2) - - #Double-check zarr creation - no computed burdens should equal zero + + # Double-check zarr creation - no computed burdens should equal zero assert np.all(this_burdens != 0) if center_scale_burdens: print("Centering and Scaling Burdens before aggregating") for r in range(len(repeats)): - zero_effect_val = center_scale_df.loc[r,"mode"] - repeat_max = center_scale_df.loc[r,"max"] + zero_effect_val = center_scale_df.loc[r, "mode"] + repeat_max = center_scale_df.loc[r, "max"] # Subtract off zero effect burden value (mode) - this_burdens[:,:,r] -= zero_effect_val + this_burdens[:, :, r] -= zero_effect_val adjusted_max = repeat_max - zero_effect_val # Scale only values >= mode. Scale between 0 and 1. - pos_mask = this_burdens[:,:,r] > 0 + pos_mask = this_burdens[:, :, r] > 0 # (this_burdens[:,:,r][pos_mask] - this_burdens[:,:,r][pos_mask].min()) / (adjusted_max - this_burdens[:,:,r][pos_mask].min()) - this_burdens[:,:,r][pos_mask] /= adjusted_max + this_burdens[:, :, r][pos_mask] /= adjusted_max this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py index e64ea743..5e840496 100644 --- a/deeprvat/deeprvat/config.py +++ b/deeprvat/deeprvat/config.py @@ -309,7 +309,9 @@ def create_main_config( "correction_method" ] full_config["evaluation"]["alpha"] = input_config["evaluation"]["alpha"] - full_config["center_scale_burdens"] = input_config["evaluation"]["center_scale_burdens"] + full_config["center_scale_burdens"] = input_config["evaluation"][ + "center_scale_burdens" + ] # DeepRVAT model full_config["n_repeats"] = input_config["n_repeats"] From 542e19067eb9562e14a45a0cddbb83e9953da51d Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 18 Jul 2024 11:07:19 +0200 Subject: [PATCH 05/31] remove unneeded function argument --- deeprvat/deeprvat/associate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 8bb4e193..292e89f1 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -946,7 +946,7 @@ def compute_burdens( "sample": None, } this_mode, _, _, _ = get_burden( - empty_batch, agg_models, device=device, skip_burdens=False, compute_mode=True + empty_batch, agg_models, device=device, skip_burdens=False ) this_mode = this_mode.flatten() center_scale_df = pd.DataFrame(columns=["max","mode"]) From 8afef137ae4f6e425a6749c5ac521948f30e0822 Mon Sep 17 00:00:00 2001 From: PMBio Date: Thu, 18 Jul 2024 09:09:48 +0000 Subject: [PATCH 06/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index e547b1fc..44553e84 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -946,11 +946,11 @@ def compute_burdens( "sample": None, } this_mode, _, _, _ = get_burden( - empty_batch, - agg_models, - device=device, - skip_burdens=False, - ) + empty_batch, + agg_models, + device=device, + skip_burdens=False, + ) this_mode = this_mode.flatten() center_scale_df = pd.DataFrame(columns=["max", "mode"]) for r in range(len(agg_models)): From 583f54bca5884d033cc08f994b7bc9fc0b892d51 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 22 Jul 2024 12:13:00 +0200 Subject: [PATCH 07/31] reduce max computation and set to 1.0 --- deeprvat/deeprvat/associate.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index e547b1fc..2248d9c6 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -954,7 +954,7 @@ def compute_burdens( this_mode = this_mode.flatten() center_scale_df = pd.DataFrame(columns=["max", "mode"]) for r in range(len(agg_models)): - center_scale_df.loc[r, "max"] = None + center_scale_df.loc[r, "max"] = 1.0 #Set to max DeepRVAT sigmoid output center_scale_df.loc[r, "mode"] = this_mode[r] pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") center_scale_df.to_csv( @@ -1355,18 +1355,6 @@ def average_burdens( Path(os.path.split(burden_out_file)[0]) / "computed_burdens_stats.csv" ) center_scale_df = pd.read_csv(center_scale_params_file) - if (chunk == 0) or not chunk: - xd = da.from_zarr(burden_file, chunks=(1000, 1000, 1)) - for r in range(len(repeats)): - repeat_max = ( - xd[:, :, r].max().compute() - ) # compute max across each repeat - center_scale_df.loc[r, "max"] = repeat_max - - pprint( - f"Center and scaling values extracted from computed burdens :\n{center_scale_df}" - ) - center_scale_df.to_csv(center_scale_params_file, index=False) if chunk is not None: if n_chunks is None: @@ -1386,7 +1374,7 @@ def average_burdens( f"Computing result for chunk {chunk} out of {n_chunks} in range {chunk_start}, {chunk_end}" ) - batch_size = 100 + batch_size = 1000 logger.info(f"Batch size: {batch_size}") n_batches = n_samples // batch_size + (n_samples % batch_size != 0) From 7e85215fe72ef8b27b3f3f3d4e620b7f9a296cd4 Mon Sep 17 00:00:00 2001 From: PMBio Date: Mon, 22 Jul 2024 10:13:35 +0000 Subject: [PATCH 08/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 6b175744..1740b4a3 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -954,7 +954,9 @@ def compute_burdens( this_mode = this_mode.flatten() center_scale_df = pd.DataFrame(columns=["max", "mode"]) for r in range(len(agg_models)): - center_scale_df.loc[r, "max"] = 1.0 #Set to max DeepRVAT sigmoid output + center_scale_df.loc[r, "max"] = ( + 1.0 # Set to max DeepRVAT sigmoid output + ) center_scale_df.loc[r, "mode"] = this_mode[r] pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") center_scale_df.to_csv( From 18b5ba34b057e32e8c6ffc61813f39acc4d5601d Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Tue, 20 Aug 2024 09:00:12 +0200 Subject: [PATCH 09/31] move max computation to compute_burdens update agg._burdens to scale all values -1 to 1 --- deeprvat/deeprvat/associate.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 1740b4a3..3540ab26 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -358,6 +358,17 @@ def compute_burdens_( if not skip_burdens: burdens[chunk_start:chunk_end] = chunk_burden + #Calculate Max for this chunk and store for later + max_df = pd.DataFrame(columns=["max"]) + for r in range(len(agg_models)): + #print(chunk_burden.shape) #samples x genes x repeats + chunk_max = np.max(chunk_burden[:,:,r]) + max_df.loc[r, "max"] = chunk_max + print(f"Saving Burden Max Scores") + max_df.to_csv( + f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False + ) + y[chunk_start:chunk_end] = chunk_y x[chunk_start:chunk_end] = chunk_x sample_ids[chunk_start:chunk_end] = chunk_sampleid @@ -939,6 +950,7 @@ def compute_burdens( if center_scale_burdens and not link_burdens: if (chunk == 0) or not chunk: + # Calculate Mode empty_batch = { "rare_variant_annotations": torch.zeros(1, 1, 34, 1), "y": None, @@ -952,11 +964,8 @@ def compute_burdens( skip_burdens=False, ) this_mode = this_mode.flatten() - center_scale_df = pd.DataFrame(columns=["max", "mode"]) + center_scale_df = pd.DataFrame(columns=["mode"]) for r in range(len(agg_models)): - center_scale_df.loc[r, "max"] = ( - 1.0 # Set to max DeepRVAT sigmoid output - ) center_scale_df.loc[r, "mode"] = this_mode[r] pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") center_scale_df.to_csv( @@ -1358,6 +1367,14 @@ def average_burdens( ) center_scale_df = pd.read_csv(center_scale_params_file) + max_dfs = pd.DataFrame() + max_files_path = Path(os.path.split(burden_out_file)[0]).glob("chunk*_max.csv") + for i, filename in enumerate(max_files_path): + max_dfs[f"Max_Chunk{i}"] = pd.read_csv(filename)["max"] + #compute max across all chunks + max_dfs["max"] = max_dfs.max(axis=1) + + if chunk is not None: if n_chunks is None: raise ValueError("n_chunks must be specified if chunk is not None") @@ -1413,14 +1430,13 @@ def average_burdens( print("Centering and Scaling Burdens before aggregating") for r in range(len(repeats)): zero_effect_val = center_scale_df.loc[r, "mode"] - repeat_max = center_scale_df.loc[r, "max"] + repeat_max = max_dfs.loc[r, "max"] # Subtract off zero effect burden value (mode) this_burdens[:, :, r] -= zero_effect_val adjusted_max = repeat_max - zero_effect_val - # Scale only values >= mode. Scale between 0 and 1. - pos_mask = this_burdens[:, :, r] > 0 - # (this_burdens[:,:,r][pos_mask] - this_burdens[:,:,r][pos_mask].min()) / (adjusted_max - this_burdens[:,:,r][pos_mask].min()) - this_burdens[:, :, r][pos_mask] /= adjusted_max + min_val = this_burdens[:,:,r].min() + # Scale values between -1 and 1 + this_burdens[:, :, r] = 2*((this_burdens[:,:,r] - min_val) / (adjusted_max - min_val)) - 1 this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) From 9a0934e0b9e0ca76b2784374013bd8e9c0a83e36 Mon Sep 17 00:00:00 2001 From: PMBio Date: Tue, 20 Aug 2024 07:00:46 +0000 Subject: [PATCH 10/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 3540ab26..00fd9126 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -358,16 +358,14 @@ def compute_burdens_( if not skip_burdens: burdens[chunk_start:chunk_end] = chunk_burden - #Calculate Max for this chunk and store for later + # Calculate Max for this chunk and store for later max_df = pd.DataFrame(columns=["max"]) for r in range(len(agg_models)): - #print(chunk_burden.shape) #samples x genes x repeats - chunk_max = np.max(chunk_burden[:,:,r]) + # print(chunk_burden.shape) #samples x genes x repeats + chunk_max = np.max(chunk_burden[:, :, r]) max_df.loc[r, "max"] = chunk_max print(f"Saving Burden Max Scores") - max_df.to_csv( - f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False - ) + max_df.to_csv(f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False) y[chunk_start:chunk_end] = chunk_y x[chunk_start:chunk_end] = chunk_x @@ -1371,10 +1369,9 @@ def average_burdens( max_files_path = Path(os.path.split(burden_out_file)[0]).glob("chunk*_max.csv") for i, filename in enumerate(max_files_path): max_dfs[f"Max_Chunk{i}"] = pd.read_csv(filename)["max"] - #compute max across all chunks + # compute max across all chunks max_dfs["max"] = max_dfs.max(axis=1) - if chunk is not None: if n_chunks is None: raise ValueError("n_chunks must be specified if chunk is not None") @@ -1434,9 +1431,12 @@ def average_burdens( # Subtract off zero effect burden value (mode) this_burdens[:, :, r] -= zero_effect_val adjusted_max = repeat_max - zero_effect_val - min_val = this_burdens[:,:,r].min() + min_val = this_burdens[:, :, r].min() # Scale values between -1 and 1 - this_burdens[:, :, r] = 2*((this_burdens[:,:,r] - min_val) / (adjusted_max - min_val)) - 1 + this_burdens[:, :, r] = ( + 2 * ((this_burdens[:, :, r] - min_val) / (adjusted_max - min_val)) + - 1 + ) this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) From 4ddbb5afcd93cbe71bdc85fee9fac3d3e8de428c Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Wed, 21 Aug 2024 13:11:01 +0200 Subject: [PATCH 11/31] update default to perform score calibration of burdens --- pipelines/association_testing_pretrained.snakefile | 2 +- pipelines/association_testing_pretrained_regenie.snakefile | 2 +- pipelines/training_association_testing.snakefile | 2 +- pipelines/training_association_testing_regenie.snakefile | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 13a64afc..0f6b55ff 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -18,7 +18,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index 1514eaf5..731e9bbc 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -18,7 +18,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 6f17795d..a0337ec4 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -18,7 +18,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index 863f2e2d..49b1cc4e 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -18,7 +18,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] From 9c8901fb0c9e8e47532f998068dac00f796bdae8 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 7 Oct 2024 11:58:38 +0200 Subject: [PATCH 12/31] remove lsf dir --- lsf/lsf.yaml | 131 --------------------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 lsf/lsf.yaml diff --git a/lsf/lsf.yaml b/lsf/lsf.yaml deleted file mode 100644 index b230b39b..00000000 --- a/lsf/lsf.yaml +++ /dev/null @@ -1,131 +0,0 @@ -__default__: - - "-q medium" - - "-R \"select[(hname != 'odcf-cn11u15' && hname != 'odcf-cn11u17' && hname != 'odcf-cn33u24s03' && hname != 'odcf-cn23u25' && hname != 'odcf-cn11u13' && hname != 'odcf-cn31u13' && hname != 'odcf-cn31u21' && hname != 'odcf-cn23u23')]\"" - - -# For association testing pipelines - -config: - - "-q short" - -training_dataset: - - "-q long" - -delete_burden_cache: - - "-q short" - -choose_training_genes: - - "-q short" - -best_cv_run: - - "-q short" -link_avg_burdens: - - "-q short" -best_bagging_run: - - "-q short" - -train: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - # - "-R tensorcore" - # - "-L /bin/bash" - -compute_burdens: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=15.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - - "-W 180" - # - "-R tensorcore" - # - "-L /bin/bash" - -link_burdens: - - "-q medium" - -compute_plof_burdens: - - "-q medium" - -regress: - - "-q long" - -combine_regression_chunks: - - "-q short" - -regenie_step1_splitl0: - - "-q short" - -regenie_step1_runl0: - - "-q medium" - -regenie_step1_runl1: - - "-q medium" - -regenie_step1: - - "-q verylong" - -regenie_step2: - - "-q verylong" - - -# For CV (phenotype prediction) pipeline - -deeprvat_config: - - "-q short" - -deeprvat_plof_config: - - "-q short" - -deeprvat_training_dataset: - - "-q long" - -deeprvat_delete_burden_cache: - - "-q short" - -deeprvat_best_cv_run: - - "-q short" - -deeprvat_train: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - # - "-R tensorcore" - # - "-L /bin/bash" - - -deeprvat_compute_burdens: - - "-q gpu-lowprio" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - - "-W 180" - # - "-R tensorcore" - # - "-L /bin/bash" - -prepare_genotypes_per_gene: - - "-q long" -deeprvat_regress: - - "-q long" - -average_burdens: - - "-q long" - -regress_missense: - - "-q long" -regress_plof: - - "-q long" - -seed_gene_regress_missense: - - "-q long" - -seed_gene_association_dataset: - - "-q long" -association_dataset: - - "-q long" - -seed_gene_regress_plof: - - "-q medium" - -deeprvat_compute_plof_burdens: - - "-q medium" - -deeprvat_regress: - - "-q long" From 4879d9bd102da10f929a7a8a578d1b998ab5cc6f Mon Sep 17 00:00:00 2001 From: PMBio Date: Mon, 7 Oct 2024 09:59:10 +0000 Subject: [PATCH 13/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py index a5a671c0..3586bf70 100644 --- a/deeprvat/deeprvat/config.py +++ b/deeprvat/deeprvat/config.py @@ -383,7 +383,9 @@ def create_main_config( "alpha": input_config["evaluation"]["alpha"], } if "center_scale_burdens" in input_config["evaluation"]: - full_config["center_scale_burdens"] = input_config["evaluation"]["center_scale_burdens"] + full_config["center_scale_burdens"] = input_config["evaluation"][ + "center_scale_burdens" + ] if pretrained_setup: full_config.update( From c39cf8d93c2cd1f9f5b31c382b8e13c56540c722 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 7 Oct 2024 12:52:16 +0200 Subject: [PATCH 14/31] bugfix compute_burdens rule --- pipelines/association_testing/burdens.snakefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index 2ba4bd88..1af44f2c 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -74,7 +74,7 @@ rule compute_burdens: mem_mb = 32000, gpus = 1 shell: - ' && '.join([ + ' '.join([ ('deeprvat_associate compute-burdens ' + debug + ' --n-chunks '+ str(n_burden_chunks) + ' ' @@ -84,9 +84,8 @@ rule compute_burdens: '{input.data_config} ' '{input.model_config} ' '{input.checkpoints} ' - '{params.prefix}/{wildcards.phenotype}/deeprvat/burdens'), - 'touch {output}' - ]) + '{params.prefix}/burdens'], + ) rule reverse_models: input: From 49e06f6f1f45bf5951d7025b0a04e1a1fa4f254b Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 7 Oct 2024 12:56:40 +0200 Subject: [PATCH 15/31] bugfix --- .../association_testing/burdens.snakefile | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index 1af44f2c..a274a382 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -75,16 +75,16 @@ rule compute_burdens: gpus = 1 shell: ' '.join([ - ('deeprvat_associate compute-burdens ' - + debug + - ' --n-chunks '+ str(n_burden_chunks) + ' ' - '--chunk {wildcards.chunk} ' - '--dataset-file {input.dataset} ' - + center_scale_burdens + - '{input.data_config} ' - '{input.model_config} ' - '{input.checkpoints} ' - '{params.prefix}/burdens'], + 'deeprvat_associate compute-burdens ' + + debug + + ' --n-chunks '+ str(n_burden_chunks) + ' ' + '--chunk {wildcards.chunk} ' + '--dataset-file {input.dataset} ' + + center_scale_burdens + + '{input.data_config} ' + '{input.model_config} ' + '{input.checkpoints} ' + '{params.prefix}/burdens'], ) rule reverse_models: From ee887585eb0c2ef8877605383c116eb543d6b3d8 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 7 Oct 2024 14:28:59 +0200 Subject: [PATCH 16/31] remove skip_burdens from get_burden --- deeprvat/deeprvat/associate.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index a0fbdf98..a70195ef 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -64,10 +64,8 @@ def get_burden( :type agg_models: Dict[str, List[nn.Module]] :param device: Device to perform computations on, defaults to "cpu". :type device: torch.device - :param skip_burdens: Flag to skip burden computation, defaults to False. - :type skip_burdens: bool :return: Tuple containing burden scores, target y phenotype values, x phenotypes and sample ids. - :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + :rtype: Tuple[torch.Tensor, torch.Tensor] .. note:: Checkpoint models all corresponding to the same repeat are averaged for that repeat. @@ -1038,11 +1036,10 @@ def compute_burdens( "x_phenotypes": None, "sample": None, } - this_mode, _, _, _ = get_burden( + this_mode, _ = get_burden( empty_batch, agg_models, device=device, - skip_burdens=False, ) this_mode = this_mode.flatten() center_scale_df = pd.DataFrame(columns=["mode"]) From 9a112ddf7819c3f101c9860402138435c1e44255 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 7 Oct 2024 15:29:54 +0200 Subject: [PATCH 17/31] bugfix max computation from main branch merge --- deeprvat/deeprvat/associate.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index a70195ef..c923d3dd 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -922,6 +922,16 @@ def compute_burdens_( if bottleneck and i > 20: break + #Calculate Max for this chunk and store for later + max_df = pd.DataFrame(columns=["max"]) + for r in range(len(agg_models)): + chunk_max = np.max(chunk_burden[:,:,r]) #samples x genes x repeats + max_df.loc[r, "max"] = chunk_max + print(f"Saving Burden Max Scores") + max_df.to_csv( + f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False + ) + burdens[:] = chunk_burden[:] sample_ids[:] = chunk_sampleid[:] From df40898ee0742b29f74f6bf233259d17546b5fd8 Mon Sep 17 00:00:00 2001 From: PMBio Date: Mon, 7 Oct 2024 13:30:25 +0000 Subject: [PATCH 18/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index c923d3dd..e285087f 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -922,15 +922,13 @@ def compute_burdens_( if bottleneck and i > 20: break - #Calculate Max for this chunk and store for later + # Calculate Max for this chunk and store for later max_df = pd.DataFrame(columns=["max"]) for r in range(len(agg_models)): - chunk_max = np.max(chunk_burden[:,:,r]) #samples x genes x repeats + chunk_max = np.max(chunk_burden[:, :, r]) # samples x genes x repeats max_df.loc[r, "max"] = chunk_max print(f"Saving Burden Max Scores") - max_df.to_csv( - f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False - ) + max_df.to_csv(f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False) burdens[:] = chunk_burden[:] sample_ids[:] = chunk_sampleid[:] From ae8450aa541864ba37d883b71304909220cae839 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Tue, 8 Oct 2024 15:13:34 +0200 Subject: [PATCH 19/31] github action test --- pipelines/association_testing_pretrained.snakefile | 2 +- pipelines/association_testing_pretrained_regenie.snakefile | 2 +- pipelines/training_association_testing.snakefile | 2 +- pipelines/training_association_testing_regenie.snakefile | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 96bd08ac..a2fb82a0 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -13,7 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index 2ec5f923..d6e32574 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -13,7 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index cf6a20fa..34f86176 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -17,7 +17,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index 66c8b179..535a63a9 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -16,7 +16,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] From ccc5d56acadcc93590077160e998d95c271272ab Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Tue, 8 Oct 2024 17:00:16 +0200 Subject: [PATCH 20/31] specify upload-artifact version --- .github/workflows/run-pipeline.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml index f6c9322e..a3d4ae43 100644 --- a/.github/workflows/run-pipeline.yml +++ b/.github/workflows/run-pipeline.yml @@ -134,7 +134,7 @@ jobs: - name: Upload Training Outputs id: uploaded_training_outputs if: inputs.upload_training_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_training_outputs path: | @@ -148,7 +148,7 @@ jobs: - name: Upload Pretrained Outputs id: uploaded_pretrained_outputs if: inputs.upload_pretrained_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_pretrained_outputs path: | @@ -163,7 +163,7 @@ jobs: - name: Upload Regenie Outputs id: uploaded_regenie_outputs if: inputs.upload_regenie_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_regenie_outputs path: | From e28f8737c29892ae0f6f27da9fa4d2f5f2e4b11a Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Wed, 9 Oct 2024 08:28:44 +0200 Subject: [PATCH 21/31] Reset default of center_scale_burdens to True --- pipelines/association_testing_pretrained.snakefile | 2 +- pipelines/association_testing_pretrained_regenie.snakefile | 2 +- pipelines/training_association_testing.snakefile | 2 +- pipelines/training_association_testing_regenie.snakefile | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index a2fb82a0..96bd08ac 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -13,7 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index d6e32574..2ec5f923 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -13,7 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 34f86176..cf6a20fa 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -17,7 +17,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index 535a63a9..66c8b179 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -16,7 +16,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) -center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', False) else '' +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] From 042aa426cbcdd49629e9c828e8ae8466c9c53024 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 14 Oct 2024 10:28:33 +0200 Subject: [PATCH 22/31] remove unused module --- deeprvat/deeprvat/associate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index e285087f..bd625768 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -25,7 +25,6 @@ from tqdm import tqdm, trange import zarr import re -import dask.array as da import deeprvat.deeprvat.models as deeprvat_models from deeprvat.data import DenseGTDataset From 9587f90ec6d9f28624031de7e4b67db9916796e8 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 14 Oct 2024 10:30:12 +0200 Subject: [PATCH 23/31] remove extra space --- deeprvat/deeprvat/associate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index bd625768..5e06fdb1 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -25,7 +25,6 @@ from tqdm import tqdm, trange import zarr import re - import deeprvat.deeprvat.models as deeprvat_models from deeprvat.data import DenseGTDataset From 68aabb37bed6b8d2f67ddf62d3ac05b5bfca4378 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 14 Oct 2024 10:31:28 +0200 Subject: [PATCH 24/31] remove extra space --- deeprvat/deeprvat/associate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 5e06fdb1..9dc67537 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -8,7 +8,6 @@ from pathlib import Path from pprint import pprint from typing import Dict, List, Optional, Tuple - import click import numpy as np import pandas as pd From 08e01ddb0171b574b04ec8e511622610ea35a997 Mon Sep 17 00:00:00 2001 From: Brian Clarke Date: Mon, 14 Oct 2024 15:23:03 +0200 Subject: [PATCH 25/31] add release notes --- docs/index.rst | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c9d1700f..bc92c6c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,8 +8,6 @@ Welcome to DeepRVAT's documentation! Rare variant association testing using deep learning and data-driven gene impairment scores. -_Coming soon:_ Overview of the DeepRVAT methodaster - How to use this documentation =================================== @@ -39,6 +37,25 @@ If you use this package, please cite: Clarke, Holtkamp et al., “Integration of Variant Annotations Using Deep Set Networks Boosts Rare Variant Association Testing.” Nature Genetics. https://www.nature.com/articles/s41588-024-01919-z +Release notes +==================================== + +v1.1.0 +------------------------------------ + +Adjusts the calibration of DeepRVAT scores to differ from previous versions. Each model in the ensemble has its scores linearly adjusted to lie between -1 and 1, with the "no variant" score set to 0. (Previously, scores were between 0 and 1 with "no variant" close to, but not exactly, 0.5.) + +This can change the results of association testing and phenotype prediction, giving (in our testing) a slight boost in yield and replication of significant gene-trait associations. + + +v1.0.0 +------------------------------------ + +First release version. + +The results of the DeepRVAT publication can be reproduced using this version of the package. + + Contact ==================================== From 1b626fc40f051241ddcba230617bd6e276fde377 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 17 Oct 2024 08:25:25 +0200 Subject: [PATCH 26/31] update scaling function --- deeprvat/deeprvat/associate.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 9dc67537..746c79f9 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -1625,14 +1625,10 @@ def average_burdens( for r in range(len(repeats)): zero_effect_val = center_scale_df.loc[r, "mode"] repeat_max = max_dfs.loc[r, "max"] - # Subtract off zero effect burden value (mode) - this_burdens[:, :, r] -= zero_effect_val adjusted_max = repeat_max - zero_effect_val - min_val = this_burdens[:, :, r].min() - # Scale values between -1 and 1 + # Subtract off zero effect burden value (mode) and scale this_burdens[:, :, r] = ( - 2 * ((this_burdens[:, :, r] - min_val) / (adjusted_max - min_val)) - - 1 + (this_burdens[:, :, r] - zero_effect_val) / adjusted_max ) this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) From a260b8b96c476a7ec27f020e2d7554120578bc97 Mon Sep 17 00:00:00 2001 From: PMBio Date: Thu, 17 Oct 2024 06:26:34 +0000 Subject: [PATCH 27/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 746c79f9..6369ae1c 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -1628,8 +1628,8 @@ def average_burdens( adjusted_max = repeat_max - zero_effect_val # Subtract off zero effect burden value (mode) and scale this_burdens[:, :, r] = ( - (this_burdens[:, :, r] - zero_effect_val) / adjusted_max - ) + this_burdens[:, :, r] - zero_effect_val + ) / adjusted_max this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) From 7c5672068a53af8029c957c50c2894a495b85c49 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 17 Oct 2024 10:56:07 +0200 Subject: [PATCH 28/31] specify center_scale_burdens option in example configs --- example/config/deeprvat_input_config.yaml | 2 +- example/config/deeprvat_input_config_regenie.yaml | 1 + example/config/deeprvat_input_pretrained_models_config.yaml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/example/config/deeprvat_input_config.yaml b/example/config/deeprvat_input_config.yaml index 10cde104..36f92618 100644 --- a/example/config/deeprvat_input_config.yaml +++ b/example/config/deeprvat_input_config.yaml @@ -129,7 +129,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 - center_scale_burdens: False + center_scale_burdens: True # Subsetting samples for training or association testing #sample_files: diff --git a/example/config/deeprvat_input_config_regenie.yaml b/example/config/deeprvat_input_config_regenie.yaml index f74814fb..131c0765 100644 --- a/example/config/deeprvat_input_config_regenie.yaml +++ b/example/config/deeprvat_input_config_regenie.yaml @@ -129,6 +129,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: True # Subsetting samples for training or association testing #sample_files: diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml index 1dc4cfde..939f5df0 100644 --- a/example/config/deeprvat_input_pretrained_models_config.yaml +++ b/example/config/deeprvat_input_pretrained_models_config.yaml @@ -52,7 +52,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 - center_scale_burdens: False + center_scale_burdens: True # Subsetting samples for association testing #sample_files: From e4d31d6094e20425fe7cec71c8647aebb43df256 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Tue, 22 Oct 2024 09:26:45 +0200 Subject: [PATCH 29/31] remove hard-coded annotation length --- deeprvat/deeprvat/associate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 6369ae1c..eb8ee1f2 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -1035,8 +1035,9 @@ def compute_burdens( if center_scale_burdens: if (chunk == 0) or not chunk: # Calculate Mode + anno_len = len(data_config[data_key]['dataset_config']['rare_embedding']['config']['annotations']) empty_batch = { - "rare_variant_annotations": torch.zeros(1, 1, 34, 1), + "rare_variant_annotations": torch.zeros(1, 1, anno_len, 1), "y": None, "x_phenotypes": None, "sample": None, From c86757beff744578ce951b70213c243e0d657122 Mon Sep 17 00:00:00 2001 From: PMBio Date: Tue, 22 Oct 2024 07:27:21 +0000 Subject: [PATCH 30/31] fixup! Format Python code with psf/black pull_request --- deeprvat/deeprvat/associate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index eb8ee1f2..370fadc1 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -1035,7 +1035,11 @@ def compute_burdens( if center_scale_burdens: if (chunk == 0) or not chunk: # Calculate Mode - anno_len = len(data_config[data_key]['dataset_config']['rare_embedding']['config']['annotations']) + anno_len = len( + data_config[data_key]["dataset_config"]["rare_embedding"]["config"][ + "annotations" + ] + ) empty_batch = { "rare_variant_annotations": torch.zeros(1, 1, anno_len, 1), "y": None, From b77a98aefcfb958a0398be42105d024859d46d33 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Mon, 4 Nov 2024 11:39:46 +0100 Subject: [PATCH 31/31] add in center_scale_burdens option from config --- pipelines/cv_training/cv_training_association_testing.snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/cv_training/cv_training_association_testing.snakefile b/pipelines/cv_training/cv_training_association_testing.snakefile index 9f507b5b..1b7d275e 100644 --- a/pipelines/cv_training/cv_training_association_testing.snakefile +++ b/pipelines/cv_training/cv_training_association_testing.snakefile @@ -18,6 +18,7 @@ burden_phenotype = phenotypes[0] n_burden_chunks = config.get("n_burden_chunks", 1) if not debug_flag else 2 n_regression_chunks = config.get("n_regression_chunks", 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config["hyperparameter_optimization"]["n_trials"] n_bags = config["training"]["n_bags"] if not debug_flag else 3 n_repeats = config["n_repeats"]