diff --git a/computerome_calc_tnfs_almeida.sh b/computerome_calc_tnfs_almeida.sh deleted file mode 100755 index 950fc95c..00000000 --- a/computerome_calc_tnfs_almeida.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -python3 calc_tnf.py - diff --git a/computerome_mmseq_search.sh b/computerome_mmseq_search.sh deleted file mode 100755 index 4b1d8cbe..00000000 --- a/computerome_mmseq_search.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -/home/projects/cpr_10006/people/svekut/.conda/vamb/bin/mmseqs databases GTDB gtdb tmp - diff --git a/computerome_semisupervised_mmseq_genus.sh b/computerome_semisupervised_mmseq_genus.sh deleted file mode 100755 index 4f43acc1..00000000 --- a/computerome_semisupervised_mmseq_genus.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=$2 -dataset=$1 - -python3 train_mmseq_genus.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_genus_almeida.sh b/computerome_semisupervised_mmseq_genus_almeida.sh deleted file mode 100755 index 5e72befb..00000000 --- a/computerome_semisupervised_mmseq_genus_almeida.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module unload gcc -module load gcc/11.1.0 -module unload anaconda3/4.4.0 -module load anaconda3/2023.03 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=200 - -python3 train_mmseq_hloss_vamb2label_almeida.py --nepoch $nepoch --cuda --dataset almeida \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_genus_binning.sh b/computerome_semisupervised_mmseq_genus_binning.sh deleted file mode 100755 index 908158be..00000000 --- a/computerome_semisupervised_mmseq_genus_binning.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_binning.py --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_genus_svae.sh b/computerome_semisupervised_mmseq_genus_svae.sh deleted file mode 100755 index 484f555f..00000000 --- a/computerome_semisupervised_mmseq_genus_svae.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_mmseq_genus_svae.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss.sh b/computerome_semisupervised_mmseq_hloss.sh deleted file mode 100755 index abe28fae..00000000 --- a/computerome_semisupervised_mmseq_hloss.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=200 -dataset=$1 - -python3 train_mmseq_hloss_vamb2label.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss_concat.sh b/computerome_semisupervised_mmseq_hloss_concat.sh deleted file mode 100755 index 7f5f4be2..00000000 --- a/computerome_semisupervised_mmseq_hloss_concat.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_mmseq_hloss_concat.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss_inds.sh b/computerome_semisupervised_mmseq_hloss_inds.sh deleted file mode 100755 index 0321f35a..00000000 --- a/computerome_semisupervised_mmseq_hloss_inds.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_mmseq_hloss_mmseq_inds.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss_labels.sh b/computerome_semisupervised_mmseq_hloss_labels.sh deleted file mode 100755 index 23b34264..00000000 --- a/computerome_semisupervised_mmseq_hloss_labels.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_mmseq_hloss_labels.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss_longread.sh b/computerome_semisupervised_mmseq_hloss_longread.sh deleted file mode 100755 index 5d377af5..00000000 --- a/computerome_semisupervised_mmseq_hloss_longread.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module unload gcc -module load gcc/11.1.0 -module unload anaconda3/4.4.0 -module load anaconda3/2023.03 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=1000 - -python3 train_mmseq_hloss_vamb2label_longread.py --nepoch $nepoch --cuda \ No newline at end of file diff --git a/computerome_semisupervised_mmseq_hloss_walktrap.sh b/computerome_semisupervised_mmseq_hloss_walktrap.sh deleted file mode 100755 index d3f0f7e4..00000000 --- a/computerome_semisupervised_mmseq_hloss_walktrap.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=200 -dataset=$1 - -python3 train_mmseq_hloss_vamb2label_walktrap.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_semisupervised_vamb.sh b/computerome_semisupervised_vamb.sh deleted file mode 100755 index 0bbba98b..00000000 --- a/computerome_semisupervised_vamb.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=1000 -dataset=$1 - -python3 train_mmseq_vamb.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_snakemake.sh b/computerome_snakemake.sh deleted file mode 100755 index 8356a3dc..00000000 --- a/computerome_snakemake.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module unload gcc -module load gcc/11.1.0 -module load anaconda3/4.4.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 - -python3 train_mmseq_genus.py --nepoch $nepoch --cuda --dataset $dataset \ No newline at end of file diff --git a/computerome_test_knud.sh b/computerome_test_knud.sh deleted file mode 100755 index c47c3ab8..00000000 --- a/computerome_test_knud.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash -module load tools computerome_utils/2.0 -module load anaconda3/2023.03 -module unload gcc -module load gcc/11.1.0 -module load minimap2/2.17r941 samtools/1.10 - -source ~/.bashrc -conda init bash -conda activate /home/projects/cpr_10006/people/svekut/.conda/vamb - -nepoch=500 -dataset=$1 -PATH='/home/projects/ku_00200/data/matrix/mgx22_assembly/vamb' -/home/projects/cpr_10006/people/svekut/.conda/vamb/bin/vamb --outdir test_knud2 --fasta ${PATH}/outdir/contigs.flt.fna.gz -p 36 --rpkm ${PATH}/outdir/abundance.npz -m 2000 --minfasta 200000 --cuda -o C -m 2000 --minfasta 200000 --model vae-aae diff --git a/train_binning.py b/train_binning.py deleted file mode 100644 index 924d16de..00000000 --- a/train_binning.py +++ /dev/null @@ -1,101 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--exp", type=str, default='') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' - -# PATH_CONTIGS = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.fa.gz' -# ABUNDANCE_PATH = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.jgi.depth.npz' - - -# exp_id = args['exp'] -exp_id = '_species_filter' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}{exp_id}.pt' -# MODEL_PATH = f'model_semisupervised_mmseq_almeida.pt' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -dataloader, mask = vamb.encode.make_dataloader( - rpkms, - composition.matrix, - composition.metadata.lengths, -) - -# latent_vamb = np.load(f'latent_trained_semisupervised_mmseq_genus_vamb_{DATASET}{exp_id}.npy') -# latent_both = np.load(f'latent_trained_semisupervised_mmseq_genus_both_{DATASET}{exp_id}.npy') -# latent_labels = np.load(f'latent_trained_semisupervised_mmseq_genus_labels_{DATASET}{exp_id}.npy') - -latent_vamb = np.load(f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy') -latent_both = np.load(f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy') -latent_labels = np.load(f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy') -inds = np.array(range(latent_vamb.shape[0])) -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'rb') as fp: - indices_both = pickle.load(fp) -new_indices = [np.argwhere(inds == i)[0][0] for i in indices_both] -print(latent_both.shape, len(new_indices), len(indices_both)) -latent_vamb_new = latent_vamb.copy() -latent_vamb_new[new_indices] = latent_both - -composition.metadata.filter_mask(mask) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_species_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_species_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_species_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) diff --git a/train_binning_folders.py b/train_binning_folders.py deleted file mode 100644 index f8804f26..00000000 --- a/train_binning_folders.py +++ /dev/null @@ -1,62 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--exp", type=str, default='') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -# PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -# ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' - -PATH_CONTIGS = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.fa.gz' -ABUNDANCE_PATH = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.jgi.depth.npz' - - -exp_id = args['exp'] -# MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}{exp_id}.pt' -MODEL_PATH = f'model_semisupervised_mmseq_almeida.pt' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers -lengthof = dict(zip(contignames, lengths)) - -with open(f'clusters_mmseq_genus_{DATASET}_v4{exp_id}.tsv') as file: - clusters = vamb.vambtools.read_clusters(file) -filtered_clusters = dict() - -for cluster, contigs in clusters.items(): - size = sum(lengthof[contig] for contig in contigs) - if size >= 200000: - filtered_clusters[cluster] = clusters[cluster] - -with vamb.vambtools.Reader(PATH_CONTIGS) as file: - vamb.vambtools.write_bins( - f'bins_almeida', - filtered_clusters, - file, - maxbins=None, - separator='C', - ) diff --git a/train_mmseq_almeida.py b/train_mmseq_almeida.py deleted file mode 100644 index 49cfde73..00000000 --- a/train_mmseq_almeida.py +++ /dev/null @@ -1,81 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd - -parser = argparse.ArgumentParser() -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -PATH_CONTIGS = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.fa.gz' -ABUNDANCE_PATH = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.jgi.depth.npz' -MODEL_PATH = f'model_semisupervised_mmseq_almeida.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -df_mmseq_genus['genus'] = df_mmseq_genus[8].str.split(';').str[5] - -vc = df_mmseq_genus['genus'].value_counts() -good_labels = set(vc[vc > 1000].index) -df_mmseq_genus = df_mmseq_genus[df_mmseq_genus[3].isin(good_labels)] - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -classes_order = list(df_mmseq_genus['genus']) -vae = vamb.semisupervised_encode.VAEVAE(nsamples=rpkms.shape[1], nlabels=len(set(classes_order)), cuda=CUDA) - -with open(f'indices_mmseq_genus_{DATASET}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask = vamb.encode.make_dataloader(rpkms, tnfs, lengths, batchsize=256) -dataloader_joint, mask = vamb.semisupervised_encode.make_dataloader_concat(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order, batchsize=256) -dataloader_labels, mask = vamb.semisupervised_encode.make_dataloader_labels(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order, batchsize=256) -shapes = (rpkms.shape[1], 103, 1, len(set(classes_order))) -print('shapes', shapes) -dataloader = vamb.semisupervised_encode.make_dataloader_semisupervised(dataloader_joint, dataloader_vamb, dataloader_labels, shapes, batchsize=256) -print('Dataloaders created') - -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - ) - print('training') - -latent = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_semisupervised_mmseq_genus_vamb_{DATASET}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent) - -latent = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_semisupervised_mmseq_genus_both_{DATASET}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent) - -latent = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_semisupervised_mmseq_genus_labels_{DATASET}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent) diff --git a/train_mmseq_genus.py b/train_mmseq_genus.py deleted file mode 100644 index 2c7c60be..00000000 --- a/train_mmseq_genus.py +++ /dev/null @@ -1,147 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy_2023.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -exp_id = f'_mmseq_onehot_genus_{N_EPOCHS}' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) - -# df_mmseq_family = df_mmseq[(df_mmseq[2] == 'family') |(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -# df_mmseq_family['family'] = df_mmseq_family[8].str.split(';').str[4] - -df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -df_mmseq_genus['genus'] = df_mmseq_genus[8].str.split(';').str[5] - -# df_mmseq_genus = df_mmseq[df_mmseq[2] == 'species'] -# df_mmseq_genus['genus'] = df_mmseq_genus[8].str.split(';').str[6] - -# df_mmseq_species = df_mmseq[df_mmseq[2] == 'species'] -# vc = df_mmseq_species[3].value_counts() -# good_labels = set(vc[vc > 100].index) -# df_mmseq_species = df_mmseq_species[df_mmseq_species[3].isin(good_labels)] - -# df_gt = pd.read_csv(GT_PATH) - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -# indices_mmseq = [ind_map[c] for c in df_gt['contigs']] -# classes_order = np.array(list(df_gt['tax'].str.split(';').str[-1]))[indices_mmseq] -classes_order = list(df_mmseq_genus['genus']) -# classes_order = list(df_mmseq_family['family']) - -vae = vamb.semisupervised_encode.VAEVAE(nsamples=rpkms.shape[1], nlabels=len(set(classes_order)), cuda=CUDA) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.semisupervised_encode.make_dataloader_concat(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order) -dataloader_labels, mask = vamb.semisupervised_encode.make_dataloader_labels(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order) -shapes = (rpkms.shape[1], 103, 1, len(set(classes_order))) -dataloader = vamb.semisupervised_encode.make_dataloader_semisupervised(dataloader_joint, dataloader_vamb, dataloader_labels, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75, 150], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -inds = np.array(range(latent_vamb.shape[0])) -# new_indices = [np.argwhere(inds == i)[0][0] for i in indices_mmseq] -# print(latent_both.shape, len(new_indices), len(indices_mmseq)) -latent_vamb_new = latent_vamb.copy() -latent_vamb_new[indices_mmseq] = latent_both - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers -print('len after mask', len(names)) -print('vamb shape', latent_vamb.shape) - -latent_vamb_new_labels = latent_vamb.copy() -latent_vamb_new_labels[indices_mmseq] = latent_labels - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_genus_svae.py b/train_mmseq_genus_svae.py deleted file mode 100644 index 60e9b1eb..00000000 --- a/train_mmseq_genus_svae.py +++ /dev/null @@ -1,145 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' - -exp_id = 'svae' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}_{exp_id}.pt' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -df_mmseq_genus['genus'] = df_mmseq_genus[8].str.split(';').str[5] - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -classes_order = list(df_mmseq_genus['genus']) - -vae = vamb.semisupervised_encode.SVAE(nsamples=rpkms.shape[1], nlabels=len(set(classes_order)), cuda=CUDA) - -with open(f'indices_mmseq_genus_{DATASET}_{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask1 = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.semisupervised_encode.make_dataloader_concat(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order) -dataloader_labels, mask = vamb.semisupervised_encode.make_dataloader_labels(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order) -shapes = (rpkms.shape[1], 103, 1, len(set(classes_order))) -dataloader = vamb.semisupervised_encode.make_dataloader_semisupervised(dataloader_joint, dataloader_vamb, dataloader_labels, shapes) - -vae._VAEVamb.train() -vae._VAELabels.train() -vae._VAEVamb_joint.train() -vae._VAELabels_joint.train() -vae.VAEVamb.train() -vae.VAELabels.train() -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - ) - print('training') - - -vae._VAEVamb.eval() -vae._VAELabels.eval() -vae._VAEVamb_joint.eval() -vae._VAELabels_joint.eval() -vae.VAEVamb.eval() -vae.VAELabels.eval() -latent = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}_{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent) - -latent = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}_{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent) - -latent = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}_{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent) - -latent_vamb = np.load(f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}_{exp_id}.npy') -latent_both = np.load(f'latent_trained_lengths_mmseq_genus_both_{DATASET}_{exp_id}.npy') -latent_labels = np.load(f'latent_trained_lengths_mmseq_genus_labels_{DATASET}_{exp_id}.npy') - -print('Clustering') - -inds = np.array(range(latent_vamb.shape[0])) -with open(f'indices_mmseq_genus_{DATASET}_{exp_id}.pickle', 'rb') as fp: - indices_both = pickle.load(fp) -new_indices = [np.argwhere(inds == i)[0][0] for i in indices_both] -print(latent_both.shape, len(new_indices), len(indices_both)) -latent_vamb_new = latent_vamb.copy() -latent_vamb_new[new_indices] = latent_both - -composition.metadata.filter_mask(mask1) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_genus_{DATASET}_v4_{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_genus_vamb_{DATASET}_v4_{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_genus_labels_{DATASET}_v4_{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) diff --git a/train_mmseq_hloss.py b/train_mmseq_hloss.py deleted file mode 100644 index a83d265b..00000000 --- a/train_mmseq_hloss.py +++ /dev/null @@ -1,159 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_labels' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) - -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] -# df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -# df_mmseq_genus = df_mmseq_genus[df_mmseq_genus[7] > 0.8] -# classes_order = list(df_mmseq_genus[8].str.split(';').str[-1]) - -df_gt = pd.read_csv(GT_PATH) -df_gt['tax'] = df_gt['tax'].str.replace('Bacteria', 'd_Bacteria', regex=False) -df_gt['tax'] = df_gt['tax'].str.replace('Archaea', 'd_Archaea', regex=False) -df_gt['gt_genus'] = df_gt['tax'].str.split(';').str[:6].map(lambda x: ';'.join(x)) -df_gt['gt_species'] = df_gt['tax'].str.split(';').str[:7].map(lambda x: ';'.join(x)) - -df_merged = pd.merge(df_mmseq, df_gt, left_on=0, right_on='contigs') -df_merged['gt_tree'] = df_merged['gt_species'] -df_merged.loc[(df_merged[2] == 'genus'), 'gt_tree'] = df_merged.loc[(df_merged[2] == 'genus'), 'gt_genus'] -df_merged['len'] = df_merged['gt_tree'].str.split(';').str[:].map(lambda x: len(x)) -print(df_merged['len'].value_counts(), flush=True) - -df_mmseq_genus = df_merged[(df_merged[2] == 'genus') | (df_merged[2] == 'species')] - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -graph_column = df_mmseq_genus['gt_tree'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -# indices_mmseq = [ind_map[c] for c in df_gt['contigs']] -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - # batchsteps=[25, 75, 150], - batchsteps=[], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -inds = np.array(range(latent_vamb.shape[0])) -# new_indices = [np.argwhere(inds == i)[0][0] for i in indices_mmseq] -# print(latent_both.shape, len(new_indices), len(indices_mmseq)) -latent_vamb_new = latent_vamb.copy() -latent_vamb_new[indices_mmseq] = latent_both - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -latent_vamb_new_labels = latent_vamb.copy() -latent_vamb_new_labels[indices_mmseq] = latent_labels - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_concat.py b/train_mmseq_hloss_concat.py deleted file mode 100644 index 577f1e0f..00000000 --- a/train_mmseq_hloss_concat.py +++ /dev/null @@ -1,86 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_mmseq_predict' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_{DATASET}{exp_id}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_gt = pd.read_csv(GT_PATH) -graph_column = df_gt[f'predictions{exp_id}'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEConcatHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader_joint, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=set(), - ) - print('training') - -latent_both = vae.encode(dataloader_joint) -LATENT_PATH = f'latent_concat_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -with open(f'clusters_concat_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) diff --git a/train_mmseq_hloss_labels.py b/train_mmseq_hloss_labels.py deleted file mode 100644 index 0670473b..00000000 --- a/train_mmseq_hloss_labels.py +++ /dev/null @@ -1,107 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_gt' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_{DATASET}{exp_id}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -# df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -# df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -# df_mmseq[8] = df_mmseq[8].fillna('d_Bacteria') -# df_mmseq.loc[df_mmseq[8].isin(['-_root']), 8] = 'd_Bacteria' - -# df_mmseq_family = df_mmseq[(df_mmseq[2] == 'family') |(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -df_gt = pd.read_csv(GT_PATH) -df_gt['tax'] = df_gt['tax'].str.replace('Bacteria', 'd_Bacteria', regex=False) -df_gt['tax'] = df_gt['tax'].str.replace('Archaea', 'd_Archaea', regex=False) - -classes_order = list(df_gt['tax'].str.split(';').str[-1]) - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_gt['contigs']] - -nodes, ind_nodes, table_indices, table_true, table_walkdown, table_parent = vamb.h_loss.make_graph(df_gt['tax'].unique()) - -targets = [ind_nodes[i] for i in classes_order] - -# vae = vamb.semisupervised_encode.VAELabels( -# len(set(classes_order)), -# cuda=CUDA, -# logfile=sys.stdout, -# ) - -vae = vamb.h_loss.VAELabelsHLoss( - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -# dataloader_labels, mask = vamb.semisupervised_encode.make_dataloader_labels(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], classes_order) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader_labels, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - ) - print('training') - -latent_both = vae.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) diff --git a/train_mmseq_hloss_mmseq_inds.py b/train_mmseq_hloss_mmseq_inds.py deleted file mode 100644 index 560e3586..00000000 --- a/train_mmseq_hloss_mmseq_inds.py +++ /dev/null @@ -1,158 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_mmseq' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) - -# df_mmseq_genus = df_mmseq[~(df_mmseq[2] == 'no rank')] - -df_mmseq_genus = df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')] -classes_order = list(df_mmseq_genus[8].str.split(';').str[-1]) - -# df_gt = pd.read_csv(GT_PATH) -# df_gt['tax'] = df_gt['tax'].str.replace('Bacteria', 'd_Bacteria', regex=False) -# df_gt['tax'] = df_gt['tax'].str.replace('Archaea', 'd_Archaea', regex=False) - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -# nodes, ind_nodes, table_indices, table_true, table_walkdown, table_parent = vamb.h_loss.make_graph(df_gt['tax'].unique()) - -nodes, ind_nodes, table_indices, table_true, table_walkdown, table_parent = vamb.h_loss.make_graph(df_mmseq_genus[8].unique()) - -# indices_mmseq = [ind_map[c] for c in df_gt['contigs']] -# classes_order = np.array(list(df_gt['tax'].str.split(';').str[-1]))[indices_mmseq] -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - table_indices, - table_true, - table_walkdown, - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - ) - print('training') - -# vae = vamb.h_loss.VAEVAEHLoss.load( -# MODEL_PATH, -# table_indices, -# table_true, -# table_walkdown, -# nodes, -# table_parent, -# evaluate=True, -# ) - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -inds = np.array(range(latent_vamb.shape[0])) -new_indices = [np.argwhere(inds == i)[0][0] for i in indices_mmseq] -print(latent_both.shape, len(new_indices), len(indices_mmseq)) -latent_vamb_new = latent_vamb.copy() -latent_vamb_new[new_indices] = latent_both - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb_new) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_vamb2label.py b/train_mmseq_hloss_vamb2label.py deleted file mode 100644 index cec2db98..00000000 --- a/train_mmseq_hloss_vamb2label.py +++ /dev/null @@ -1,209 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_mmseq_predict_replace' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] -df_mmseq_genus = df_mmseq - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -graph_column = df_mmseq_genus[8] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -model = vamb.h_loss.VAMB2Label( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - model.trainmodel( - dataloader_joint, - nepochs=100, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75], - ) - print('training') - -latent_vamb = model.predict(dataloader_vamb) -LATENT_PATH = f'latent_predict_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb', latent_vamb.shape) -np.save(LATENT_PATH, latent_vamb) - -print('Saving the tree', len(nodes)) -TREE_PATH = f'tree_predict_vamb_{DATASET}{exp_id}.npy' -np.save(TREE_PATH, np.array(nodes)) -PARENT_PATH = f'parents_predict_vamb_{DATASET}{exp_id}.npy' -np.save(PARENT_PATH, np.array(table_parent)) - -print('Getting the predictions') -df_gt = pd.read_csv(GT_PATH) -predictions = [] -for i in range(len(df_gt)): - predictions.append(';'.join(np.array(nodes)[latent_vamb[i] > 0.5][1:])) - -df_gt[f'predictions{exp_id}'] = predictions - -df_mmseq_sp = df_mmseq[(df_mmseq[2] == 'species')] -mmseq_map = {k: v for k, v in zip(df_mmseq_sp[0], df_mmseq_sp[8])} -counters = defaultdict(lambda: 0) -preds = [] -for i, r in df_gt.iterrows(): - pred_line = r[f'predictions{exp_id}'].split(';') - try: - mmseq_line = mmseq_map.get(r['contigs'], '').split(';') - except AttributeError: - preds.append(';'.join(pred_line)) - continue - if mmseq_line[0] != '': - for i in range(len(mmseq_line)): - if i < len(pred_line): - pred_line[i] = mmseq_line[i] - preds.append(';'.join(pred_line)) -df_gt[f'predictions{exp_id}_replace'] = preds - -df_gt.to_csv(GT_PATH, index=None) - -print('Starting the VAE') - -# graph_column = df_gt[f'predictions{exp_id}'] -# for i in range(len(latent_vamb)): -# predictions.append(';'.join(np.array(nodes)[latent_vamb[i] > 0.5][1:])) - -# df_preds = pd.DataFrame({'contigs': contignames, f'predictions{exp_id}': predictions}) -# df_mmseq_genus = pd.merge(df_mmseq, df_preds, left_on=0, right_on='contigs') - -graph_column = df_gt[f'predictions{exp_id}_replace'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75, 150], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_vamb2label_almeida.py b/train_mmseq_hloss_vamb2label_almeida.py deleted file mode 100644 index 132baedb..00000000 --- a/train_mmseq_hloss_vamb2label_almeida.py +++ /dev/null @@ -1,229 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_almeida_hloss_pred_short' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -PATH_CONTIGS = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.fa.gz' -ABUNDANCE_PATH = f'/home/projects/cpr_10006/projects/vamb/analysis/almeida/data/almeida.jgi.depth.npz' -MODEL_PATH = f'model_semisupervised_mmseq_almeida.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs = vamb.vambtools.read_npz('almeida_tnfs.npz') -lengths = vamb.vambtools.read_npz('almeida_lengths.npz') -contignames = np.load('almeida_contignames.npz', allow_pickle=True)["arr_0"] - -print('N contigs', len(contignames)) - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) - -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] - -print('Total mmseq hits', len(df_mmseq)) -print('Genus', len(df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')])) -print('Species', len(df_mmseq[(df_mmseq[2] == 'species')])) - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq[0]] - -species_dict = df_mmseq[df_mmseq[2] == 'species'][3].value_counts() -non_abundant_species = set() -for k, v in species_dict.items(): - if v < 1000: - non_abundant_species.add(k) -df_mmseq['tax'] = df_mmseq[8] -df_mmseq.loc[df_mmseq[3].isin(non_abundant_species), 'tax'] = \ - df_mmseq.loc[df_mmseq[3].isin(non_abundant_species), 8].str.split(';').str[:1].map(lambda x: ';'.join(x)) - -graph_column = df_mmseq['tax'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -print('N nodes', len(nodes)) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -model = vamb.h_loss.VAMB2Label( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) -batchsize = 128 -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths, batchsize=batchsize) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss( - rpkms[indices_mmseq], - tnfs[indices_mmseq], - lengths[indices_mmseq], - targets, - len(nodes), - table_parent, - batchsize=batchsize, - ) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - model.trainmodel( - dataloader_joint, - nepochs=5, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[], - ) - print('training') - -latent_vamb = model.predict(dataloader_vamb) -LATENT_PATH = f'latent_predict_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb', latent_vamb.shape) -np.save(LATENT_PATH, latent_vamb) - -print('Saving the tree', len(nodes)) -TREE_PATH = f'tree_predict_vamb_{DATASET}{exp_id}.npy' -np.save(TREE_PATH, np.array(nodes)) -PARENT_PATH = f'parents_predict_vamb_{DATASET}{exp_id}.npy' -np.save(PARENT_PATH, np.array(table_parent)) - -print('Getting the predictions') -df_gt = pd.DataFrame({'contigs': contignames}) -nodes_ar = np.array(nodes) -predictions = [] -for i in range(len(df_gt)): - predictions.append(';'.join(nodes_ar[latent_vamb[i] > 0.5][1:])) -df_gt[f'predictions{exp_id}'] = predictions - -GT_PATH = 'almeida_predictions.csv' -df_gt.to_csv(GT_PATH, index=None) - -df_mmseq_sp = df_mmseq[(df_mmseq[2] == 'species')] -mmseq_map = {k: v for k, v in zip(df_mmseq_sp[0], df_mmseq_sp[8])} -counters = defaultdict(lambda: 0) -preds = [] -for i, r in df_gt.iterrows(): - pred_line = r[f'predictions{exp_id}'].split(';') - try: - mmseq_line = mmseq_map.get(r['contigs'], '').split(';') - except AttributeError: - preds.append(';'.join(pred_line)) - continue - if mmseq_line[0] != '': - for i in range(len(mmseq_line)): - if i < len(pred_line): - pred_line[i] = mmseq_line[i] - preds.append(';'.join(pred_line)) -df_gt[f'predictions{exp_id}_replace'] = preds - -GT_PATH = 'almeida_predictions.csv' -df_gt.to_csv(GT_PATH, index=None) - -print('Starting the VAE') - -graph_column = df_gt[f'predictions{exp_id}_replace'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths, batchsize=batchsize) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent, batchsize=batchsize) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent, batchsize=batchsize) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes, batchsize=batchsize) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - # batchsteps=[25, 75, 150], - batchsteps=[], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -latent_vamb = np.load(f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy') -latent_both = np.load(f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy') -latent_labels = np.load(f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy') - -names = contignames - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_vamb2label_aug.py b/train_mmseq_hloss_vamb2label_aug.py deleted file mode 100644 index 96421811..00000000 --- a/train_mmseq_hloss_vamb2label_aug.py +++ /dev/null @@ -1,240 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_mmseq_predict_aug_overlap' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -AUGMENTED_PATH = f'/home/projects/cpr_10006/projects/semi_vamb/data/augmented/{DATASET}' -rpkms_aug = np.load(f'{AUGMENTED_PATH}/new_ab_overlap.npz', allow_pickle=True)['matrix'] -comp_aug = np.load(f'{AUGMENTED_PATH}/new_comp_overlap.npz', allow_pickle=True) -tnfs_aug, lengths_aug = comp_aug['matrix'], comp_aug['lengths'] - -with vamb.vambtools.Reader(PATH_CONTIGS) as contigfile: - composition = vamb.parsecontigs.Composition.from_file(contigfile) - -rpkms = vamb.vambtools.read_npz(ABUNDANCE_PATH) -tnfs, lengths = composition.matrix, composition.metadata.lengths -contignames = composition.metadata.identifiers - -ori_contigs = np.array(list(set([c.split('_')[0] for c in comp_aug['identifiers'] if len(c.split('_')) > 1]))) -ind_map = {c: i for i, c in enumerate(contignames)} -inds_ori = [ind_map[c] for c in ori_contigs] - -rpkms_train = np.concatenate([rpkms_aug, rpkms[inds_ori]]) -tnfs_train = np.concatenate([tnfs_aug, tnfs[inds_ori]]) -lengths_train = np.concatenate([lengths_aug, lengths[inds_ori]]) -contignames_train = np.concatenate([comp_aug['identifiers'], ori_contigs]) - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) - -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] - -data = { - 0: [], - 2: [], - 8: [], -} -mmseq_map = dict(zip(df_mmseq[0], df_mmseq[8])) -mmseq_map_genus = dict(zip(df_mmseq[0], df_mmseq[2])) -added = set() -for c in contignames_train: - c_map = c.split('_')[0] - if c_map in mmseq_map: - data[0].append(c) - data[8].append(mmseq_map[c_map]) - data[2].append(mmseq_map_genus[c_map]) -df_mmseq_aug = pd.DataFrame(data) -df_mmseq_aug.to_csv(f'mmseq_aug_{DATASET}.csv', index=None) - -ind_map = {c: i for i, c in enumerate(contignames_train)} -indices_mmseq = [ind_map[c] for c in df_mmseq_aug[0]] - -graph_column = df_mmseq_aug[8] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -model = vamb.h_loss.VAMB2Label( - rpkms.shape[1], - len(nodes), - table_indices, - table_true, - table_walkdown, - nodes, - table_parent, - cuda=CUDA, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms_train[indices_mmseq], tnfs_train[indices_mmseq], lengths_train[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms_train.shape[1], 103, 1, len(nodes)) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - model.trainmodel( - dataloader_joint, - nepochs=100, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[], - ) - print('training') - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) - -latent_vamb = model.predict(dataloader_vamb) -LATENT_PATH = f'latent_predict_vamb_{DATASET}{exp_id}.npy' -# print('Saving latent space: Vamb', latent_vamb.shape) -# np.save(LATENT_PATH, latent_vamb) - -print('Saving the tree', len(nodes)) -TREE_PATH = f'tree_predict_vamb_{DATASET}{exp_id}.npy' -np.save(TREE_PATH, np.array(nodes)) -PARENT_PATH = f'parents_predict_vamb_{DATASET}{exp_id}.npy' -np.save(PARENT_PATH, np.array(table_parent)) - -print('Getting the predictions') -df_gt = pd.read_csv(GT_PATH) -predictions = [] -for i in range(len(df_gt)): - predictions.append(';'.join(np.array(nodes)[latent_vamb[i] > 0.5][1:])) -df_gt[f'predictions{exp_id}'] = predictions -df_gt.to_csv(GT_PATH, index=None) - -print('Getting augmented predictions') -dataloader_train, mask_vamb = vamb.encode.make_dataloader(rpkms_train, tnfs_train, lengths_train) - -latent_vamb = model.predict(dataloader_train) -LATENT_PATH = f'latent_predict_vamb_aug_{DATASET}{exp_id}.npy' -# print('Saving latent space: Vamb', latent_vamb.shape) -# np.save(LATENT_PATH, latent_vamb) - -df_train = pd.DataFrame({'contigs': contignames_train}) -predictions = [] -for i in range(len(df_train)): - predictions.append(';'.join(np.array(nodes)[latent_vamb[i] > 0.5][1:])) -df_train[f'predictions{exp_id}'] = predictions -df_train.to_csv(f'train_{DATASET}.csv', index=None) - -print('Starting the VAE') - -graph_column = df_train[f'predictions{exp_id}'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms_train, tnfs_train, lengths_train, targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms_train, tnfs_train, lengths_train, targets, len(nodes), table_parent) - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - table_indices, - table_true, - table_walkdown, - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_train, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75, 150], - ) - print('training') - -graph_column = df_gt[f'predictions{exp_id}'] -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -composition.metadata.filter_mask(mask_vamb) -names = composition.metadata.identifiers - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_vamb2label_longread.py b/train_mmseq_hloss_vamb2label_longread.py deleted file mode 100644 index 69016199..00000000 --- a/train_mmseq_hloss_vamb2label_longread.py +++ /dev/null @@ -1,220 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_longread_1000_64' - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = 'longread' -DEPTH_PATH = f'/home/projects/cpr_10006/projects/semi_vamb/data/human_longread/vambout' -COMPOSITION_PATH = f'{DEPTH_PATH}/composition.npz' -ABUNDANCE_PATH = f'{DEPTH_PATH}/abundance.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy_2023.tsv' -GT_PATH = f'gt_tax_{DATASET}.csv' - -rpkms = np.load(ABUNDANCE_PATH, mmap_mode='r')['matrix'] -composition = np.load(COMPOSITION_PATH, mmap_mode='r', allow_pickle=True) -tnfs, lengths = composition['matrix'], composition['lengths'] -contignames = composition['identifiers'] - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) - -print('N contigs', len(contignames)) -print('rpkms shape', rpkms.shape) -print('tnfs shape', tnfs.shape) - -print('N contigs after mask', len(contignames[mask_vamb])) -print('rpkms shape after mask', rpkms[mask_vamb].shape) -print('tnfs shape after mask', tnfs[mask_vamb].shape) - -rpkms = rpkms[mask_vamb] -tnfs, lengths = tnfs[mask_vamb], lengths[mask_vamb] -contignames = contignames[mask_vamb] -all_contigs = set(contignames) - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] -df_mmseq = df_mmseq[df_mmseq[0].isin(all_contigs)] -print('Total mmseq hits', len(df_mmseq)) -print('Genus', len(df_mmseq[(df_mmseq[2] == 'genus') | (df_mmseq[2] == 'species')])) -print('Species', len(df_mmseq[(df_mmseq[2] == 'species')])) - -df_mmseq_genus = df_mmseq - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -graph_column = df_mmseq_genus[8] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -model = vamb.h_loss.VAMB2Label( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - model.trainmodel( - dataloader_joint, - nepochs=100, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[], - ) - print('training') - -latent_vamb = model.predict(dataloader_vamb) -LATENT_PATH = f'latent_predict_vamb_{DATASET}{exp_id}.npy' -# print('Saving latent space: Vamb', latent_vamb.shape) -# np.save(LATENT_PATH, latent_vamb) - -print('Saving the tree', len(nodes)) -TREE_PATH = f'tree_predict_vamb_{DATASET}{exp_id}.npy' -np.save(TREE_PATH, np.array(nodes)) -PARENT_PATH = f'parents_predict_vamb_{DATASET}{exp_id}.npy' -np.save(PARENT_PATH, np.array(table_parent)) - -print('Getting the predictions') -df_gt = pd.DataFrame({'contigs': contignames}) -nodes_ar = np.array(nodes) -predictions = [] -for i in range(len(df_gt)): - predictions.append(';'.join(nodes_ar[latent_vamb[i] > 0.5][1:])) -df_gt[f'predictions{exp_id}'] = predictions - -df_mmseq_sp = df_mmseq[(df_mmseq[2] == 'species')] -mmseq_map = {k: v for k, v in zip(df_mmseq_sp[0], df_mmseq_sp[8])} -counters = defaultdict(lambda: 0) -preds = [] -for i, r in df_gt.iterrows(): - pred_line = r[f'predictions{exp_id}'].split(';') - try: - mmseq_line = mmseq_map.get(r['contigs'], '').split(';') - except AttributeError: - preds.append(';'.join(pred_line)) - continue - if mmseq_line[0] != '': - for i in range(len(mmseq_line)): - if i < len(pred_line): - pred_line[i] = mmseq_line[i] - preds.append(';'.join(pred_line)) -df_gt[f'predictions{exp_id}_replace'] = preds - -df_gt.to_csv(GT_PATH, index=None) - -print('Starting the VAE') - -graph_column = df_gt[f'predictions{exp_id}_replace'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - nlatent=64, - cuda=CUDA, - logfile=sys.stdout, -) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[], - # batchsteps=[25, 75, 150], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -names = contignames - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_hloss_vamb2label_walktrap.py b/train_mmseq_hloss_vamb2label_walktrap.py deleted file mode 100644 index de395175..00000000 --- a/train_mmseq_hloss_vamb2label_walktrap.py +++ /dev/null @@ -1,227 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_hloss_mmseq_predict_replace_walktrap_no_components' - -DATASET_MAP = { - 'airways': 'Airways', - 'gi': 'Gastrointestinal', - 'oral': 'Oral', - 'skin': 'Skin', - 'urog': 'Urogenital', -} - -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = args['dataset'] -DEPTH_PATH = f'/home/projects/cpr_10006/projects/vamb/data/datasets/cami2_{DATASET}' -PATH_CONTIGS = f'{DEPTH_PATH}/contigs_2kbp.fna.gz' -BAM_PATH = f'{DEPTH_PATH}/bam_sorted' -ABUNDANCE_PATH = f'{DEPTH_PATH}/depths.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}{exp_id}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/paupie/vaevae/mmseq2_annotations/ptracker/{DATASET_MAP[DATASET]}_taxonomy.tsv' -GT_PATH = f'gt_tax_{DATASET}{exp_id}.csv' -COMPOSITION_PATH = f'/home/projects/cpr_10006/people/paupie/vaevae/abundances_compositions/{DATASET}/composition.npz' -ABUNDANCE_PATH = f'/home/projects/cpr_10006/people/paupie/vaevae/abundances_compositions/{DATASET}/abundance.npz' -WALKTRAP_PATH = f'/home/projects/cpr_10006/people/paupie/vaevae/graph/labels_walktrap/{DATASET_MAP[DATASET]}/walktrap.tsv' - -rpkms = np.load(ABUNDANCE_PATH, mmap_mode='r')['matrix'] -composition = np.load(COMPOSITION_PATH, mmap_mode='r', allow_pickle=True) -tnfs, lengths = composition['matrix'], composition['lengths'] -contignames = composition['identifiers'] - -df_walktrap = pd.read_csv(WALKTRAP_PATH, delimiter='\t', header=None) -print('Len walktrap', len(df_walktrap)) -walktrap_map = {c: i+1 for i, c in enumerate(set(df_walktrap[0]))} -df_walktrap[2] = df_walktrap[0].map(walktrap_map) -labels_assembly_map = {k: v for k, v in zip(df_walktrap[1], df_walktrap[2])} -labels_assembly = np.array([labels_assembly_map.get(c, 0) for c in contignames]) -n_walktraps = len(set(labels_assembly_map.values())) + 1 - -def one_hot(a, num_classes): - return np.squeeze(np.eye(num_classes)[a.reshape(-1)]) - -ar = one_hot(labels_assembly, num_classes=n_walktraps) -# rpkms = np.concatenate([rpkms, ar], axis=1).astype(np.float32) - -print('N contigs', len(contignames)) -print('rpkms shape', rpkms.shape) - -df_mmseq = pd.read_csv(MMSEQ_PATH, delimiter='\t', header=None) -df_mmseq = df_mmseq[~(df_mmseq[2] == 'no rank')] -df_mmseq_genus = df_mmseq - -ind_map = {c: i for i, c in enumerate(contignames)} -indices_mmseq = [ind_map[c] for c in df_mmseq_genus[0]] - -graph_column = df_mmseq_genus[8] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -model = vamb.h_loss.VAMB2Label( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, -) - -with open(f'indices_mmseq_genus_{DATASET}{exp_id}.pickle', 'wb') as handle: - pickle.dump(indices_mmseq, handle, protocol=pickle.HIGHEST_PROTOCOL) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms[indices_mmseq], tnfs[indices_mmseq], lengths[indices_mmseq], targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - model.trainmodel( - dataloader_joint, - nepochs=100, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75], - ) - print('training') - -latent_vamb = model.predict(dataloader_vamb) -LATENT_PATH = f'latent_predict_vamb_{DATASET}{exp_id}.npy' -# print('Saving latent space: Vamb', latent_vamb.shape) -# np.save(LATENT_PATH, latent_vamb) - -print('Saving the tree', len(nodes)) -TREE_PATH = f'tree_predict_vamb_{DATASET}{exp_id}.npy' -np.save(TREE_PATH, np.array(nodes)) -PARENT_PATH = f'parents_predict_vamb_{DATASET}{exp_id}.npy' -np.save(PARENT_PATH, np.array(table_parent)) - -print('Getting the predictions') -df_gt = pd.DataFrame({'contigs': contignames}) -nodes_ar = np.array(nodes) -predictions = [] -for i in range(len(df_gt)): - predictions.append(';'.join(nodes_ar[latent_vamb[i] > 0.5][1:])) -df_gt[f'predictions{exp_id}'] = predictions - -df_mmseq_sp = df_mmseq[(df_mmseq[2] == 'species')] -mmseq_map = {k: v for k, v in zip(df_mmseq_sp[0], df_mmseq_sp[8])} -counters = defaultdict(lambda: 0) -preds = [] -for i, r in df_gt.iterrows(): - pred_line = r[f'predictions{exp_id}'].split(';') - try: - mmseq_line = mmseq_map.get(r['contigs'], '').split(';') - except AttributeError: - preds.append(';'.join(pred_line)) - continue - if mmseq_line[0] != '': - for i in range(len(mmseq_line)): - if i < len(pred_line): - pred_line[i] = mmseq_line[i] - preds.append(';'.join(pred_line)) -df_gt[f'predictions{exp_id}_replace'] = preds - -df_gt.to_csv(GT_PATH, index=None) - -print('Starting the VAE') - -graph_column = df_gt[f'predictions{exp_id}_replace'] -nodes, ind_nodes, table_parent = vamb.h_loss.make_graph(graph_column.unique()) - -classes_order = np.array(list(graph_column.str.split(';').str[-1])) -targets = [ind_nodes[i] for i in classes_order] - -vae = vamb.h_loss.VAEVAEHLoss( - rpkms.shape[1], - len(nodes), - nodes, - table_parent, - cuda=CUDA, - logfile=sys.stdout, -) - -dataloader_vamb, mask_vamb = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -dataloader_joint, mask = vamb.h_loss.make_dataloader_concat_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) -dataloader_labels, mask = vamb.h_loss.make_dataloader_labels_hloss(rpkms, tnfs, lengths, targets, len(nodes), table_parent) - -shapes = (rpkms.shape[1], 103, 1, len(nodes)) -dataloader = vamb.h_loss.make_dataloader_semisupervised_hloss(dataloader_joint, dataloader_vamb, dataloader_labels, len(nodes), table_parent, shapes) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75, 150], - ) - print('training') - -latent_vamb = vae.VAEVamb.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent_vamb) - -latent_labels = vae.VAELabels.encode(dataloader_labels) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_labels_{DATASET}{exp_id}.npy' -print('Saving latent space: Labels') -np.save(LATENT_PATH, latent_labels) - -latent_both = vae.VAEJoint.encode(dataloader_joint) -LATENT_PATH = f'latent_trained_lengths_mmseq_genus_both_{DATASET}{exp_id}.npy' -print('Saving latent space: Both') -np.save(LATENT_PATH, latent_both) - -names = contignames - -with open(f'clusters_mmseq_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_both) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_vamb) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) - -with open(f'clusters_mmseq_labels_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent_labels) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator) \ No newline at end of file diff --git a/train_mmseq_vamb.py b/train_mmseq_vamb.py deleted file mode 100644 index 7f8ffffd..00000000 --- a/train_mmseq_vamb.py +++ /dev/null @@ -1,66 +0,0 @@ -import vamb -import sys -import argparse -import pickle -import numpy as np -import pandas as pd -import os - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str, default='/Users/nmb127/Documents/vamb_data/data') -parser.add_argument("--nepoch", type=int, default=500) -parser.add_argument('--cuda', action=argparse.BooleanOptionalAction) -parser.add_argument("--dataset", type=str, default='airways') -parser.add_argument("--supervision", type=float, default=1.) - -args = vars(parser.parse_args()) -print(args) - -exp_id = '_longread_1000_64' -SUP = args['supervision'] -CUDA = bool(args['cuda']) -DATASET = 'longread' -DEPTH_PATH = f'/home/projects/cpr_10006/projects/semi_vamb/data/human_longread/vambout' -COMPOSITION_PATH = f'{DEPTH_PATH}/composition.npz' -ABUNDANCE_PATH = f'{DEPTH_PATH}/abundance.npz' -MODEL_PATH = f'model_semisupervised_mmseq_genus_{DATASET}{exp_id}.pt' -N_EPOCHS = args['nepoch'] -MMSEQ_PATH = f'/home/projects/cpr_10006/people/svekut/mmseq2/{DATASET}_taxonomy_2023.tsv' - -rpkms = np.load(ABUNDANCE_PATH, mmap_mode='r')['matrix'] -composition = np.load(COMPOSITION_PATH, mmap_mode='r', allow_pickle=True) -tnfs, lengths = composition['matrix'], composition['lengths'] -contignames = composition['identifiers'] - -vae = vamb.encode.VAE(nsamples=rpkms.shape[1], nlatent=64, cuda=CUDA) - -dataloader_vamb, mask = vamb.encode.make_dataloader(rpkms, tnfs, lengths) -with open(MODEL_PATH, 'wb') as modelfile: - print('training') - vae.trainmodel( - dataloader_vamb, - nepochs=N_EPOCHS, - modelfile=modelfile, - logfile=sys.stdout, - batchsteps=[25, 75, 150], - ) - print('training') - -latent = vae.encode(dataloader_vamb) -LATENT_PATH = f'latent_trained_lengths_vamb_{DATASET}{exp_id}.npy' -print('Saving latent space: Vamb') -np.save(LATENT_PATH, latent) - -latent = np.load(LATENT_PATH) -names = contignames[mask] - -with open(f'clusters_vamb_{DATASET}_v4{exp_id}.tsv', 'w') as binfile: - iterator = vamb.cluster.ClusterGenerator(latent) - iterator = vamb.vambtools.binsplit( - ( - (names[cluster.medoid], {names[m] for m in cluster.members}) - for cluster in iterator - ), - "C" - ) - vamb.vambtools.write_clusters(binfile, iterator)