From 7f20a2b0af2fc3fed685d8af7f1dd676e0f5e989 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 9 Dec 2024 11:36:17 -0500 Subject: [PATCH 01/17] Initial commit --- .../scripts/IntegrateGQ.sh | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 429fc5f61..f82f320b7 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -33,14 +33,14 @@ zcat $RD_melted_genotypes \ ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >pe_indiv_geno.txt ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >>pe_indiv_geno.txt @@ -50,14 +50,14 @@ rm pe_indiv_geno.txt ##SR## zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >sr_indiv_geno.txt ##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >>sr_indiv_geno.txt @@ -127,7 +127,7 @@ touch genotype.indiv.txt if [ $(cat depthonly.del.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf depthonly.del.ids.txt || true; } \ + |fgrep -wf depthonly.del.ids.txt || true \ |awk '{if ($4>=2) print $0 "\t" 0 "\t" $5"\t" "RD"; \ else if ($4==1) print $0 "\t" 1 "\t" $5"\t" "RD"; \ else if ($4==0) print $0 "\t" 2 "\t" $5"\t" "RD" }' \ @@ -136,7 +136,7 @@ then >genotype.indiv.txt zcat RDall.variants.combined.files.txt.gz \ - | { fgrep -wf depthonly.del.ids.txt || true; } \ + |fgrep -wf depthonly.del.ids.txt || true \ |awk '{print $0 "\t" $2}'\ >genotype.variant.txt fi @@ -144,7 +144,7 @@ fi if [ $(cat depthonly.dup.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf depthonly.dup.ids.txt || true; } \ + |fgrep -wf depthonly.dup.ids.txt || true \ |awk '{if ($4<=2) print $0 "\t" 0 "\t" $5"\t" "RD"; \ else print $0 "\t" $4-2 "\t" $5"\t" "RD"}' \ |cut -f1-6,8-9,11,15- \ @@ -152,7 +152,7 @@ then >>genotype.indiv.txt zcat RDall.variants.combined.files.txt.gz \ - | { fgrep -wf depthonly.dup.ids.txt || true; } \ + |fgrep -wf depthonly.dup.ids.txt || true \ |awk '{print $0 "\t" $2}'\ >>genotype.variant.txt fi @@ -168,7 +168,7 @@ then ##Pull out highest GQ if an event has a genotype and lowest if it doesn't## #take max of PE/SR for genotype when they have a genotype## \ zcat PESRall.combined.files.txt.gz \ - | { fgrep -wf noncnv.ids.txt || true; } \ + |fgrep -wf noncnv.ids.txt || true \ |awk '{if ($6==$8 && $7>=$9) print $0 "\t" $6 "\t" $7 "\t" "PE,SR"; \ else if ($6==$8 && $7<$9) print $0 "\t" $8 "\t" $9 "\t" "PE,SR"; \ else if ($6>0 && $8==0) print $0 "\t" $6 "\t" $7 "\t" "PE"; \ @@ -178,7 +178,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - | { fgrep -wf noncnv.ids.txt || true; } \ + |fgrep -wf noncnv.ids.txt || true \ |awk '{if ($3>=$4)print $0 "\t" $3 ;else print $0 "\t" $4 }' \ |tr ' ' '\t' \ >>genotype.variant.txt @@ -189,13 +189,13 @@ rm PESRall.combined.files.txt.gz ##CNV## ##Recode RD to match PE/SR## ##CNV >5kb and removing any CNV with no depth genotype### -{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ +fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ |awk '{if (($5=="DEL") && $3-$2>=5000 ) print $4}' \ >gt5kbcnv.del.ids.txt -{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ +fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ |awk -F'\t' '{if (($5=="DUP" ) && $3-$2>=5000 ) print $4}'\ >gt5kbcnv.dup.ids.txt @@ -204,7 +204,7 @@ if [ $(cat gt5kbcnv.del.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ + |fgrep -wf gt5kbcnv.del.ids.txt || true \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ @@ -224,7 +224,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ + |fgrep -wf gt5kbcnv.del.ids.txt || true \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ @@ -237,7 +237,7 @@ if [ $(cat gt5kbcnv.dup.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ + |fgrep -wf gt5kbcnv.dup.ids.txt || true \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ @@ -256,7 +256,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ + |fgrep -wf gt5kbcnv.dup.ids.txt || true \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ @@ -265,8 +265,8 @@ then fi ##CNV 1-5kb and removing any CNV with no depth genotype### -{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ +fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<5000 && $3-$2>=1000 ) print $4}' \ >gt1_5kbcnv.ids.txt @@ -275,7 +275,7 @@ if [ $(cat gt1_5kbcnv.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ + |fgrep -wf gt1_5kbcnv.ids.txt || true \ |awk '{if ($14==999) print $0,$12,999; \ else if ($4==$13) print $0,$12,$14+((999-$14) * ($5/999)*0.5); \ else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' \ @@ -296,7 +296,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ + |fgrep -wf gt1_5kbcnv.ids.txt || true \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ @@ -305,8 +305,8 @@ then fi ###CNV <1kb and removing any CNV with no depth genotype#### -{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ +fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ >lt1kbcnv.ids.txt @@ -314,7 +314,7 @@ if [ $(cat lt1kbcnv.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - | { fgrep -wf lt1kbcnv.ids.txt || true; } \ + |fgrep -wf lt1kbcnv.ids.txt || true \ |awk '{if ($14==999) print $0,$12,999; \ else if ($7==$10 && $8>=$11) print $0,$12,$14+((999-$14) * ($11/999)*0.5); \ else if ($7==$10 && $11>=$8) print $0,$12,$14+((999-$14) * ($8/999)*0.5); \ @@ -336,7 +336,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - | { fgrep -wf lt1kbcnv.ids.txt || true; } \ + |fgrep -wf lt1kbcnv.ids.txt || true \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ From 519426bb856471b1d2ab7a3f06f210f4f025636c Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 9 Dec 2024 20:38:20 -0500 Subject: [PATCH 02/17] Added safe joining - include all from each side if none present --- .../04_variant_resolution/scripts/IntegrateGQ.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index f82f320b7..ff083543b 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -76,10 +76,10 @@ fi ##All PE/SR .'s for samples missing RD## ##Add max PE/SR to use for future GQ integration## -join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ +join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ <(zcat rd_indiv_geno.txt.gz) <(zcat pe_indiv_geno.txt.gz) \ |tr ' ' '\t' \ - |join -j 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 2.4 2.5 2.6 - <(zcat sr_indiv_geno.txt.gz) \ + |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 2.4 2.5 2.6 - <(zcat sr_indiv_geno.txt.gz) \ |tr ' ' '\t' \ | awk '{if ($6==$9 && $8>=$11) print $0 "\t" $6 "\t" $7 "\t" $8 ; \ else if ($6==$9 && $8<$11) print $0 "\t" $9 "\t" $10 "\t" $11 ; \ @@ -91,26 +91,26 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ >RDall.combined.files.txt.gz ##variant combine## -join -j 1 -a 1 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1 ) \ +join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1 ) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1) \ + |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1) \ |tr ' ' '\t' \ |gzip \ >RDall.variants.combined.files.txt.gz ##All RD NA's for samples missing RD## -join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ +join -j 1 -a 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ <(zcat pe_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ - |join -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ + |join -a 1 -a 2 -e "." -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ |gzip \ >PESRall.combined.files.txt.gz ##variant combine## -join -j 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ +join -j 1 -a 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ + |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ |tr ' ' '\t' \ |gzip \ >PESRall.variants.combined.files.txt.gz From 7140508c2ead5122640aad733567cd581f7db17a Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 10:45:15 -0500 Subject: [PATCH 03/17] Removed join -a commands --- .../04_variant_resolution/scripts/IntegrateGQ.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index ff083543b..f407637bb 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -76,10 +76,10 @@ fi ##All PE/SR .'s for samples missing RD## ##Add max PE/SR to use for future GQ integration## -join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ +join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ <(zcat rd_indiv_geno.txt.gz) <(zcat pe_indiv_geno.txt.gz) \ |tr ' ' '\t' \ - |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 2.4 2.5 2.6 - <(zcat sr_indiv_geno.txt.gz) \ + |join -j 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 2.4 2.5 2.6 - <(zcat sr_indiv_geno.txt.gz) \ |tr ' ' '\t' \ | awk '{if ($6==$9 && $8>=$11) print $0 "\t" $6 "\t" $7 "\t" $8 ; \ else if ($6==$9 && $8<$11) print $0 "\t" $9 "\t" $10 "\t" $11 ; \ @@ -91,15 +91,15 @@ join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ >RDall.combined.files.txt.gz ##variant combine## -join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1 ) \ +join -j 1 -a 1 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1 ) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1) \ + |join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1) \ |tr ' ' '\t' \ |gzip \ >RDall.variants.combined.files.txt.gz ##All RD NA's for samples missing RD## -join -j 1 -a 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ +join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ <(zcat pe_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ |join -a 1 -a 2 -e "." -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ @@ -110,7 +110,7 @@ join -j 1 -a 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno. ##variant combine## join -j 1 -a 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -a 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ + |join -j 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ |tr ' ' '\t' \ |gzip \ >PESRall.variants.combined.files.txt.gz From 6a4f25dd8218a0def6d7f8faabba0605c17369e8 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 10:49:06 -0500 Subject: [PATCH 04/17] Additional join i missed before --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index f407637bb..db52efadc 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -99,7 +99,7 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fg >RDall.variants.combined.files.txt.gz ##All RD NA's for samples missing RD## -join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ +join -j 1 -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ <(zcat pe_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ |join -a 1 -a 2 -e "." -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ @@ -108,7 +108,7 @@ join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.g >PESRall.combined.files.txt.gz ##variant combine## -join -j 1 -a 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ +join -j 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ |join -j 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ |tr ' ' '\t' \ From 98c6592ef06d1d262a1e21e9c7afa35a27771add Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 10:50:26 -0500 Subject: [PATCH 05/17] Final set of joins missed --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index db52efadc..194e99cc7 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -108,9 +108,9 @@ join -j 1 -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1 >PESRall.combined.files.txt.gz ##variant combine## -join -j 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ +join -j 1 -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ + |join -j 1 -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ |tr ' ' '\t' \ |gzip \ >PESRall.variants.combined.files.txt.gz From 5c64eef4079dc32ecb5e822f643765b015a68324 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 10:53:07 -0500 Subject: [PATCH 06/17] All join changes resolved --- .../04_variant_resolution/scripts/IntegrateGQ.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 194e99cc7..1f59dce01 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -99,18 +99,18 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fg >RDall.variants.combined.files.txt.gz ##All RD NA's for samples missing RD## -join -j 1 -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ +join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ <(zcat pe_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ - |join -a 1 -a 2 -e "." -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ + |join -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ |gzip \ >PESRall.combined.files.txt.gz ##variant combine## -join -j 1 -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ +join -j 1 -a 2 -e "." -o 2.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fgrep -v variant_gq|sort -k1,1) \ <(zcat $pegeno_variants_file|sort -k1,1 ) \ - |join -j 1 -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ + |join -j 1 -a 2 -e "." -o 1.1 1.2 1.3 2.2 - <(zcat $srgeno_variants_file|sort -k1,1 ) \ |tr ' ' '\t' \ |gzip \ >PESRall.variants.combined.files.txt.gz From 8b94e8529b3d27fa2e3bf24c5abf0c5bd2392e0e Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 16:53:59 -0500 Subject: [PATCH 07/17] Verify no join difference in integrategq --- .../scripts/IntegrateGQ.sh | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 1f59dce01..06d0da797 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -33,14 +33,14 @@ zcat $RD_melted_genotypes \ ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## zcat $pegeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ + | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >pe_indiv_geno.txt ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## zcat $pegeno_indiv_file \ - |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ + | fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >>pe_indiv_geno.txt @@ -50,7 +50,7 @@ rm pe_indiv_geno.txt ##SR## zcat $srgeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ + | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >sr_indiv_geno.txt @@ -99,7 +99,7 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 2.2 <(cut -f4- $RD_melted_variants_gentoypes|fg >RDall.variants.combined.files.txt.gz ##All RD NA's for samples missing RD## -join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ +join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.gz|sort -k1,1) \ <(zcat pe_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ |join -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ @@ -189,13 +189,14 @@ rm PESRall.combined.files.txt.gz ##CNV## ##Recode RD to match PE/SR## ##CNV >5kb and removing any CNV with no depth genotype### -fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ +{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk '{if (($5=="DEL") && $3-$2>=5000 ) print $4}' \ >gt5kbcnv.del.ids.txt -fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ + +{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" ) && $3-$2>=5000 ) print $4}'\ >gt5kbcnv.dup.ids.txt @@ -265,8 +266,8 @@ then fi ##CNV 1-5kb and removing any CNV with no depth genotype### -fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ +{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<5000 && $3-$2>=1000 ) print $4}' \ >gt1_5kbcnv.ids.txt @@ -305,8 +306,8 @@ then fi ###CNV <1kb and removing any CNV with no depth genotype#### -fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true \ +{ fgrep -wvf <(zcat RDall.combined.files.txt.gz \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ >lt1kbcnv.ids.txt From 0bad4ff8d23a6f59987681ca40745e3f88517fd8 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 17:39:23 -0500 Subject: [PATCH 08/17] Further changes --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 06d0da797..208b31a1e 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -194,7 +194,6 @@ rm PESRall.combined.files.txt.gz |awk '{if (($5=="DEL") && $3-$2>=5000 ) print $4}' \ >gt5kbcnv.del.ids.txt - { fgrep -wvf <(zcat RDall.combined.files.txt.gz \ |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" ) && $3-$2>=5000 ) print $4}'\ @@ -307,7 +306,7 @@ fi ###CNV <1kb and removing any CNV with no depth genotype#### { fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ >lt1kbcnv.ids.txt From 7521c6a457408d3ceaf2c17c364200a081fb377d Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 10 Dec 2024 17:49:07 -0500 Subject: [PATCH 09/17] Reset to init for ease of understanding --- .../scripts/IntegrateGQ.sh | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 208b31a1e..68e928e7e 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -33,14 +33,14 @@ zcat $RD_melted_genotypes \ ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >pe_indiv_geno.txt ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## zcat $pegeno_indiv_file \ - | fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ + |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >>pe_indiv_geno.txt @@ -50,7 +50,7 @@ rm pe_indiv_geno.txt ##SR## zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ >sr_indiv_geno.txt @@ -127,7 +127,7 @@ touch genotype.indiv.txt if [ $(cat depthonly.del.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf depthonly.del.ids.txt || true \ + | { fgrep -wf depthonly.del.ids.txt || true; } \ |awk '{if ($4>=2) print $0 "\t" 0 "\t" $5"\t" "RD"; \ else if ($4==1) print $0 "\t" 1 "\t" $5"\t" "RD"; \ else if ($4==0) print $0 "\t" 2 "\t" $5"\t" "RD" }' \ @@ -136,7 +136,7 @@ then >genotype.indiv.txt zcat RDall.variants.combined.files.txt.gz \ - |fgrep -wf depthonly.del.ids.txt || true \ + | { fgrep -wf depthonly.del.ids.txt || true; } \ |awk '{print $0 "\t" $2}'\ >genotype.variant.txt fi @@ -144,7 +144,7 @@ fi if [ $(cat depthonly.dup.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf depthonly.dup.ids.txt || true \ + | { fgrep -wf depthonly.dup.ids.txt || true; } \ |awk '{if ($4<=2) print $0 "\t" 0 "\t" $5"\t" "RD"; \ else print $0 "\t" $4-2 "\t" $5"\t" "RD"}' \ |cut -f1-6,8-9,11,15- \ @@ -152,7 +152,7 @@ then >>genotype.indiv.txt zcat RDall.variants.combined.files.txt.gz \ - |fgrep -wf depthonly.dup.ids.txt || true \ + | { fgrep -wf depthonly.dup.ids.txt || true; } \ |awk '{print $0 "\t" $2}'\ >>genotype.variant.txt fi @@ -168,7 +168,7 @@ then ##Pull out highest GQ if an event has a genotype and lowest if it doesn't## #take max of PE/SR for genotype when they have a genotype## \ zcat PESRall.combined.files.txt.gz \ - |fgrep -wf noncnv.ids.txt || true \ + | { fgrep -wf noncnv.ids.txt || true; } \ |awk '{if ($6==$8 && $7>=$9) print $0 "\t" $6 "\t" $7 "\t" "PE,SR"; \ else if ($6==$8 && $7<$9) print $0 "\t" $8 "\t" $9 "\t" "PE,SR"; \ else if ($6>0 && $8==0) print $0 "\t" $6 "\t" $7 "\t" "PE"; \ @@ -178,7 +178,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - |fgrep -wf noncnv.ids.txt || true \ + | { fgrep -wf noncnv.ids.txt || true; } \ |awk '{if ($3>=$4)print $0 "\t" $3 ;else print $0 "\t" $4 }' \ |tr ' ' '\t' \ >>genotype.variant.txt @@ -204,7 +204,7 @@ if [ $(cat gt5kbcnv.del.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf gt5kbcnv.del.ids.txt || true \ + | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ @@ -224,7 +224,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - |fgrep -wf gt5kbcnv.del.ids.txt || true \ + | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ @@ -237,7 +237,7 @@ if [ $(cat gt5kbcnv.dup.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf gt5kbcnv.dup.ids.txt || true \ + | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ @@ -256,7 +256,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - |fgrep -wf gt5kbcnv.dup.ids.txt || true \ + | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ @@ -275,7 +275,7 @@ if [ $(cat gt1_5kbcnv.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf gt1_5kbcnv.ids.txt || true \ + | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ |awk '{if ($14==999) print $0,$12,999; \ else if ($4==$13) print $0,$12,$14+((999-$14) * ($5/999)*0.5); \ else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' \ @@ -296,7 +296,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - |fgrep -wf gt1_5kbcnv.ids.txt || true \ + | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ @@ -306,7 +306,7 @@ fi ###CNV <1kb and removing any CNV with no depth genotype#### { fgrep -wvf <(zcat RDall.combined.files.txt.gz \ - |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ + |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ >lt1kbcnv.ids.txt @@ -314,7 +314,7 @@ if [ $(cat lt1kbcnv.ids.txt|wc -l) -gt 0 ] then zcat RDall.combined.files.txt.gz \ - |fgrep -wf lt1kbcnv.ids.txt || true \ + | { fgrep -wf lt1kbcnv.ids.txt || true; } \ |awk '{if ($14==999) print $0,$12,999; \ else if ($7==$10 && $8>=$11) print $0,$12,$14+((999-$14) * ($11/999)*0.5); \ else if ($7==$10 && $11>=$8) print $0,$12,$14+((999-$14) * ($8/999)*0.5); \ @@ -336,7 +336,7 @@ then >>genotype.indiv.txt zcat PESRall.variants.combined.files.txt.gz \ - |fgrep -wf lt1kbcnv.ids.txt || true \ + | { fgrep -wf lt1kbcnv.ids.txt || true; } \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ @@ -377,4 +377,3 @@ cat genotype.variant.txt \ |awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \ |gzip \ >genotype.variant.txt.gz - From 66326053bcf5872932f5348ccdb454eef5c9aba5 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 11 Dec 2024 09:00:16 -0500 Subject: [PATCH 10/17] Minor updates, still WIP --- .../scripts/IntegrateGQ.sh | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 68e928e7e..4e400deef 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -32,11 +32,12 @@ zcat $RD_melted_genotypes \ ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## -zcat $pegeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >pe_indiv_geno.txt +( + zcat $pegeno_indiv_file \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ + |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + |awk '!seen[$1"@"$2]++' +) >pe_indiv_geno.txt ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## zcat $pegeno_indiv_file \ @@ -49,11 +50,12 @@ sort -k1,1 pe_indiv_geno.txt|gzip>pe_indiv_geno.txt.gz rm pe_indiv_geno.txt ##SR## -zcat $srgeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >sr_indiv_geno.txt +( + zcat $srgeno_indiv_file \ + |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ + |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + |awk '!seen[$1"@"$2]++' +) >sr_indiv_geno.txt ##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## zcat $srgeno_indiv_file \ From 5764f0272421144d658d557d04d449377dd9ee32 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 22 Jan 2025 18:37:56 -0500 Subject: [PATCH 11/17] Minor edits --- .../scripts/IntegrateGQ.sh | 92 ++++++++++--------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 4e400deef..9f45ed3d7 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -3,7 +3,7 @@ # IntegrateGQ.sh # -set -euo pipefail +# set -euo pipefail vcf=$1 RD_melted_genotypes=$2 @@ -32,37 +32,41 @@ zcat $RD_melted_genotypes \ ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## -( - zcat $pegeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' -) >pe_indiv_geno.txt - -##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## zcat $pegeno_indiv_file \ - |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ - >>pe_indiv_geno.txt + >pe_indiv_geno.txt + +##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## +count_nonDEL=$(awk '{if ($5!="DEL") print $4}' int.bed | wc -l) +if [ "$count_nonDEL" -gt 0 ]; then + zcat "$pegeno_indiv_file" \ + | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ + | awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + | awk '!seen[$1"@"$2]++' \ + >> pe_indiv_geno.txt +fi sort -k1,1 pe_indiv_geno.txt|gzip>pe_indiv_geno.txt.gz rm pe_indiv_geno.txt ##SR## -( - zcat $srgeno_indiv_file \ - |fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' -) >sr_indiv_geno.txt - -##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## zcat $srgeno_indiv_file \ - |fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true \ - |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ + |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ |awk '!seen[$1"@"$2]++' \ - >>sr_indiv_geno.txt + >sr_indiv_geno.txt + +##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## +count_nonDEL2=$(awk '{if ($5!="DEL") print $4}' int.bed | wc -l) +if [ "$count_nonDEL2" -gt 0 ]; then + zcat "$srgeno_indiv_file" \ + | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ + | awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ + | awk '!seen[$1"@"$2]++' \ + >> sr_indiv_geno.txt +fi sort -k1,1 sr_indiv_geno.txt|gzip>sr_indiv_geno.txt.gz @@ -70,11 +74,11 @@ rm sr_indiv_geno.txt ##check to make sure PE and SR are same size which they should be## -if [ $(zcat sr_indiv_geno.txt.gz|wc -l) != $(zcat pe_indiv_geno.txt.gz|wc -l) ] -then - echo "ERROR: PE and SR genotype files have different sizes" - exit -fi +# if [ $(zcat sr_indiv_geno.txt.gz|wc -l) != $(zcat pe_indiv_geno.txt.gz|wc -l) ] +# then +# echo "ERROR: PE and SR genotype files have different sizes" +# exit +# fi ##All PE/SR .'s for samples missing RD## ##Add max PE/SR to use for future GQ integration## @@ -88,7 +92,7 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ else if ($6>0 && $9==0) print $0 "\t" $6 "\t" $7 "\t" $8 ; \ else if ($6==0 && $9>0) print $0 "\t" $9 "\t" $10 "\t" $11 ; \ else if ($8>=$11) print $0 "\t"$6 "\t" $7 "\t" $8 ; \ - else if ($11>$8) print $0 "\t" $9 "\t" $10 "\t" $11 }' \ + else if ($11>$8) print $0 "\t" $9 "\t" $10 "\t" $11 }' || true \ |gzip \ >RDall.combined.files.txt.gz @@ -106,7 +110,7 @@ join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.g |tr ' ' '\t' \ |join -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ - |gzip \ + |gzip || true \ >PESRall.combined.files.txt.gz ##variant combine## @@ -209,7 +213,7 @@ then | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ - else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ + else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' || true \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($4==2 && $4==$7 && $4!=$10) print $0 "\t" "RD,PE" ; \ else if ($4==2 && $4!=$7 && $4==$10) print $0 "\t" "RD,SR" ; \ @@ -217,11 +221,11 @@ then else if ($4!=2 && $7!=2 && $10!=2) print $0 "\t" "RD,PE,SR" ; \ else if ($4!=2 && $7!=2 && $10==2) print $0 "\t" "RD,PE" ; \ else if ($4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ - else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' \ + else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' || true \ |cut -f1-6,8-9,11,15- \ |awk '{if ($4>=2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,0,$11,$12; \ else if ($4==1) print $1,$2,$3,$4,$5,$6,$7,$8,$9,1,$11,$12; \ - else if ($4==0) print $1,$2,$3,$4,$5,$6,$7,$8,$9,2,$11,$12; }' \ + else if ($4==0) print $1,$2,$3,$4,$5,$6,$7,$8,$9,2,$11,$12; }' || true \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -229,7 +233,7 @@ then | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ - else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ + else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' || true \ |tr ' ' '\t' \ >>genotype.variant.txt fi @@ -242,7 +246,7 @@ then | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ - else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ + else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' || true \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($4==2 && $4==$7 && $4!=$10) print $0 "\t" "RD,PE" ; \ else if ($4==2 && $4!=$7 && $4==$10) print $0 "\t" "RD,SR" ; \ @@ -250,10 +254,10 @@ then else if ($4!=2 && $7!=2 && $10!=2) print $0 "\t" "RD,PE,SR" ; \ else if ($4!=2 && $7!=2 && $10==2) print $0 "\t" "RD,PE" ; \ else if ($4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ - else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' \ + else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' || true \ |cut -f1-6,8-9,11,15- \ |awk '{if ($4<=2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,0,$11,$12; \ - else if ($4>2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10-2,$11,$12}' \ + else if ($4>2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10-2,$11,$12}' || true \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -261,7 +265,7 @@ then | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ - else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ + else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' || true \ |tr ' ' '\t' \ >>genotype.variant.txt fi @@ -280,7 +284,7 @@ then | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ |awk '{if ($14==999) print $0,$12,999; \ else if ($4==$13) print $0,$12,$14+((999-$14) * ($5/999)*0.5); \ - else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' \ + else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' || true \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($13==2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "RD,PE" ; \ else if ($13==2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "RD,SR" ; \ @@ -292,7 +296,7 @@ then else if ($13!=2 && $4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ else if ($13!=2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "SR" ; \ else if ($13!=2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "PE"; \ - else if ($13!=2 && $4==2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' \ + else if ($13!=2 && $4==2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' || true \ |cut -f1-6,8-9,11,15- \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -309,7 +313,7 @@ fi ###CNV <1kb and removing any CNV with no depth genotype#### { fgrep -wvf <(zcat RDall.combined.files.txt.gz \ |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ - |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ + |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' || true \ >lt1kbcnv.ids.txt if [ $(cat lt1kbcnv.ids.txt|wc -l) -gt 0 ] @@ -320,7 +324,7 @@ then |awk '{if ($14==999) print $0,$12,999; \ else if ($7==$10 && $8>=$11) print $0,$12,$14+((999-$14) * ($11/999)*0.5); \ else if ($7==$10 && $11>=$8) print $0,$12,$14+((999-$14) * ($8/999)*0.5); \ - else print $0 "\t" $12 "\t" $14}' OFS='\t' \ + else print $0 "\t" $12 "\t" $14}' OFS='\t' || true \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($13==2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "RD,PE" ; \ else if ($13==2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "RD,SR" ; \ @@ -332,7 +336,7 @@ then else if ($13!=2 && $4==$13 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ else if ($13!=2 && $7==2 && $10!=2) print $0 "\t" "SR" ; \ else if ($13!=2 && $7!=2 && $10==2) print $0 "\t" "PE"; \ - else if ($13!=2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' \ + else if ($13!=2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' || true \ |cut -f1-6,8-9,11,15- \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -341,7 +345,7 @@ then | { fgrep -wf lt1kbcnv.ids.txt || true; } \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ - else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ + else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' || true \ |tr ' ' '\t' \ >>genotype.variant.txt fi From aa6c3f386c0ce5f2de53909a78e31003524ce3f0 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 23 Jan 2025 10:49:30 -0500 Subject: [PATCH 12/17] Separated piped commands into distinct steps --- .../scripts/IntegrateGQ.sh | 134 ++++++++++-------- 1 file changed, 78 insertions(+), 56 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 9f45ed3d7..e76772f73 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -3,7 +3,7 @@ # IntegrateGQ.sh # -# set -euo pipefail +set -euo pipefail vcf=$1 RD_melted_genotypes=$2 @@ -30,55 +30,77 @@ zcat $RD_melted_genotypes \ |gzip \ >rd_indiv_geno.txt.gz -##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## -zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >pe_indiv_geno.txt - +zcat "$pegeno_indiv_file" > tmp.pe.zcat + +##Deletions, need to PE-SR genotypes to match RD format (2==ref)## +fgrep -wf <(awk '{if ($5 == "DEL") print $4}' int.bed) tmp.pe.zcat \ + > tmp.pe.del.txt || true + +awk '{ + if ($4>1) print $1"@"$2, $1,$2, $4, 0, $5; + else if ($4==1)print $1"@"$2, $1,$2, $4, 1, $5; + else print $1"@"$2, $1,$2, $4, 2, $5 +}' OFS="\t" tmp.pe.del.txt \ +| awk '!seen[$1"@"$2]++' \ +> pe_indiv_geno.del.txt + ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## -count_nonDEL=$(awk '{if ($5!="DEL") print $4}' int.bed | wc -l) -if [ "$count_nonDEL" -gt 0 ]; then - zcat "$pegeno_indiv_file" \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ - | awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - | awk '!seen[$1"@"$2]++' \ - >> pe_indiv_geno.txt -fi -sort -k1,1 pe_indiv_geno.txt|gzip>pe_indiv_geno.txt.gz +fgrep -wf <(awk '{if ($5 != "DEL") print $4}' int.bed) tmp.pe.zcat \ + > tmp.pe.nodel.txt || true + +awk '{ + if ($4>0) print $1"@"$2, $1,$2, $4, $4+2, $5; + else print $1"@"$2, $1,$2, $4, 2, $5 +}' OFS="\t" tmp.pe.nodel.txt \ +| awk '!seen[$1"@"$2]++' \ +> pe_indiv_geno.nodel.txt + +cat pe_indiv_geno.del.txt pe_indiv_geno.nodel.txt > pe_indiv_geno.txt +sort -k1,1 pe_indiv_geno.txt | gzip > pe_indiv_geno.txt.gz + +rm -f tmp.pe.zcat tmp.pe.del.txt tmp.pe.nodel.txt pe_indiv_geno.del.txt pe_indiv_geno.nodel.txt pe_indiv_geno.txt -rm pe_indiv_geno.txt ##SR## -zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >sr_indiv_geno.txt +zcat "$srgeno_indiv_file" > tmp.sr.zcat + +##Deletion events, need to PE-SR genotypes to match RD format (2==ref)## +fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) tmp.sr.zcat \ + > tmp.sr.del.txt || true + +awk '{ + if ($4>1) print $1"@"$2,$1,$2,$4,0,$5; + else if ($4==1)print $1"@"$2,$1,$2,$4,1,$5; + else print $1"@"$2,$1,$2,$4,2,$5 +}' OFS="\t" tmp.sr.del.txt \ +| awk '!seen[$1"@"$2]++' \ +> sr_indiv_geno.del.txt ##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## -count_nonDEL2=$(awk '{if ($5!="DEL") print $4}' int.bed | wc -l) -if [ "$count_nonDEL2" -gt 0 ]; then - zcat "$srgeno_indiv_file" \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ - | awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - | awk '!seen[$1"@"$2]++' \ - >> sr_indiv_geno.txt -fi - -sort -k1,1 sr_indiv_geno.txt|gzip>sr_indiv_geno.txt.gz +fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) tmp.sr.zcat \ + > tmp.sr.nodel.txt || true + +awk '{ + if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; + else print $1"@"$2,$1,$2,$4,2,$5 +}' OFS="\t" tmp.sr.nodel.txt \ +| awk '!seen[$1"@"$2]++' \ +> sr_indiv_geno.nodel.txt + +cat sr_indiv_geno.del.txt sr_indiv_geno.nodel.txt > sr_indiv_geno.txt +sort -k1,1 sr_indiv_geno.txt | gzip > sr_indiv_geno.txt.gz + +rm -f tmp.sr.zcat tmp.sr.nodel.txt tmp.sr.del.txt sr_indiv_geno.del.txt sr_indiv_geno.nodel.txt sr_indiv_geno.txt -rm sr_indiv_geno.txt ##check to make sure PE and SR are same size which they should be## -# if [ $(zcat sr_indiv_geno.txt.gz|wc -l) != $(zcat pe_indiv_geno.txt.gz|wc -l) ] -# then -# echo "ERROR: PE and SR genotype files have different sizes" -# exit -# fi +if [ $(zcat sr_indiv_geno.txt.gz|wc -l) != $(zcat pe_indiv_geno.txt.gz|wc -l) ] +then + echo "ERROR: PE and SR genotype files have different sizes" + exit +fi ##All PE/SR .'s for samples missing RD## ##Add max PE/SR to use for future GQ integration## @@ -92,7 +114,7 @@ join -j 1 -a 1 -e "." -o 1.1 1.2 1.3 1.4 1.5 2.4 2.5 2.6 \ else if ($6>0 && $9==0) print $0 "\t" $6 "\t" $7 "\t" $8 ; \ else if ($6==0 && $9>0) print $0 "\t" $9 "\t" $10 "\t" $11 ; \ else if ($8>=$11) print $0 "\t"$6 "\t" $7 "\t" $8 ; \ - else if ($11>$8) print $0 "\t" $9 "\t" $10 "\t" $11 }' || true \ + else if ($11>$8) print $0 "\t" $9 "\t" $10 "\t" $11 }' \ |gzip \ >RDall.combined.files.txt.gz @@ -110,7 +132,7 @@ join -j 1 -a 2 -e "." -o 2.1 2.2 2.3 1.4 1.5 2.4 2.6 <(zcat rd_indiv_geno.txt.g |tr ' ' '\t' \ |join -j 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.4 2.6 - <(zcat sr_indiv_geno.txt.gz|sort -k1,1) \ |tr ' ' '\t' \ - |gzip || true \ + |gzip \ >PESRall.combined.files.txt.gz ##variant combine## @@ -213,7 +235,7 @@ then | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ - else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' || true \ + else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($4==2 && $4==$7 && $4!=$10) print $0 "\t" "RD,PE" ; \ else if ($4==2 && $4!=$7 && $4==$10) print $0 "\t" "RD,SR" ; \ @@ -221,11 +243,11 @@ then else if ($4!=2 && $7!=2 && $10!=2) print $0 "\t" "RD,PE,SR" ; \ else if ($4!=2 && $7!=2 && $10==2) print $0 "\t" "RD,PE" ; \ else if ($4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ - else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' || true \ + else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' \ |cut -f1-6,8-9,11,15- \ |awk '{if ($4>=2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,0,$11,$12; \ else if ($4==1) print $1,$2,$3,$4,$5,$6,$7,$8,$9,1,$11,$12; \ - else if ($4==0) print $1,$2,$3,$4,$5,$6,$7,$8,$9,2,$11,$12; }' || true \ + else if ($4==0) print $1,$2,$3,$4,$5,$6,$7,$8,$9,2,$11,$12; }' \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -233,7 +255,7 @@ then | { fgrep -wf gt5kbcnv.del.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ - else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' || true \ + else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ |tr ' ' '\t' \ >>genotype.variant.txt fi @@ -246,7 +268,7 @@ then | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($5==999) print $0,$4,999; \ else if ($4==$13) print $0,$4,$5+((999-$5) * ($14/999)*0.5); \ - else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' || true \ + else print $0,$4,$5-(($5-1) * ($14/999)*0.5) }' OFS='\t' \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($4==2 && $4==$7 && $4!=$10) print $0 "\t" "RD,PE" ; \ else if ($4==2 && $4!=$7 && $4==$10) print $0 "\t" "RD,SR" ; \ @@ -254,10 +276,10 @@ then else if ($4!=2 && $7!=2 && $10!=2) print $0 "\t" "RD,PE,SR" ; \ else if ($4!=2 && $7!=2 && $10==2) print $0 "\t" "RD,PE" ; \ else if ($4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ - else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' || true \ + else if ($4!=2 && $7==2 && $10==2) print $0 "\t" "RD" }' \ |cut -f1-6,8-9,11,15- \ |awk '{if ($4<=2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,0,$11,$12; \ - else if ($4>2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10-2,$11,$12}' || true \ + else if ($4>2) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10-2,$11,$12}' \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -265,7 +287,7 @@ then | { fgrep -wf gt5kbcnv.dup.ids.txt || true; } \ |awk '{if ($2==999) print $0,999; \ else if ($3>=$4) print $0,$2+((999-$2) * ($3/999)*0.5); \ - else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' || true \ + else if ($4>$3) print $0,$2+((999-$2) * ($4/999)*0.5); }' \ |tr ' ' '\t' \ >>genotype.variant.txt fi @@ -284,7 +306,7 @@ then | { fgrep -wf gt1_5kbcnv.ids.txt || true; } \ |awk '{if ($14==999) print $0,$12,999; \ else if ($4==$13) print $0,$12,$14+((999-$14) * ($5/999)*0.5); \ - else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' || true \ + else print $0,$12,$14-(($14-1) * ($5/999)*0.5) }' OFS='\t' \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($13==2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "RD,PE" ; \ else if ($13==2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "RD,SR" ; \ @@ -296,7 +318,7 @@ then else if ($13!=2 && $4!=2 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ else if ($13!=2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "SR" ; \ else if ($13!=2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "PE"; \ - else if ($13!=2 && $4==2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' || true \ + else if ($13!=2 && $4==2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' \ |cut -f1-6,8-9,11,15- \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -313,7 +335,7 @@ fi ###CNV <1kb and removing any CNV with no depth genotype#### { fgrep -wvf <(zcat RDall.combined.files.txt.gz \ |awk -F'\t' '{if ($4==".") print $2}') int.bed || true; } \ - |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' || true \ + |awk -F'\t' '{if (($5=="DUP" || $5=="DEL") && $3-$2<1000 ) print $4}' \ >lt1kbcnv.ids.txt if [ $(cat lt1kbcnv.ids.txt|wc -l) -gt 0 ] @@ -324,7 +346,7 @@ then |awk '{if ($14==999) print $0,$12,999; \ else if ($7==$10 && $8>=$11) print $0,$12,$14+((999-$14) * ($11/999)*0.5); \ else if ($7==$10 && $11>=$8) print $0,$12,$14+((999-$14) * ($8/999)*0.5); \ - else print $0 "\t" $12 "\t" $14}' OFS='\t' || true \ + else print $0 "\t" $12 "\t" $14}' OFS='\t' \ |awk '{if ($4==$7 && $4==$10) print $0 "\t" "RD,PE,SR"; \ else if ($13==2 && $4==2 && $7==2 && $10!=2) print $0 "\t" "RD,PE" ; \ else if ($13==2 && $4==2 && $7!=2 && $10==2) print $0 "\t" "RD,SR" ; \ @@ -336,7 +358,7 @@ then else if ($13!=2 && $4==$13 && $7==2 && $10!=2) print $0 "\t" "RD,SR" ; \ else if ($13!=2 && $7==2 && $10!=2) print $0 "\t" "SR" ; \ else if ($13!=2 && $7!=2 && $10==2) print $0 "\t" "PE"; \ - else if ($13!=2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' || true \ + else if ($13!=2 && $7!=2 && $10!=2) print $0 "\t" "PE,SR" }' \ |cut -f1-6,8-9,11,15- \ |tr ' ' '\t' \ >>genotype.indiv.txt @@ -345,7 +367,7 @@ then | { fgrep -wf lt1kbcnv.ids.txt || true; } \ |awk '{if ($3==999 || $4==999) print $0,999; \ else if ($3>=$4) print $0,$3+((999-$3) * ($2/999)*0.5); \ - else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' || true \ + else if ($4>$3) print $0,$4+((999-$4) * ($2/999)*0.5); }' \ |tr ' ' '\t' \ >>genotype.variant.txt fi @@ -382,4 +404,4 @@ cat genotype.indiv.txt \ cat genotype.variant.txt \ |awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \ |gzip \ - >genotype.variant.txt.gz + >genotype.variant.txt.gz \ No newline at end of file From ed2a5b1ca5af4cd2ea19b0f0f770960fe9c40781 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 23 Jan 2025 10:50:50 -0500 Subject: [PATCH 13/17] Added extra line --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index e76772f73..e01f260c8 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -404,4 +404,5 @@ cat genotype.indiv.txt \ cat genotype.variant.txt \ |awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \ |gzip \ - >genotype.variant.txt.gz \ No newline at end of file + >genotype.variant.txt.gz + \ No newline at end of file From 82239a0061d1281c336b2a1ec01090c36b3b595b Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 24 Jan 2025 11:18:55 -0500 Subject: [PATCH 14/17] Modified to use ARGIND --- .../scripts/IntegrateGQ.sh | 107 ++++++++---------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index e01f260c8..4b618a38b 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -31,67 +31,56 @@ zcat $RD_melted_genotypes \ >rd_indiv_geno.txt.gz ##PE## -zcat "$pegeno_indiv_file" > tmp.pe.zcat - -##Deletions, need to PE-SR genotypes to match RD format (2==ref)## -fgrep -wf <(awk '{if ($5 == "DEL") print $4}' int.bed) tmp.pe.zcat \ - > tmp.pe.del.txt || true - -awk '{ - if ($4>1) print $1"@"$2, $1,$2, $4, 0, $5; - else if ($4==1)print $1"@"$2, $1,$2, $4, 1, $5; - else print $1"@"$2, $1,$2, $4, 2, $5 -}' OFS="\t" tmp.pe.del.txt \ -| awk '!seen[$1"@"$2]++' \ -> pe_indiv_geno.del.txt - -##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## -fgrep -wf <(awk '{if ($5 != "DEL") print $4}' int.bed) tmp.pe.zcat \ - > tmp.pe.nodel.txt || true - -awk '{ - if ($4>0) print $1"@"$2, $1,$2, $4, $4+2, $5; - else print $1"@"$2, $1,$2, $4, 2, $5 -}' OFS="\t" tmp.pe.nodel.txt \ -| awk '!seen[$1"@"$2]++' \ -> pe_indiv_geno.nodel.txt - -cat pe_indiv_geno.del.txt pe_indiv_geno.nodel.txt > pe_indiv_geno.txt -sort -k1,1 pe_indiv_geno.txt | gzip > pe_indiv_geno.txt.gz - -rm -f tmp.pe.zcat tmp.pe.del.txt tmp.pe.nodel.txt pe_indiv_geno.del.txt pe_indiv_geno.nodel.txt pe_indiv_geno.txt - +zcat "$pegeno_indiv_file" | \ +awk -v OFS="\t" ' + ARGIND==1 { + if ($5 == "DEL") { + del[$4] # e.g. del["variant_id"] + } else { + nodel[$4] + } + next + } + + ARGIND==2 { + if ($1 in del) { + final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } else if ($1 in nodel) { + final_gt = ($4>0 ? $4+2 : 2) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } + # If $1 not in either array, do nothing. + } +' int.bed - \ +| awk '!seen[$1]++' \ +| sort -k1,1 \ +| gzip > pe_indiv_geno.txt.gz ##SR## -zcat "$srgeno_indiv_file" > tmp.sr.zcat - -##Deletion events, need to PE-SR genotypes to match RD format (2==ref)## -fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) tmp.sr.zcat \ - > tmp.sr.del.txt || true - -awk '{ - if ($4>1) print $1"@"$2,$1,$2,$4,0,$5; - else if ($4==1)print $1"@"$2,$1,$2,$4,1,$5; - else print $1"@"$2,$1,$2,$4,2,$5 -}' OFS="\t" tmp.sr.del.txt \ -| awk '!seen[$1"@"$2]++' \ -> sr_indiv_geno.del.txt - -##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## -fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) tmp.sr.zcat \ - > tmp.sr.nodel.txt || true - -awk '{ - if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5; - else print $1"@"$2,$1,$2,$4,2,$5 -}' OFS="\t" tmp.sr.nodel.txt \ -| awk '!seen[$1"@"$2]++' \ -> sr_indiv_geno.nodel.txt - -cat sr_indiv_geno.del.txt sr_indiv_geno.nodel.txt > sr_indiv_geno.txt -sort -k1,1 sr_indiv_geno.txt | gzip > sr_indiv_geno.txt.gz - -rm -f tmp.sr.zcat tmp.sr.nodel.txt tmp.sr.del.txt sr_indiv_geno.del.txt sr_indiv_geno.nodel.txt sr_indiv_geno.txt +zcat "$srgeno_indiv_file" | \ +awk -v OFS="\t" ' + ARGIND==1 { + if ($5 == "DEL") { + del[$4] + } else { + nodel[$4] + } + next + } + ARGIND==2 { + if ($1 in del) { + final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } else if ($1 in nodel) { + final_gt = ($4>0 ? $4+2 : 2) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } + } +' int.bed - \ +| awk '!seen[$1]++' \ +| sort -k1,1 \ +| gzip > sr_indiv_geno.txt.gz ##check to make sure PE and SR are same size which they should be## From 73497801bf670eeb25cd83ecd83ef3335b1cb190 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 24 Jan 2025 18:50:58 -0500 Subject: [PATCH 15/17] Removed extra line in pe test --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 4b618a38b..f54493b4b 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -41,7 +41,6 @@ awk -v OFS="\t" ' } next } - ARGIND==2 { if ($1 in del) { final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) From 6512861a92d0ac009fff4f0f4b7f958ae231b733 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 24 Jan 2025 18:51:26 -0500 Subject: [PATCH 16/17] Removed extra space at the end of script --- src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index f54493b4b..2e5f66df5 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -393,4 +393,3 @@ cat genotype.variant.txt \ |awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \ |gzip \ >genotype.variant.txt.gz - \ No newline at end of file From bd4473fe6d1024b521c14154d4aba9c64ddee6f3 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 28 Jan 2025 15:16:33 -0500 Subject: [PATCH 17/17] Readded comments for user clarity --- .../04_variant_resolution/scripts/IntegrateGQ.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 2e5f66df5..3bdea0fb3 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -35,7 +35,7 @@ zcat "$pegeno_indiv_file" | \ awk -v OFS="\t" ' ARGIND==1 { if ($5 == "DEL") { - del[$4] # e.g. del["variant_id"] + del[$4] } else { nodel[$4] } @@ -43,13 +43,14 @@ awk -v OFS="\t" ' } ARGIND==2 { if ($1 in del) { + ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) print $1"@"$2, $1, $2, $4, final_gt, $5 } else if ($1 in nodel) { + ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## final_gt = ($4>0 ? $4+2 : 2) print $1"@"$2, $1, $2, $4, final_gt, $5 } - # If $1 not in either array, do nothing. } ' int.bed - \ | awk '!seen[$1]++' \ @@ -69,9 +70,11 @@ awk -v OFS="\t" ' } ARGIND==2 { if ($1 in del) { + ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) print $1"@"$2, $1, $2, $4, final_gt, $5 } else if ($1 in nodel) { + ##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## final_gt = ($4>0 ? $4+2 : 2) print $1"@"$2, $1, $2, $4, final_gt, $5 }