Skip to content
This repository has been archived by the owner on Mar 11, 2019. It is now read-only.

Commit

Permalink
Added tests for VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
Luis Francisco Hernández Sánchez authored and Luis Francisco Hernández Sánchez committed Mar 23, 2018
1 parent 6ff56aa commit 24eb20f
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 21 deletions.
186 changes: 186 additions & 0 deletions resources/input/GeneticVariants/VCF/CysticFibrosis.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
##fileformat=VCFv4.3
##fileDate=200171205
##source=theImputationPipeline
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
1 94508323 . C
1 94517254 . C
1 203152801 . T
1 205899595 . C
1 205914757 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
1 223285200 . A
1 223285200 . A
1 223285200 . A
2 113588793 . C
2 113590390 . A
3 195529118 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
3 195529118 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
4 38798648 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
4 38798648 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
5 428236 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
5 428236 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
5 518434 . C
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 142779317 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
5 148206473 . C
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
6 26091179 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 73118196 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 73118196 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 73122923 . C
7 73122923 . C
7 117144378 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117144378 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117149143 . C
7 117149143 . C
7 117149144 . C
7 117149144 . C
7 117149147 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117149147 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117171028 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117171028 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117171029 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117171029 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117171122 . C
7 117171122 . C
7 117171169 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117171169 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117175339 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117175339 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117175372 . C
7 117175372 . C
7 117180174 . C
7 117180174 . C
7 117180186 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117180186 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117180297 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117180297 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117180327 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117180327 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117180330 . A
7 117180330 . A
7 117188812 . C
7 117188812 . C
7 117188849 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117188849 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117199648 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117199648 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
7 117199683 . C
7 117199683 . C
7 117227792 . C
7 117227792 . C
7 117227832 . C
7 117227832 . C
7 117227860 . A
7 117227860 . A
7 117227874 . C
7 117227874 . C
7 117230283 . C
7 117230283 . C
7 117230454 . C
7 117230454 . C
7 117232086 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117232086 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117232223 . C
7 117232223 . C
7 117232470 . A
7 117232470 . A
7 117232481 . T
7 117232481 . T
7 117234999 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117234999 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117243663 . C
7 117243663 . C
7 117250575 . C
7 117250575 . C
7 117251649 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117251649 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117251704 . C
7 117251704 . C
7 117254753 . C
7 117254753 . C
7 117254767 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117254767 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
7 117267556 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117267556 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
7 117267812 . C
7 117267812 . C
7 117282582 . C
7 117282582 . C
7 117282620 . A
7 117282620 . A
7 117304834 . C
7 117304834 . C
7 117306991 . T
7 117306991 . T
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
10 114758349 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
11 72945341 . T
11 72945341 . T
11 72945341 . T
11 72946140 . A
11 72946140 . A
11 72946140 . A
11 72946204 . C
11 72946204 . C
11 72946204 . C
12 1866204 . C
12 6458350 . A
12 6458350 . A
12 6458350 . A
13 47469940 . A
13 47469940 . A
13 47469940 . A
14 70517183 . C
14 70517183 . C
14 70517183 . C
14 70517183 . C
14 70517183 . C
14 70517183 . C
14 70517183 . C
14 70517183 . C
16 23200921 . T
19 6919624 . C
19 6919624 . C
19 6919624 . C
19 6919624 . C
19 6919624 . C
19 41858921 . T
19 41858921 . T
19 41860296 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
19 41860296 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 . C
20 31876681 . T
34 changes: 15 additions & 19 deletions src/main/java/no/uib/pap/pathwaymatcher/PathwayMatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.TreeMultimap;
import com.google.common.io.Files;
import no.uib.pap.methods.analysis.ora.Analysis;
Expand All @@ -24,6 +24,7 @@
import static no.uib.pap.model.Error.sendError;
import static no.uib.pap.model.InputPatterns.matches_ChrBp;
import static no.uib.pap.model.InputPatterns.matches_Rsid;
import static no.uib.pap.model.InputPatterns.matches_Vcf_Record;
import static no.uib.pap.model.Warning.EMPTY_ROW;
import static no.uib.pap.model.Warning.INVALID_ROW;
import static no.uib.pap.model.Warning.sendWarning;
Expand Down Expand Up @@ -218,6 +219,7 @@ public static void main(String args[]) {
break;
case RSID:
case RSIDS:

HashSet<String> rsIdSet = new HashSet<>();
// Get the unique set of Variants
int row = 0;
Expand All @@ -234,7 +236,7 @@ public static void main(String args[]) {
rsIdSet.add(rsid);
}

outputSearchWithRsid();
outputSearchWithRsidHeader();
for (int chr = 1; chr <= 22; chr++) {
System.out.println("Loading data for chromosome " + chr);
imapRsIdsToProteins = (ImmutableSetMultimap<String, String>) getSerializedObject("imapRsIdsToProteins" + chr + ".gz");
Expand All @@ -252,23 +254,28 @@ public static void main(String args[]) {
break;
case CHRBP:
case CHRBPS:
case VCF:
TreeMultimap<Integer, Long> chrBpMap = TreeMultimap.create();
Snp snp = null;
row = 0;
for (String line : input) {
row++;
if (line.isEmpty()) {
sendWarning(EMPTY_ROW, row);
continue;
}
if (!matches_ChrBp(line)) {
if (line.startsWith("#")) {
continue;
}
if (!matches_ChrBp(line) && !matches_Vcf_Record(line)) {
sendWarning(INVALID_ROW, row);
continue;
}
Snp snp = getSnpFromChrBp(line);
snp = getSnpFromChrBp(line);
chrBpMap.put(snp.getChr(), snp.getBp());
}
outputSearchWithChrBp();
for (int chr = 1; chr <= 22; chr++) {
outputSearchWithChrBpHeader();
for(int chr : chrBpMap.keySet()){
System.out.println("Loading data for chromosome " + chr);
imapChrBpToProteins = (ImmutableSetMultimap<Long, String>) getSerializedObject("imapChrBpToProteins" + chr + ".gz");
searchResult = Search.searchWithChrBp(chr, chrBpMap.get(chr), iReactions, iPathways, imapChrBpToProteins,
Expand All @@ -281,17 +288,6 @@ public static void main(String args[]) {
analysisResult = Analysis.analysis(iPathways, imapProteinsToReactions.keySet().size(),
hitProteins, hitPathways);
break;
case VCF:
imapChrBpToProteins = (ImmutableSetMultimap<Long, String>) getSerializedObject("imapChrBpToProteins.gz");
searchResult = Search.searchWithVCF(input, iReactions, iPathways, imapChrBpToProteins,
imapProteinsToReactions, imapReactionsToPathways, imapPathwaysToTopLevelPathways,
commandLine.hasOption("tlp"), hitProteins, hitPathways);
outputSearchWithUniProt(searchResult.getKey());
System.out.println("Matching results writen to: " + outputPath + "search.csv");
System.out.println("Starting ORA analysis...");
analysisResult = Analysis.analysis(iPathways, imapProteinsToReactions.keySet().size(),
hitProteins, hitPathways);
break;
case PEPTIDE:
case PEPTIDES:
searchResult = Search.searchWithPeptide(input, iReactions, iPathways, imapProteinsToReactions,
Expand Down Expand Up @@ -549,7 +545,7 @@ private static void outputSearchWithUniProt(List<String[]> searchResult) throws
writeSearchResults(searchResult);
}

private static void outputSearchWithRsid() throws IOException {
private static void outputSearchWithRsidHeader() throws IOException {

outputSearch.write("RSID" + separator + "UNIPROT" + separator + "REACTION_STID" + separator + "REACTION_DISPLAY_NAME" + separator
+ "PATHWAY_STID" + separator + "PATHWAY_DISPLAY_NAME");
Expand All @@ -560,7 +556,7 @@ private static void outputSearchWithRsid() throws IOException {
outputSearch.newLine();
}

private static void outputSearchWithChrBp() throws IOException {
private static void outputSearchWithChrBpHeader() throws IOException {

outputSearch.write("CHROMOSOME" + separator + "BASE_PAIR" + separator + "UNIPROT" + separator + "REACTION_STID" + separator + "REACTION_DISPLAY_NAME" + separator
+ "PATHWAY_STID" + separator + "PATHWAY_DISPLAY_NAME");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ class PathwayMatcherGeneticVariantsTest {
public void GIANTTest() throws IOException {
String[] args = {"-t", "rsids",
"-o", "output/",
"-i", "resources/input/GeneticVariants/RsId/GIANT.csv",
"-tlp"};
"-i", "resources/input/GeneticVariants/RsId/GIANT.csv"};
PathwayMatcher.main(args);

// Check the output file
Expand Down Expand Up @@ -63,6 +62,22 @@ public void cysticFibrosisWithChrAndBpTest() throws IOException {
assertEquals(206, statistics.size());
}

@Test
public void cysticFibrosisWithVCFTest() throws IOException {
String[] args = {"-t", "vcf",
"-i", "resources/input/GeneticVariants/VCF/CysticFibrosis.txt",
"-o", "output/",
"-tlp"};
PathwayMatcher.main(args);

// Check the output file
List<String> search = Files.readLines(new File("output/search.tsv"), Charset.defaultCharset());
assertEquals(6104, search.size());

List<String> statistics = Files.readLines(new File("output/analysis.tsv"), Charset.defaultCharset());
assertEquals(206, statistics.size());
}

@Test
public void diabetesTest() throws IOException {
String[] args = {"-t", "rsids",
Expand Down

0 comments on commit 24eb20f

Please sign in to comment.