diff --git a/code/pamu.cpp b/code/pamu.cpp index b05381f..9cc34cc 100644 --- a/code/pamu.cpp +++ b/code/pamu.cpp @@ -55,6 +55,7 @@ int main(int argc, char* argv[]) ////input data if (par::read_bitfile) P.readBinData(); else if(par::read_ped) P.readPedData(); + else if(par::read_vcf) P.readVcfData(); ///get covariate data if(par::cov_read) P.readCovData(); @@ -167,6 +168,11 @@ void Pamu::readPedData() _data->readGenMapPed(); } +void Pamu::readVcfData() +{ + _data->readVcfData(); +} + void Pamu::readCovData() { _data->readCovFile(par::cov_file); diff --git a/code/pamu.h b/code/pamu.h index f44237a..fb30097 100644 --- a/code/pamu.h +++ b/code/pamu.h @@ -53,6 +53,7 @@ class Pamu void printLOG(string); void readBinData(); void readPedData(); + void readVcfData(); void readCovData(); void readPheData(); void readPheMat(); diff --git a/code/parset.cpp b/code/parset.cpp index 04befc8..cbf641f 100644 --- a/code/parset.cpp +++ b/code/parset.cpp @@ -83,6 +83,13 @@ void gfun::setPar(CArg &a) par::genfile = par::fileroot + ".gen"; } + if (a.find("--vcf")) + { + par::read_vcf = true; + par::fileroot = a.value("--vcf"); + par::vcffile = par::fileroot + ".vcf"; + } + if(a.find("--hwu")) { par::hwu_run=true; @@ -642,6 +649,8 @@ bool par::silent=false; double par::replace0_cor=0.5; double par::replace0_sml=0.00001; bool par::read_bitfile=false; + + string par::fileroot=""; string par::bitfilename=""; string par::bitfilename_map=""; @@ -665,6 +674,9 @@ string par::mapfile = ""; string par::genfile = ""; bool par:: recode=false; +bool par::read_vcf = false; +string par::vcffile = ""; + int par::most_nsnp=10; double par::largest_auc=0.9; bool par::out_nom_LR=false; diff --git a/code/parset.h b/code/parset.h index 524bd4c..eaf8f9e 100644 --- a/code/parset.h +++ b/code/parset.h @@ -40,11 +40,16 @@ class par static double replace0_sml;// =0.00001 static bool read_bitfile; + static string fileroot; static string bitfilename; static string bitfilename_map; static string famfile; + static bool read_vcf; + static string vcffile; + + static bool qt;//quantitative trait static bool bt;//binary trait diff --git a/code/snpdt.cpp b/code/snpdt.cpp index 10ec0ab..92f1661 100644 --- a/code/snpdt.cpp +++ b/code/snpdt.cpp @@ -1,4 +1,16 @@ #include "snpdt.h" +#include + + +vector split(const string& s, char delimiter) { + vector tokens; + string token; + istringstream tokenStream(s); + while (getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} snpdt::snpdt() { @@ -36,6 +48,31 @@ void snpdt::test() //add code here... } +int chromosomeStringToInt(const std::string& chromStr) +{ + if (chromStr == "X") + return 23; + else if (chromStr == "Y") + return 24; + else if (chromStr == "MT" || chromStr == "M") + return 25; + else if (chromStr == "XY") + return 26; // If needed + else + { + // Try to convert to integer + try + { + return std::stoi(chromStr); + } + catch (std::invalid_argument&) + { + return 0; // Unknown chromosome + } + } +} + + void snpdt::readBinData() { // We cannot assume the file will be in order @@ -257,6 +294,227 @@ void snpdt::readBinData() +// Version 1 of the readVcfData - Most closest to the Output +void snpdt::readVcfData() +{ + // Check if the VCF file exists + gfun::checkFileExists(par::vcffile); + + gfun::printLOG("Reading [ " + par::vcffile + " ] \n"); + + vector ordered; + + ifstream VCF(par::vcffile.c_str(), ios::in); + VCF.clear(); + + int c = 0; + string line; + while (std::getline(VCF, line)) + { + // Skip header lines + if (line[0] == '#') + continue; + + std::stringstream ss(line); + std::string field; + Locus* loc = new Locus; + + // Read chromosome as a string first, then convert to integer + std::getline(ss, field, '\t'); + loc->chr = std::stoi(field); // Convert to integer + + std::getline(ss, field, '\t'); // Position + loc->bp = std::stoi(field); // Convert to integer for base-pair position + + std::getline(ss, loc->name, '\t'); // SNP ID + std::getline(ss, loc->allele2, '\t'); // Reference allele + std::getline(ss, loc->allele1, '\t'); // Alternate allele + + + // Skip unnecessary VCF columns (QUAL, FILTER, INFO) + for (int i = 0; i < 3; i++) + std::getline(ss, field, '\t'); + + // Store the order information temporarily + loc->freq = c++; + + // Check if locus name (ID) is present + if (!loc->name.empty()) + { + _locus.push_back(loc); + ordered.push_back(*loc); + } + else + { + delete loc; + } + } + + gfun::printLOG(int2str(_locus.size()) + " markers in [ " + par::vcffile + " ]\n"); + + VCF.clear(); + VCF.close(); + + if (_locus.size() == 0) + gfun::shutdown(); + + stable_sort(_locus.begin(), _locus.end(), less()); + stable_sort(ordered.begin(), ordered.end()); + + c = 0; + for (int i = 0; i < _locus.size(); i++) + { + ordered[i].bp = static_cast(ordered[i].freq); + ordered[i].chr = 1; + ordered[i].freq = c++; + } + + stable_sort(ordered.begin(), ordered.end()); + + vector include(0); + int nl_actual = _locus.size(); + for (int j = 0; j < ordered.size(); j++) + include.push_back(static_cast(ordered[j].freq)); + + readVCFHeader(par::vcffile); + + // Allocate space for SNPs + for (int i = 0; i < nl_actual; i++) + { + CSNP* newlocus = new CSNP; + newlocus->one.resize(_sample.size()); + newlocus->two.resize(_sample.size()); + _SNP.push_back(newlocus); + } + + // Read genotype data from the VCF + ifstream VCFGenotypes(par::vcffile.c_str(), ios::in); + while (getline(VCFGenotypes, line)) + { + // Skip header lines + if (line[0] == '#') + continue; + + stringstream ss(line); + string field; + static int s = 0; // Keep track of SNP index + + // Skip the first 9 columns (VCF metadata fields) + for (int i = 0; i < 9; i++) + getline(ss, field, '\t'); + + // Loop through the samples/individuals + for (int indx = 0; indx < _sample.size(); indx++) + { + getline(ss, field, '\t'); + + if (s >= include.size() || include[s] >= _SNP.size()) { + std::cerr << "Error: s = " << s << ", include[s] = " << include[s] + << ", _SNP.size() = " << _SNP.size() << std::endl; + exit(1); + } + + if (include[s] > -1) + { + CSNP* snp = _SNP[include[s]]; + if (snp == nullptr) { + std::cerr << "Error: snp is nullptr at s = " << s << std::endl; + exit(1); + } + + // std::cout << "Processing sample " << indx << ", SNP " << s << std::endl; + + if (field[0] == '0' || field[0] == '.') + snp->one[indx] = 0; // Reference allele (0) + else + snp->one[indx] = 1; // Alternate allele (1) + + if (field[2] == '0' || field[2] == '.') + snp->two[indx] = 0; // Reference allele (0) + else + snp->two[indx] = 1; // Alternate allele (1) + } + } + + s++; // Increment SNP index only after processing all samples for this SNP + } + + + VCFGenotypes.clear(); + VCFGenotypes.close(); + + gfun::printLOG("Finished reading VCF data\n"); +} + + +void snpdt::readVCFHeader(string vcfFilename) +{ + gfun::printLOG("Reading [ " + vcfFilename + " ] \n"); + + gfun::checkFileExists(vcfFilename); + + ifstream VCF; + VCF.open(vcfFilename.c_str()); + VCF.clear(); + + string line; + int c = 0; + + // Read through the VCF file to find the header line + while (std::getline(VCF, line)) + { + // Skip lines that are not the header + if (line.substr(0, 6) != "#CHROM") + continue; + + std::stringstream ss(line); + string field; + + // Skip the first 9 columns (standard VCF columns) + for (int i = 0; i < 9; ++i) + std::getline(ss, field, '\t'); + + // Read individual sample IDs from the header + while (std::getline(ss, field, '\t')) + { + Individual* person = new Individual; + + // Assign the sample ID (individual ID) + person->iid = field; + + // Default values for family ID, parents, sex, and phenotype + person->fid = "0"; // Assign a default family ID + person->pat = "0"; // No paternal ID available in VCF + person->mat = "0"; // No maternal ID available in VCF + person->sex = par::missing_int; // Sex unknown by default. Set to -9 + person->pheno_str = "0"; // Default phenotype (can be adjusted later) + person->aff = -1; // Undefined affection status + + // Set the family number (for this example, default to 0) + person->nfid = 0; + + // Increase person counter + c++; + + // Add individual to list + _sample.push_back(person); + } + + // Break after processing the header + break; + } + + VCF.clear(); + VCF.close(); + + gfun::printLOG(int2str(c) + " individuals found in VCF file [ " + vcfFilename + " ] \n"); + + if (_sample.size() == 0) + gfun::shutdown(); +} + + + bool snpdt::openBinaryFile(string s, ifstream & BIT) { @@ -619,6 +877,9 @@ vector snpdt::readExtSnpF(string filename) } + + + void snpdt::readPheFile(string filename) { ////////////////////////////// diff --git a/code/snpdt.h b/code/snpdt.h index ca8b6ba..17eb189 100644 --- a/code/snpdt.h +++ b/code/snpdt.h @@ -510,10 +510,12 @@ class snpdt void test(); void readBinData(); + void readVcfData(); bool openBinaryFile(string, ifstream &); void readFamFile(string filename); + void readVCFHeader(string filename); void getFamInfo(vector & fam); void readPopuFile(string filename); void getPopuInfo(vector & popu); @@ -528,6 +530,8 @@ class snpdt void readSnpSet(); void readSnpSet2Cols(); + + vector readExtSnpF(string filename); void clear(); @@ -584,7 +588,18 @@ class snpdt inline Individual * getIndividual(int indi); inline int fID(int indi); + // Getter methods for accessing private members + const std::vector& getSampleIds() const { return sample_ids; } + const std::vector& getVariantIds() const { return variant_ids; } + const std::vector>& getGenotypes() const { return genotypes; } + private: + + // Private members + std::vector sample_ids; // Sample IDs from the VCF file + std::vector variant_ids; // Variant IDs (e.g., rsIDs) + std::vector> genotypes; // Genotypes for each sample and variant + // Genotype/phenotype per individual file vector _sample; vector _cov_name;