Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions code/pamu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ int main(int argc, char* argv[])
////input data
if (par::read_bitfile) P.readBinData();
else if(par::read_ped) P.readPedData();
else if(par::read_vcf) P.readVcfData();

///get covariate data
if(par::cov_read) P.readCovData();
Expand Down Expand Up @@ -167,6 +168,11 @@ void Pamu::readPedData()
_data->readGenMapPed();
}

void Pamu::readVcfData()
{
_data->readVcfData();
}

void Pamu::readCovData()
{
_data->readCovFile(par::cov_file);
Expand Down
1 change: 1 addition & 0 deletions code/pamu.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class Pamu
void printLOG(string);
void readBinData();
void readPedData();
void readVcfData();
void readCovData();
void readPheData();
void readPheMat();
Expand Down
12 changes: 12 additions & 0 deletions code/parset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ void gfun::setPar(CArg &a)
par::genfile = par::fileroot + ".gen";
}

if (a.find("--vcf"))
{
par::read_vcf = true;
par::fileroot = a.value("--vcf");
par::vcffile = par::fileroot + ".vcf";
}

if(a.find("--hwu"))
{
par::hwu_run=true;
Expand Down Expand Up @@ -642,6 +649,8 @@ bool par::silent=false;
double par::replace0_cor=0.5;
double par::replace0_sml=0.00001;
bool par::read_bitfile=false;


string par::fileroot="";
string par::bitfilename="";
string par::bitfilename_map="";
Expand All @@ -665,6 +674,9 @@ string par::mapfile = "";
string par::genfile = "";
bool par:: recode=false;

bool par::read_vcf = false;
string par::vcffile = "";

int par::most_nsnp=10;
double par::largest_auc=0.9;
bool par::out_nom_LR=false;
Expand Down
5 changes: 5 additions & 0 deletions code/parset.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@ class par
static double replace0_sml;// =0.00001

static bool read_bitfile;

static string fileroot;
static string bitfilename;
static string bitfilename_map;
static string famfile;

static bool read_vcf;
static string vcffile;


static bool qt;//quantitative trait
static bool bt;//binary trait

Expand Down
261 changes: 261 additions & 0 deletions code/snpdt.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
#include "snpdt.h"
#include <htslib/vcf.h>


vector<string> split(const string& s, char delimiter) {
vector<string> tokens;
string token;
istringstream tokenStream(s);
while (getline(tokenStream, token, delimiter)) {
tokens.push_back(token);
}
return tokens;
}

snpdt::snpdt()
{
Expand Down Expand Up @@ -36,6 +48,31 @@ void snpdt::test()
//add code here...
}

int chromosomeStringToInt(const std::string& chromStr)
{
if (chromStr == "X")
return 23;
else if (chromStr == "Y")
return 24;
else if (chromStr == "MT" || chromStr == "M")
return 25;
else if (chromStr == "XY")
return 26; // If needed
else
{
// Try to convert to integer
try
{
return std::stoi(chromStr);
}
catch (std::invalid_argument&)
{
return 0; // Unknown chromosome
}
}
}


void snpdt::readBinData()
{
// We cannot assume the file will be in order
Expand Down Expand Up @@ -257,6 +294,227 @@ void snpdt::readBinData()



// Version 1 of the readVcfData - Most closest to the Output
void snpdt::readVcfData()
{
// Check if the VCF file exists
gfun::checkFileExists(par::vcffile);

gfun::printLOG("Reading [ " + par::vcffile + " ] \n");

vector<Locus> ordered;

ifstream VCF(par::vcffile.c_str(), ios::in);
VCF.clear();

int c = 0;
string line;
while (std::getline(VCF, line))
{
// Skip header lines
if (line[0] == '#')
continue;

std::stringstream ss(line);
std::string field;
Locus* loc = new Locus;

// Read chromosome as a string first, then convert to integer
std::getline(ss, field, '\t');
loc->chr = std::stoi(field); // Convert to integer

std::getline(ss, field, '\t'); // Position
loc->bp = std::stoi(field); // Convert to integer for base-pair position

std::getline(ss, loc->name, '\t'); // SNP ID
std::getline(ss, loc->allele2, '\t'); // Reference allele
std::getline(ss, loc->allele1, '\t'); // Alternate allele


// Skip unnecessary VCF columns (QUAL, FILTER, INFO)
for (int i = 0; i < 3; i++)
std::getline(ss, field, '\t');

// Store the order information temporarily
loc->freq = c++;

// Check if locus name (ID) is present
if (!loc->name.empty())
{
_locus.push_back(loc);
ordered.push_back(*loc);
}
else
{
delete loc;
}
}

gfun::printLOG(int2str(_locus.size()) + " markers in [ " + par::vcffile + " ]\n");

VCF.clear();
VCF.close();

if (_locus.size() == 0)
gfun::shutdown();

stable_sort(_locus.begin(), _locus.end(), less<Locus*>());
stable_sort(ordered.begin(), ordered.end());

c = 0;
for (int i = 0; i < _locus.size(); i++)
{
ordered[i].bp = static_cast<int>(ordered[i].freq);
ordered[i].chr = 1;
ordered[i].freq = c++;
}

stable_sort(ordered.begin(), ordered.end());

vector<int> include(0);
int nl_actual = _locus.size();
for (int j = 0; j < ordered.size(); j++)
include.push_back(static_cast<int>(ordered[j].freq));

readVCFHeader(par::vcffile);

// Allocate space for SNPs
for (int i = 0; i < nl_actual; i++)
{
CSNP* newlocus = new CSNP;
newlocus->one.resize(_sample.size());
newlocus->two.resize(_sample.size());
_SNP.push_back(newlocus);
}

// Read genotype data from the VCF
ifstream VCFGenotypes(par::vcffile.c_str(), ios::in);
while (getline(VCFGenotypes, line))
{
// Skip header lines
if (line[0] == '#')
continue;

stringstream ss(line);
string field;
static int s = 0; // Keep track of SNP index

// Skip the first 9 columns (VCF metadata fields)
for (int i = 0; i < 9; i++)
getline(ss, field, '\t');

// Loop through the samples/individuals
for (int indx = 0; indx < _sample.size(); indx++)
{
getline(ss, field, '\t');

if (s >= include.size() || include[s] >= _SNP.size()) {
std::cerr << "Error: s = " << s << ", include[s] = " << include[s]
<< ", _SNP.size() = " << _SNP.size() << std::endl;
exit(1);
}

if (include[s] > -1)
{
CSNP* snp = _SNP[include[s]];
if (snp == nullptr) {
std::cerr << "Error: snp is nullptr at s = " << s << std::endl;
exit(1);
}

// std::cout << "Processing sample " << indx << ", SNP " << s << std::endl;

if (field[0] == '0' || field[0] == '.')
snp->one[indx] = 0; // Reference allele (0)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to make this part of the logic consistent

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In particular, this is the logic we should follow:

void snpdt::writeIntToGenotype(int indi, int snp, int code)

Copy link
Author

@radioactive17 radioactive17 Feb 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the logic you suggested. It's still giving me a different output in comparison to the bfile.
I also want to confirm if you run the program with a --bfile and --file flag for the same data would it give the same output?

Below is the code I wrote:
int code = -9; if (field[0] == '.' || field[2] == '.') { code = -9; } else if (field[0] == '0' && field[2] == '0') { code = 0; // Homozygous reference (0/0) } else if ((field[0] == '0' && field[2] == '1') || (field[0] == '1' && field[2] == '0')) { code = 1; // Heterozygous (0/1) } else if (field[0] == '1' && field[2] == '1') { code = 2; // Homozygous alternate (1/1) } else { std::cerr << "Error: unrecognized genotype format: " << field << std::endl; exit(1); }

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add your code as a commit to this PR?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@radioactive17 the internal representatino neends to be changed as well. meaning this part of the code needs to be changed as well.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I haven't changed that part of the code. I'll do that and add the updated code. Thanks.

else
snp->one[indx] = 1; // Alternate allele (1)

if (field[2] == '0' || field[2] == '.')
snp->two[indx] = 0; // Reference allele (0)
else
snp->two[indx] = 1; // Alternate allele (1)
}
}

s++; // Increment SNP index only after processing all samples for this SNP
}


VCFGenotypes.clear();
VCFGenotypes.close();

gfun::printLOG("Finished reading VCF data\n");
}


void snpdt::readVCFHeader(string vcfFilename)
{
gfun::printLOG("Reading [ " + vcfFilename + " ] \n");

gfun::checkFileExists(vcfFilename);

ifstream VCF;
VCF.open(vcfFilename.c_str());
VCF.clear();

string line;
int c = 0;

// Read through the VCF file to find the header line
while (std::getline(VCF, line))
{
// Skip lines that are not the header
if (line.substr(0, 6) != "#CHROM")
continue;

std::stringstream ss(line);
string field;

// Skip the first 9 columns (standard VCF columns)
for (int i = 0; i < 9; ++i)
std::getline(ss, field, '\t');

// Read individual sample IDs from the header
while (std::getline(ss, field, '\t'))
{
Individual* person = new Individual;

// Assign the sample ID (individual ID)
person->iid = field;

// Default values for family ID, parents, sex, and phenotype
person->fid = "0"; // Assign a default family ID
person->pat = "0"; // No paternal ID available in VCF
person->mat = "0"; // No maternal ID available in VCF
person->sex = par::missing_int; // Sex unknown by default. Set to -9
person->pheno_str = "0"; // Default phenotype (can be adjusted later)
person->aff = -1; // Undefined affection status

// Set the family number (for this example, default to 0)
person->nfid = 0;

// Increase person counter
c++;

// Add individual to list
_sample.push_back(person);
}

// Break after processing the header
break;
}

VCF.clear();
VCF.close();

gfun::printLOG(int2str(c) + " individuals found in VCF file [ " + vcfFilename + " ] \n");

if (_sample.size() == 0)
gfun::shutdown();
}




bool snpdt::openBinaryFile(string s, ifstream & BIT)
{
Expand Down Expand Up @@ -619,6 +877,9 @@ vector<int> snpdt::readExtSnpF(string filename)

}




void snpdt::readPheFile(string filename)
{
//////////////////////////////
Expand Down
Loading