Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for vcf import for large number of different indels #33

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 83 additions & 70 deletions python/PascalX/refpanel.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ def _import_reference_thread_tped(self,i):
return True

def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None):
# Pre-compile genotype splitter
Gsplitter = re.compile('/|\|')

# Load filter info
keep = set([])
if keepfile is not None:
Expand Down Expand Up @@ -230,83 +233,93 @@ def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None):

# Main data import loop
for line in f:

# Data line
data = line.split("\t")

# Get GT pos
tmp = data[8].split(":")
GT = -1
for j in range(0,len(tmp)):
if tmp[j] == 'GT':
GT = j
break

# Checks
if (GT == -1) or (data[2][:2] != 'rs') or (data[6] != 'PASS' and qualityT is not None and (int(data[5]) < qualityT)):
continue

# Read genotype
genotypes = data[9:]

# Infer alternate alleles (pos 0: ref allele)
alleles = [data[3]]
alleles.extend(data[4].split(","))

if SNPonly and (len(data[3]) > 1):
continue

counter = np.zeros(len(alleles),dtype='int')

# Only read samples in sampleMap
genomap = {}
for j in range(0,len(sampleKeys)):

geno = genotypes[sampleKeys[j]].split(":")[GT]

# Ignore half-calls
if geno[0] != "." and geno[2] != ".":
counter[int(geno[0])] += 1
counter[int(geno[2])] += 1

genomap[sampleKeys[j]] = geno
try:
# Data line
data = line.split("\t")

# Get GT pos
tmp = data[8].split(":")
GT = -1
for j in range(0,len(tmp)):
if tmp[j] == 'GT':
GT = j
break

# Checks
if (GT == -1) or (data[2][:2] != 'rs') or (data[6] != 'PASS' and qualityT is not None and (int(data[5]) < qualityT)):
continue

# Infer alternate alleles (pos 0: ref allele)
alleles = [data[3]]
alleles.extend(data[4].split(","))

# Reference allele
refp = 0

SC = np.argsort(counter) # Sort alleles count
for p in SC:
if SNPonly and (len(data[3]) > 1):
continue

counter = np.zeros(len(alleles),dtype='int')

if p != refp:
if SNPonly and len(alleles[p]) > 1:
continue

minp = str(p)

gd = np.zeros(len(sampleKeys),dtype='B')
# Read genotype
genotypes = data[9:]

for j in range(0,len(sampleKeys)):
#geno = genotypes[sampleKeys[j]].split(":")[GT]
geno = genomap[sampleKeys[j]]

# Ignore half-calls
if geno[0] != '.' and geno[2] != '.':

if geno[0] == minp:
gd[j] += 1

if geno[2] == minp:
gd[j] += 1
# Multi-allelic flag
mallelic = False

# Only read samples in sampleMap
genomap = {}
for j in range(0,len(sampleKeys)):

geno = Gsplitter.split( genotypes[sampleKeys[j]].split(":")[GT] )

# Compute MAF
MAF = np.mean(gd)/2.
if (MAF > 0.5):
MAF = 1.0 - MAF;
if len(geno) > 2:
mallelic = True
break

# Ignore half-calls
if geno[0] != "." and geno[1] != ".":
counter[int(geno[0])] += 1
counter[int(geno[1])] += 1

T = [data[2],MAF,gd,alleles[p],alleles[refp]] # Stores alt and ref allele
genomap[sampleKeys[j]] = geno

db.insert({int(data[1]):T})
if not mallelic: # Skip if more than two alleles
# Reference allele
refp = 0

SC = np.argsort(counter) # Sort alleles count
for p in SC:

if p != refp:
if SNPonly and len(alleles[p]) > 1:
continue

minp = str(p)

gd = np.zeros(len(sampleKeys),dtype='B')

for j in range(0,len(sampleKeys)):
geno = genomap[sampleKeys[j]]

# Ignore half-calls
if geno[0] != '.' and geno[1] != '.':

if geno[0] == minp:
gd[j] += 1

if geno[1] == minp:
gd[j] += 1

# Compute MAF
MAF = np.mean(gd)/2.
if (MAF > 0.5):
MAF = 1.0 - MAF;

T = [data[2],MAF,gd,alleles[p],alleles[refp]] # Stores alt and ref allele

db.insert({int(data[1]):T})

except Exception as e:
print("[WARNING] Data line broken:",line)

f.close()
db.close()

Expand Down