Skip to content

Commit

Permalink
small code improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikita Tikhomirov committed Oct 11, 2024
1 parent 6e42c78 commit 0625d30
Showing 1 changed file with 112 additions and 113 deletions.
225 changes: 112 additions & 113 deletions scripts/piawka
Original file line number Diff line number Diff line change
Expand Up @@ -265,127 +265,126 @@ function process_sites(region, locus,

if ( args["het"] != 1 ) {
for ( g in seenlist ) {
if ( miss[g]/(miss[g]+nalleles[g]) <= args["miss"] && ( args["mult"] == 1 || nseen[g]<=2 ) ) {

# Increment number of sites used for calculation
nUsed[g]++

# Extract allele counts of the group
delete thesealleles
delete bothalleles
for (i=1;i<=length(seenlist[g]);i++) {
xx=substr(seenlist[g],i,1)
bothalleles[xx]++
thesealleles[xx] = alleles[g,xx]
if ( miss[g]/(miss[g]+nalleles[g]) > args["miss"] || ( args["mult"] != 1 && nseen[g]>2 ) ) { continue }

# Increment number of sites used for calculation
nUsed[g]++

# Extract allele counts of the group
delete thesealleles
delete bothalleles
for (i=1;i<=length(seenlist[g]);i++) {
xx=substr(seenlist[g],i,1)
bothalleles[xx]++
thesealleles[xx] = alleles[g,xx]
}

# Add to pi: probability that two randomly picked alleles differ
# New formula below from https://pubmed.ncbi.nlm.nih.gov/36031871/
thisnum[g]=nalleles[g]^2
thisden[g]=nalleles[g]*(nalleles[g]-1)
for ( x in thesealleles ) { thisnum[g]-=thesealleles[x]^2 }

if ( TAJLIKE && nseen[g]==2 ) {
for (i=1;i<nalleles[g];i++) { a1[g]+=1/i; a2[g]+=1/i^2 }
segr[g]+=recalcS_expected( 1, nalleles[g], nalleles[g]+miss[g] )
truesegr[g]++
}

if ( args["persite"] == 1 ) {
if ( thisden[g] && args["nopi"] != 1 ) { printOutput( $1"_"$2, 1, g, ".", 1, "pi", thisnum[g], thisden[g], nalleles[g], miss[g] ) }
} else {
numerator[g]+=thisnum[g]; denominator[g]+=thisden[g]
if ( VERBOSE ) {
allmiss[g]+=miss[g]
allgeno[g]+=nalleles[g]
}
}

# Add to pi: probability that two randomly picked alleles differ
# New formula below from https://pubmed.ncbi.nlm.nih.gov/36031871/
thisnum[g]=nalleles[g]^2
thisden[g]=nalleles[g]*(nalleles[g]-1)
for ( x in thesealleles ) { thisnum[g]-=thesealleles[x]^2 }

if ( TAJLIKE && nseen[g]==2 ) {
for (i=1;i<nalleles[g];i++) { a1[g]+=1/i; a2[g]+=1/i^2 }
segr[g]+=recalcS_expected( 1, nalleles[g], nalleles[g]+miss[g] )
truesegr[g]++

# Calculate pi between this group and all other groups with <50% missing data
if ( args["nodxy"] == 1 ) { continue }
for ( g2 in seenlist ) {
if ( g2 >= g || miss[g2]/(miss[g2]+nalleles[g2]) > args["miss"] || ( args["mult"] != 1 && nseen[g2]>2 ) ) { continue }

# Is the union of allelic states from g1 and g2 bigger than nseen[g1]?
poolsize=nseen[g]

# Extract alleles of the group
delete thosealleles
for (i=1;i<=length(seenlist[g2]);i++) {
yy=substr(seenlist[g2],i,1)
bothalleles[yy]++ # so far keeps alleles from comparisons of g with previous groups
thosealleles[yy] = alleles[g2,yy]
if ( !(yy in thesealleles) ) { poolsize++ }
}

if ( args["persite"] == 1 ) {
if ( thisden[g] && args["nopi"] != 1 ) { printOutput( $1"_"$2, 1, g, ".", 1, "pi", thisnum[g], thisden[g], nalleles[g], miss[g] ) }
} else {
numerator[g]+=thisnum[g]; denominator[g]+=thisden[g]
if ( VERBOSE ) {
allmiss[g]+=miss[g]
allgeno[g]+=nalleles[g]
}
}

# Calculate pi between this group and all other groups with <50% missing data
if ( args["nodxy"] == 1 ) { continue }
for ( g2 in seenlist ) {
if ( g2 >= g || miss[g2]/(miss[g2]+nalleles[g2]) > args["miss"] || ( args["mult"] == "" && nseen[g2]>2 ) ) { continue }
# If not args["mult"] == 1, proceed only if common allele pool has <=2 alleles
if ( args["mult"] != 1 && poolsize > 2 ) { continue }

# Increment number of sites used for dxy
nUsed[g,g2]++

# Is the union of allelic states from g1 and g2 bigger than nseen[g1]?
poolsize=nseen[g]

# Extract alleles of the group
delete thosealleles
for (i=1;i<=length(seenlist[g2]);i++) {
yy=substr(seenlist[g2],i,1)
bothalleles[yy]++ # so far keeps alleles from comparisons of g with previous groups
thosealleles[yy] = alleles[g2,yy]
if ( !(yy in thesealleles) ) { poolsize++ }
# Add to dxy: probability that two alleles picked from two groups differ
# subtraction rather than addition inspired by https://pubmed.ncbi.nlm.nih.gov/36031871/
thisnum[g,g2]=nalleles[g]*nalleles[g2]
thisden[g,g2]=thisnum[g,g2]
if ( FST ) { thisfstnum=""; thisfstden="" }
for ( x in bothalleles ) {
# gawk breaks here if you don't set uninitialized array elems to 0
if ( !(x in thesealleles) ) { thesealleles[x]=0 }
else if ( !(x in thosealleles) ) { thosealleles[x]=0 }
thisnum[g,g2]-=thesealleles[x]*thosealleles[x]
if ( FST ) {
if ( FST=="HUD" ) {
incrementFstHudson( thesealleles[x], thosealleles[x], nalleles[g], nalleles[g2] )
} else if ( FST=="WC" ) {
incrementFstWeirCockerham( thesealleles[x], thosealleles[x], nalleles[g], nalleles[g2] )
}
if ( args["mult"] != 1 ) { break } # no need to increment twice for biallelic comparisons
}

# If not args["mult"] == 1, proceed only if common allele pool has <=2 alleles
if ( args["mult"] == "" && poolsize <= 2 ) { continue }

# Increment number of sites used for dxy
nUsed[g,g2]++

# Add to dxy: probability that two alleles picked from two groups differ
# subtraction rather than addition inspired by https://pubmed.ncbi.nlm.nih.gov/36031871/
thisnum[g,g2]=nalleles[g]*nalleles[g2]
thisden[g,g2]=thisnum[g,g2]
if ( FST ) { thisfstnum=""; thisfstden="" }
for ( x in bothalleles ) {
# gawk breaks here if you don't set uninitialized array elems to 0
if ( !(x in thesealleles) ) { thesealleles[x]=0 }
else if ( !(x in thosealleles) ) { thosealleles[x]=0 }
thisnum[g,g2]-=thesealleles[x]*thosealleles[x]
if ( FST ) {
if ( FST=="HUD" ) {
incrementFstHudson( thesealleles[x], thosealleles[x], nalleles[g], nalleles[g2] )
} else if ( FST=="WC" ) {
incrementFstWeirCockerham( thesealleles[x], thosealleles[x], nalleles[g], nalleles[g2] )
}
if ( args["mult"] != 1 ) { break } # no need to increment twice for biallelic comparisons
}
}
if ( args["rho"] == 1 ) {
if (!thisden[g2]) {
thisnum[g2]=nalleles[g2]^2
thisden[g2]=nalleles[g2]*(nalleles[g2]-1)
for ( x in thosealleles ) { thisnum[g2]-=thesealleles[x2]^2 }
}
if ( args["rho"] == 1 ) {
if (!thisden[g2]) {
thisnum[g2]=nalleles[g2]^2
thisden[g2]=nalleles[g2]*(nalleles[g2]-1)
for ( x in thosealleles ) { thisnum[g2]-=thesealleles[x2]^2 }
}
# Here Hs = average of pi values of two populations,
# Ht = pi of two populations pooled,
# Hsp = Hs corrected for ploidy
thisHtnum=(nalleles[g] + nalleles[g2])^2
thisHtden=(nalleles[g] + nalleles[g2]) * (nalleles[g] + nalleles[g2] - 1)
for ( x in bothalleles ) { thisHtnum-=(thesealleles[x]+thosealleles[x])^2 }
thisHsnum = ( (thisnum[g]*thisden[g2] ) + (thisnum[g2]*thisden[g]) )
thisHsden = 2 * thisden[g] * thisden[g2] # same as Hspden
thisHspnum = ( (thisnum[g]*thisden[g2]*(ploidy[g]-1)/ploidy[g] ) + (thisnum[g2]*thisden[g]*(ploidy[g2]-1)/ploidy[g2]) )
# Here Hs = average of pi values of two populations,
# Ht = pi of two populations pooled,
# Hsp = Hs corrected for ploidy
thisHtnum=(nalleles[g] + nalleles[g2])^2
thisHtden=(nalleles[g] + nalleles[g2]) * (nalleles[g] + nalleles[g2] - 1)
for ( x in bothalleles ) { thisHtnum-=(thesealleles[x]+thosealleles[x])^2 }
thisHsnum = ( (thisnum[g]*thisden[g2] ) + (thisnum[g2]*thisden[g]) )
thisHsden = 2 * thisden[g] * thisden[g2] # same as Hspden
thisHspnum = ( (thisnum[g]*thisden[g2]*(ploidy[g]-1)/ploidy[g] ) + (thisnum[g2]*thisden[g]*(ploidy[g2]-1)/ploidy[g2]) )
}
if ( args["persite"] == 1 ) {
if ( thisden[g,g2] ) { printOutput( $1"_"$2, 1, g, g2, 1, "dxy", thisnum[g,g2], thisden[g,g2], nalleles[g]+nalleles[g2], miss[g]+miss[g2] ) }
if ( FST && thisfstden ) { printOutput( $1"_"$2, 1, g, g2, 1, "Fst_"FST, thisfstnum, thisfstden, nalleles[g]+nalleles[g2], miss[g]+miss[g2] ) }
if ( args["rho"] == 1 ) {
Hs = thisHsnum/thisHsden
Ht = thisHtnum/thisHtden
Hsp = thisHspnum/thisHsden
Hpt = Hs + 2 * (Ht - Hs)
thisrhonum=Hpt-Hs
thisrhoden=Hpt-Hsp
printOutput( $1"_"$2, 1, g, g2, 1, "Rho", thisrhonum, thisrhoden, nalleles[g]+nalleles[g2], miss[g]+miss[g2] )
}
if ( args["persite"] == 1 ) {
if ( thisden[g,g2] ) { printOutput( $1"_"$2, 1, g, g2, 1, "dxy", thisnum[g,g2], thisden[g,g2], nalleles[g]+nalleles[g2], miss[g]+miss[g2] ) }
if ( FST && thisfstden ) { printOutput( $1"_"$2, 1, g, g2, 1, "Fst_"FST, thisfstnum, thisfstden, nalleles[g]+nalleles[g2], miss[g]+miss[g2] ) }
if ( args["rho"] == 1 ) {
Hs = thisHsnum/thisHsden
Ht = thisHtnum/thisHtden
Hsp = thisHspnum/thisHsden
Hpt = Hs + 2 * (Ht - Hs)
thisrhonum=Hpt-Hs
thisrhoden=Hpt-Hsp
printOutput( $1"_"$2, 1, g, g2, 1, "Rho", thisrhonum, thisrhoden, nalleles[g]+nalleles[g2], miss[g]+miss[g2] )
} else {
numerator[g,g2]+=thisnum[g,g2]
denominator[g,g2]+=thisden[g,g2]
if ( FST ) {
fst_numerator[g,g2]+=thisfstnum
fst_denominator[g,g2]+=thisfstden
}
if ( args["rho"] == 1 ) {
Htnum[g,g2] += thisHtnum
Htden[g,g2] += thisHtden
Hsnum[g,g2] += thisHsnum
Hsden[g,g2] += thisHsden
Hspnum[g,g2] += thisHspnum
}
} else {
numerator[g,g2]+=thisnum[g,g2]
denominator[g,g2]+=thisden[g,g2]
if ( FST ) {
fst_numerator[g,g2]+=thisfstnum
fst_denominator[g,g2]+=thisfstden
}
if ( args["rho"] == 1 ) {
Htnum[g,g2] += thisHtnum
Htden[g,g2] += thisHtden
Hsnum[g,g2] += thisHsnum
Hsden[g,g2] += thisHsden
Hspnum[g,g2] += thisHspnum
}
}
}
}
}
Expand Down

0 comments on commit 0625d30

Please sign in to comment.