GitHub - cdquinto/Flip-genotypes-MEGA

Branches Tags

Name		Name	Last commit message	Last commit date
Latest commit History 5 Commits
README.txt		README.txt
README_Memo.txt		README_Memo.txt
Reporte_flip_MEGA.aux		Reporte_flip_MEGA.aux
Reporte_flip_MEGA.log		Reporte_flip_MEGA.log
Reporte_flip_MEGA.out		Reporte_flip_MEGA.out
Reporte_flip_MEGA.pdf		Reporte_flip_MEGA.pdf
Reporte_flip_MEGA.synctex.gz		Reporte_flip_MEGA.synctex.gz
Reporte_flip_MEGA.tex		Reporte_flip_MEGA.tex
Reporte_flip_MEGA.tex.bak		Reporte_flip_MEGA.tex.bak
cal_concordance.pl		cal_concordance.pl
cal_concordance2.pl		cal_concordance2.pl
change_alleles_dbSNP.pl		change_alleles_dbSNP.pl
comparison_pos_1KG_MEGA.pl		comparison_pos_1KG_MEGA.pl
complementary_snps.txt		complementary_snps.txt
convert_MEGA_alleles.pl		convert_MEGA_alleles.pl
convert_MEGA_alleles_control.pl		convert_MEGA_alleles_control.pl
convert_MEGA_alleles_sentli.pl		convert_MEGA_alleles_sentli.pl
convert_MEGA_alleles_sentli_test.pl		convert_MEGA_alleles_sentli_test.pl
get_snps_to_change_alleles.pl		get_snps_to_change_alleles.pl
instrucciones.aux		instrucciones.aux
instrucciones.log		instrucciones.log
instrucciones.out		instrucciones.out
instrucciones.pdf		instrucciones.pdf
instrucciones.synctex.gz		instrucciones.synctex.gz
instrucciones.tex		instrucciones.tex
instrucciones.tex.bak		instrucciones.tex.bak
list_snps_to_change_alleles.txt		list_snps_to_change_alleles.txt
multiallelic_SNPs.txt		multiallelic_SNPs.txt
new_positions_snps.txt		new_positions_snps.txt
remove_dupl_snps_to_change_alleles.pl		remove_dupl_snps_to_change_alleles.pl
test.pl		test.pl

Repository files navigation

#####README#####
#####Convert genotypes according to the information about the strands
####  /data/users/coquinto/convert_MEGA_alleles

##This information comes from the file: maavp1v1.raw_FinalReport.txt, generated by the 
##Multi-EthnicGlobal_A1.bpm manifest. 

##Remove the first ten lines of the file: maavp1v1.raw_FinalReport.txt
##That has some information about the strands where the genotypes are and therefore we can obtain which SNPs we should flip. 

perl get_snps_to_change_alleles.pl

###This generates a list of SNP with the "old" and "new" alelles to update the ped/map generated by GenomeStudio.
##The file list_snps_to_change_alleles.txt still has the SNP Ids that come from GenomeStudio.

###Run the script convert_MEGA_alleles.pl
###This script orders the raw data (before renaming the SNPs), finds duplicates by ID and by position and removes them, changes the alleles codes.
###Also removes SNPs in chr 0 (unmapped SNPs)
###And renames the SNPs by rs# (if possible)
###And It also updates the position of 2 SNPs (based on 1KG positions) (new_positions_snps.txt)
##rs9522257	112171024
##rs9480186	150624675

#perl convert_MEGA_alleles.pl [file].ped
perl convert_MEGA_alleles.pl entrenamiento_clusterPAGE.raw.ped
#Output file ${file}.flip.tped/tfam and ${file}.flip.ped/map

##This script requires 3 files:
##list_snps_to_change_alleles.txt
##MEGA_Consortium_v2_15070954_A1_b138_rsids.txt
##new_positions_snps.txt

##And it writes to a file the list of duplicated SNPs by name and by position
##duplicate_snps_${file}.txt
##duplicate_positions_${file}.txt
##to_remove.txt

###In addition, there is a file that has the list of SNPs in the MEGA array that are not biallelic:
##multiallelic_SNPs.txt

####Important: this script has to be run after the data is obtained from GenomeStudio, BEFORE doing the renaming

#############################################################
#########EXAMPLE RUN######

##I used the file entrenamiento_clusterPAGE.raw.ped/map

###This file still contains SNPs in chromosome 0 

###Filter data sets
plink2 --file entrenamiento_clusterPAGE.raw.flip --geno 0 --not-chr X,Y,XY,MT --exclude multiallelic_SNPs.txt --recode --out entrenamiento_clusterPAGE.raw_nomissing
plink2 --file ../test_langebio/1KG_Affy6.0_training_samples --extract id_overlap_snps_sorted.txt --exclude multiallelic_SNPs.txt --make-bed --recode --out 1KG_Affy6.0_training_samples_nomulti

###Find SNPs in common
awk 'NR==FNR{a[$2]=$4;next}{if ($2 in a) print $1,a[$2],$4,$2}' OFS="\t" 1KG_Affy6.0_training_samples_nomulti.map entrenamiento_clusterPAGE.raw_nomissing.map > overlap_snps.txt
sort overlap_snps.txt | uniq -u > overlap_snps_sorted.txt
cut -f4 overlap_snps_sorted.txt > id_overlap_snps_sorted.txt 
##129,588 SNPs in common

plink2 --file 1KG_Affy6.0_training_samples_nomulti --extract id_overlap_snps_sorted.txt --make-bed --out 1KG_Affy6.0_training_samples_overlap
plink2 --file entrenamiento_clusterPAGE.raw_nomissing --extract id_overlap_snps_sorted.txt --make-bed --out entrenamiento_clusterPAGE.raw_nomissing_overlap

###Merge the datasets (in binary format)
plink2 --bfile 1KG_Affy6.0_training_samples_overlap --bmerge entrenamiento_clusterPAGE.raw_nomissing_overlap.bed entrenamiento_clusterPAGE.raw_nomissing_overlap.bim entrenamiento_clusterPAGE.raw_nomissing_overlap.fam --recode --out merged_data
###There are not any sites that need to be flipped

###Calculate the difference in the genotypes

plink2 --bfile merged_data --keep hg01938.txt --recode transpose --out hg01938 

plink2 --file hg01938 --distance square 

###Results: 96.98%

##When C/G and A/T are taken into account the corcordance is: 99.96%

###Without those sites:
plink2 --file hg01938 --exclude test.txt --recode --out hg01938_test

plink2 --file hg01938_test --distance square 

##99.96%
######################
####Sidenotes:
###To find the SNPs that need to flipped and get the information from dbSNP

###Find more SNPs whose alleles have to be flipped 
perl cal_concordance.pl > snps_diff_alleles.txt
cut -f2 -d" " snps_diff_alleles.txt > list_snps_to_find_in_dbSNP.txt

###I used this file to get the information from dbSNP (directly from the web database) mart_export.txt
###perl change_alleles_dbSNP.pl > snps_to_add_from_dbSNP.txt

cat list_snps_to_change_alleles.txt_COPY snps_to_add_from_dbSNP.txt > list_snps_to_change_alleles.txt2
perl remove_dupl_snps_to_change_alleles.pl