-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathAllelic frequencies calculation
74 lines (60 loc) · 2.04 KB
/
Allelic frequencies calculation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
set.seed(123)
# Step 1: Simulate fake SNP data matrix
num_individuals <- 100 # Number of individuals
num_snps <- 50 # Number of SNP markers
# Generate SNP data
snp_data <- data.frame(
individuals = paste0("genotype", 1:num_individuals),
matrix(
sample(c("AA", "AG", "GG"),
size = num_individuals * num_snps,
replace = TRUE,
prob = c(0.4, 0.4, 0.2)),
nrow = num_individuals,
byrow = TRUE
)
)
# Rename SNP columns
colnames(snp_data)[-1] <- paste0("SNP", 1:num_snps)
# Step 2: Calculate genotypic and allelic frequencies
allelic_freq_df <- data.frame(SNP = character(), Allele = character(), Freq = numeric()) # Initialize empty dataframe
for (snp in colnames(snp_data)[-1]) { # Loop through each SNP
print(snp)
# Count genotypes
genotype_count <- table(snp_data[[snp]])
total_alleles <- 2 * nrow(snp_data) # Total alleles = 2 * number of individuals
# Count alleles
allele_count <- c(
A = 2 * genotype_count["AA"] + genotype_count["AG"], # Count 'A' alleles
G = 2 * genotype_count["GG"] + genotype_count["AG"] # Count 'G' alleles
)
# Calculate allele frequencies
allele_freq <- allele_count / total_alleles
# Store results in the dataframe
allelic_freq_df <- rbind(
allelic_freq_df,
data.frame(
SNP = snp,
Allele = names(allele_freq),
Allele_freq = as.numeric(allele_freq)
)
)
# Print frequencies for the current SNP
cat("\n", snp, ":\n")
print(as.data.frame(allele_freq))
}
# Save allelic frequencies to CSV
write.csv(allelic_freq_df, "allelic_frequencies.csv", row.names = FALSE)
# Step 3: Add phenotypic data
snp_data$phenotype <- sample(
c("Resistance", "Susceptible"),
size = num_individuals,
replace = TRUE,
prob = c(0.6, 0.4)
)
# Step 4: Calculate phenotype frequencies
pheno_count <- table(snp_data$phenotype) # Count each phenotype
pheno_freq <- pheno_count / num_individuals # Calculate phenotype frequencies
# Print phenotype frequencies
cat("\nPhenotype Frequencies:\n")
print(as.data.frame(pheno_freq))