simulated_v_empirical.Rmd

---
title: "Fit performance on simulated vs empirical alignments"
author: "Mackenzie Johnson"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Analysis of fitting procedures on simulated v empirical alignments
This RMarkdown document repeats the fitting procedures for the 10 paired simulated and empirical protein alignments and produces Figure 4 and Supplementary Figures 2 and 3. It repeats code from simulated_protein_all.Rmd and empirical_protein_all.Rmd, and compares chi-squared results. Additionally examines effects of larger sample size in simulated alignments.


```{r, message=FALSE}
# load packages
library(tidyverse)
library(Biostrings)
library(stringr)
library(cowplot)
library(broom)
library(ggtext)
theme_set(theme_cowplot())
```


## Simulated data
Read in and reformat data
```{r, message=FALSE}
# read in all .csv files in /data/simulated/
simulated_files <- dir("data/simulated", pattern = "*.csv", full.names = T)

# tidy individual alignments
tidy_alignments <- function(alignment) {
  
  alignment %>% 
    mutate(sequence = as.numeric(1:nrow(alignment)) ) %>% 
    gather(key = "site", value = "aa", 1:ncol(alignment)) %>%
    mutate(site = as.numeric(site))

}

alignments_sim <- tibble(filename = simulated_files) %>%
  extract(
    filename, 
    "protein", 
    "results_(.*)_evolved_split", 
    remove = FALSE
  ) %>%
  mutate(
    raw_data = map(
      filename, 
      function(x) read_csv(x, col_types = cols(.default = "c"))
    ),
    alignment = map(raw_data, tidy_alignments)
  ) %>%
  select(-raw_data, -filename) %>%
  unnest(cols = alignment)

```


Sample simulated sequences to match the number of empirical sequences for each protein
```{r}
# 1B4T - 160
alignments_sim %>% 
  filter(protein == "1B4T_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(160) %>% 
  unnest(data) -> pro1
  
# 1CI0 - 87
alignments_sim %>% 
  filter(protein == "1CI0_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(87) %>% 
  unnest(data) -> pro2

# 1EFV - 84
alignments_sim %>% 
  filter(protein == "1EFV_B") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(84) %>% 
  unnest(data) -> pro3

# 1G58 - 211
alignments_sim %>% 
  filter(protein == "1G58_B") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(211) %>% 
  unnest(data) -> pro4

# 1GV3 - 181
alignments_sim %>% 
  filter(protein == "1GV3_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(181) %>% 
  unnest(data) -> pro5

# 2A84 - 125
alignments_sim %>% 
  filter(protein == "2A84_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(125) %>% 
  unnest(data) -> pro6

# 2AIU - 73
alignments_sim %>% 
  filter(protein == "2AIU_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(73) %>% 
  unnest(data) -> pro7

# 2BCG - 168
alignments_sim %>% 
  filter(protein == "2BCG_Y") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(168) %>% 
  unnest(data) -> pro8

# 2BR9 - 96
alignments_sim %>% 
  filter(protein == "2BR9_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(96) %>% 
  unnest(data) -> pro9

# 2CFE - 312
alignments_sim %>% 
  filter(protein == "2CFE_A") %>% 
  nest(data = c(site, aa)) %>% 
  sample_n(312) %>% 
  unnest(data) -> pro10

full_join(pro1, pro2) %>% 
  full_join(., pro3) %>% 
  full_join(., pro4) %>% 
  full_join(., pro5) %>% 
  full_join(., pro6) %>% 
  full_join(., pro7) %>% 
  full_join(., pro8) %>% 
  full_join(., pro9) %>% 
  full_join(., pro10) -> alignments_sim_partial

rm(pro1, pro2, pro3, pro4, pro5, pro6, pro7, pro8, pro9, pro10)

```


Actual distribution - all
```{r, message=FALSE}
# determine the number of each AA at each site
count_aa <- function(protein_alignment) {
  
  protein_alignment %>%
    group_by(site, aa) %>% 
    summarize(count = n())

  }

alignments_sim %>%
  nest(data = c(sequence, site, aa)) %>% 
  mutate(counts = map(data, count_aa)) %>%
  select(-data) %>% 
  unnest(cols = counts) -> site_aa_count_sim

# add unobserved amino acids to counts
empty_counts <- tibble(
  aa = c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
         'N', 'P','Q', 'R', 'S', 'T', 'V', 'W', 'Y'), 
  count = rep(0, 20)
)

# 1. First write a function that can take a data frame with amino acids at one site, and return a data frame with counts, including zeros.
count_zero_aa_site <- function(site_df) {
  
  site_df %>% 
    group_by(aa) %>% 
    summarize(count = n()) -> counts
  full_join(counts, anti_join(empty_counts, counts, by = 'aa'))

  }

# 2. Function to call that function for each protein.
count_zero_aa_protein <- function(protein_df) {
  
  protein_df %>%
    nest(data = -site) %>%
    mutate(counts = map(data, count_zero_aa_site)) %>%
    select(-data) %>%
    unnest(cols = counts)

}

# 3. Apply to all proteins
alignments_sim %>%
  nest(data = c(sequence, site, aa)) %>%
  mutate(all_counts = map(data, count_zero_aa_protein)) %>%
  select(-data) %>%
  unnest(cols = all_counts) -> all_count_sim

```


Actual distribution - partial
```{r, message=FALSE}
# determine the number of each AA at each site
alignments_sim_partial %>%
  nest(data = c(sequence, site, aa)) %>% 
  mutate(counts = map(data, count_aa)) %>%
  select(-data) %>% 
  unnest(cols = counts) -> site_aa_count_sim_partial

# add unobserved amino acids to counts
alignments_sim_partial %>%
  nest(data = c(sequence, site, aa)) %>%
  mutate(all_counts = map(data, count_zero_aa_protein)) %>%
  select(-data) %>%
  unnest(cols = all_counts) -> all_count_sim_partial

```


Estimated distribution - all
```{r, message=FALSE}
# USING ONLY OBSERVED AA
# order aa by relative frequency
order_aa <- function(protein_df) {
  
  protein_df %>%
    group_by(site) %>% 
    arrange(desc(count), .by_group = T)
  
}

all_count_sim %>%
  nest(data = c(site, aa, count)) %>% 
  mutate(ordered_count = map(data, order_aa)) %>%
  select(-data) %>% 
  unnest(cols = ordered_count) -> ordered_count_sim

# make the categorical variable (aa) numerical (k) by numbering aa's 1-20 in order of freq
number_aa <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(k = as.numeric(0:19))
  
}

ordered_count_sim %>% 
  nest(data = c(site, aa, count)) %>% 
  mutate(add_k = map(data, number_aa)) %>% 
  select(-data) %>% 
  unnest(cols = add_k) -> ordered_count_sim

# remove aa with zero values
ordered_count_sim %>%
  filter(count > 0) -> observed_aa_sim

# count relative to most frequent - rescale to 1 for highly conserved sites
add_rel_count <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(rel_count = count/max(count))
  
}

observed_aa_sim %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> observed_aa_sim

# log-transformed data
add_ln_count <- function(protein_df) {
  
  protein_df %>% 
    mutate(ln_count = log(rel_count))
  
}

observed_aa_sim %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(log_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(log_count) -> observed_aa_sim

# fit to a linear function, using map and BROOM 
# SET INTERCEPT TO 0

fit_lm_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count)) %>% 
    mutate(
      fit = map(data, ~ lm(ln_count ~ 0 + k, data = .)),
      slope = map_dbl(fit, ~ (.)$coefficients[1]),
      fit_sum = map(fit, summary),
      r2 = map_dbl(fit_sum, ~ (.)$adj.r.squared)
    ) %>% 
    select(-data, -fit, -fit_sum)
  
}

observed_aa_sim %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count)) %>% 
  mutate(fits = map(data, fit_lm_protein)) %>% 
  select(-data) %>% 
  unnest(cols = fits) -> observed_fits_sim

# added rel_count and ln_count to all aa
ordered_count_sim %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> ordered_count_sim

ordered_count_sim %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(ln_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(cols = ln_count) -> ordered_count_sim

ordered_count_sim %>% 
  left_join(observed_fits_sim, by = c("protein", "site")) %>%
  mutate(est_dist = slope*exp(k*slope)) -> ordered_count_sim

# rescale to compare w raw count
rescale_est_dist <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(est_rel = est_dist/sum(est_dist)) %>% 
    mutate(est_count = sum(count)*est_rel)
  
}

ordered_count_sim %>%
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, est_dist)) %>% 
  mutate(rescaled = map(data, rescale_est_dist)) %>% 
  select(-data) %>% 
  unnest(cols = rescaled) -> ordered_count_sim
  
  
```


Estimated distribution - partial
```{r, message=FALSE}
# USING ONLY OBSERVED AA
# order aa by relative frequency
all_count_sim_partial %>%
  nest(data = c(site, aa, count)) %>% 
  mutate(ordered_count = map(data, order_aa)) %>%
  select(-data) %>% 
  unnest(cols = ordered_count) -> ordered_count_sim_partial

# make the categorical variable (aa) numerical (k) by numbering aa's 1-20 in order of freq
ordered_count_sim_partial %>% 
  nest(data = c(site, aa, count)) %>% 
  mutate(add_k = map(data, number_aa)) %>% 
  select(-data) %>% 
  unnest(cols = add_k) -> ordered_count_sim_partial

# remove aa with zero values
ordered_count_sim_partial %>%
  filter(count > 0) -> observed_aa_sim_partial

# count relative to most frequent - rescale to 1 for highly conserved sites
observed_aa_sim_partial %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> observed_aa_sim_partial

# log-transformed data
observed_aa_sim_partial %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(log_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(log_count) -> observed_aa_sim_partial

# fit to a linear function, using map and BROOM 
# SET INTERCEPT TO 0
observed_aa_sim_partial %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count)) %>% 
  mutate(fits = map(data, fit_lm_protein)) %>% 
  select(-data) %>% 
  unnest(cols = fits) -> observed_fits_sim_partial

# added rel_count and ln_count to all aa
ordered_count_sim_partial %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> ordered_count_sim_partial

ordered_count_sim_partial %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(ln_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(cols = ln_count) -> ordered_count_sim_partial

ordered_count_sim_partial %>% 
  left_join(observed_fits_sim, by = c("protein", "site")) %>%
  mutate(est_dist = slope*exp(k*slope)) -> ordered_count_sim_partial

# rescale to compare w raw count
ordered_count_sim_partial %>%
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, est_dist)) %>% 
  mutate(rescaled = map(data, rescale_est_dist)) %>% 
  select(-data) %>% 
  unnest(cols = rescaled) -> ordered_count_sim_partial
  
```


Chi-squared test to compare distributions - all
```{r, message=FALSE}
# use raw count
chi_square_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
    ungroup() %>% 
    mutate(chisq = map(data, ~ sum(((.$count - .$est_count)^2)/.$est_count)),
         p_value = map_dbl(chisq, ~ pchisq(., 18, lower.tail=FALSE))) %>% 
    select(-data) %>% 
    mutate(p.adjusted = p.adjust(p_value, method = "fdr"))
  
}

ordered_count_sim %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
  mutate(chisq_results = map(data, chi_square_protein)) %>% 
  select(-data) %>% 
  unnest(chisq_results) -> chisq_results_sim

chisq_results_sim %>%
  mutate(result = "pass") -> chisq_results_sim

# REMOVE conserved sites with a single aa
chisq_results_sim %>% 
  filter(p_value != "NaN") -> chisq_results_sim

chisq_results_sim$result[chisq_results_sim$p.adjusted < 0.05] <- "fail"

chisq_results_sim %>% filter(result == "fail") %>% nrow()/nrow(chisq_results_sim)

chisq_results_sim$chisq <- as.numeric(chisq_results_sim$chisq)

```


$r^2$ to compare distributions - all
```{r, message=FALSE}
observed_fits_sim %>% 
  select(protein, site, r2) -> regression_perform_all

```


Chi-squared test to compare distributions - partial
```{r, message=FALSE}
# use raw count
ordered_count_sim_partial %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
  mutate(chisq_results = map(data, chi_square_protein)) %>% 
  select(-data) %>% 
  unnest(chisq_results) -> chisq_results_sim_partial


chisq_results_sim_partial %>%
  mutate(result = "pass") -> chisq_results_sim_partial

# REMOVE conserved sites with a single aa
chisq_results_sim_partial %>% 
  filter(p_value != "NaN") -> chisq_results_sim_partial

chisq_results_sim_partial$result[chisq_results_sim_partial$p.adjusted < 0.05] <- "fail"

chisq_results_sim_partial %>% 
  filter(result == "fail") %>% 
  nrow()/nrow(chisq_results_sim_partial)
chisq_results_sim_partial$chisq <- as.numeric(chisq_results_sim_partial$chisq)

```


$r^2$ to compare distributions - partial
```{r, message=FALSE}
observed_fits_sim_partial %>% 
  select(protein, site, r2) -> regression_perform_partial

```


```{r, message=FALSE}
rm(alignments_sim, alignments_sim_partial, all_count_sim,
   all_count_sim_partial, empty_counts, observed_aa_sim, 
   observed_aa_sim_partial, observed_fits_sim,
   observed_fits_sim_partial, site_aa_count_sim,
   site_aa_count_sim_partial, ordered_count_sim,
   ordered_count_sim_partial)

```


## Empirical data
Read in and reformat data 
```{r, message=FALSE}
# read in all .csv files in /data/real/
empirical_files <- dir(
  "data/real", 
  pattern = "*_Aligned_Sequences.fasta", 
  full.names = T
)

# import alignment
import_alignment <- function(filename) {
  
  fasta_file <- readAAStringSet(filename, format = "fasta")
  seq_name = names(fasta_file)
  aa_seq = paste(fasta_file)
  seq_num = seq(1:length(seq_name))
  real_align <- data.frame(seq_num, aa_seq, stringsAsFactors = F)
  
}

# change sequences from strings into character vectors
aa_string_to_vect <- function(real_align) {

  as.data.frame(strsplit(real_align$aa_seq, split = ""), 
                row.names = NULL) -> int_df
  as.data.frame(t(int_df), row.names = NULL) -> int_df

}

# tidy data frame
tidy_df <- function(int_df) {
  int_df %>%
    mutate(num_seq = seq(1:nrow(int_df))) %>% 
    pivot_longer(
      -num_seq, 
      names_to = "site_raw", 
      values_to = "aa"
    ) -> int_df
  int_df %>% 
    extract(site_raw, "site", "V(.*)", remove = TRUE) -> int_df
}

#import_alignment(empirical_files[1]) %>% View()
tibble(filename = empirical_files) %>% 
  extract(filename, "protein", "data/real/(.*)_A", remove = FALSE) %>% 
  mutate(
    raw_data = map(filename, import_alignment),
    long_data = map(raw_data, aa_string_to_vect),
    tidy_data = map(long_data, tidy_df)
  ) %>% 
  select(-filename, -raw_data, -long_data) %>% 
  unnest(cols = tidy_data) -> alignments_emp

# replace undefined aas
alignments_emp$site <- as.numeric(alignments_emp$site)

levels(alignments_emp$aa) # "X" = any aa, "B" = N or D, "Z" = Q or D

alignments_emp$aa <- recode(alignments_emp$aa, X = "-", B = "N", Z = "Q")

levels(alignments_emp$aa)

alignments_emp$aa <- as.character(alignments_emp$aa)

```


Actual distribution
```{r, message=FALSE}
# determine the number of each AA at each site
count_aa <- function(protein_alignment) {
  
  protein_alignment %>%
    group_by(site, aa) %>% 
    summarize(count = n())

  }

alignments_emp %>%
  nest(data = c(num_seq, site, aa)) %>% 
  mutate(counts = map(data, count_aa)) %>%
  select(-data) %>% 
  unnest(cols = counts) -> site_aa_count_emp

empty_counts <- tibble(
  aa = c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
         'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'),
  count = rep(0, 20)
)


# 1. First write a function that can take a data frame with amino acids at one site, and return a data frame with counts, including zeros.
count_zero_aa_site <- function(site_df) {
  
  site_df %>% 
    group_by(aa) %>% 
    summarize(count = n()) -> counts
  full_join(counts, anti_join(empty_counts, counts, by = 'aa'))

  }

# 2. Function to call that function for each protein.
count_zero_aa_protein <- function(protein_df) {
  
  protein_df %>%
    nest(data = -site) %>%
    mutate(counts = map(data, count_zero_aa_site)) %>%
    select(-data) %>%
    unnest(cols = counts)

}

# 3. Apply to all proteins
alignments_emp %>%
  nest(data = c(num_seq, site, aa)) %>%
  mutate(all_counts = map(data, count_zero_aa_protein)) %>%
  select(-data) %>%
  unnest(cols = all_counts) -> all_count_emp

```


Estimated distribution 
```{r, message=FALSE}
# USING ONLY OBSERVED AA
# order aa by relative frequency
order_aa <- function(protein_df) {
  
  protein_df %>%
    group_by(site) %>% 
    arrange(desc(count), .by_group = T)
  
}

all_count_emp %>%
  nest(data = c(site, aa, count)) %>% 
  mutate(ordered_count = map(data, order_aa)) %>%
  select(-data) %>% 
  unnest(cols = ordered_count) -> ordered_count_emp

# EXCLUDE all '-' entries/gaps in alignment
ordered_count_emp %>%
  filter(aa != "-") -> no_gaps_emp

# make the categorical variable (aa) numerical (k) by numbering aa's 1-20 in order of freq
number_aa <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(k = as.numeric(0:19))
  
}

no_gaps_emp %>% 
  nest(data = c(site, aa, count)) %>% 
  mutate(add_k = map(data, number_aa)) %>% 
  select(-data) %>% 
  unnest(cols = add_k) -> ordered_count_emp

# remove aa with zero values
ordered_count_emp %>%
  filter(count > 0) -> observed_aa_emp

# count relative to most frequent - rescale to 1 for highly conserved sites
add_rel_count <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(rel_count = count/max(count))
  
}

observed_aa_emp %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> observed_aa_emp

# log-transformed data
add_ln_count <- function(protein_df) {
  
  protein_df %>% 
    mutate(ln_count = log(rel_count))
  
}

observed_aa_emp %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(log_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(log_count) -> observed_aa_emp

# fit to a linear function, using map and BROOM 
# SET INTERCEPT TO 0
fit_lm_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count)) %>% 
    mutate(fit = map(data, ~ lm(ln_count ~ 0 + k, data = .)),
           slope = map_dbl(fit, ~ (.)$coefficients[1]),
           fit_sum = map(fit, summary),
           r2 = map_dbl(fit_sum, ~ (.)$adj.r.squared)) %>% 
    select(-data, -fit, -fit_sum)
  
}

observed_aa_emp %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count)) %>% 
  mutate(fits = map(data, fit_lm_protein)) %>% 
  select(-data) %>% 
  unnest(cols = fits) -> observed_fits_emp

# added rel_count and ln_count to all aa
ordered_count_emp %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> ordered_count_emp

ordered_count_emp %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(ln_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(cols = ln_count) -> ordered_count_emp

ordered_count_emp %>% 
  left_join(observed_fits_emp, by = c("protein", "site")) %>%
  mutate(est_dist = slope*exp(k*slope)) -> ordered_count_emp

# rescale to compare w raw count
rescale_est_dist <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(est_rel = est_dist/sum(est_dist)) %>% 
    mutate(est_count = sum(count)*est_rel)
  
}

ordered_count_emp %>%
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, est_dist)) %>% 
  mutate(rescaled = map(data, rescale_est_dist)) %>% 
  select(-data) %>% 
  unnest(cols = rescaled) -> ordered_count_emp

```


Chi-squared test to compare distributions
```{r, message=FALSE}
# use raw count
chi_square_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
    ungroup() %>% 
    mutate(
      chisq = map(data, ~ sum(((.$count - .$est_count)^2)/.$est_count)),
      p_value = map_dbl(chisq, ~ pchisq(., 18, lower.tail=FALSE))
    ) %>% 
    select(-data) %>% 
    mutate(p.adjusted = p.adjust(p_value, method = "fdr"))
  
}

ordered_count_emp %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
  mutate(chisq_results = map(data, chi_square_protein)) %>% 
  select(-data) %>% 
  unnest(chisq_results) -> chisq_results_emp

chisq_results_emp %>%
  mutate(result = "pass") -> chisq_results_emp

chisq_results_emp$result[chisq_results_emp$p.adjusted < 0.05] <- "fail"

# REMOVE conserved sites with a single aa
chisq_results_emp %>% 
  filter(p_value != "NaN") -> chisq_results_emp

chisq_results_emp %>% filter(result == "fail") %>% nrow()/nrow(chisq_results_emp)


```


$r^2$ of regression - empirical
```{r, message=FALSE}
observed_fits_emp %>% 
  select(protein, site, r2) -> regression_perform_emp

```


```{r, message=FALSE}
rm(alignments_emp, all_count_emp, empty_counts,
   empirical_files, simulated_files, no_gaps_emp, 
   observed_aa_emp, observed_fits_emp,
   ordered_count_emp, site_aa_count_emp)

```


## Comparison across alignments
Combine $\chi^2$ results
```{r, message=FALSE}
chisq_results_sim %>% 
  mutate(result_sim = result) %>% 
  select(protein, site, result_sim) -> chisq_results_sim

chisq_results_sim_partial %>% 
  mutate(result_sim_partial = result) %>% 
  select(protein, site, result_sim_partial) -> chisq_results_sim_partial

chisq_results_emp %>% 
  mutate(result_emp = result) %>% 
  select(protein, site, result_emp) -> chisq_results_emp

full_join(chisq_results_emp, chisq_results_sim) %>% 
  full_join(., chisq_results_sim_partial) -> all_results

all_results %>% 
  pivot_longer(c(result_sim, result_emp, result_sim_partial), 
               names_to = "alignment_type",
               values_to = "result") -> all_results


```


Figure 4
```{r}
# based on code from: https://wilkelab.org/practicalgg/articles/bundestag_pie.html

library(ggforce)

summarize_results <- function(protein_df){
  
  protein_df %>% 
    group_by(alignment_type, result) %>% 
    summarize(count = n()) 
  
}

all_results %>% 
  nest(data = c(site, alignment_type, result)) %>% 
  mutate(sum_result = map(data, summarize_results)) %>% 
  select(-data) %>% 
  unnest(sum_result) -> tidy_results

tidy_results %>% 
  group_by(protein, alignment_type) %>% 
  #arrange(count) %>%
  arrange(result) %>% 
  mutate(
    end_angle = 2*pi*cumsum(count)/sum(count),   # ending angle for each pie slice
    start_angle = lag(end_angle, default = 0),   # starting angle for each pie slice
    mid_angle = 0.5*(start_angle + end_angle),   # middle of each pie slice
    hjust = ifelse(mid_angle > pi, 1, 0),
    vjust = ifelse(mid_angle < pi/2 | mid_angle > 3*pi/2, 0, 1)
  ) -> tidy_results

rpie <- 1
rlabel_out <- 1.05 * rpie
rlabel_in <- 0.6 * rpie

tidy_results$alignment_type <- recode(
  tidy_results$alignment_type, 
  result_sim = "Simulated (all)", 
  result_emp = "Empirical",
  result_sim_partial = "Simulated"
)
tidy_results %>% 
  extract(protein, "protein", "(.*)_", remove = T) -> tidy_results

tidy_results$result %>% 
  replace_na("NA") -> tidy_results$result

myfillcolors <- c("fail" = "#E69F00", "pass" = "#0072B2", "NA" = "#999999")

tidy_results %>% 
  filter(protein == "1B4T" | protein == "1CI0" | protein == "1EFV" |
           protein == "1G58" | protein == "1GV3") -> tidy_results1

tidy_results %>% 
  filter(protein == "2A84" | protein == "2AIU" | protein == "2BCG" |
           protein == "2BR9" | protein == "2CFE") -> tidy_results2

tidy_results1 %>% 
  ggplot() +
  geom_arc_bar(
    aes(
      x0 = 0, 
      y0 = 0, 
      r0 = 0, 
      r = rpie,
      start = start_angle, 
      end = end_angle, 
      fill = result
    ),
    color = "white"
  ) +
  scale_fill_manual(values = myfillcolors) +
  coord_fixed() +
  facet_grid(
    vars(alignment_type), 
    vars(protein)
  ) +
  theme_map() +
  labs(fill = "Result") +
  theme(legend.position = "bottom") -> fig4

fig4

save_plot("figure4.png", fig4, ncol = 1, nrow = 1, base_height = 5,
          base_asp = 1.618, base_width = NULL)

tidy_results2 %>% 
  ggplot() +
  geom_arc_bar(
    aes(
      x0 = 0, 
      y0 = 0, 
      r0 = 0, 
      r = rpie,
      start = start_angle, 
      end = end_angle, 
      fill = result
    ),
    color = "white"
  ) +
  scale_fill_manual(values = myfillcolors) +
  coord_fixed() +
  facet_grid(
    vars(alignment_type), 
    vars(protein)
  ) +
  theme_map() +
  labs(fill = "Result") +
  theme(legend.position = "bottom") -> figs2

#plot_grid(fig4a, fig4b, nrow = 2) -> fig4

figs2

save_plot("figures2.png", figs2, ncol = 1, nrow = 1, base_height = 5,
          base_asp = 1.618, base_width = NULL)


```


Combine $r^2$ results
```{r, message=FALSE}

regression_perform_all %>% 
  mutate(r2_sim = r2) %>% 
  select(-r2) -> regression_perform_all

regression_perform_partial %>% 
  mutate(r2_partial = r2) %>% 
  select(-r2) -> regression_perform_partial

regression_perform_emp %>% 
  mutate(r2_emp = r2) %>% 
  select(-r2) -> regression_perform_emp

full_join(regression_perform_all, regression_perform_partial) %>% 
  full_join(., regression_perform_emp) -> all_results_r2

all_results_r2 %>% 
  pivot_longer(c(r2_sim, r2_partial, r2_emp), 
               names_to = "alignment_type",
               values_to = "r2") -> all_results_r2

recode(
  all_results_r2$alignment_type,
  r2_sim = "Simulated (all)",
  r2_partial = "Simulated",
  r2_emp = "Empirical"
) -> all_results_r2$alignment_type

```


Supplementary figure 3
```{r, message=FALSE}
library(ggtext)

all_results_r2 %>%
  extract(protein, "protein", "(.*)_", remove = T) -> all_results_r2

all_results_r2 %>%
  ggplot(aes(x = alignment_type, y = r2)) +
  geom_boxplot(aes(fill = alignment_type), alpha = 0.5) +
  #geom_jitter(aes(color = alignment_type)) +
  facet_wrap(vars(protein)) +
  scale_y_continuous(
    limits = c(0, 1),
    breaks = c(0, .2, .4, .6, .8, 1)
  ) +
  coord_flip() +
  labs(y = "*R*<sup>2</sup>") +
  theme(
    axis.title.y = element_blank(),
    legend.position = "none",
    axis.title.x = element_markdown()
  ) +
  panel_border() -> figs3

figs3

save_plot("figures3.png", figs3, ncol = 1, nrow = 1, base_height = 5,
          base_asp = 1.618, base_width = NULL)

```