empirical_protein_all.Rmd

---
title: "Empirical protein alignments"
author: "Mackenzie Johnson"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Analysis of distribution fitting for multiple empirical protein alignments

This RMarkdown document repeats the fitting procedures for 10 empirical protein alignments and produces Supplementary Figure 1. It serves the same purpose as simulated_protein_all.Rmd, but for empirical protein alignments.

```{r, message=FALSE}
# load packages
library(tidyverse)
library(Biostrings)
library(stringr)
library(cowplot)
library(broom)
library(ggtext)
theme_set(theme_cowplot())
```

Read in and reformat data 
```{r, message=FALSE}
# read in all .csv files in /data/real/
empirical_files <- dir(
  "data/real", 
  pattern = "*_Aligned_Sequences.fasta", 
  full.names = T
)

# import alignment
import_alignment <- function(filename) {
  
  fasta_file <- readAAStringSet(filename, format = "fasta")
  seq_name = names(fasta_file)
  aa_seq = paste(fasta_file)
  seq_num = seq(1:length(seq_name))
  real_align <- data.frame(seq_num, aa_seq, stringsAsFactors = F)
  
}

# change sequences from strings into character vectors
aa_string_to_vect <- function(real_align) {

  as.data.frame(strsplit(real_align$aa_seq, split = ""), 
                row.names = NULL) -> int_df
  as.data.frame(t(int_df), row.names = NULL) -> int_df

}

# tidy data frame
tidy_df <- function(int_df) {
  int_df %>%
    mutate(num_seq = seq(1:nrow(int_df))) %>% 
    pivot_longer(
      -num_seq,
      names_to = "site_raw",
      values_to = "aa"
    ) -> int_df
  int_df %>% 
    extract(site_raw, "site", "V(.*)", remove = TRUE) -> int_df
}

#import_alignment(empirical_files[1]) %>% View()
tibble(filename = empirical_files) %>% 
  extract(
    filename, 
    "protein", 
    "data/real/(.*)_A", 
    remove = FALSE
  ) %>% 
  mutate(
    raw_data = map(filename, import_alignment),
    long_data = map(raw_data, aa_string_to_vect),
    tidy_data = map(long_data, tidy_df)
  ) %>% 
  select(-filename, -raw_data, -long_data) %>% 
  unnest(cols = tidy_data) -> alignments

# replace undefined aas
alignments$site <- as.numeric(alignments$site)

levels(alignments$aa) # "X" = any aa, "B" = N or D, "Z" = Q or D

alignments$aa <- recode(alignments$aa, X = "-", B = "N", Z = "Q")

levels(alignments$aa)

alignments$aa <- as.character(alignments$aa)

```


Actual distribution
```{r, message=FALSE}
# determine the number of each AA at each site
count_aa <- function(protein_alignment) {
  
  protein_alignment %>%
    group_by(site, aa) %>% 
    summarize(count = n())

  }

alignments %>%
  nest(data = c(num_seq, site, aa)) %>% 
  mutate(counts = map(data, count_aa)) %>%
  select(-data) %>% 
  unnest(cols = counts) -> site_aa_count

empty_counts <- tibble(aa = c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 
                              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'), 
                           count = rep(0, 20))


# 1. First write a function that can take a data frame with amino acids at one site, and return a data frame with counts, including zeros.
count_zero_aa_site <- function(site_df) {
  
  site_df %>% 
    group_by(aa) %>% 
    summarize(count = n()) -> counts
  full_join(counts, anti_join(empty_counts, counts, by = 'aa'))

  }

# 2. Function to call that function for each protein.
count_zero_aa_protein <- function(protein_df) {
  
  protein_df %>%
    nest(data = -site) %>%
    mutate(counts = map(data, count_zero_aa_site)) %>%
    select(-data) %>%
    unnest(cols = counts)

}

# 3. Apply to all proteins
alignments %>%
  nest(data = c(num_seq, site, aa)) %>%
  mutate(all_counts = map(data, count_zero_aa_protein)) %>%
  select(-data) %>%
  unnest(cols = all_counts) -> all_count

```


Estimated distribution 
```{r, message=FALSE}
# USING ONLY OBSERVED AA
# order aa by relative frequency
order_aa <- function(protein_df) {
  
  protein_df %>%
    group_by(site) %>% 
    arrange(desc(count), .by_group = T)
  
}

all_count %>%
  nest(data = c(site, aa, count)) %>% 
  mutate(ordered_count = map(data, order_aa)) %>%
  select(-data) %>% 
  unnest(cols = ordered_count) -> ordered_count

# EXCLUDE all '-' entries/gaps in alignment
ordered_count %>%
  filter(aa != "-") -> no_gaps

# make the categorical variable (aa) numerical (k) by numbering aa's 1-20 in order of freq
number_aa <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(k = as.numeric(0:19))
  
}

no_gaps %>% 
  nest(data = c(site, aa, count)) %>% 
  mutate(add_k = map(data, number_aa)) %>% 
  select(-data) %>% 
  unnest(cols = add_k) -> ordered_count

# remove aa with zero values
ordered_count %>%
  filter(count > 0) -> observed_aa

# count relative to most frequent - rescale to 1 for highly conserved sites
add_rel_count <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(rel_count = count/max(count))
  
}

observed_aa %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> observed_aa

# log-transformed data
add_ln_count <- function(protein_df) {
  
  protein_df %>% 
    mutate(ln_count = log(rel_count))
  
}

observed_aa %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(log_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(log_count) -> observed_aa

# fit to a linear function, using map and BROOM 
# SET INTERCEPT TO 0
fit_lm_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count)) %>% 
    mutate(fit = map(data, ~ lm(ln_count ~ 0 + k, data = .)),
           slope = map_dbl(fit, ~ (.)$coefficients[1])) %>% 
    select(-data, -fit)
  
}

observed_aa %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count)) %>% 
  mutate(fits = map(data, fit_lm_protein)) %>% 
  select(-data) %>% 
  unnest(cols = fits) -> observed_fits

# added rel_count and ln_count to all aa
ordered_count %>%
  nest(data = c(site, aa, count, k)) %>% 
  mutate(rel_count = map(data, add_rel_count)) %>% 
  select(-data) %>% 
  unnest(cols = rel_count) -> ordered_count

ordered_count %>%
  nest(data = c(site, aa, count, k, rel_count)) %>% 
  mutate(ln_count = map(data, add_ln_count)) %>% 
  select(-data) %>% 
  unnest(cols = ln_count) -> ordered_count

ordered_count %>% 
  left_join(observed_fits, by = c("protein", "site")) %>%
  mutate(est_dist = slope*exp(k*slope)) -> ordered_count

# rescale to compare w raw count
rescale_est_dist <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(est_rel = est_dist/sum(est_dist)) %>% 
    mutate(est_count = sum(count)*est_rel)
  
}

ordered_count %>%
  nest(data = c(site, aa, count, k, rel_count, 
                ln_count, slope, est_dist)) %>% 
  mutate(rescaled = map(data, rescale_est_dist)) %>% 
  select(-data) %>% 
  unnest(cols = rescaled) -> ordered_count


# # number of observed amino acids at each site in the alignment
# observed_aa %>% 
#   group_by(site) %>%
#   mutate(obs_count = sum(count)) %>% 
#   select(site, obs_count) %>% distinct() -> align_count_bysite
# 
# align_count_bysite %>%
#   ggplot(aes(x = site, y = obs_count)) +
#   geom_point() +
#   ylab("observed count") +
#   theme(text = element_text(size = 35),
#         axis.text = element_text(size = 35)) 
# ggsave("gaps.jpg")

```


Chi-squared test to compare distributions
```{r, message=FALSE}
# use raw count
chi_square_protein <- function(protein_df) {
  
  protein_df %>% 
    nest(data = c(aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
    ungroup() %>% 
    mutate(chisq = map(data, ~ sum(((.$count - .$est_count)^2)/.$est_count)),
           p_value = map_dbl(chisq, ~ pchisq(., 18, lower.tail=FALSE))) %>% 
    select(-data) %>% 
    mutate(p.adjusted = p.adjust(p_value, method = "fdr"))
  
}

ordered_count %>% 
  nest(data = c(site, aa, count, k, rel_count, ln_count, slope, 
                  est_dist, est_rel,est_count)) %>% 
  mutate(chisq_results = map(data, chi_square_protein)) %>% 
  select(-data) %>% 
  unnest(chisq_results) -> chisq_results

chisq_results %>%
  mutate(result = "pass") -> chisq_results

chisq_results$result[chisq_results$p.adjusted < 0.05] <- "fail"

# REMOVE conserved sites with a single aa
chisq_results %>% 
  filter(p_value != "NaN") -> chisq_results

chisq_results %>% filter(result == "fail") %>% nrow()/nrow(chisq_results)


```


Effective number of amino acids 
```{r, message=FALSE}
# eff aa for actual distribution
add_site_aa_freq <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    mutate(frequency = (count)/sum(count))
  
}

observed_aa %>%
  nest(data = c(site, aa, count, k, rel_count, ln_count)) %>% 
  mutate(aa_freq = map(data, add_site_aa_freq)) %>% 
  select(-data) %>% 
  unnest(cols = aa_freq) -> site_aa_freq

site_aa_freq %>%
  mutate(flnf = frequency*log(frequency)) -> site_aa_freq

add_eff_aa <- function(protein_df) {
  
  protein_df %>% 
    group_by(site) %>% 
    summarize(
      entropy = -sum(flnf),
      eff_aa = exp(entropy),
      n = length(unique(aa))
    )
  
}

site_aa_freq %>% 
  nest(data = c(site, aa, count, k, rel_count, 
                ln_count, frequency, flnf)) %>% 
  mutate(eff_aa = map(data, add_eff_aa)) %>% 
  select(-data) %>% 
  unnest(cols = eff_aa) -> eff_aa_all


```


Relationship between $\lambda$ and effective number of amino acids
```{r, message=FALSE}

observed_fits %>%
  left_join(eff_aa_all) %>% 
  select(protein, site, slope, eff_aa)-> gam_v_effaa

gam_v_effaa %>%
  left_join(chisq_results) %>% 
  select(protein, site, slope, eff_aa, result) -> gam_v_effaa

neff <- function(lambda_inv) {
  lambda <- 1/lambda_inv
  pi <- lambda*exp(-lambda*(0:19))
  C <- sum(pi)
  pi <- pi/C
  exp(-sum(pi*log(pi)))
}

df_theory <- data.frame(
  lambda_inv = (0:-100)/10, 
  ne = vapply((0:-100)/10, neff, numeric(1))
)

```


Supplementary Figure 1
```{r, message=FALSE}
gam_v_effaa %>%
  extract(protein, "protein", "(.*)_", remove = T) -> gam_v_effaa

gam_v_effaa %>% 
  filter(!is.na(result)) -> gam_v_effaa # remove sites conserved for one aa

ggplot(gam_v_effaa,
       aes(x = eff_aa, 
           y = -1/slope)) +
  geom_point(aes(color = result)) +
  geom_line(
    data = df_theory,
    aes(ne, -1*lambda_inv),
    inherit.aes = FALSE,
    size = 1
  ) +
  facet_wrap(vars(protein)) +
  labs(
    x = "Effective number of amino acids *n*<sub>eff</sub>", 
    #y = expression(~lambda^-1)
    y = "Fitting parameter &lambda;<sup>-1</sup>",
    color = "Result"
  ) +
  scale_color_manual(
    values = c("#E69F00", "#0072B2")
  ) +
  scale_y_continuous(limits = c(-0.5, 10), expand = c(0, 0)) +
  theme_cowplot(12, rel_small = 1) +
  panel_border() +
  background_grid() +
  theme(
    axis.title.x = element_markdown(),
    axis.title.y = element_markdown(),
    strip.background = element_rect(fill = "gray90"),
    legend.position = c(.9, .1),
    legend.justification = c(1, 0),
    legend.direction = "horizontal"
  ) -> figs1

figs1
save_plot("figures1.png", figs1, ncol = 1, nrow = 1, base_height = 5,
          base_asp = 1.618, base_width = NULL)

```