-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrc.py
102 lines (83 loc) · 4.7 KB
/
src.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage
import numpy as np
from matplotlib.patches import Patch
from scipy.stats import shapiro
from scipy.stats import ttest_ind, mannwhitneyu
import itertools
# Function to add significance brackets
def add_significance_bracket(ax, x1, x2, y, h, text, color='black'):
ax.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, c=color)
ax.text((x1 + x2) * .5, y + h, text, ha='center', va='bottom', color=color)
def earth_movers_distance(u, v):
"""
If I equip the probability simplex with the ground metric of the L1 norm,
it turns the EMD into a simple L1 norm. So that's how we'll do it.
Parameters:
- vec1: A list or numpy array representing the first distribution.
- vec2: A list or numpy array representing the second distribution.
Returns:
- The Earth Mover's Distance between vec1 and vec2.
"""
# compute L1 norm between u and v
emd = np.linalg.norm(np.array(u) - np.array(v), ord=1)
return emd
def start_box_plot(raw_distances, average_distances, pretty_name):
# Creating the box plots
plt.figure(figsize=(12, 8))
# Improved color scheme
colors = ['orange', 'teal']
# Box plot positions
n_time_points = len(raw_distances['Time'])
positions_env_1_mix = [x * 2 for x in range(n_time_points)] # Positions for env_1_mix box plots
positions_env_2_mix = [x * 2 + 1 for x in range(n_time_points)] # Positions for env_2_mix box plots
# Create box plots
bp_env_1_mix = plt.boxplot(raw_distances['env_1_mix'], positions=positions_env_1_mix, widths=0.6, patch_artist=True,
boxprops=dict(facecolor=colors[0]))
bp_env_2_mix = plt.boxplot(raw_distances['env_2_mix'], positions=positions_env_2_mix, widths=0.6, patch_artist=True,
boxprops=dict(facecolor=colors[1]))
# Add lines connecting averages
plt.plot(positions_env_1_mix, average_distances['env_1_mix'], color=colors[0], marker='o', label='Average Env 1 vs '
'mix')
plt.plot(positions_env_2_mix, average_distances['env_2_mix'], color=colors[1], marker='o', label='Average Env 2 vs '
'mix')
# Adding titles and labels
plt.title(f'Distribution of {pretty_name} Distances Over Time')
plt.xlabel('Time Points')
plt.ylabel('Distance')
plt.xticks([r * 2 + 0.5 for r in range(n_time_points)],
raw_distances['Time']) # Set x-ticks to be between the pairs of box plots
# Adding a legend
legend_elements = [Patch(facecolor=colors[0], label='Env 1 vs mix'),
Patch(facecolor=colors[1], label='Env 2 vs mix')]
plt.legend(handles=legend_elements, title='Sample Type Pairs')
return positions_env_1_mix, positions_env_2_mix
def add_stats_to_plot(raw_distances, positions_env_1_mix, positions_env_2_mix):
# Add significance brackets
p_values = []
for i, time_point in enumerate(raw_distances['Time']):
dist_env_1_mix = raw_distances['env_1_mix'][i]
dist_env_2_mix = raw_distances['env_2_mix'][i]
# Perform t-test on the two independent samples
#t_stat, p_val = ttest_ind(dist_env_1_mix, dist_env_2_mix)
t_stat, p_val = mannwhitneyu(dist_env_1_mix, dist_env_2_mix)
p_values.append(p_val)
# Adjust y value for significance bracket based on your plot's max value
#y_max = np.max([np.max(raw_distances['env_1_mix']), np.max(raw_distances['env_2_mix'])])
y_max = max(itertools.chain.from_iterable(raw_distances['env_1_mix'] + raw_distances['env_2_mix']))
y_value_for_bracket = y_max * 1.1 # for example, 10% above the max value
# Plot significance on the existing boxplot
for i, p_val in enumerate(p_values):
if p_val < 0.05: # Choose your significance level here
star_text = (f'**** ({p_val:.3g})' if p_val < 0.0001 else # four stars for p < 0.0001
f'*** ({p_val:.3g})' if p_val < 0.001 else # three stars for p < 0.001
f'** ({p_val:.3g})' if p_val < 0.01 else # two stars for p < 0.01
f'* ({p_val:.3g})' if p_val < 0.05 else # single star for p < 0.05
'') # no star if p is not significant
add_significance_bracket(plt.gca(), positions_env_1_mix[i], positions_env_2_mix[i], y_value_for_bracket,
0.05 * y_max,
star_text)