henrmota
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎README.md
+67 b/‎README.md
+67
diff --git a/‎ab_testing/bayesian_bandit.py
+68 b/‎ab_testing/bayesian_bandit.py
+68
diff --git a/‎ab_testing/chisquare.py
+63 b/‎ab_testing/chisquare.py
+63
diff --git a/‎ab_testing/ci_comparison.py
+37 b/‎ab_testing/ci_comparison.py
+37
diff --git a/‎ab_testing/convergence.py
+34 b/‎ab_testing/convergence.py
+34
diff --git a/‎ab_testing/demo.py
+27 b/‎ab_testing/demo.py
+27
diff --git a/‎ab_testing/ttest.py
+23 b/‎ab_testing/ttest.py
+23
@@ -0,0 +1,5 @@
+*.DS_Store
+*.pyc
+large_files
+large_files/*
+nlp_class2/chunking/*
@@ -0,0 +1,67 @@
+machine_learning_examples
+=========================
+
+A collection of machine learning examples and tutorials.
+
+Find associated tutorials at https://lazyprogrammer.me
+
+Find associated courses at https://deeplearningcourses.com
+
+
+Direct Course Links
+===================
+
+Deep Learning Prerequisites: The Numpy Stack in Python
+https://deeplearningcourses.com/c/deep-learning-prerequisites-the-numpy-stack-in-python
+
+Deep Learning Prerequisites: Linear Regression in Python
+https://deeplearningcourses.com/c/data-science-linear-regression-in-python
+
+Deep Learning Prerequisites: Logistic Regression in Python
+https://deeplearningcourses.com/c/data-science-logistic-regression-in-python
+
+Deep Learning in Python
+https://deeplearningcourses.com/c/data-science-deep-learning-in-python
+
+Cluster Analysis and Unsupervised Machine Learning in Python
+https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
+
+Data Science: Supervised Machine Learning in Python
+https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python
+
+Bayesian Machine Learning in Python: A/B Testing
+https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+
+Easy Natural Language Processing in Python
+https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
+
+Practical Deep Learning in Theano and TensorFlow
+https://deeplearningcourses.com/c/data-science-deep-learning-in-theano-tensorflow
+
+Ensemble Machine Learning in Python: Random Forest and AdaBoost
+https://deeplearningcourses.com/c/machine-learning-in-python-random-forest-adaboost
+
+Deep Learning: Convolutional Neural Networks in Python
+https://deeplearningcourses.com/c/deep-learning-convolutional-neural-networks-theano-tensorflow
+
+Unsupervised Deep Learning in Python
+https://deeplearningcourses.com/c/unsupervised-deep-learning-in-python
+
+Unsupervised Machine Learning: Hidden Markov Models in Python
+https://deeplearningcourses.com/c/unsupervised-machine-learning-hidden-markov-models-in-python
+
+Deep Learning: Recurrent Neural Networks in Python
+https://deeplearningcourses.com/c/deep-learning-recurrent-neural-networks-in-python
+
+Advanced Natural Language Processing: Deep Learning in Python
+https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
+
+Artificial Intelligence: Reinforcement Learning in Python
+https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
+
+Advanced AI: Deep Reinforcement Learning in Python
+https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
+
+Deep Learning: GANs and Variational Autoencoders
+https://deeplearningcourses.com/c/deep-learning-gans-and-variational-autoencoders
+
@@ -0,0 +1,68 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.stats import beta
+
+
+NUM_TRIALS = 2000
+BANDIT_PROBABILITIES = [0.2, 0.5, 0.75]
+
+
+class Bandit(object):
+  def __init__(self, p):
+    self.p = p
+    self.a = 1
+    self.b = 1
+
+  def pull(self):
+    return np.random.random() < self.p
+
+  def sample(self):
+    return np.random.beta(self.a, self.b)
+
+  def update(self, x):
+    self.a += x
+    self.b += 1 - x
+
+
+def plot(bandits, trial):
+  x = np.linspace(0, 1, 200)
+  for b in bandits:
+    y = beta.pdf(x, b.a, b.b)
+    plt.plot(x, y, label="real p: %.4f" % b.p)
+  plt.title("Bandit distributions after %s trials" % trial)
+  plt.legend()
+  plt.show()
+
+
+def experiment():
+  bandits = [Bandit(p) for p in BANDIT_PROBABILITIES]
+
+  sample_points = [5,10,20,50,100,200,500,1000,1500,1999]
+  for i in xrange(NUM_TRIALS):
+
+    # take a sample from each bandit
+    bestb = None
+    maxsample = -1
+    allsamples = [] # let's collect these just to print for debugging
+    for b in bandits:
+      sample = b.sample()
+      allsamples.append("%.4f" % sample)
+      if sample > maxsample:
+        maxsample = sample
+        bestb = b
+    if i in sample_points:
+      print "current samples: %s" % allsamples
+      plot(bandits, i)
+
+    # pull the arm for the bandit with the largest sample
+    x = bestb.pull()
+
+    # update the distribution for the bandit whose arm we just pulled
+    bestb.update(x)
+
+
+if __name__ == "__main__":
+  experiment()
@@ -0,0 +1,63 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import chi2, chi2_contingency
+
+# contingency table
+#        click       no click
+#------------------------------
+# ad A |   a            b
+# ad B |   c            d
+#
+# chi^2 = (ad - bc)^2 (a + b + c + d) / [ (a + b)(c + d)(a + c)(b + d)]
+# degrees of freedom = (#cols - 1) x (#rows - 1) = (2 - 1)(2 - 1) = 1
+
+# short example
+
+# T = np.array([[36, 14], [30, 25]])
+# c2 = np.linalg.det(T)**2 * T.sum() / ( T[0].sum()*T[1].sum()*T[:,0].sum()*T[:,1].sum() )
+# p_value = 1 - chi2.cdf(x=c2, df=1)
+
+# equivalent:
+# (36-31.429)**2/31.429+(14-18.571)**2/18.571 + (30-34.571)**2/34.571 + (25-20.429)**2/20.429
+
+
+class DataGenerator:
+  def __init__(self, p1, p2):
+    self.p1 = p1
+    self.p2 = p2
+
+  def next(self):
+    click1 = 1 if (np.random.random() < self.p1) else 0
+    click2 = 1 if (np.random.random() < self.p2) else 0
+    return click1, click2
+
+
+def get_p_value(T):
+  # same as scipy.stats.chi2_contingency(T, correction=False)
+  det = T[0,0]*T[1,1] - T[0,1]*T[1,0]
+  c2 = float(det) / T[0].sum() * det / T[1].sum() * T.sum() / T[:,0].sum() / T[:,1].sum()
+  p = 1 - chi2.cdf(x=c2, df=1)
+  return p
+
+
+def run_experiment(p1, p2, N):
+  data = DataGenerator(p1, p2)
+  p_values = np.empty(N)
+  T = np.zeros((2, 2)).astype(np.float32)
+  for i in xrange(N):
+    c1, c2 = data.next()
+    T[0,c1] += 1
+    T[1,c2] += 1
+    # ignore the first 10 values
+    if i < 10:
+      p_values[i] = None
+    else:
+      p_values[i] = get_p_value(T)
+  plt.plot(p_values)
+  plt.plot(np.ones(N)*0.05)
+  plt.show()
+
+run_experiment(0.1, 0.11, 20000)
@@ -0,0 +1,37 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.stats import beta, norm
+
+T = 501 # number of coin tosses
+true_ctr = 0.5
+a, b = 1, 1 # beta priors
+plot_indices = (10, 20, 30, 50, 100, 200, 500)
+data = np.empty(T)
+for i in xrange(T):
+  x = 1 if np.random.random() < true_ctr else 0
+  data[i] = x
+
+  # update a and b
+  a += x
+  b += 1 - x
+
+  if i in plot_indices:
+    # maximum likelihood estimate of ctr
+    p = data[:i].mean()
+    n = i + 1 # number of samples collected so far
+    std = np.sqrt(p*(1-p)/n)
+    
+    # gaussian
+    x = np.linspace(0, 1, 200)
+    g = norm.pdf(x, loc=p, scale=std)
+    plt.plot(x, g, label='Gaussian Approximation')
+
+    # beta
+    posterior = beta.pdf(x, a=a, b=b)
+    plt.plot(x, posterior, label='Beta Posterior')
+    plt.legend()
+    plt.title("N = %s" % n)
+    plt.show()
@@ -0,0 +1,34 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import matplotlib.pyplot as plt
+import numpy as np
+from bayesian_bandit import Bandit
+
+
+def run_experiment(p1, p2, p3, N):
+  bandits = [Bandit(p1), Bandit(p2), Bandit(p3)]
+
+  data = np.empty(N)
+  
+  for i in xrange(N):
+    # thompson sampling
+    j = np.argmax([b.sample() for b in bandits])
+    x = bandits[j].pull()
+    bandits[j].update(x)
+
+    # for the plot
+    data[i] = x
+  cumulative_average_ctr = np.cumsum(data) / (np.arange(N) + 1)
+
+  # plot moving average ctr
+  plt.plot(cumulative_average_ctr)
+  plt.plot(np.ones(N)*p1)
+  plt.plot(np.ones(N)*p2)
+  plt.plot(np.ones(N)*p3)
+  plt.ylim((0,1))
+  plt.xscale('log')
+  plt.show()
+
+
+run_experiment(0.2, 0.25, 0.3, 100000)
@@ -0,0 +1,27 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import beta
+
+def plot(a, b, trial, ctr):
+  x = np.linspace(0, 1, 200)
+  y = beta.pdf(x, a, b)
+  mean = float(a) / (a + b)
+  plt.plot(x, y)
+  plt.title("Distributions after %s trials, true rate = %.1f, mean = %.2f" % (trial, ctr, mean))
+  plt.show()
+
+true_ctr = 0.3
+a, b = 1, 1 # beta parameters
+show = [0, 5, 10, 25, 50, 100, 200, 300, 500, 700, 1000, 1500]
+for t in xrange(1501):
+  coin_toss_result = (np.random.random() < true_ctr)
+  if coin_toss_result:
+    a += 1
+  else:
+    b += 1
+
+  if t in show:
+    plot(a, b, t+1, true_ctr)
@@ -0,0 +1,23 @@
+# From the course: Bayesin Machine Learning in Python: A/B Testing
+# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
+# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
+import numpy as np
+from scipy import stats
+
+# generate data
+N = 10
+a = np.random.randn(N) + 2 # mean 2, variance 1
+b = np.random.randn(N) # mean 0, variance 1
+
+# roll your own t-test:
+var_a = a.var(ddof=1) # unbiased estimator, divide by N-1 instead of N
+var_b = b.var(ddof=1)
+s = np.sqrt( (var_a + var_b) / 2 ) # balanced standard deviation
+t = (a.mean() - b.mean()) / (s * np.sqrt(2.0/N)) # t-statistic
+df = 2*N - 2 # degrees of freedom
+p = 1 - stats.t.cdf(np.abs(t), df=df) # one-sided test p-value
+print "t:\t", t, "p:\t", 2*p # two-sided test p-value
+
+# built-in t-test:
+t2, p2 = stats.ttest_ind(a, b)
+print "t2:\t", t2, "p2:\t", p2