example-code.py

"""
USAGE: 

cd path/to/SparseSC
python example-code.py
"""

import os
import sys
#sys.path.append(os.path.join(os.getcwd(),"SparseSC"))

import time
import SparseSC as SC
from SparseSC.tests import ge_dgp
import numpy as np
import random


# def setup(N0,N1,T,K,g,gs,bs ): # controls (N0), treated units (N1) , time periods (T), Predictors (K), groups (g), g-scale, b-scale

if __name__ == "__main__":

    # ------------------------------------------------------------
    # ------------------------------------------------------------
    # SECTION 1: GENERATE SOME TOY DATA
    # ------------------------------------------------------------
    # ------------------------------------------------------------

    # CONTROL PARAMETERS

    random.seed(12345)
    np.random.seed(10101)

    # Controls (per group), Treated (per group), pre-intervention Time points,  post intervention time-periods
    N0,N1,T0,T1 = 5,5,4,4
    # Causal Covariates, Confounders , Random Covariates 
    K,S,R = 7,4,5
    # Number of Groups, Scale of the Group Effect, (groups make the time series correlated for reasons other than the observed covariates...)
    groups,group_scale = 30,2
    # Scale of the Correlation of the Causal Covariates
    beta_scale,confounders_scale = 4,1

    X_control, X_treated, Y_pre_control, Y_pre_treated, \
        Y_post_control, Y_post_treated = ge_dgp(N0,N1,T0,T1,K,S,R,groups,group_scale,beta_scale,confounders_scale,model= "full")

    # JOIN THE TREAT AND CONTROL DATA
    X = np.vstack( (X_control, X_treated,) )
    Y_pre  = np.vstack( (Y_pre_control,  Y_pre_treated, ) )
    Y_post = np.vstack( (Y_post_control, Y_post_treated,) )

    # in the leave-one-out scneario, the pre-treatment outcomes will be part of the covariates
    X_and_Y_pre = np.hstack( ( X, Y_pre,) )
    X_and_Y_pre_control = np.hstack( ( X_control, Y_pre_control,) )

    # IDENTIFIERS FOR TREAT AND CONTROL UNITS
    # control_units = np.arange( N0 * groups )
    # treated_units = np.arange( N1 * groups ) + N0

    # ------------------------------------------------------------
    # ------------------------------------------------------------
    # find default penalties
    # ------------------------------------------------------------
    # ------------------------------------------------------------

    # get starting point for the L2 penalty 
    w_pen_start_ct  = SC.w_pen_guestimate(X_control)
    w_pen_start_loo = SC.w_pen_guestimate(X_and_Y_pre_control)

    # get the maximum value for the L1 Penalty parameter conditional on the guestimate for the L2 penalty
    L1_max_ct  = SC.get_max_v_pen(X_control,Y_pre_control,X_treat=X_treated,Y_treat=Y_pre_treated)
    if False:
        L1_max_loo = SC.get_max_v_pen(X_and_Y_pre_control[np.arange(100)],Y_post[np.arange(100)])
        print("Max L1 loo %s " % L1_max_loo)
    else:
        L1_max_loo = np.float(147975295.9121998)

    if False:
        "Demonstrate relations between the L1 and L2 penalties"

        # get the maximum value for the L1 Penalty parameter conditional on several L2 penalty parameter values
        L2_grid = (2.** np.arange(-1,2))

        L1_max_loo_grid = SC.get_max_v_pen(X_and_Y_pre,
                                           Y_post, 
                                           w_pen = w_pen_start_loo * L2_grid)
        L1_max_ct_grid = SC.get_max_v_pen(X_control,
                                           Y_pre_control,
                                           X_treat=X_treated,
                                           Y_treat=Y_pre_treated,
                                           w_pen = w_pen_start_ct * L2_grid)
        assert ( L1_max_loo / L1_max_loo_grid  == 1/L2_grid).all()
        assert ( L1_max_ct  / L1_max_ct_grid   == 1/L2_grid).all()


    # ------------------------------------------------------------
    # create a grid of penalties to try 
    # ------------------------------------------------------------
    n_points = 10 # number of points in the grid

    grid_type = "log-linear" # Usually the optimal penalties are quite Small

    if grid_type == "simple":
        # An equally spaced linear grid that does not include 0 or 1
        grid = ( 1 + np.arange(n_points) ) / ( 1 + n_points )
    elif grid_type == "linear":
        # Another equally spaced linear grid
        fmax = 1e-3 # lowest point in the grid relative to the max-lambda
        fmin = 1e-5 # highest point in the grid relative to the max-lambda
        grid = np.linspace(fmin,fmax,n_points)
    elif grid_type == "log-linear":
        # Another equally spaced linear grid
        fmax = 1e-2 # lowest point in the grid relative to the max-lambda
        fmin = 1e-4 # highest point in the grid relative to the max-lambda
        grid = np.exp(np.linspace(np.log(fmin),np.log(fmax),n_points))
    else:
        raise ValueError("Unknown grid type: %s" % grid_type)


    # ------------------------------------------------------------
    # get the Cross Validation Error over a grid of L1 penalty
    # parameters using both (a) Treat/Control and (b) the 
    # leave-one-out  controls only methods
    # ------------------------------------------------------------

    if False:

        print("starting grid scoring for treat / control scenario", grid*L1_max_ct)
        grid_scores_ct = SC.CV_score(
            X = X_control,
            Y = Y_pre_control,

            X_treat = X_treated,
            Y_treat = Y_pre_treated,

            # if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
            v_pen = grid * L1_max_ct,
            w_pen = w_pen_start_ct,

            # CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
            cache = False, # False by Default

            # Run each of the Cross-validation folds in parallel? Often slower
            # for large sample sizes because numpy.linalg.solve() already runs
            # in parallel for large matrices
            parallel=False,

            # ANNOUNCE COMPLETION OF EACH ITERATION
            progress = True)

        best_L1_penalty_ct = (grid * L1_max_ct)[np.argmin(grid_scores_ct)]

    if False:

        print("Starting grid scoring for Controls Only scenario with 5-fold gradient descent", grid*L1_max_loo)
        grid_scores_loo = SC.CV_score(
            X = X_and_Y_pre_control, # limit the amount of time...
            Y = Y_post_control     , # limit the amount of time...

            # this is what enables the k-fold gradient descent
            grad_splits = 5,
            random_state = 10101, # random_state for the splitting during k-fold gradient descent

            # L1 Penalty. if v_pen is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
            v_pen = grid * L1_max_loo,

            # L2 Penalty (float)
            w_pen = w_pen_start_loo,

            # CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
            #cache = True, # False by Default

            # Run each of the Cross-validation folds in parallel? Often slower
            # for large sample sizes because numpy.linalg.solve() already runs
            # in parallel for large matrices
            parallel=False,

            # announce each call to `numpy.linalg.solve(A,B)` (the major bottleneck)
            verbose = False, # it's kind of obnoxious, but gives a sense of running time per gradient calculation

            # ANNOUNCE COMPLETION OF EACH ITERATION
            progress = True)

    if False:

        # even with smaller data, this takes a while.
        print("Starting grid scoring for Controls Only scenario with leave-one-out gradient descent", grid*L1_max_loo)
        grid_scores_loo = SC.CV_score(
            X = X_and_Y_pre_control [np.arange(100)], # limit the amount of time...
            Y = Y_post_control     [np.arange(100)], # limit the amount of time...

            # with `grad_splits = None` (the default behavior) we get leave-one-out gradient descent.
            grad_splits = None,

            # L1 Penalty. if v_pen is a single value (value), we get a single score, If it's an array of values, we get an array of scores.
            v_pen = grid * L1_max_loo,

            # L2 Penalty (float)
            w_pen = w_pen_start_loo,

            # CACHE THE V MATRIX BETWEEN v_pen PARAMETERS (generally faster, but path dependent)
            #cache = True, # False by Default

            # Run each of the Cross-validation folds in parallel? Often slower
            # for large sample sizes because numpy.linalg.solve() already runs
            # in parallel for large matrices
            parallel=False,

            # announce each call to `numpy.linalg.solve(A,B)` (the major bottleneck)
            verbose = False, # it's kind of obnoxious, but gives a sense of running time per gradient calculation

            # ANNOUNCE COMPLETION OF EACH ITERATION
            progress = True)

    # ---------------------------------------------------------------------------
    # Calculate Synthetic Control weights for a fixed pair of penalty parameters
    # ---------------------------------------------------------------------------

    # This is a two-step process because in principle, we can estimate a 
    # tensor matrix (which contains the relative weights of the covariates and
    # possibly pre-treatment outcomes) in one population, and apply it to
    # another population.
    best_L1_penalty_ct = np.float(1908.9329)

    # -----------------------------------
    # Treat/Control:
    # -----------------------------------

    V_ct = SC.tensor(X = X_control,
                     Y = Y_pre_control,
                     X_treat = X_treated,
                     Y_treat = Y_pre_treated,
                     v_pen = best_L1_penalty_ct,
                     w_pen = w_pen_start_ct)

    SC_weights_ct = SC.weights(X = X_control,
                               X_treat = X_treated,
                               V = V_ct,
                               w_pen = w_pen_start_ct)

    Y_post_treated_synthetic_conrols_ct = SC_weights_ct.dot(Y_post_control)
    ct_prediction_error = Y_post_treated_synthetic_conrols_ct - Y_post_treated
    null_model_error = Y_post_treated - np.mean(Y_pre_treated)
    R_squared_post_ct = 1 - np.power(ct_prediction_error,2).sum() / np.power(null_model_error,2).sum()

    print( "C/T: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_ct,))


    # -----------------------------------
    # Leave-One-Out with Control Only:
    # -----------------------------------

    if False: 
        # this takes a while:
        V_loo = SC.tensor(
                X = X_and_Y_pre [np.arange(100)], # limit the amount of time...
                Y = Y_post      [np.arange(100)], # limit the amount of time...
                v_pen = best_L1_penalty_loo,
                w_pen = w_pen_start_loo)

        SC_weights_loo = SC.weights(X = X_control,
                                    V = V_loo,
                                    w_pen = w_pen_start_loo)

        # in progress...
        import pdb; pdb.set_trace()

        Y_post_treated_synthetic_conrols_loo = SC_weights_loo.dot(Y_post_control)
        loo_prediction_error = Y_post_treated_synthetic_conrols_loo - Y_post_treated
        null_model_error = Y_post_treated - np.mean(Y_pre_treated)
        R_squared_post_loo = 1 - np.power(loo_prediction_error,2).sum() / np.power(null_model_error,2).sum()

        print( "LOO: Out of Sample post intervention R squared: %0.2f%% " % (100*R_squared_post_loo,))

        import pdb; pdb.set_trace()


    # ---------------------------------------------------------------------------
    # ---------------------------------------------------------------------------
    # Optimization of the L1 and L2 parameters together (second order)
    # ---------------------------------------------------------------------------
    # ---------------------------------------------------------------------------

    from scipy.optimize import fmin_l_bfgs_b, differential_evolution 
    import time

    # -----------------------------------------------------------------
    # Optimization of the L2 and L1 Penalties Simultaneously, keeping their
    # product constant.  Heuristically, this has been most efficient means of
    # optimizing the L2 Parameter.
    # -----------------------------------------------------------------
    if False:
        # build the objective function to be minimized

        # cache for L2_obj_func
        n_calls = [0,]
        temp_results =[]
        SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
        best_L1_penalty_ct = np.float(1908.9329)

        def L1_L2_const_obj_func (x): 
            n_calls[0] += 1
            t1 = time.time();
            score = SC.CV_score(X = X_control,
                                Y = Y_pre_control,
                                X_treat = X_treated,
                                Y_treat = Y_pre_treated,
                                # if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
                                v_pen = best_L1_penalty_ct * np.exp(x[0]),
                                w_pen = w_pen_start_ct / np.exp(x[0]),
                                # suppress the analysis type message
                                quiet = True)
            t2 = time.time(); 
            temp_results.append((n_calls[0],x,score))
            print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" % (n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
            #print("calls: %s, time: %0.4f, x0: %0.4f, x1: %0.4f, Cross Validation Error: %s, R-Squared: %s" % (n_calls[0], t2 - t1, x[0], x[1], score, 1 - score / SS ))
            return score

        # the actual optimization
        print("Starting L1-L2 Penalty (product constant) optimization")

        #results = differential_evolution(L1_L2_const_obj_func, bounds = ((-6,6,),)  )
        #results = differential_evolution(L1_L2_obj_func, bounds = ((-6,6,),)*2  )

        import pdb; pdb.set_trace()
        NEW_best_L1_penalty_ct = best_L1_penalty_ct * np.exp(results.x[0])
        best_L2_penalty = w_pen_start_ct * np.exp(results.x[1])

        print("DE optimized L2 Penalty: %s, DE optimized  L1 penalty: %s"  % (NEW_best_L1_penalty_ct, best_L2_penalty,) )

    # -----------------------------------------------------------------
    # Optimization of the L2 Parameter alone
    # -----------------------------------------------------------------
    if False:
        # -----------------------
        # build the objective function to be minimized
        # -----------------------

        # OBJECTIVE FUNCTION(S) TO MINIMIZE USING DE

        # cache for L2_obj_func
        n_calls = [0,]
        temp_results =[]
        SS = np.power(Y_pre_treated - np.mean(Y_pre_treated),2).sum()
        best_L1_penalty_ct = np.float(1908.9329)

        def L2_obj_func (x): 
            n_calls[0] += 1
            t1 = time.time();

            score = SC.CV_score(X = X_control,
                                Y = Y_pre_control,
                                X_treat = X_treated,
                                Y_treat = Y_pre_treated,
                                # if v_pen is a single value, we get a single score, If it's an array of values, we get an array of scores.
                                v_pen = best_L1_penalty_ct,
                                w_pen = w_pen_start_ct * np.exp(x[0]),
                                # suppress the analysis type message
                                quiet = True)
        
            t2 = time.time(); 
            temp_results.append((n_calls[0],x,score))
            print("calls: %s, time: %0.4f, x0: %0.4f, Cross Validation Error: %s, out-of-sample R-Squared: %s" %
                    (n_calls[0], t2 - t1, x[0], score, 1 - score / SS ))
            return score

        # the actual optimization
        print("Starting L2 Penalty optimization")

        results = differential_evolution(L2_obj_func, bounds = ((-6,6,),))
        NEW_L2_pen_start_ct = w_pen_start_ct * np.exp(results.x[0])
        print("DE optimized L2 Penalty: %s, using fixed L1 penalty: %s"  % (NEW_L2_pen_start_ct, best_L1_penalty_ct,) )

    # -----------------------------------------------------------------
    # Optimization of the L2 and L1 Penalties Simultaneously
    # -----------------------------------------------------------------
    best_L1_penalty_ct = np.float(1908.9329)

    # the actual optimization
    print("Starting L1-L2 Joint Penalty optimization")

    results = SC.joint_penalty_optimzation(X = X_control, Y = Y_pre_control, L1_pen_start = best_L1_penalty_ct, L2_pen_start = w_pen_start_ct, bounds = ((-6,6,),)*2, X_treat = X_treated, Y_treat = Y_pre_treated)

    import pdb; pdb.set_trace()
    NEW_best_L1_penalty_ct = results.x[0]
    best_L2_penalty = results.x[1]

    print("DE optimized L2 Penalty: %s, DE optimized  L1 penalty: %s"  % (NEW_best_L1_penalty_ct, best_L2_penalty,) )