From 09d3e78570abb89258c7b5c33f6b10e212890b90 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Fri, 1 Dec 2023 18:27:44 -0800
Subject: [PATCH] first cut gamma coefficient - working, results poor

---
 bemb/model/bayesian_coefficient.py | 116 ++++++++++++++++++++++++++---
 bemb/model/bemb.py                 |  30 +++++++-
 bemb/model/bemb_flex_lightning.py  |   8 ++
 tutorials/supermarket/configs.yaml |  26 ++++---
 tutorials/supermarket/main.py      |   7 ++
 5 files changed, 166 insertions(+), 21 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 790c4fe..7763764 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -9,6 +9,7 @@
 import torch
 import torch.nn as nn
 from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal
+from torch.distributions.gamma import Gamma
 
 
 class BayesianCoefficient(nn.Module):
@@ -21,7 +22,8 @@ def __init__(self,
                  num_obs: Optional[int] = None,
                  dim: int = 1,
                  prior_mean: float = 0.0,
-                 prior_variance: Union[float, torch.Tensor] = 1.0
+                 prior_variance: Union[float, torch.Tensor] = 1.0,
+                 distribution: str = 'gaussian'
                  ) -> None:
         """The Bayesian coefficient object represents a learnable tensor mu_i in R^k, where i is from a family (e.g., user, item)
             so there are num_classes * num_obs learnable weights in total.
@@ -63,12 +65,27 @@ def __init__(self,
                 If a tensor with shape (num_classes, dim) is supplied, supplying a (num_classes, dim) tensor is amount
                 to specifying a different prior variance for each entry in the coefficient.
                 Defaults to 1.0.
+            distribution (str, optional): the distribution of the coefficient. Currently we support 'gaussian' and 'gamma'.
+                Defaults to 'gaussian'.
         """
         super(BayesianCoefficient, self).__init__()
         # do we use this at all? TODO: drop self.variation.
         assert variation in ['item', 'user', 'constant', 'category']
 
         self.variation = variation
+
+        assert distribution in ['gaussian', 'gamma'], f'Unsupported distribution {distribution}'
+        if distribution == 'gamma':
+            assert not obs2prior, 'Gamma distribution is not supported for obs2prior at present.'
+            mean = 1
+            variance = 10
+            assert mean > 0, 'Gamma distribution requires mean > 0'
+            assert variance > 0, 'Gamma distribution requires variance > 0'
+            prior_mean = mean**2 / variance
+            prior_variance = mean / variance
+
+        self.distribution = distribution
+
         self.obs2prior = obs2prior
         if variation == 'constant' or variation == 'category':
             if obs2prior:
@@ -95,7 +112,8 @@ def __init__(self,
                                                dim=num_obs,
                                                prior_variance=1.0,
                                                H_zero_mask=self.H_zero_mask,
-                                               is_H=True)  # this is a distribution responsible for the obs2prior H term.
+                                               is_H=True,
+                                               distribution=self.distribution)  # this is a distribution responsible for the obs2prior H term.
 
         else:
             self.register_buffer(
@@ -117,6 +135,14 @@ def __init__(self,
         self.variational_mean_flexible = nn.Parameter(
             torch.randn(num_classes, dim), requires_grad=True)
 
+        # TOOD(kanodiaayush): initialize the gamma distribution variational mean in a more principled way.
+        '''
+        if self.distribution == 'gamma':
+            # take absolute value of the variational mean.
+            self.variational_mean_flexible.data = torch.abs(
+                self.variational_mean_flexible.data)
+        '''
+
         if self.is_H and self.H_zero_mask is not None:
             assert self.H_zero_mask.shape == self.variational_mean_flexible.shape, \
                 f"The H_zero_mask should have exactly the shape as the H variable, `H_zero_mask`.shape is {self.H_zero_mask.shape}, `H`.shape is {self.variational_mean_flexible.shape} "
@@ -163,6 +189,9 @@ def variational_mean(self) -> torch.Tensor:
         else:
             M = self.variational_mean_fixed + self.variational_mean_flexible
 
+        if self.distribution == 'gamma':
+            M = torch.pow(M, 2) + 0.000001
+
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
             # multiply zeros to entries with H_zero_mask[i, j] = 1.
@@ -196,7 +225,11 @@ def log_prior(self,
         Returns:
             torch.Tensor: the log prior of the variable with shape (num_seeds, num_classes).
         """
-        # p(sample)
+        # DEBUG_MARKER
+        '''
+        print(sample)
+        print('log_prior')
+        '''
         num_seeds, num_classes, dim = sample.shape
         # shape (num_seeds, num_classes)
         if self.obs2prior:
@@ -211,9 +244,46 @@ def log_prior(self,
 
         else:
             mu = self.prior_zero_mean
-        out = LowRankMultivariateNormal(loc=mu,
-                                        cov_factor=self.prior_cov_factor,
-                                        cov_diag=self.prior_cov_diag).log_prob(sample)
+
+        if self.distribution == 'gaussian':
+            # DEBUG_MARKER
+            '''
+            print('sample.shape', sample.shape)
+            print('gaussian')
+            print("mu.shape, self.prior_cov_diag.shape")
+            print(mu.shape, self.prior_cov_diag.shape)
+            '''
+            out = LowRankMultivariateNormal(loc=mu,
+                                            cov_factor=self.prior_cov_factor,
+                                            cov_diag=self.prior_cov_diag).log_prob(sample)
+        elif self.distribution == 'gamma':
+            concentration = torch.pow(mu, 2)/self.prior_cov_diag
+            rate = mu/self.prior_cov_diag
+            # DEBUG_MARKER
+            '''
+            print('sample.shape', sample.shape)
+            print('gamma')
+            print("mu.shape, self.prior_cov_diag.shape")
+            print(mu.shape, self.prior_cov_diag.shape)
+            print("concentration.shape, rate.shape")
+            print(concentration.shape, rate.shape)
+            '''
+            out = Gamma(concentration=concentration,
+                        rate=rate).log_prob(sample)
+            # drop the last dim, take the first element over the last dim
+            out = out[:, :, 0]
+
+
+        # DEBUG_MARKER
+        '''
+        print("sample.shape")
+        print(sample.shape)
+        print("out.shape")
+        print(out.shape)
+        print("num_seeds, num_classes")
+        print(num_seeds, num_classes)
+        breakpoint()
+        '''
         assert out.shape == (num_seeds, num_classes)
         return out
 
@@ -250,6 +320,15 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
         """
         value_sample = self.variational_distribution.rsample(
             torch.Size([num_seeds]))
+        # DEBUG_MARKER
+        '''
+        print("rsample")
+        print(self.distribution)
+        print("value_sample.shape")
+        print(value_sample.shape)
+        breakpoint()
+        '''
+        # DEBUG_MARKER
         if self.obs2prior:
             # sample obs2prior H as well.
             H_sample = self.prior_H.rsample(num_seeds=num_seeds)
@@ -258,12 +337,29 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
             return value_sample
 
     @property
-    def variational_distribution(self) -> LowRankMultivariateNormal:
+    def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
         """Constructs the current variational distribution of the coefficient from current variational mean and covariance.
         """
-        return LowRankMultivariateNormal(loc=self.variational_mean,
-                                         cov_factor=self.variational_cov_factor,
-                                         cov_diag=torch.exp(self.variational_logstd))
+        if self.distribution == 'gaussian':
+            return LowRankMultivariateNormal(loc=self.variational_mean,
+                                             cov_factor=self.variational_cov_factor,
+                                             cov_diag=torch.exp(self.variational_logstd))
+        elif self.distribution == 'gamma':
+            # concentration is mean**2 / var (std**2)
+            concentration = torch.pow(self.variational_mean, 2)/torch.pow(torch.exp(self.variational_logstd), 2)
+            # rate is mean / var (std**2)
+            rate = self.variational_mean/torch.pow(torch.exp(self.variational_logstd), 2)
+            # DEBUG_MARKER
+            '''
+            print("self.variational_mean, self.variational_logstd")
+            print(self.variational_mean, self.variational_logstd)
+            print("concentration, rate")
+            print(concentration, rate)
+            '''
+            # DEBUG_MARKER
+            return Gamma(concentration=concentration, rate=rate)
+        else:
+            raise NotImplementedError("Unknown variational distribution type.")
 
     @property
     def device(self) -> torch.device:
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index be704db..bcf0cc4 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -113,6 +113,7 @@ def __init__(self,
                  num_items: int,
                  pred_item: bool,
                  num_classes: int = 2,
+                 coef_dist_dict: Dict[str, str] = {'default' : 'gaussian'},
                  H_zero_mask_dict: Optional[Dict[str, torch.BoolTensor]] = None,
                  prior_mean: Union[float, Dict[str, float]] = 0.0,
                  prior_variance: Union[float, Dict[str, float]] = 1.0,
@@ -140,6 +141,13 @@ def __init__(self,
                     lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
                 See the doc-string of parse_utility for an example.
 
+            coef_dist_dict (Dict[str, str]): a dictionary mapping coefficient name to coefficient distribution name.
+                The coefficient distribution name can be one of the following:
+                1. 'gaussian'
+                2. 'gamma' - obs2prior is not supported for gamma coefficients
+                If a coefficient does not appear in the dictionary, it will be assigned the distribution specified
+                by the 'default' key. By default, the default distribution is 'gaussian'.
+
             obs2prior_dict (Dict[str, bool]): a dictionary maps coefficient name (e.g., 'lambda_item')
                 to a boolean indicating if observable (e.g., item_obs) enters the prior of the coefficient.
 
@@ -233,6 +241,7 @@ def __init__(self,
         self.utility_formula = utility_formula
         self.obs2prior_dict = obs2prior_dict
         self.coef_dim_dict = coef_dim_dict
+        self.coef_dist_dict = coef_dist_dict
         if H_zero_mask_dict is not None:
             self.H_zero_mask_dict = H_zero_mask_dict
         else:
@@ -348,6 +357,13 @@ def __init__(self,
                             warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
                             self.prior_variance[coef_name] = 1.0
 
+                if coef_name not in self.coef_dist_dict.keys():
+                    if 'default' in self.coef_dist_dict.keys():
+                        self.coef_dist_dict[coef_name] = self.coef_dist_dict['default']
+                    else:
+                        warnings.warn(f"You provided a dictionary of coef_dist_dict, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the coef_dist_dict dictionary to use that as default value (e.g., coef_dist_dict['default'] = 'gaussian'); now using distribution='gaussian' since this is not supplied.")
+                        self.coef_dist_dict[coef_name] = 'gaussian'
+
                 s2 = self.prior_variance[coef_name] if isinstance(
                     self.prior_variance, dict) else self.prior_variance
 
@@ -367,7 +383,8 @@ def __init__(self,
                                                            prior_mean=mean,
                                                            prior_variance=s2,
                                                            H_zero_mask=H_zero_mask,
-                                                           is_H=False)
+                                                           is_H=False,
+                                                           distribution=self.coef_dist_dict[coef_name])
         self.coef_dict = nn.ModuleDict(coef_dict)
 
         # ==============================================================================================================
@@ -653,6 +670,9 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         """
         sample_dict = dict()
         for coef_name, coef in self.coef_dict.items():
+            '''
+            print(coef_name)
+            '''
             if deterministic:
                 sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
                 if coef.obs2prior:
@@ -935,6 +955,8 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, P, I, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -965,6 +987,8 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
@@ -1167,6 +1191,8 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, total_computation, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -1196,6 +1222,8 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
diff --git a/bemb/model/bemb_flex_lightning.py b/bemb/model/bemb_flex_lightning.py
index 7873aa5..0c7dbee 100644
--- a/bemb/model/bemb_flex_lightning.py
+++ b/bemb/model/bemb_flex_lightning.py
@@ -79,6 +79,14 @@ def training_step(self, batch, batch_idx):
         loss = - elbo
         return loss
 
+    # DEBUG_MARKER
+    '''
+    def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
+        print(f"Epoch {self.current_epoch} has ended")
+        breakpoint()
+    '''
+    # DEBUG_MARKER
+
     def _get_performance_dict(self, batch):
         if self.model.pred_item:
             log_p = self.model(batch, return_type='log_prob',
diff --git a/tutorials/supermarket/configs.yaml b/tutorials/supermarket/configs.yaml
index ce6b672..7ecc4f7 100644
--- a/tutorials/supermarket/configs.yaml
+++ b/tutorials/supermarket/configs.yaml
@@ -1,30 +1,36 @@
 device: cuda
 # data_dir: /home/tianyudu/Data/MoreSupermarket/tsv/
-data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
+# data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
+data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/1904/20180101-20191231_44/tsv
 # utility: lambda_item
 # utility: lambda_item + theta_user * alpha_item
 # utility: lambda_item + theta_user * alpha_item + zeta_user * item_obs
-utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+# utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+utility: lambda_item + theta_user * alpha_item + gamma_user * price_obs
+# utility: alpha_item * gamma_user * price_obs
 out_dir: ./output/
 # model configuration.
+coef_dist_dict:
+  default: 'gaussian'
+  gamma_user: 'gamma'
 obs2prior_dict:
-  lambda_item: True
-  theta_user: True
-  alpha_item: True
+  lambda_item: False
+  theta_user: False
+  alpha_item: False
   zeta_user: True
   lota_item: True
-  gamma_user: True
+  gamma_user: False
   beta_item: True
 coef_dim_dict:
   lambda_item: 1
   theta_user: 10
   alpha_item: 10
-  gamma_user: 10
-  beta_item: 10
+  gamma_user: 1
+  beta_item: 1
 #### optimization.
 trace_log_q: False
 shuffle: False
 batch_size: 100000
-num_epochs: 3
+num_epochs: 100
 learning_rate: 0.03
-num_mc_seeds: 1
+num_mc_seeds: 2
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 7bbb1e4..5cd5479 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -197,6 +197,7 @@ def load_tsv(file_name, data_dir):
     bemb = LitBEMBFlex(
         # trainings args.
         pred_item = configs.pred_item,
+        coef_dist_dict=configs.coef_dist_dict,
         learning_rate=configs.learning_rate,
         num_seeds=configs.num_mc_seeds,
         # model args, will be passed to BEMB constructor.
@@ -217,6 +218,12 @@ def load_tsv(file_name, data_dir):
     bemb = bemb.to(configs.device)
     bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs)
 
+    coeffs = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
+    coeffs = coeffs**2
+    # give distribution statistics
+    print('Coefficients statistics:')
+    print(pd.DataFrame(coeffs).describe())
+
     # ==============================================================================================
     # inference example
     # ==============================================================================================