Merge pull request #12 from ihmeuw-msca/rename_prevalence_to_rate

Rename prevalence to rate
ihmeuw-msca · Jan 29, 2023 · f0d612d · f0d612d
2 parents ae34d9d + b6edcf4
commit f0d612d
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 60 deletions.
diff --git a/examples/Basic Splitting Examples.ipynb b/examples/Basic Splitting Examples.ipynb
@@ -28,7 +28,7 @@
    "outputs": [],
    "source": [
     "populations=np.array([20,10,5])\n",
-    "global_rate=np.array([0.1,0.5,0.7])\n",
+    "rate_pattern=np.array([0.1,0.5,0.7])\n",
     "measured_count=31\n",
     "measured_count_SE=2.5"
    ]
@@ -57,7 +57,7 @@
     "estimate,SE,CI=split_datapoint(\n",
     "    measured_count,\n",
     "    populations,\n",
-    "    global_rate,\n",
+    "    rate_pattern,\n",
     "    measured_count_SE,\n",
     "    model=RateMultiplicativeModel()\n",
     ")\n",
@@ -97,7 +97,7 @@
     "estimate,SE,CI=split_datapoint(\n",
     "    measured_count,\n",
     "    populations,\n",
-    "    global_rate,\n",
+    "    rate_pattern,\n",
     "    measured_count_SE,\n",
     "    model=LogOdds_model()\n",
     ")\n",
@@ -120,7 +120,7 @@
    "metadata": {},
    "source": [
     "### Example where we get unreasonable estimates\n",
-    "The rate-multiplicative model here estimates 14 events in a group of only 10 people, while the LogOdds model provides a more reasonable estimate, adjusting the larger group more, and increasing the estimated rate for them relatively more, as we incorporate the prior information that the prevalence cannot go above 1. "
+    "The rate-multiplicative model here estimates 14 events in a group of only 10 people, while the LogOdds model provides a more reasonable estimate, adjusting the larger group more, and increasing the estimated rate for them relatively more, as we incorporate the prior information that the estimated rate cannot go above 1. "
    ]
   },
   {
@@ -155,6 +155,13 @@
     "))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Example below is somewhat out of date, see dataframe splitting example"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -190,10 +197,10 @@
     "observed_study_rate=0.7\n",
     "study_se=0.1\n",
     "\n",
-    "#This hack of prevalence ratio works when we assume multiplicativity in rate rather than in odds\n",
-    "baseline_male_prev=1.2\n",
-    "baseline_female_prev=1\n",
-    "sex_splitting_model=RateMultiplicativeModel(np.array([baseline_female_prev,baseline_male_prev]))\n",
+    "#This hack of using the ratio of incidence rates works when we assume multiplicativity in rate rather than in odds\n",
+    "baseline_male_rate=1.2\n",
+    "baseline_female_rate=1\n",
+    "sex_splitting_model=RateMultiplicativeModel(np.array([baseline_female_rate,baseline_male_rate]))\n",
     "\n",
     "#Fit with study_props, the study population proportions\n",
     "sex_splitting_model.fit_beta(\n",
@@ -232,7 +239,7 @@
     "population_age_pattern=np.array([100,200,300,100])\n",
     "global_age_pattern=np.array([0.5,0.3,0.5,0.6])\n",
     "\n",
-    "within_age_sex_prevalence_ratios=np.array([1.2,1.1,1,0.8])\n",
+    "within_age_sex_rate_ratios=np.array([1.2,1.1,1,0.8])\n",
     "within_age_sex_proportions=np.array([\n",
     "    [0.4,0.6],\n",
     "    [0.5,0.5],\n",
@@ -264,17 +271,17 @@
     "combined_split_results=np.zeros((4,3))\n",
     "combined_split_results[:,0]=age_results\n",
     "\n",
-    "for age_incidence,sex_proportions,population_at_age,prev_ratio,age_id in zip(\n",
+    "for age_incidence,sex_proportions,population_at_age,rate_ratio,age_id in zip(\n",
     "    age_results,\n",
     "    within_age_sex_proportions,\n",
     "    population_age_pattern,\n",
-    "    within_age_sex_prevalence_ratios,\n",
+    "    within_age_sex_rate_ratios,\n",
     "    range(len(age_results))\n",
     "    ):\n",
     "    combined_split_results[age_id,1:3]=split_datapoint(\n",
     "        age_incidence,\n",
     "        population_at_age*sex_proportions,\n",
-    "        np.array([prev_ratio,1]),\n",
+    "        np.array([rate_ratio,1]),\n",
     "        model=sex_splitting_model\n",
     "    )\n",
     "    \n",

diff --git a/readme.md b/readme.md
@@ -5,7 +5,7 @@ The most basic functionality is to perform disaggregation under the rate multipl
 
 The setup is as follows: 
 
-Let $D_{1,...,k}$ be an aggregated measurement across groups ${g_1,...,g_k}$, where the population of each is $p_i,...,p_k$. Let $f_1,...,f_k$ be the baseline pattern of the prevalence across groups, which could have potentially been estimated on a larger dataset or a population in which have higher quality data on. Using this data, we generate estimates for $D_i$, the number of events in group $g_i$ and $\hat{f_{i}}$, the prevalence in each group in the population of interest by combining $D_{1,...,k}$ with $f_1,...,f_k$ to make the estimates self consistent. 
+Let $D_{1,...,k}$ be an aggregated measurement across groups ${g_1,...,g_k}$, where the population of each is $p_i,...,p_k$. Let $f_1,...,f_k$ be the baseline pattern of the rates across groups, which could have potentially been estimated on a larger dataset or a population in which have higher quality data on. Using this data, we generate estimates for $D_i$, the number of events in group $g_i$ and $\hat{f_{i}}$, the rate in each group in the population of interest by combining $D_{1,...,k}$ with $f_1,...,f_k$ to make the estimates self consistent. 
 
 Mathematically, in the simpler rate multiplicative model, we find $\beta$ such that 
 $$D_{1,...,k} = \sum_{i=1}^{k}\hat{f}_i \cdot p_i $$
@@ -15,11 +15,11 @@ $$\hat{f_i} = T^{-1}(\beta + T(f_i)) $$
 This yields the estimates for the per-group event count, 
 
 $$D_i = \hat f_i \cdot p_i $$
-For the current models in use, T is just a logarithm, and this assumes that each prevalence is some constant multiple muliplied by the overall global or baseline prevalence level. Allowing a more general transformation T, such as a log-odds transformation, assumes multiplicativity in the prevalence odds, rather than the prevalence rate, and can produce better estimates statistically (potentially being a more realistic assumption in some cases) and practically, restricting the estimated prevalences to lie within a reasonable interval. 
+For the current models in use, T is just a logarithm, and this assumes that each rate is some constant muliplied by the overall rate pattern level. Allowing a more general transformation T, such as a log-odds transformation, assumes multiplicativity in the associated odds, rather than the rate, and can produce better estimates statistically (potentially being a more realistic assumption in some cases) and practically, restricting the estimated rates to lie within a reasonable interval. 
 
 ## Current Package Capabilities and Models
 Currently, the multiplicative-in-rate model RateMultiplicativeModel with $T(x)=\log(x)$ and the Log Modified Odds model LMO_model(m) with $T(x)=\log(\frac{x}{1-x^{m}})$ are implemented. Note that the LMO_model with m=1 gives a multiplicative in odds model.
 
 A useful (but slightly wrong) analogy is that the multiplicative-in-rate is to the multiplicative-in-odds model as ordinary least squares is to logistic regression in terms of the relationship between covariates and output (not in terms of anything like the likelihood)
 
-Increasing in the model LMO_model(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that prevalence estimates are bounded by 1. 
+Increasing m in the model LMO_model(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that rate estimates are bounded by 1. 
diff --git a/src/pydisagg/disaggregate.py b/src/pydisagg/disaggregate.py
@@ -6,7 +6,7 @@
 def split_datapoint(
     measured_count,
     bucket_populations,
-    baseline_prevalence,
+    rate_pattern,
     measured_count_se=None,
     model=LMO_model(1),
     CI_method='delta-wald'
@@ -23,7 +23,7 @@ def split_datapoint(
         bucket_populations,
         measured_count,
         measured_count_se,
-        baseline_prevalence,
+        rate_pattern,
         CI_method=CI_method
     )
 
@@ -52,9 +52,8 @@ def split_dataframe(
     population_sizes: dataframe with location_id as the index containing the 
         size of each group within each population (given the location_id)
 
-    baseline_prevalences: dataframe with pattern_id as the index, and columns 
-        for each of the groups_to_split where the entries represent the baseline
-        prevalence in the given group to use for pydisagg. 
+    rate_patterns: dataframe with pattern_id as the index, and columns 
+        for each of the groups_to_split where the entries represent the rate pattern in the given group to use for pydisagg. 
 
     use_se: Boolean, whether or not to report standard errors along with estimates
         if set to True, then observation_group_membership_df must have an obs_se column

diff --git a/src/pydisagg/models.py b/src/pydisagg/models.py
@@ -4,22 +4,22 @@
 
 class RateMultiplicativeModel(SplittingModel):
     '''
-    Produces a AgeSplittingModel using the log(prev) transformation with the exponent m.
-    This assumes that log(prev)=log(baseline_prev)+beta
+    Produces a AgeSplittingModel using the log(rate) transformation with the exponent m.
+    This assumes that log(rate)=log(rate_pattern)+beta
     resulting in the current multiplicative model after exponentiating
     Take exp(beta) to recover the multiplier in the model.
     '''
 
     def __init__(
         self,
-        baseline_prevalence=None,
+        rate_pattern=None,
         beta_parameter=None,
         error_inflation=None,
         beta_standard_error=None
     ):
         super().__init__(
             parameter_transformation=transformations.LogTransformation(),
-            baseline_prevalence=baseline_prevalence,
+            rate_pattern=rate_pattern,
             beta_parameter=beta_parameter,
             error_inflation=error_inflation,
             beta_standard_error=beta_standard_error
@@ -34,14 +34,14 @@ class LMO_model(SplittingModel):
     def __init__(
         self,
         m,
-        baseline_prevalence=None,
+        rate_pattern=None,
         beta_parameter=None,
         error_inflation=None,
         beta_standard_error=None
     ):
         super().__init__(
             parameter_transformation=transformations.LogModifiedOddsTransformation(m),
-            baseline_prevalence=baseline_prevalence,
+            rate_pattern=rate_pattern,
             beta_parameter=beta_parameter,
             error_inflation=error_inflation,
             beta_standard_error=beta_standard_error
@@ -55,14 +55,14 @@ class LogOdds_model(SplittingModel):
 
     def __init__(
         self,
-        baseline_prevalence=None,
+        rate_pattern=None,
         beta_parameter=None,
         error_inflation=None,
         beta_standard_error=None
     ):
         super().__init__(
             parameter_transformation=transformations.LogOddsTransformation(),
-            baseline_prevalence=baseline_prevalence,
+            rate_pattern=rate_pattern,
             beta_parameter=beta_parameter,
             error_inflation=error_inflation,
             beta_standard_error=beta_standard_error

diff --git a/src/pydisagg/splittingmodel.py b/src/pydisagg/splittingmodel.py
@@ -9,18 +9,18 @@ class SplittingModel:
     multiplicativity in some space, but fit an additive parameter beta throughout
 
     We do this because it works better with the delta method calculations, and we assume that we
-    have a log into the transormations so additive factors become multiplicative
+    have a log the transormations so additive factors become multiplicative
     '''
 
     def __init__(
         self,
         parameter_transformation=None,
-        baseline_prevalence=None,
+        rate_pattern=None,
         beta_parameter=None,
         error_inflation=None,
         beta_standard_error=None
     ):
-        self.baseline_prevalence = baseline_prevalence
+        self.rate_pattern = rate_pattern
         self.beta_parameter = beta_parameter
         self.beta_standard_error = beta_standard_error
         self.error_inflation = error_inflation
@@ -43,38 +43,38 @@ def pull_beta(self, beta):
         else:
             raise Exception("Not fitted, No Beta Parameter Available")
 
-    def pull_set_baseline_prevalence(self, baseline_prevalence):
+    def pull_set_rate_pattern(self, rate_pattern):
         '''
-        Checks whether baseline_prevalence parameter is available in input, or if it is None
-            if baseline_prevalence is not none, it will return it and set it as self.baseline_prevalence
-        If baseline_prevalence is none, then this will try and return 
-        self.baseline_prevalence. 
+        Checks whether rate_pattern parameter is available in input, or if it is None
+            if rate_pattern is not none, it will return it and set it as self.rate_pattern
+        If rate_pattern is none, then this will try and return 
+        self.rate_pattern. 
         If neither are available, this will raise an exception
         '''
-        if baseline_prevalence is not None:
-            self.baseline_prevalence = baseline_prevalence
-            return baseline_prevalence
-        elif self.baseline_prevalence is not None:
-            return self.baseline_prevalence
+        if rate_pattern is not None:
+            self.rate_pattern = rate_pattern
+            return rate_pattern
+        elif self.rate_pattern is not None:
+            return self.rate_pattern
         else:
-            raise Exception("No Baseline Prevalence Available")
+            raise Exception("No Rate Pattern Available")
 
-    def predict_rates(self, beta=None, baseline_prevalence=None):
+    def predict_rates(self, beta=None, rate_pattern=None):
         '''
-        Generates a predicted prevalence within each bucket assuming 
+        Generates a predicted rate within each bucket assuming 
             multiplicativity in the T-transformed space with the additive parameter
         '''
         beta_val = self.pull_beta(beta)
-        baseline_prevalence_val = self.pull_set_baseline_prevalence(baseline_prevalence)
+        rate_pattern_val = self.pull_set_rate_pattern(rate_pattern)
 
-        return self.T_inverse(beta_val + self.T(baseline_prevalence_val))
+        return self.T_inverse(beta_val + self.T(rate_pattern_val))
 
     def _predict_rates_SE(self, beta_val, SE_val):
         '''
-        Computes the standard error of the predicted prevalence in each bucket
+        Computes the standard error of the predicted rate in each bucket
             using the delta method, propogating the given standard error on beta
         '''
-        return self._rate_derivative(beta_val, self.baseline_prevalence)*SE_val
+        return self._rate_derivative(beta_val, self.rate_pattern)*SE_val
 
     def _H_func(self, beta, bucket_populations):
         '''
@@ -83,27 +83,27 @@ def _H_func(self, beta, bucket_populations):
         '''
         return np.sum(bucket_populations*self.predict_rates(beta))
 
-    def _rate_derivative(self, beta, global_rate_value):
+    def _rate_derivative(self, beta, rate_pattern_value):
         '''
         Computes the derivative with respect to beta
-        of the predicted prevalence in a bucket with a fixed global prevalence value
+        of the predicted rate in a bucket with a fixed rate_pattern value
         '''
-        return 1/self.T_diff(self.T_inverse(beta + self.T(global_rate_value)))
+        return 1/self.T_diff(self.T_inverse(beta + self.T(rate_pattern_value)))
 
     def _H_diff(self, beta, bucket_populations):
         '''
         Function outputs the derivative of the above _H_func with respect to beta
         '''
         return np.sum(bucket_populations *
-                      self._rate_derivative(beta, self.baseline_prevalence)
+                      self._rate_derivative(beta, self.rate_pattern)
                       )
 
     def fit_beta(
         self,
         bucket_populations,
         measured_count,
         measured_count_se=None,
-        baseline_prevalence=None,
+        rate_pattern=None,
         lower_guess=-50,
         upper_guess=50,
         verbose=0
@@ -113,7 +113,7 @@ def fit_beta(
         Will attempt to generate a standard error using delta method if a standard error for
         for age density is given. 
         '''
-        _ = self.pull_set_baseline_prevalence(baseline_prevalence)
+        _ = self.pull_set_rate_pattern(rate_pattern)
 
         def beta_misfit(beta):
             return self._H_func(beta, bucket_populations)-measured_count
@@ -153,7 +153,7 @@ def predict_rates_CI(self, alpha=0.05, method='delta-wald'):
             from the standard error on beta
 
         Method "delta-wald" propogates the standard error on beta through to the
-            predicted rate using delta method of Tinv(beta + T(global_rate))
+            predicted rate using delta method of Tinv(beta + T(rate_pattern))
         This gives a symmetric confidence interval that will be self consistent with 
             any confidence interval on your original measurement
 
@@ -225,7 +225,7 @@ def predict_count_CI(self, bucket_populations, alpha=0.05, method='delta-wald'):
             given an an age density from the standard error on beta
 
         Method "delta-wald" propogates the standard error on beta through to the
-            predicted rate using delta method of Tinv(beta + T(global_rate))
+            predicted rate using delta method of Tinv(beta + T(rate_pattern))
         This gives a symmetric confidence interval that will be self consistent with 
             any confidence interval on your original measurement
 
@@ -251,7 +251,7 @@ def split_groups(
         bucket_populations,
         measured_count=None,
         measured_count_se=None,
-        baseline_prevalence=None,
+        rate_pattern=None,
         CI_method='delta-wald',
         alpha=0.05
     ):
@@ -260,7 +260,7 @@ def split_groups(
         If a measured_count and measured_count_se argument is given, 
         then we refit the model to the bucket_populations and the measured_count first before predicting
         '''
-        _ = self.pull_set_baseline_prevalence(baseline_prevalence)
+        _ = self.pull_set_rate_pattern(rate_pattern)
 
         if measured_count is not None:
             self.fit_beta(bucket_populations, measured_count, measured_count_se, verbose=0)

diff --git a/tests/test_splitting_consistency.py b/tests/test_splitting_consistency.py
@@ -17,10 +17,10 @@ def test_model_consistency(model):
     populations = np.array([2, 5])
     measured_total = 4.8
     measurement_SE = 1
-    baseline_prevalence = np.array([0.2, 0.4])
+    rate_pattern = np.array([0.2, 0.4])
 
     result, SE, CI = split_datapoint(measured_total, populations,
-                                     baseline_prevalence, measurement_SE,
+                                     rate_pattern, measurement_SE,
                                      model)
     assert_approx_equal(measured_total, np.sum(result))
     assert_approx_equal(measurement_SE, np.sum(SE))