Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to 2024 constituency targeting #74

Merged
merged 25 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
!incomes_projection.csv
!policyengine_uk_data/datasets/frs/local_areas/**/*.csv
**/_build
!policyengine_uk_data/storage/*.csv
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ test:
pytest

install:
pip install policyengine-uk
pip install policyengine-uk==2.19.1
pip install policyengine>=2.4
pip install -e ".[dev]" --config-settings editable_mode=compat

install-uv:
uv pip install --system policyengine-uk
uv pip install --system policyengine-uk==2.19.1
uv pip install --system policyengine>=2.4
uv pip install --system -e ".[dev]" --config-settings editable_mode=compat

Expand Down
5,920 changes: 40 additions & 5,880 deletions docs/methodology.ipynb

Large diffs are not rendered by default.

179 changes: 77 additions & 102 deletions docs/validation/constituencies.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/validation/local_authorities.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down
9 changes: 0 additions & 9 deletions policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,7 @@ def generate(self):

self.add_random_variables(data)

# Reweighting

data = self.load_dataset()
original_weights = data["household_weight"][str(self.time_period)] + 10
for year in range(self.time_period, self.end_year + 1):
loss_matrix, targets_array = create_target_matrix(self, year)
new_weights = reweight(
original_weights, loss_matrix, targets_array
)
data["household_weight"][str(year)] = new_weights

self.save_dataset(data)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@
row_sums = mapping_matrix.sum(axis=1)
if not np.allclose(row_sums, 1.0):
print("Warning: Not all rows sum to 1. Check data for consistency.")

mapping_matrix = mapping_matrix.T
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@


def calibrate(
map_to_2024_boundaries: bool = True,
epochs: int = 256,
epochs: int = 512,
):
matrix, y = create_constituency_target_matrix("enhanced_frs_2022_23", 2025)

Expand Down Expand Up @@ -61,17 +60,27 @@ def loss(w):

return mse_c + mse_n

def pct_close(w, t=0.1):
def pct_close(w, t=0.1, constituency=True, national=True):
# Return the percentage of metrics that are within t% of the target
numerator = 0
denominator = 0
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t).item()
c_c = pred_c.shape[0] * pred_c.shape[1]

if constituency:
numerator += e_c
denominator += c_c

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t).item()
c_n = pred_n.shape[0]

return (e_c + e_n) / (c_c + c_n)
if national:
numerator += e_n
denominator += c_n

return numerator / denominator

def dropout_weights(weights, p):
if p == 0:
Expand All @@ -83,7 +92,7 @@ def dropout_weights(weights, p):
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=0.1)
optimizer = torch.optim.Adam([weights], lr=0.15)

desc = range(32) if os.environ.get("DATA_LITE") else range(epochs)

Expand All @@ -93,28 +102,28 @@ def dropout_weights(weights, p):
l = loss(weights_)
l.backward()
optimizer.step()
close = pct_close(weights_)
c_close = pct_close(weights_, constituency=True, national=False)
n_close = pct_close(weights_, constituency=False, national=True)
if epoch % 1 == 0:
print(
f"Loss: {l.item()}, Epoch: {epoch}, Constituency<10%: {c_close:.1%}, National<10%: {n_close:.1%}"
)
if epoch % 10 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")

final_weights = torch.exp(weights).detach().numpy()

if map_to_2024_boundaries:
final_weights = mapping_matrix @ final_weights

with h5py.File(
STORAGE_FOLDER / "parliamentary_constituency_weights.h5", "w"
) as f:
f.create_dataset("2025", data=final_weights)

# Override national weights in 2025 with the sum of the constituency weights

with h5py.File(
STORAGE_FOLDER / "enhanced_frs_2022_23.h5",
"r+",
) as f:
national_weights = final_weights.sum(axis=0)
f["household_weight/2025"][...] = national_weights
final_weights = torch.exp(weights).detach().numpy()

with h5py.File(
STORAGE_FOLDER / "parliamentary_constituency_weights.h5", "w"
) as f:
f.create_dataset("2025", data=final_weights)

with h5py.File(
STORAGE_FOLDER / "enhanced_frs_2022_23.h5", "r+"
) as f:
if "household_weight/2025" in f:
del f["household_weight/2025"]
f.create_dataset(
"household_weight/2025", data=final_weights.sum(axis=0)
)

return final_weights

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from policyengine_uk_data.utils.loss import (
create_target_matrix as create_national_target_matrix,
)
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk_data.datasets.frs.local_areas.constituencies.boundary_changes.mapping_matrix import (
mapping_matrix,
)

FOLDER = Path(__file__).parent

Expand Down Expand Up @@ -39,20 +43,21 @@ def create_constituency_target_matrix(
"employment_income",
]

for income_variable in INCOME_VARIABLES:
for income_variable in INCOME_VARIABLES[:1]:
income_values = sim.calculate(income_variable).values
in_spi_frame = sim.calculate("income_tax").values > 0
matrix[f"hmrc/{income_variable}/amount"] = sim.map_result(
income_values, "person", "household"
income_values * in_spi_frame, "person", "household"
)
y[f"hmrc/{income_variable}/amount"] = incomes[
f"{income_variable}_amount"
]
].values
matrix[f"hmrc/{income_variable}/count"] = sim.map_result(
income_values != 0, "person", "household"
(income_values != 0) * in_spi_frame, "person", "household"
)
y[f"hmrc/{income_variable}/count"] = incomes[
f"{income_variable}_count"
]
].values

age = sim.calculate("age").values
for lower_age in range(0, 80, 10):
Expand All @@ -78,7 +83,7 @@ def create_constituency_target_matrix(
) + [np.inf]

for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
if lower_bound >= 70_000 or lower_bound < 12_570:
if lower_bound < 12_570 or upper_bound > 70_000:
continue
in_bound = (
(employment_income >= lower_bound)
Expand Down Expand Up @@ -106,25 +111,42 @@ def create_constituency_target_matrix(
if uprate:
y = uprate_targets(y, time_period)

const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv")

y_2010 = y.copy()
y_2010["name"] = const_2010["name"].values

y_columns = list(y.columns)
y_values = mapping_matrix @ y.values # Transform to 2024 constituencies

y = pd.DataFrame(y_values, columns=y_columns)

y_2024 = y.copy()
y_2024["name"] = const_2024["name"].values

return matrix, y


def uprate_targets(y: pd.DataFrame, target_year: int = 2025) -> pd.DataFrame:
# Uprate age targets from 2020, taxable income targets from 2021, employment income targets from 2023.
# Use PolicyEngine uprating factors.
sim = Microsimulation(dataset="frs_2020_21")
from policyengine_uk_data.datasets.frs.frs import FRS_2020_21

sim = Microsimulation(dataset=FRS_2020_21)
matrix_20, y_20 = create_constituency_target_matrix(
"frs_2020_21", 2020, uprate=False
FRS_2020_21, 2020, uprate=False
)
matrix_21, y_21 = create_constituency_target_matrix(
"frs_2020_21", 2021, uprate=False
FRS_2020_21, 2021, uprate=False
)
matrix_23, y_23 = create_constituency_target_matrix(
"frs_2020_21", 2023, uprate=False
FRS_2020_21, 2023, uprate=False
)
matrix_final, y_final = create_constituency_target_matrix(
"frs_2020_21", target_year, uprate=False
FRS_2020_21, target_year, uprate=False
)

weights_20 = sim.calculate("household_weight", 2020)
weights_21 = sim.calculate("household_weight", 2021)
weights_23 = sim.calculate("household_weight", 2023)
Expand Down

This file was deleted.

Loading
Loading