Skip to content

Commit

Permalink
Add first pass training multi-cards sqft agg
Browse files Browse the repository at this point in the history
  • Loading branch information
wagnerlmichael committed Jan 22, 2025
1 parent c46937c commit c027cb0
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 20 deletions.
56 changes: 54 additions & 2 deletions pipeline/01-train.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,62 @@ message("Preparing model training data")
# NOTE: It is critical to trim "multicard" sales when training. Multicard means
# there is multiple buildings on a PIN. Since these sales include multiple
# buildings, they are typically higher than a "normal" sale and must be removed
training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!ind_pin_is_multicard, !sv_is_outlier) %>%

# - - - - - -
# TESTING - multi-card training strategy
# - - - - - -

training_data_w_multi_card <- read_parquet(paths$input$training$local) %>%
filter(!sv_is_outlier) %>%
arrange(meta_sale_date)



# Process the data as per the requirements
training_data_full <- training_data_w_multi_card %>%
# Group by both meta_pin and meta_sale_document_num
group_by(meta_pin, meta_sale_document_num) %>%
arrange(desc(char_bldg_sf), desc(char_bldg_sf), .by_group = TRUE) %>%
# Flag the first row in each group to preserve
mutate(
preserve = row_number() == 1,
# Calculate the total char_bldg_sf for each group
total_char_bldg_sf = sum(char_bldg_sf)
) %>%
# Update char_bldg_sf: if preserved, set to total; else, set to NA
mutate(char_bldg_sf = if_else(preserve, total_char_bldg_sf, NA_real_)) %>%
# Keep only the preserved rows
filter(preserve) %>%
# Remove temporary columns used for processing
select(-preserve, -total_char_bldg_sf) %>%
# Ungroup the data for further operations
ungroup()



# * * * *
# some eda stuff
# * * * *
# training_data_w_multi_card_processed %>%
# mutate(temp_price_per_sqft = meta_sale_price/char_bldg_sf) %>%
# select(meta_pin, meta_sale_document_num, ind_pin_is_multicard,
# meta_sale_price, temp_price_per_sqft,
# sv_is_outlier, char_bldg_sf,
# meta_sale_date) %>% View()
#
#
# training_data_w_multi_card_processed %>%
# mutate(temp_price_per_sqft = meta_sale_price/char_bldg_sf) %>%
# mutate(meta_sale_price_sf_decile = ntile(temp_price_per_sqft, 10)) %>%
# group_by(meta_sale_price_sf_decile, ind_pin_is_multicard) %>%
# summarise(count = n(), .groups = "drop") %>%
# group_by(meta_sale_price_sf_decile) %>%
# mutate(percentage = count / sum(count) * 100)

Check warning on line 83 in pipeline/01-train.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/01-train.R,line=83,col=5,[commented_code_linter] Commented code should be removed.

# training_data_full <- read_parquet(paths$input$training$local) %>%
# filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
# arrange(meta_sale_date)

Check warning on line 87 in pipeline/01-train.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/01-train.R,line=87,col=5,[commented_code_linter] Commented code should be removed.

# Create train/test split by time, with most recent observations in the test set
# We want our best model(s) to be predictive of the future, since properties are
# assessed on the basis of past sales
Expand Down
36 changes: 18 additions & 18 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,28 +79,28 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
# across multiple PINs sometimes receives different values from the model
group_by(meta_tieback_key_pin, meta_card_num, char_land_sf) %>%
mutate(
pred_card_intermediate_fmv = ifelse(
pred_pin_card_sum = ifelse(
is.na(meta_tieback_key_pin),
pred_card_initial_fmv,
mean(pred_card_initial_fmv)
)
) %>%
# Aggregate multi-cards to the PIN-level by summing the predictions
# of all cards. We use a heuristic here to limit the PIN-level total
# value, this is to prevent super-high-value back-buildings/ADUs from
# blowing up the PIN-level AV
group_by(meta_pin) %>%
mutate(
pred_pin_card_sum = ifelse(
sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
is.na(meta_1yr_pri_board_tot) |
n() != 2,
sum(pred_card_intermediate_fmv),
max(pred_card_intermediate_fmv)
)
) %>%
ungroup()
) #%>%
# # Aggregate multi-cards to the PIN-level by summing the predictions
# # of all cards. We use a heuristic here to limit the PIN-level total
# # value, this is to prevent super-high-value back-buildings/ADUs from

Check warning on line 90 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=90,col=3,[commented_code_linter] Commented code should be removed.
# # blowing up the PIN-level AV
# group_by(meta_pin) %>%
# mutate(
# pred_pin_card_sum = ifelse(
# sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
# params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
# is.na(meta_1yr_pri_board_tot) |
# n() != 2,

Check warning on line 98 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=98,col=9,[commented_code_linter] Commented code should be removed.
# sum(pred_card_intermediate_fmv),

Check warning on line 99 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=99,col=7,[commented_code_linter] Commented code should be removed.
# max(pred_card_intermediate_fmv)

Check warning on line 100 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=100,col=7,[commented_code_linter] Commented code should be removed.
# )
# ) %>%
# ungroup()

Check warning on line 103 in pipeline/02-assess.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/pipeline/02-assess.R,line=103,col=3,[commented_code_linter] Commented code should be removed.


## 3.2. Townhomes --------------------------------------------------------------
Expand Down

0 comments on commit c027cb0

Please sign in to comment.