Skip to content

Commit

Permalink
2022 Data Update (#23)
Browse files Browse the repository at this point in the history
* 2022 has different column name for levy_plus_loss "levy+loss"

* update agency

make sheet explicit, update across syntax, add 2022 column names

* Update cpihistory.pdf

* Switching to pdftools

Pretty sure this is the same but didn't want to use noncran tabulizer

* from press release

* remove tabulizer

* add 2022

* add excel conversions

* update 2006 to 2012 to excel versions

* add 2022 tax code

* sample 2022 bills

* update with pdftools

* lint /style
  • Loading branch information
erhla authored and dfsnow committed Jan 19, 2024
1 parent 84a46d3 commit 4c9bc27
Show file tree
Hide file tree
Showing 29 changed files with 202 additions and 145 deletions.
3 changes: 3 additions & 0 deletions data-raw/agency/Agency Rate Report 2022.xlsx
Git LFS file not shown
20 changes: 10 additions & 10 deletions data-raw/agency/agency.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ file_names <- list.files(
)





# agency_fund ------------------------------------------------------------------

# Load the detail sheet from each agency file. This includes the levy and rate
Expand All @@ -64,7 +61,7 @@ agency_fund <- map_dfr(file_names, function(file) {
"loss", "loss_percent", "fund_loss"
))) %>%
rename_with(~"levy_plus_loss", any_of(c(
"levy_and_loss", "fund_levy_plus_loss"
"levy_and_loss", "fund_levy_plus_loss", "levy_loss"
))) %>%
rename_with(~"rate_ceiling", any_of(c(
"ceiling", "rate_ceiling", "fund_rate_ceiling"
Expand Down Expand Up @@ -189,7 +186,7 @@ arrow::write_parquet(
# EAV, final extension, and much more
agency <- map_dfr(file_names, function(file) {
message("Reading: ", file)
readxl::read_xlsx(file) %>%
readxl::read_xlsx(file, sheet = 1) %>%
set_names(snakecase::to_snake_case(names(.))) %>%
mutate(
across(
Expand Down Expand Up @@ -235,9 +232,12 @@ agency <- map_dfr(file_names, function(file) {
"reduction_percent", "reduction_factor", "clerk_reduction_factor"
))) %>%
rename_with(~"total_non_cap_ext", any_of(c(
"total_non_cap_ext", "final_non_cap_ext"
"total_non_cap_ext", "final_non_cap_ext", "total_non_cap_extension"
))) %>%
rename_with(~"total_ext", any_of(c(
"total_ext", "final_ext",
"grand_total_ext"
))) %>%
rename_with(~"total_ext", any_of(c("total_ext", "final_ext"))) %>%
# Select, order, and rename columns
select(
year,
Expand Down Expand Up @@ -296,20 +296,20 @@ agency <- map_dfr(file_names, function(file) {
arrange(year, agency_num) %>%
# Coerce columns to expected types
mutate(
across(c(year), as.character),
across(c(year), ~ as.character(.x)),
across(
c(
lim_numerator, lim_denominator, prior_eav:cty_total_eav,
total_levy, total_max_levy, total_reduced_levy, total_final_levy
),
as.integer64
~ as.integer64(.x)
),
across(
c(
lim_rate, pct_burden, total_prelim_rate, total_final_rate,
reduction_pct, total_non_cap_ext, total_ext
),
as.double
~ as.double(.x)
)
)

Expand Down
4 changes: 2 additions & 2 deletions data-raw/agency/tif_agency_names.csv
Git LFS file not shown
26 changes: 16 additions & 10 deletions data-raw/cpi/cpi.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(arrow)
library(dplyr)
library(miniUI)
library(tabulizer)
library(pdftools)
library(tidyr)
library(stringr)

Expand All @@ -14,27 +14,33 @@ row_to_names <- function(df) {
# The goal of this script is to create a data frame of Consumer Price Indices
# CPI-U used by PTELL to calculate/cap property tax extensions
# We can load the historical CPIs from a PDF provided by the State of Illinois
# https://tax.illinois.gov/content/dam/soi/en/web/tax/localgovernments/property/documents/cpihistory.pdf

# Paths for local raw data storage and remote storage on S3
remote_bucket <- Sys.getenv("S3_REMOTE_BUCKET")
remote_path <- file.path(remote_bucket, "cpi", "part-0.parquet")

# Extract the table only (no headers), then manually assign header
cpi_ext <- extract_areas(file = "data-raw/cpi/cpihistory.pdf")[[1]]
cpi <- as_tibble(cpi_ext[, c(1, 2, 4, 5, 6)])
cpi <- setNames(cpi, c("year", "cpi", "ptell_cook", "comments", "levy_year"))
cpi <- pdftools::pdf_text(pdf = "data-raw/cpi/cpihistory.pdf") %>%
str_extract(., regex("1991.*", dotall = TRUE)) %>%
str_remove_all(., "\\(5 % for Cook\\)") %>%
str_split(., "\n") %>%
unlist() %>%
tibble(vals = `.`) %>%
mutate(vals = str_squish(vals)) %>%
separate_wider_delim(
col = vals,
names = c("year", "cpi", "pct", "ptell_cook", "levy_year", "year_paid"),
delim = " ", too_few = "align_start", too_many = "drop"
)

# Merge Cook rate into main column
cpi <- cpi %>%
mutate(
across(c(year, levy_year), as.character),
across(c(cpi), as.numeric),
across(c(ptell_cook, comments), readr::parse_number),
ptell_cook = ifelse(!is.na(comments), comments, ptell_cook),
across(c(ptell_cook), readr::parse_number),
ptell_cook = ptell_cook / 100
) %>%
select(-comments) %>%
filter(year != "1991") %>%
filter(year != "1991", year != "", year != "CPI") %>%
arrange(year)

# Write to S3
Expand Down
Binary file modified data-raw/cpi/cpihistory.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions data-raw/eq_factor/eq_factor.csv
Git LFS file not shown
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
50 changes: 37 additions & 13 deletions data-raw/sample_tax_bills/sample_tax_bills_detail.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
library(dplyr)
library(tidyr)
library(tabulizer)
library(pdftools)
library(miniUI)
library(stringr)
library(purrr)
Expand All @@ -25,19 +25,43 @@ row_to_names <- function(df) {


# Different tax bills can have different table sizes depending on the number of
# taxing district. As such, the table bottom boundary will be different for each
# bill. Here we manually specify the area of table using an interactive widget
# taxing district.
extract_tax_bill <- function(file) {
base_file <- basename(file)

# Scan table into memory
tbl <- tabulizer::extract_areas(file = file, pages = 1)[[1]] %>%
as_tibble() %>%
row_to_names() %>%
set_names(
c("agency_name", "final_tax", "rate", "percent", "pension", "prev_tax")
tbl <- pdf_text(file)[[1]] %>%
str_extract(., regex("MISCELLANEOUS TAXES.*", dotall = TRUE)) %>%
str_split(., "\n") %>%
unlist() %>%
tibble(vals = `.`) %>%
mutate(vals = str_replace_all(vals, "[:space:]{2,}", "\t")) %>%
separate_wider_delim(
col = vals,
names = c(
"agency_name", "final_tax", "rate", "percent",
"pension", "prev_tax"
),
delim = "\t", too_few = "align_start", too_many = "drop"
) %>%
mutate(
agency_name = str_squish(agency_name),
flag = is.na(prev_tax),
prev_tax = if_else(flag,
pension,
prev_tax
),
pension = if_else(flag,
NA,
pension
)
) %>%
select(-flag) %>%
filter(
agency_name != "",
!str_detect(
agency_name,
"TAXES|Assess|Property|EAV|Local Tax|Total Tax|Do not|Equalizer|cookcountyclerk.com"
)
)

# Create a list with metadata for output
out <- list(
year = str_sub(base_file, 1, 4),
Expand Down Expand Up @@ -91,8 +115,8 @@ bills_df <- bills_df %>%
# Round numeric values to nearest hundredth
bills_df <- bills_df %>%
mutate(
across(c(final_tax, percent, pension, prev_tax), round, 2),
across(c(rate), round, 3),
across(c(final_tax, percent, pension, prev_tax), ~ round(.x, 2)),
across(c(rate), ~ round(.x, 3)),
)

# Write detail results to file for safekeeping
Expand Down
4 changes: 2 additions & 2 deletions data-raw/sample_tax_bills/sample_tax_bills_detail.csv
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tax_code/2022 Tax Code Agency Rate.xlsx
Git LFS file not shown
9 changes: 9 additions & 0 deletions data-raw/tax_code/tax_code.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ file_names <- list.files(
# Load each file and cleanup columns, then combine into single df
tax_code <- map_dfr(file_names, function(file) {
# Extract year from file name
print(file)
year_ext <- str_extract(file, "\\d{4}")

# Load file based on extension
Expand All @@ -39,6 +40,14 @@ tax_code <- map_dfr(file_names, function(file) {
~ str_replace(.x, "taxcode", "tax_code"),
starts_with("taxcode")
) %>%
rename_with(
~ str_replace(.x, "ag_rate", "agency_rate"),
starts_with("ag_rate")
) %>%
rename_with(
~ str_replace(.x, "code_rate", "tax_code_rate"),
starts_with("code_rate")
) %>%
mutate(
year = as.character(year_ext),
agency_rate = as.numeric(agency_rate),
Expand Down
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2006 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2007 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2008 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2009 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2010 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2011 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2012 Cook TIF Summary.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions data-raw/tif/main/2022 Cook County TIF Summary.xlsx
Git LFS file not shown
Loading

0 comments on commit 4c9bc27

Please sign in to comment.