2022 Data Update (#23)

* 2022 has different column name for levy_plus_loss "levy+loss" * update agency make sheet explicit, update across syntax, add 2022 column names * Update cpihistory.pdf * Switching to pdftools Pretty sure this is the same but didn't want to use noncran tabulizer * from press release * remove tabulizer * add 2022 * add excel conversions * update 2006 to 2012 to excel versions * add 2022 tax code * sample 2022 bills * update with pdftools * lint /style
ccao-data · Jan 19, 2024 · 4c9bc27 · 4c9bc27
1 parent 84a46d3
commit 4c9bc27
Show file tree

Hide file tree

Showing 29 changed files with 202 additions and 145 deletions.
diff --git a/data-raw/agency/Agency Rate Report 2022.xlsx b/data-raw/agency/Agency Rate Report 2022.xlsx
diff --git a/data-raw/agency/agency.R b/data-raw/agency/agency.R
@@ -43,9 +43,6 @@ file_names <- list.files(
 )
 
 
-
-
-
 # agency_fund ------------------------------------------------------------------
 
 # Load the detail sheet from each agency file. This includes the levy and rate
@@ -64,7 +61,7 @@ agency_fund <- map_dfr(file_names, function(file) {
       "loss", "loss_percent", "fund_loss"
     ))) %>%
     rename_with(~"levy_plus_loss", any_of(c(
-      "levy_and_loss", "fund_levy_plus_loss"
+      "levy_and_loss", "fund_levy_plus_loss", "levy_loss"
     ))) %>%
     rename_with(~"rate_ceiling", any_of(c(
       "ceiling", "rate_ceiling", "fund_rate_ceiling"
@@ -189,7 +186,7 @@ arrow::write_parquet(
 # EAV, final extension, and much more
 agency <- map_dfr(file_names, function(file) {
   message("Reading: ", file)
-  readxl::read_xlsx(file) %>%
+  readxl::read_xlsx(file, sheet = 1) %>%
     set_names(snakecase::to_snake_case(names(.))) %>%
     mutate(
       across(
@@ -235,9 +232,12 @@ agency <- map_dfr(file_names, function(file) {
       "reduction_percent", "reduction_factor", "clerk_reduction_factor"
     ))) %>%
     rename_with(~"total_non_cap_ext", any_of(c(
-      "total_non_cap_ext", "final_non_cap_ext"
+      "total_non_cap_ext", "final_non_cap_ext", "total_non_cap_extension"
+    ))) %>%
+    rename_with(~"total_ext", any_of(c(
+      "total_ext", "final_ext",
+      "grand_total_ext"
     ))) %>%
-    rename_with(~"total_ext", any_of(c("total_ext", "final_ext"))) %>%
     # Select, order, and rename columns
     select(
       year,
@@ -296,20 +296,20 @@ agency <- map_dfr(file_names, function(file) {
   arrange(year, agency_num) %>%
   # Coerce columns to expected types
   mutate(
-    across(c(year), as.character),
+    across(c(year), ~ as.character(.x)),
     across(
       c(
         lim_numerator, lim_denominator, prior_eav:cty_total_eav,
         total_levy, total_max_levy, total_reduced_levy, total_final_levy
       ),
-      as.integer64
+      ~ as.integer64(.x)
     ),
     across(
       c(
         lim_rate, pct_burden, total_prelim_rate, total_final_rate,
         reduction_pct, total_non_cap_ext, total_ext
       ),
-      as.double
+      ~ as.double(.x)
     )
   )
 

diff --git a/data-raw/agency/tif_agency_names.csv b/data-raw/agency/tif_agency_names.csv
diff --git a/data-raw/cpi/cpi.R b/data-raw/cpi/cpi.R
@@ -1,7 +1,7 @@
 library(arrow)
 library(dplyr)
 library(miniUI)
-library(tabulizer)
+library(pdftools)
 library(tidyr)
 library(stringr)
 
@@ -14,27 +14,33 @@ row_to_names <- function(df) {
 # The goal of this script is to create a data frame of Consumer Price Indices
 # CPI-U used by PTELL to calculate/cap property tax extensions
 # We can load the historical CPIs from a PDF provided by the State of Illinois
+# https://tax.illinois.gov/content/dam/soi/en/web/tax/localgovernments/property/documents/cpihistory.pdf
 
 # Paths for local raw data storage and remote storage on S3
 remote_bucket <- Sys.getenv("S3_REMOTE_BUCKET")
 remote_path <- file.path(remote_bucket, "cpi", "part-0.parquet")
 
-# Extract the table only (no headers), then manually assign header
-cpi_ext <- extract_areas(file = "data-raw/cpi/cpihistory.pdf")[[1]]
-cpi <- as_tibble(cpi_ext[, c(1, 2, 4, 5, 6)])
-cpi <- setNames(cpi, c("year", "cpi", "ptell_cook", "comments", "levy_year"))
+cpi <- pdftools::pdf_text(pdf = "data-raw/cpi/cpihistory.pdf") %>%
+  str_extract(., regex("1991.*", dotall = TRUE)) %>%
+  str_remove_all(., "\\(5 % for Cook\\)") %>%
+  str_split(., "\n") %>%
+  unlist() %>%
+  tibble(vals = `.`) %>%
+  mutate(vals = str_squish(vals)) %>%
+  separate_wider_delim(
+    col = vals,
+    names = c("year", "cpi", "pct", "ptell_cook", "levy_year", "year_paid"),
+    delim = " ", too_few = "align_start", too_many = "drop"
+  )
 
-# Merge Cook rate into main column
 cpi <- cpi %>%
   mutate(
     across(c(year, levy_year), as.character),
     across(c(cpi), as.numeric),
-    across(c(ptell_cook, comments), readr::parse_number),
-    ptell_cook = ifelse(!is.na(comments), comments, ptell_cook),
+    across(c(ptell_cook), readr::parse_number),
     ptell_cook = ptell_cook / 100
   ) %>%
-  select(-comments) %>%
-  filter(year != "1991") %>%
+  filter(year != "1991", year != "", year != "CPI") %>%
   arrange(year)
 
 # Write to S3

diff --git a/data-raw/cpi/cpihistory.pdf b/data-raw/cpi/cpihistory.pdf
diff --git a/data-raw/eq_factor/eq_factor.csv b/data-raw/eq_factor/eq_factor.csv
diff --git a/data-raw/sample_tax_bills/2022_200_04261010740000.pdf b/data-raw/sample_tax_bills/2022_200_04261010740000.pdf
diff --git a/data-raw/sample_tax_bills/2022_202_28244220220000.pdf b/data-raw/sample_tax_bills/2022_202_28244220220000.pdf
diff --git a/data-raw/sample_tax_bills/2022_203_19063120380000.pdf b/data-raw/sample_tax_bills/2022_203_19063120380000.pdf
diff --git a/data-raw/sample_tax_bills/2022_204_02171060120000.pdf b/data-raw/sample_tax_bills/2022_204_02171060120000.pdf
diff --git a/data-raw/sample_tax_bills/2022_205_10252080490000.pdf b/data-raw/sample_tax_bills/2022_205_10252080490000.pdf
diff --git a/data-raw/sample_tax_bills/2022_211_14333001380000.pdf b/data-raw/sample_tax_bills/2022_211_14333001380000.pdf
diff --git a/data-raw/sample_tax_bills/2022_299_14052110241207.pdf b/data-raw/sample_tax_bills/2022_299_14052110241207.pdf
diff --git a/data-raw/sample_tax_bills/2022_299_23222000451009.pdf b/data-raw/sample_tax_bills/2022_299_23222000451009.pdf
diff --git a/data-raw/sample_tax_bills/2022_593_08261020260000.pdf b/data-raw/sample_tax_bills/2022_593_08261020260000.pdf
diff --git a/data-raw/sample_tax_bills/sample_tax_bills_detail.R b/data-raw/sample_tax_bills/sample_tax_bills_detail.R
@@ -1,6 +1,6 @@
 library(dplyr)
 library(tidyr)
-library(tabulizer)
+library(pdftools)
 library(miniUI)
 library(stringr)
 library(purrr)
@@ -25,19 +25,43 @@ row_to_names <- function(df) {
 
 
 # Different tax bills can have different table sizes depending on the number of
-# taxing district. As such, the table bottom boundary will be different for each
-# bill. Here we manually specify the area of table using an interactive widget
+# taxing district.
 extract_tax_bill <- function(file) {
   base_file <- basename(file)
-
-  # Scan table into memory
-  tbl <- tabulizer::extract_areas(file = file, pages = 1)[[1]] %>%
-    as_tibble() %>%
-    row_to_names() %>%
-    set_names(
-      c("agency_name", "final_tax", "rate", "percent", "pension", "prev_tax")
+  tbl <- pdf_text(file)[[1]] %>%
+    str_extract(., regex("MISCELLANEOUS TAXES.*", dotall = TRUE)) %>%
+    str_split(., "\n") %>%
+    unlist() %>%
+    tibble(vals = `.`) %>%
+    mutate(vals = str_replace_all(vals, "[:space:]{2,}", "\t")) %>%
+    separate_wider_delim(
+      col = vals,
+      names = c(
+        "agency_name", "final_tax", "rate", "percent",
+        "pension", "prev_tax"
+      ),
+      delim = "\t", too_few = "align_start", too_many = "drop"
+    ) %>%
+    mutate(
+      agency_name = str_squish(agency_name),
+      flag = is.na(prev_tax),
+      prev_tax = if_else(flag,
+        pension,
+        prev_tax
+      ),
+      pension = if_else(flag,
+        NA,
+        pension
+      )
+    ) %>%
+    select(-flag) %>%
+    filter(
+      agency_name != "",
+      !str_detect(
+        agency_name,
+        "TAXES|Assess|Property|EAV|Local Tax|Total Tax|Do not|Equalizer|cookcountyclerk.com"
+      )
     )
-
   # Create a list with metadata for output
   out <- list(
     year = str_sub(base_file, 1, 4),
@@ -91,8 +115,8 @@ bills_df <- bills_df %>%
 # Round numeric values to nearest hundredth
 bills_df <- bills_df %>%
   mutate(
-    across(c(final_tax, percent, pension, prev_tax), round, 2),
-    across(c(rate), round, 3),
+    across(c(final_tax, percent, pension, prev_tax), ~ round(.x, 2)),
+    across(c(rate), ~ round(.x, 3)),
   )
 
 # Write detail results to file for safekeeping

diff --git a/data-raw/sample_tax_bills/sample_tax_bills_detail.csv b/data-raw/sample_tax_bills/sample_tax_bills_detail.csv
diff --git a/data-raw/tax_code/2022 Tax Code Agency Rate.xlsx b/data-raw/tax_code/2022 Tax Code Agency Rate.xlsx
diff --git a/data-raw/tax_code/tax_code.R b/data-raw/tax_code/tax_code.R
@@ -23,6 +23,7 @@ file_names <- list.files(
 # Load each file and cleanup columns, then combine into single df
 tax_code <- map_dfr(file_names, function(file) {
   # Extract year from file name
+  print(file)
   year_ext <- str_extract(file, "\\d{4}")
 
   # Load file based on extension
@@ -39,6 +40,14 @@ tax_code <- map_dfr(file_names, function(file) {
       ~ str_replace(.x, "taxcode", "tax_code"),
       starts_with("taxcode")
     ) %>%
+    rename_with(
+      ~ str_replace(.x, "ag_rate", "agency_rate"),
+      starts_with("ag_rate")
+    ) %>%
+    rename_with(
+      ~ str_replace(.x, "code_rate", "tax_code_rate"),
+      starts_with("code_rate")
+    ) %>%
     mutate(
       year = as.character(year_ext),
       agency_rate = as.numeric(agency_rate),

diff --git a/data-raw/tif/distribution/2022 TIF Agency Distribution Report.xlsx b/data-raw/tif/distribution/2022 TIF Agency Distribution Report.xlsx
diff --git a/data-raw/tif/main/2006 Cook TIF Summary.xlsx b/data-raw/tif/main/2006 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2007 Cook TIF Summary.xlsx b/data-raw/tif/main/2007 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2008 Cook TIF Summary.xlsx b/data-raw/tif/main/2008 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2009 Cook TIF Summary.xlsx b/data-raw/tif/main/2009 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2010 Cook TIF Summary.xlsx b/data-raw/tif/main/2010 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2011 Cook TIF Summary.xlsx b/data-raw/tif/main/2011 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2012 Cook TIF Summary.xlsx b/data-raw/tif/main/2012 Cook TIF Summary.xlsx
diff --git a/data-raw/tif/main/2022 Cook County TIF Summary.xlsx b/data-raw/tif/main/2022 Cook County TIF Summary.xlsx