WIP: finalized draft of # 03 - Handling cumulative values and estimat…

…ing missing quarters dataframe to be tested and then execute pivot_longer
gp1981 · May 6, 2024 · 7c1e3c6 · 7c1e3c6
1 parent f1863f0
commit 7c1e3c6
Showing 1 changed file with 102 additions and 36 deletions.
diff --git a/code/Functions/data_retrieval.R b/code/Functions/data_retrieval.R
@@ -702,11 +702,11 @@ CF_std <- function(df_Facts) {
     )
 
   # 03 - Handling cumulative values and estimating missing quarters ------------------------------------------------------
-     
+
   # Perform additional data processing to identify cumulative values and estimate values for missing quarters.
   # It then calculates the quarterly value by subtracting the lead value from the current value over the corresponding quarter.
 
- # Identify cumulative values in val
+  # Identify cumulative values in val
   df_std_CF <- df_std_CF %>%
     group_by(description, year_end) %>%
     arrange(desc(quarter_end), desc(quarter_start)) %>%
@@ -753,7 +753,7 @@ CF_std <- function(df_Facts) {
     ) %>% 
     ungroup() %>% 
     select(end, standardized_cashflow_label, val, Quarterly_val, description, everything())  
-
+  
 
   # 03 - Pivot df_std_CF in a dataframe format ------------------------------------------------------
   # This code transforms the data from a long format with multiple rows per observation to a wide format where each observation is represented by a single row with columns corresponding to different Concepts
@@ -884,7 +884,8 @@ IS_CF_std <- function(df_Facts) {
   df_std_IS_CF <- df_std_IS_CF %>%
     mutate(end = as.Date(end),
            start = as.Date(start),
-           filed = as.Date(filed))
+           filed = as.Date(filed),
+           val = as.numeric(val))
 
   df_std_IS_CF <- df_std_IS_CF %>%
     # Filter out rows without standardized_label
@@ -908,45 +909,36 @@ IS_CF_std <- function(df_Facts) {
     # Remove grouping
     ungroup() 
 
-  # Split the 'end' column into 'year_end/start' and 'quarter_end/start'
+  # Split the 'end' and 'start' columns into 'year_end/start' and 'quarter_end/start'
   df_std_IS_CF <- df_std_IS_CF %>%
     mutate(
       year_end = as.integer(lubridate::year(end)),
       quarter_end = as.integer(lubridate::quarter(end)),
-      year_start = as.integer(lubridate::year(start)),
-      quarter_start = as.integer(lubridate::quarter(start))
-    )
-
-  # Add  fame_start_year and frame_start_quarter and quarter_start and quarter_end
-  df_std_IS_CF <- df_std_IS_CF %>% 
-    mutate(
-      year_start = ifelse(is.na(year_start),year_end,year_start),
+      year_start = case_when(
+        Financial.Report == "BS" & is.na(as.integer(lubridate::year(start))) ~ as.integer(lubridate::year(end)), TRUE ~ as.integer(lubridate::year(start))),               
       quarter_start = case_when(
-        is.na(quarter_start) & Aggregated == "NO" ~ as.integer(quarter_end),
-        is.na(quarter_start) & Aggregated == "YES"  & fp == "FY" ~ 1,
-        TRUE ~ as.integer(quarter_start)
-      )
-    )
+        Financial.Report == "BS" & is.na(as.integer(lubridate::quarter(start))) ~ as.integer(lubridate::quarter(end)), TRUE ~ as.integer(lubridate::quarter(start)))   
+    ) 
 
   # 03 - Handling cumulative values and estimating missing quarters ------------------------------------------------------
 
   # Perform additional data processing to identify cumulative values and estimate values for missing quarters.
   # It then calculates the quarterly value by subtracting the lead value from the current value over the corresponding quarter.
 
-  # Identify cumulative values in val
+  # Estimate cumulative values in val
   df_std_IS_CF <- df_std_IS_CF %>%
     group_by(description, year_end) %>%
     arrange(desc(quarter_end), desc(quarter_start)) %>%
     mutate(
-      Is_Cumulative = ifelse(
-        year_end == year_start & quarter_end == quarter_start,
-        "NO", "YES")
-    ) %>% 
+      Cumulative_quarters = case_when(
+        year_end == year_start & quarter_end != quarter_start ~ quarter_end - quarter_start + 1,
+        TRUE ~ 1)
+    )%>% 
     ungroup()
 
   # Clean up duplicated val from multiple filings retaining the rows with the most recent "filed" date.
   df_std_IS_CF <- df_std_IS_CF %>%
-    group_by(description, end) %>% 
+    group_by(description, quarter_end, quarter_start) %>% 
     arrange(desc(filed)) %>%  # Arrange by descending "filed" date
     distinct(description, end, .keep_all = TRUE) %>% # Keep only the first occurrence of each unique combination of description and end, preserving the one with the most recent "filed" date
     ungroup()  
@@ -962,27 +954,101 @@ IS_CF_std <- function(df_Facts) {
     ) %>% 
     ungroup()
 
+  # df_std_IS_CF <- df_std_IS_CF %>% 
+  #   left_join(df_std_IS_CF_quarter_summary, by = "description")
+  # 
+  # Calculate initial Quarterly_val based on the formula within group_by description and year_end 
   df_std_IS_CF <- df_std_IS_CF %>% 
-    left_join(df_std_IS_CF_quarter_summary, by = "description")
+    group_by(description, year_end) %>%
+    arrange(desc(Cumulative_quarters), desc(quarter_end),) %>% 
+    mutate(
+      Quarterly_val = val / Cumulative_quarters,
+      Count_rows = n()
+    ) %>% 
+    ungroup()
 
-  # Calculate initial Quarterly_val based on the formula within group_by description and year_end >>>>>> QUARTERLY VALUE TO BE CALCULATED PROPERLY DEPENDING ON THE ROWS IN DF AND THEIR QUARTER_END - QUARTER_START <<<<<<<<<<<<<<<<<<
+  # Adjust Quarterly_val based on existing of cumulative values
   df_std_IS_CF <- df_std_IS_CF %>% 
-    group_by(description, year_end) %>%  
+    group_by(description, year_end) %>% 
+    arrange(desc(Cumulative_quarters), desc(quarter_end),) %>% 
     mutate(
-      Quarterly_val = as.numeric(val) / (as.numeric(quarter_end) - as.numeric(quarter_start) + 1)
-    )
+      Count_rows = n(),
+      Quarterly_val = case_when(  
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start + 3 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ val - lead(val),
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start + 2 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ val - lead(val),
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start +1 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ val - lead(val),
+        Cumulative_quarters == 1 ~ Quarterly_val,
+        TRUE ~ Quarterly_val),
+    ) %>% 
+    ungroup() 
 
-  # Adjust Quarterly_val based on esistance of cumulative values
+  # Detect Quarterly_val modified based on existing of cumulative values
   df_std_IS_CF <- df_std_IS_CF %>% 
-    group_by(description, year_end) %>%  
+    group_by(description, year_end) %>% 
+    arrange(desc(Cumulative_quarters), desc(quarter_end),) %>% 
     mutate(
-      Quarterly_val = case_when(  
-        Is_Cumulative == "YES" & Aggregated == "YES" & quarter_end != quarter_start & !is.na(dplyr::lead(val))  ~ as.numeric((val - dplyr::lead(val)) / (quarter_end - dplyr::lead(quarter_end))),  
-        TRUE ~ Quarterly_val  # For all other cases, retain the initial Quarterly_val
-      )
+      Count_rows = n(),
+      Modified_Quarterly_val = case_when(  
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start + 3 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ TRUE,
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start + 2 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ TRUE,
+        Cumulative_quarters >= 2 & 
+          Count_rows > 1 &
+          !is.na(lead(quarter_end)) &
+          !is.na(lead(quarter_start)) &
+          quarter_end == quarter_start +1 &
+          quarter_end == lead(quarter_end) + 1 &
+          quarter_start == 1 & 
+          lead(quarter_start) == 1 ~ TRUE,
+        Cumulative_quarters == 1 ~ FALSE,
+        TRUE ~ FALSE),
     ) %>% 
     ungroup() %>% 
-    select(end, standardized_label, val, Quarterly_val, description, everything())  
+    select(end, standardized_label, val, Quarterly_val, year_end, quarter_end, quarter_start, everything())  
+
+ # Filter those rows where Quarterly_val has been properly calculated from cumulative val
+  df_std_IS_CF <- df_std_IS_CF %>% 
+    group_by(description, year_end) %>% 
+    arrange(desc(Cumulative_quarters), desc(quarter_end),) %>% 
+    filter((Cumulative_quarters ==1 & !Modified_Quarterly_val) | (Cumulative_quarters !=1 & Modified_Quarterly_val))
+
+  # Prepare dataframe for pivot
+  df_std_IS_CF_pivot <- df_std_IS_CF %>%
+    distinct(description, year_end, quarter_end, Quarterly_val, .keep_all = TRUE)
 
 
   # 03 - Pivot df_std_CF in a dataframe format ------------------------------------------------------