gp1981
diff --git a/‎code/Functions/data_retrieval.R
+87-38 b/‎code/Functions/data_retrieval.R
+87-38
diff --git a/‎code/Functions/utils.R
+1-2 b/‎code/Functions/utils.R
+1-2
diff --git a/‎data/standardized_balancesheet.xlsx
-455 Bytes b/‎data/standardized_balancesheet.xlsx
-455 Bytes
diff --git a/‎data/standardized_balancesheet_original.xlsx
-15.7 KB b/‎data/standardized_balancesheet_original.xlsx
-15.7 KB
@@ -87,68 +87,117 @@ bs_std <- function(df_Facts) {
     left_join(standardized_balancesheet, by = "label") %>% 
     select(standardized_balancesheet_label, everything(), -df_Fact_Description)
 
-  # For the same "fy", "fp", we need to add to df_std_BS the following rows before the pivot_wider:
-  #   
-  #   new row: if Total Current Assets does not exist then it creates a new row whose val is Total Assets - Total Long Term Assets and standardized_balancesheet_label is Total Current Assets
-  # 
-  # new row: if Other Current Assets does not exist then it creates a new row whose val is Total Current Assets - (Cash and Cash Equivalent + Marketable Securities, Current  + Total Accounts Receivable + Total Inventory) and standardized_balancesheet_label is Other Current Assets 
-  # 
-  # new row: if Total Long Term Assets does not exist then it creates a new row whose val is  val of Total Assets - val of Total Current Assets and standardized_balancesheet_label is Total Long Term Assets
-  # 
-  # new row: if Other Non Current Assets  does not exist then it creates a new row whose val is val of  Total Long Term Assets - val of (Marketable Securities, Non Current + Property, Plant and Equipment+ Intangible Assets (excl. goodwill) + Goodwill) and standardized_balancesheet_label is Other Non Current Assets 
-  # 
-  # new row: if Total Current Liabilities  does not exist then it creates a new row whose val is  val of Liabilities - val of Liabilities, Non Current and standardized_balancesheet_label is Total Current Liabilities 
-  # 
-  # new row: if Other Current Liabilities does not exist then it creates a new row whose val is val of  Total Current Liabilities - val of  (Accounts Payable, Current + Taxes Payable, Current + Commercial Paper + Long Term Debt, Current Maturities + Operating Lease, Liability, Current + Finance Lease, Liability, Current) and standardized_balancesheet_label is  Other Current Liabilities
-  # 
-  # new row: if Total Long Term Liabilities does not exist then it creates a new row whose val is  val of Total Liabilities - val of Total Current Liabilities and standardized_balancesheet_label is  Total Long Term Liabilities
-  # 
-  # 
-  # new row: if Other Long Term Liabilities does not exist then it creates a new row whose val is val of  Total Long Term Liabilities - val of (Long Term Debts - Operating Lease, Liability, Non Current + Finance Lease, Liability, Non Current and standardized_balancesheet_label is  Other Long Term Liabilities 
-  #                                                                                                                                             
-  #                                                                                                                                             new row: if Other Stockholders Equity does not exist then it creates a new row whose val is val of   Total Company Stockholders Equity - val of  (Common Stock + Additional Paid in Capital + Preferred Stock + Retained Earnings + Accumulated other comprehensive income (loss)) and standardized_balancesheet_label is  Other Stockholders Equity 
+  # 01 - Data cleaning ------------------------------------------------------
+  # This code filters rows in df_std_BS based on the presence of "/A" in the 'form' column, ensuring rows with "/A" are retained if any in their group have it. It then selects relevant columns, arranges by descending 'end' date, and for each unique 'val', keeps the row with the most recent 'end' date.
 
-  
-  # Filter out records not associated with standardized_balancesheet to create the mapping with df_Facts
-  df_std_BS_map <- df_std_BS %>%
-    filter(!is.na(standardized_balancesheet_label)) %>% 
-    select(standardized_balancesheet_label, label, description) %>% 
-    distinct()
-  
-  # Clean up the df_std_BS by retaining the latest amended financials with form e.g. "10K/A"
   df_std_BS <- df_std_BS %>%
+    # Filter out rows without standardized_balancesheet_label
+    filter(!is.na(standardized_balancesheet_label)) %>% 
+    # Group by fiscal year (fy), fiscal period (fp), and label
     group_by(fy, fp, label) %>%
+    # Arrange by descending end date within each group
     arrange(desc(end)) %>%
+    # Add a column indicating if any row in the group has a form ending with /A
     mutate(
       has_form_A = any(grepl("/A$", form))
     ) %>%
+    # Filter rows based on the condition:
+    # - Retain rows without /A
+    # - Retain rows with /A if there's at least one row with /A in the group
     filter(!has_form_A | (has_form_A & grepl("/A$", form))) %>%
+    # Remove the temporary column
     select(-has_form_A) %>%
-    ungroup() %>%
-    select(-label, -description, standardized_balancesheet_label, end, val, fy, fp, form, filed, start)
+    # Select relevant columns
+    select(standardized_balancesheet_label, end, val, fy, fp, form, filed, start) %>%
+    # Arrange by descending end date
+    arrange(desc(end)) %>% 
+    # Remove grouping
+    ungroup() %>% 
+    # Remove column label
+    select(-label) %>%
+    # Group by val and arrange by descending end date within each group
+    group_by(val) %>%
+    arrange(desc(end)) %>%
+    # Retain only the first row within each group
+    slice_head(n = 1) %>%
+    # Remove grouping
+    ungroup()
+
+  # Ensures that for each standardized_balancesheet_label and end combination, only the row with the most recent filing date is retained
 
+  # Clear the dataframe with the most recent form for each end period
   df_std_BS <- df_std_BS %>%
+    # Filter out rows without standardized_balancesheet_label
     filter(!is.na(standardized_balancesheet_label)) %>% 
-    mutate(end = ymd(end), filed = ymd(filed)) %>%  # convert to date format
+    # Convert 'end' and 'filed' columns to date format using lubridate (ymd function)
+    mutate(end = ymd(end), filed = ymd(filed)) %>%
+    # Group by standardized_balancesheet_label and end date
     group_by(standardized_balancesheet_label, end) %>%
-    filter(filed == max(filed)) %>%  # filter rows with the most recent filing date
+    # Filter rows with the most recent filing date within each group
+    filter(filed == max(filed)) %>%
+    # Remove grouping to perform further operations
     ungroup() %>%
+    # Select relevant columns
     select(standardized_balancesheet_label, end, val, fy, fp, form, filed, start)
 
+  # 02 - Mapping with df_Facts--------------------------------------------------------------
+  # Creates a mapping (df_std_BS_map) by matching standardized_balancesheet_label from df_std_BS with the corresponding description from df_Facts.
 
-  # Build df_std_BS dataframe pivoting the standardized labels into columns
-  df_std_BS_pivoted <- df_std_BS%>%
+  # Create a map with df_Facts
+  df_std_BS_map <- df_std_BS %>%
+    # Select the standardized_balancesheet_label column
+    select(standardized_balancesheet_label) %>% 
+    # Rename the column to 'label'
+    rename(label = standardized_balancesheet_label) %>% 
+    # Perform a left join with df_Facts using the 'label' column
+    left_join(df_Facts, by = "label") %>%
+    # Rename the 'label' column back to 'standardized_balancesheet_label'
+    rename(standardized_balancesheet_label = label) %>%
+    # Select columns for mapping
+    select(standardized_balancesheet_label, description) %>% 
+    # Retain distinct combinations
+    distinct()
+  
+  # 03 - Pivot df_std_BS in a dataframe format -----------------------------------
+  # Transforms your data from a long format with multiple rows per observation to a wide format where each observation is represented by a single row with columns corresponding to different labels
 
-  # 02 - Pivot columns of df_std_BS ---------------------------------------------------
   # Build df_std_BS dataframe pivoting the standardized labels into columns
-  df_std_BS_pivoted <- df_std_BS_cleaned %>%
-   pivot_wider(
+  df_std_BS <- df_std_BS %>%
+    # Pivot the data using standardized_balancesheet_label as column names
+    pivot_wider(
       names_from = standardized_balancesheet_label,
       values_from = val
     ) %>%
+    # Arrange the dataframe in descending order based on the 'end' column
     arrange(desc(end))
 
-  return(df_std_BS_pivoted)
+  # >>>---- CONTINUE HERE - CHECK RESULTS ----<<<<<
+  # There are instances in which the filing includes comparison with previous reporting period. In such instances additional details of the previous reporting period are included. The following code merge the row with referring to the same period end where these additional details are provided.
+  
+  # 04 - Add missing columns (Facts) ----------------------------------------
+  
+  # For the same "fy", "fp", we need to add to df_std_BS the following rows before the pivot_wider:
+  #   
+  #   new row: if Total Current Assets does not exist then it creates a new row whose val is Total Assets - Total Long Term Assets and standardized_balancesheet_label is Total Current Assets
+  # 
+  # new row: if Other Current Assets does not exist then it creates a new row whose val is Total Current Assets - (Cash and Cash Equivalent + Marketable Securities, Current  + Total Accounts Receivable + Total Inventory) and standardized_balancesheet_label is Other Current Assets 
+  # 
+  # new row: if Total Long Term Assets does not exist then it creates a new row whose val is  val of Total Assets - val of Total Current Assets and standardized_balancesheet_label is Total Long Term Assets
+  # 
+  # new row: if Other Non Current Assets  does not exist then it creates a new row whose val is val of  Total Long Term Assets - val of (Marketable Securities, Non Current + Property, Plant and Equipment+ Intangible Assets (excl. goodwill) + Goodwill) and standardized_balancesheet_label is Other Non Current Assets 
+  # 
+  # new row: if Total Current Liabilities  does not exist then it creates a new row whose val is  val of Liabilities - val of Liabilities, Non Current and standardized_balancesheet_label is Total Current Liabilities 
+  # 
+  # new row: if Other Current Liabilities does not exist then it creates a new row whose val is val of  Total Current Liabilities - val of  (Accounts Payable, Current + Taxes Payable, Current + Commercial Paper + Long Term Debt, Current Maturities + Operating Lease, Liability, Current + Finance Lease, Liability, Current) and standardized_balancesheet_label is  Other Current Liabilities
+  # 
+  # new row: if Total Long Term Liabilities does not exist then it creates a new row whose val is  val of Total Liabilities - val of Total Current Liabilities and standardized_balancesheet_label is  Total Long Term Liabilities
+  # 
+  # 
+  # new row: if Other Long Term Liabilities does not exist then it creates a new row whose val is val of  Total Long Term Liabilities - val of (Long Term Debts - Operating Lease, Liability, Non Current + Finance Lease, Liability, Non Current and standardized_balancesheet_label is  Other Long Term Liabilities 
+  #
+  # new row: if Other Stockholders Equity does not exist then it creates a new row whose val is val of   Total Company Stockholders Equity - val of  (Common Stock + Additional Paid in Capital + Preferred Stock + Retained Earnings + Accumulated other comprehensive income (loss)) and standardized_balancesheet_label is  Other Stockholders Equity   
+  
+  return(df_std_BS)
 }
 
 
 
@@ -15,5 +15,4 @@ unnest_list <- function(x) {
       .x
     }
   })
-}
-
+}
Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,4 @@ unnest_list <- function(x) {`
`15`	`15`	`.x`
`16`	`16`	`}`
`17`	`17`	`})`
`18`		`-}`
`19`		`-`
	`18`	`+}`