Skip to content

Commit 75f977e

Browse files
committed
bs_std: cleaning the df. New render
Cleaning dataframe. Addressing instances in which the filing includes comparison with previous reporting period. (WIP). New rendering structure.
1 parent ffb5d6a commit 75f977e

23 files changed

+1859
-629
lines changed

code/Functions/data_retrieval.R

+87-38
Original file line numberDiff line numberDiff line change
@@ -87,68 +87,117 @@ bs_std <- function(df_Facts) {
8787
left_join(standardized_balancesheet, by = "label") %>%
8888
select(standardized_balancesheet_label, everything(), -df_Fact_Description)
8989

90-
# For the same "fy", "fp", we need to add to df_std_BS the following rows before the pivot_wider:
91-
#
92-
# new row: if Total Current Assets does not exist then it creates a new row whose val is Total Assets - Total Long Term Assets and standardized_balancesheet_label is Total Current Assets
93-
#
94-
# new row: if Other Current Assets does not exist then it creates a new row whose val is Total Current Assets - (Cash and Cash Equivalent + Marketable Securities, Current + Total Accounts Receivable + Total Inventory) and standardized_balancesheet_label is Other Current Assets
95-
#
96-
# new row: if Total Long Term Assets does not exist then it creates a new row whose val is val of Total Assets - val of Total Current Assets and standardized_balancesheet_label is Total Long Term Assets
97-
#
98-
# new row: if Other Non Current Assets does not exist then it creates a new row whose val is val of Total Long Term Assets - val of (Marketable Securities, Non Current + Property, Plant and Equipment+ Intangible Assets (excl. goodwill) + Goodwill) and standardized_balancesheet_label is Other Non Current Assets
99-
#
100-
# new row: if Total Current Liabilities does not exist then it creates a new row whose val is val of Liabilities - val of Liabilities, Non Current and standardized_balancesheet_label is Total Current Liabilities
101-
#
102-
# new row: if Other Current Liabilities does not exist then it creates a new row whose val is val of Total Current Liabilities - val of (Accounts Payable, Current + Taxes Payable, Current + Commercial Paper + Long Term Debt, Current Maturities + Operating Lease, Liability, Current + Finance Lease, Liability, Current) and standardized_balancesheet_label is Other Current Liabilities
103-
#
104-
# new row: if Total Long Term Liabilities does not exist then it creates a new row whose val is val of Total Liabilities - val of Total Current Liabilities and standardized_balancesheet_label is Total Long Term Liabilities
105-
#
106-
#
107-
# new row: if Other Long Term Liabilities does not exist then it creates a new row whose val is val of Total Long Term Liabilities - val of (Long Term Debts - Operating Lease, Liability, Non Current + Finance Lease, Liability, Non Current and standardized_balancesheet_label is Other Long Term Liabilities
108-
#
109-
# new row: if Other Stockholders Equity does not exist then it creates a new row whose val is val of Total Company Stockholders Equity - val of (Common Stock + Additional Paid in Capital + Preferred Stock + Retained Earnings + Accumulated other comprehensive income (loss)) and standardized_balancesheet_label is Other Stockholders Equity
90+
# 01 - Data cleaning ------------------------------------------------------
91+
# This code filters rows in df_std_BS based on the presence of "/A" in the 'form' column, ensuring rows with "/A" are retained if any in their group have it. It then selects relevant columns, arranges by descending 'end' date, and for each unique 'val', keeps the row with the most recent 'end' date.
11092

111-
112-
# Filter out records not associated with standardized_balancesheet to create the mapping with df_Facts
113-
df_std_BS_map <- df_std_BS %>%
114-
filter(!is.na(standardized_balancesheet_label)) %>%
115-
select(standardized_balancesheet_label, label, description) %>%
116-
distinct()
117-
118-
# Clean up the df_std_BS by retaining the latest amended financials with form e.g. "10K/A"
11993
df_std_BS <- df_std_BS %>%
94+
# Filter out rows without standardized_balancesheet_label
95+
filter(!is.na(standardized_balancesheet_label)) %>%
96+
# Group by fiscal year (fy), fiscal period (fp), and label
12097
group_by(fy, fp, label) %>%
98+
# Arrange by descending end date within each group
12199
arrange(desc(end)) %>%
100+
# Add a column indicating if any row in the group has a form ending with /A
122101
mutate(
123102
has_form_A = any(grepl("/A$", form))
124103
) %>%
104+
# Filter rows based on the condition:
105+
# - Retain rows without /A
106+
# - Retain rows with /A if there's at least one row with /A in the group
125107
filter(!has_form_A | (has_form_A & grepl("/A$", form))) %>%
108+
# Remove the temporary column
126109
select(-has_form_A) %>%
127-
ungroup() %>%
128-
select(-label, -description, standardized_balancesheet_label, end, val, fy, fp, form, filed, start)
110+
# Select relevant columns
111+
select(standardized_balancesheet_label, end, val, fy, fp, form, filed, start) %>%
112+
# Arrange by descending end date
113+
arrange(desc(end)) %>%
114+
# Remove grouping
115+
ungroup() %>%
116+
# Remove column label
117+
select(-label) %>%
118+
# Group by val and arrange by descending end date within each group
119+
group_by(val) %>%
120+
arrange(desc(end)) %>%
121+
# Retain only the first row within each group
122+
slice_head(n = 1) %>%
123+
# Remove grouping
124+
ungroup()
125+
126+
# Ensures that for each standardized_balancesheet_label and end combination, only the row with the most recent filing date is retained
129127

128+
# Clear the dataframe with the most recent form for each end period
130129
df_std_BS <- df_std_BS %>%
130+
# Filter out rows without standardized_balancesheet_label
131131
filter(!is.na(standardized_balancesheet_label)) %>%
132-
mutate(end = ymd(end), filed = ymd(filed)) %>% # convert to date format
132+
# Convert 'end' and 'filed' columns to date format using lubridate (ymd function)
133+
mutate(end = ymd(end), filed = ymd(filed)) %>%
134+
# Group by standardized_balancesheet_label and end date
133135
group_by(standardized_balancesheet_label, end) %>%
134-
filter(filed == max(filed)) %>% # filter rows with the most recent filing date
136+
# Filter rows with the most recent filing date within each group
137+
filter(filed == max(filed)) %>%
138+
# Remove grouping to perform further operations
135139
ungroup() %>%
140+
# Select relevant columns
136141
select(standardized_balancesheet_label, end, val, fy, fp, form, filed, start)
137142

143+
# 02 - Mapping with df_Facts--------------------------------------------------------------
144+
# Creates a mapping (df_std_BS_map) by matching standardized_balancesheet_label from df_std_BS with the corresponding description from df_Facts.
138145

139-
# Build df_std_BS dataframe pivoting the standardized labels into columns
140-
df_std_BS_pivoted <- df_std_BS%>%
146+
# Create a map with df_Facts
147+
df_std_BS_map <- df_std_BS %>%
148+
# Select the standardized_balancesheet_label column
149+
select(standardized_balancesheet_label) %>%
150+
# Rename the column to 'label'
151+
rename(label = standardized_balancesheet_label) %>%
152+
# Perform a left join with df_Facts using the 'label' column
153+
left_join(df_Facts, by = "label") %>%
154+
# Rename the 'label' column back to 'standardized_balancesheet_label'
155+
rename(standardized_balancesheet_label = label) %>%
156+
# Select columns for mapping
157+
select(standardized_balancesheet_label, description) %>%
158+
# Retain distinct combinations
159+
distinct()
160+
161+
# 03 - Pivot df_std_BS in a dataframe format -----------------------------------
162+
# Transforms your data from a long format with multiple rows per observation to a wide format where each observation is represented by a single row with columns corresponding to different labels
141163

142-
# 02 - Pivot columns of df_std_BS ---------------------------------------------------
143164
# Build df_std_BS dataframe pivoting the standardized labels into columns
144-
df_std_BS_pivoted <- df_std_BS_cleaned %>%
145-
pivot_wider(
165+
df_std_BS <- df_std_BS %>%
166+
# Pivot the data using standardized_balancesheet_label as column names
167+
pivot_wider(
146168
names_from = standardized_balancesheet_label,
147169
values_from = val
148170
) %>%
171+
# Arrange the dataframe in descending order based on the 'end' column
149172
arrange(desc(end))
150173

151-
return(df_std_BS_pivoted)
174+
# >>>---- CONTINUE HERE - CHECK RESULTS ----<<<<<
175+
# There are instances in which the filing includes comparison with previous reporting period. In such instances additional details of the previous reporting period are included. The following code merge the row with referring to the same period end where these additional details are provided.
176+
177+
# 04 - Add missing columns (Facts) ----------------------------------------
178+
179+
# For the same "fy", "fp", we need to add to df_std_BS the following rows before the pivot_wider:
180+
#
181+
# new row: if Total Current Assets does not exist then it creates a new row whose val is Total Assets - Total Long Term Assets and standardized_balancesheet_label is Total Current Assets
182+
#
183+
# new row: if Other Current Assets does not exist then it creates a new row whose val is Total Current Assets - (Cash and Cash Equivalent + Marketable Securities, Current + Total Accounts Receivable + Total Inventory) and standardized_balancesheet_label is Other Current Assets
184+
#
185+
# new row: if Total Long Term Assets does not exist then it creates a new row whose val is val of Total Assets - val of Total Current Assets and standardized_balancesheet_label is Total Long Term Assets
186+
#
187+
# new row: if Other Non Current Assets does not exist then it creates a new row whose val is val of Total Long Term Assets - val of (Marketable Securities, Non Current + Property, Plant and Equipment+ Intangible Assets (excl. goodwill) + Goodwill) and standardized_balancesheet_label is Other Non Current Assets
188+
#
189+
# new row: if Total Current Liabilities does not exist then it creates a new row whose val is val of Liabilities - val of Liabilities, Non Current and standardized_balancesheet_label is Total Current Liabilities
190+
#
191+
# new row: if Other Current Liabilities does not exist then it creates a new row whose val is val of Total Current Liabilities - val of (Accounts Payable, Current + Taxes Payable, Current + Commercial Paper + Long Term Debt, Current Maturities + Operating Lease, Liability, Current + Finance Lease, Liability, Current) and standardized_balancesheet_label is Other Current Liabilities
192+
#
193+
# new row: if Total Long Term Liabilities does not exist then it creates a new row whose val is val of Total Liabilities - val of Total Current Liabilities and standardized_balancesheet_label is Total Long Term Liabilities
194+
#
195+
#
196+
# new row: if Other Long Term Liabilities does not exist then it creates a new row whose val is val of Total Long Term Liabilities - val of (Long Term Debts - Operating Lease, Liability, Non Current + Finance Lease, Liability, Non Current and standardized_balancesheet_label is Other Long Term Liabilities
197+
#
198+
# new row: if Other Stockholders Equity does not exist then it creates a new row whose val is val of Total Company Stockholders Equity - val of (Common Stock + Additional Paid in Capital + Preferred Stock + Retained Earnings + Accumulated other comprehensive income (loss)) and standardized_balancesheet_label is Other Stockholders Equity
199+
200+
return(df_std_BS)
152201
}
153202

154203

code/Functions/utils.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,4 @@ unnest_list <- function(x) {
1515
.x
1616
}
1717
})
18-
}
19-
18+
}

data/standardized_balancesheet.xlsx

-455 Bytes
Binary file not shown.
-15.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)