-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtime_series_analysis.Rmd
2481 lines (2327 loc) · 113 KB
/
time_series_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Time Series Analysis for Forecasting"
author: "Blaine Bateman"
date: "November 28, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(warn = 1)
```
```{r function defs, echo=FALSE, message=FALSE, warning=FALSE}
scale_0_to_1 <- function(data, scale_cols) {
#
scales <- as.data.frame(matrix(0, ncol = 2))
#
for (i in 1:length(scale_cols)) {
scales[i, 1] <-
max(data[, scale_cols[i]], na.rm = TRUE) -
min(data[, scale_cols[i]], na.rm = TRUE)
scales[i, 2] <-
min(data[, scale_cols[i]], na.rm = TRUE)
}
data[, scale_cols] <-
scale(data[, scale_cols],
center = scales[, 2],
scale = scales[, 1]) %>%
as.data.frame()
return(data)
}
```
```{r libraries, echo=FALSE, message=FALSE, warning=FALSE}
#
# load needed libraries ----
#
# dplyr provides pipes etc.
#
library(dplyr)
#
library(tidyverse)
#
# ggthemes gives us ggplot themes
#
library(ggthemes)
#
# gridExtra allows arraning multiple ggplots
#
library(gridExtra)
#
# reshape2 provies melt and cast
#
library(reshape2)
#
# scales formats dollars etc. in plots
#
library(scales)
#
# keras provides access to TensorFlow NN modeling
#
library(keras)
#
# caret gives us the trainControl method and resampling
#
library(caret)
#
# glmnet provides ridge, lasso, and elastic net regression
#
library(glmnet)
#
```
```{r Buisness Understanding}
```
<style type="text/css">
body{ /* Normal */
font-size: 16pt;
}
</style>
<font size=28pt> **1. Introduction**</font>
Forecasting is an historical pain point dating back to at least the time of Mesopotamia[^1] and even being regulated during the Roman empire[^2]. Unfortunately forecasters have been persecuted through the ages; a recent example being the charging of scientists and their managers with manslaughter in Italy for failing to warn of an earthquake (fortunately they were exonerated).[^3] In their excellent book "Forecasting Principles and Practice", Rob Hyndman and George Athanasopoulos note the perils of being a forecaster, quoting Emperor Constantine's decree in AD357 forbidding "to consult a soothsayer, a mathematician, or a forecaster" (note the historical lumping of math in with fortune telling!).[^4]
<font size=28pt> **2. Problem Statement**</font>
In this article I want to demonstrate some methods of time series forecasting that are less often cited in the myriad data science tutorials etc.; namely the use of regression models, including artificial neural networks, to forecast. I will describe a business use case adapted from a client engagement, where the goal was to forecast sales up to a year in advance for a specialty electronics manufacturing firm in the B2B segment. Some aspects have been adjusted or simplified for confidentiality reasons, but everything shown here is based on anonymized data using the methods described.
The data used are from 2012 through 2016; the end product was used to forecast into 2017. I arranged the training data as one block of dates from 2012 through 2015, the validation data as 1/1/2016 to 6/30/2016, and the test data as 7/1/2016 to 12/31/2016. This was a pretty challenging task but made sense based on the business case, as I'll explain later.
While simple methods like ARIMA can be highly effective in short-term forecasts, they are not as well suited if, say, I want to forecast sales with daily granularity several months (or more) into the future. One reason for those limitations is that business phenomena are impacted by multiple forces including the internal business activities, the behavior of customers, the macro behavior of the vertical market, larger macro-economic forces, government regulation, and nowadays even social-media and reputation factors. Modeling the future using only the past values of a time series of figures cannot possibly take into account these external forces as they change over time. On the other hand, using a multivariate set of predictors and modeling with regression tools can incorporate nearly all business knowledge found relevant to the outcome desired to forecast. The challenges become uncovering all the causes and effects, and their timing. In this context, using these methods fits perfectly into the Business Science Problem Framework^TM^[^5] and CRISP-DM[^6].
<font size=28pt> **3. Business Understanding**</font>
The client wanted to improve their sales forecast for several reasons. Stock analysts are very negative when results miss targets, and shareholders get restless when share prices fall. In recent quarters, sales had missed forecasts--but that was only part of the story. More urgent was a situation that inaccurate forecasts meant some orders were delayed in shipment because some products were selling significantly above forecast while others were below. This was exacerbated as some manufacturing was done by a contract manufacturer, causing further delays when shipments didn't match the operations forecasts provided to them. Since earnings reports are quarterly, forecasts longer than 90 days were desired; in combination with the manufacturing issues, the goal was to forecast to 6 months at 10 percent accuracy and hopefully better in the short term. The goal was to generate something like Figure 1.
<img src="forecast_goal.jpg" width="800px">
<font size="3"> Figure 1. Optimistic goal of forecasting project.
The green line is the mean forecast; the dotted lines represent a confidence interval to be determined. At any point the forecast is represented as a probability distribution (possibly unknown); on the right is the interpretation of the hypothetical distribution in the middle of the time frame.</font>
To establish ROI for the forecasting project, it is necessary to estimate the impact of forecast errors. Impacts of, for example, under-forecasting, include canceled orders (due to inability to meet requested delivery dates), cost of carrying excess material as a hedge against the forecast errors, overtime to build late orders once material is available, etc. Table 1 is an example calculation for a hypothetical business for one under-forecasting scenario. I've weighted the reactions to reflect that the response isn't the same for all customers etc. The result suggests in this case about a 1% of sale impact for every +/- 1% error in the forecast. I could refine this with more discussion with the client; examples include quality impacts of overtime use, customer churn due to missed delivery dates, and any number of other factors.
<img src="cost_table.jpg" width="800px">
<font size="3"> Table 1. One cost scenario. Here a hypothetical $100M (sales) company with a +/- 10% forecast uncertainty takes a range of actions when under-forecasting results in inability to meet customer requested ship dates.</font>
The data available from the business were as follows:
```{r Table 2, echo=FALSE, message=FALSE, warning=FALSE, results='asis'}
table_2 <- "
| Data Type | Source | Format |
| --------------- | --------------- | --------------- |
| CRM | SFDC | csv export |
| ERP | proprietary | csv export |
| prior forecasts | Sales | Excel |
| prior results | Sales | Excel |
"
cat(table_2)
```
<font size="3"> Table 1. Data sources from the business. CRM = customer relationship management system; SFDC = SalesForce; ERP = enterprise resource planning system.</font>
In discussion with the client, they explained that they tracked their sales by subsets of sales managers; these subsets were a combination of geographic and market based. In addition, there were hundreds of item numbers (SKUs) that were organized into a set of groups. Eventually we agreed to include 9 sales areas and 10 product groups in the project. The data provided per Table 1 were intended to support this structure in the analysis.
To understand the business relationship of the data it's important to consider a few realities. Most business people and sales professionals don’t have formal education or training in forecasting, instead substituting on the job experience and professional judgement. These skills can be surprisingly useful to produce accurate forecasts, but it can also lead to large errors, and no way to gauge the uncertainty of the forecasts. We decided after discussion to approach the confidence interval part of the project by reconstructing past forecasts and errors in discussion with sales, then using a Monte Carlo method to estimate the forecast distributions. Although I won't go deeper into that method here due to length constraints, it's explained in an article I wrote.[^7]
CRM data in B2B companies often do not reflect all the future business expected, but only major new opportunities. Sales may include a large recurring or carryover business in the forecast that is not in the CRM. In addition, there may be significant sales that are handled by Customer Service or Inside Sales that may be viewed as "drop in". In this case, it seemed clear that the future sales depended on all of these, and I wanted to account for the CRM portion explicitly. The other sources would be reflected in the sales history. However, many companies have no explicit process in place to include CRM into the forecast--it is up to sales to decide how this is done, and it may be very ad-hoc. The issue is important in the business understanding phase--CRM reports are often used mainly to manage sales and not for forecasting. I chose to attempt to estimate the average "lag time" between opportunities in the CRM at various stages and future revenue. I'll discuss this more in the Data Understanding section.
Another point to consider is that in business, forecasting is often confounded with goal setting--in other words, if the CEO says 'We need to sell \$300M next year', then the forecast is forced to be at least $300M for next year. This often results in one or more line items labeled TBD, stretch, or other euphemisms meaning 'we don't know where this revenue is coming from' (at the time of the forecast). The challenge for analyzing history and forecasting using a model-based approach is that past forecasts and errors may have large biases in them, affecting understanding of uncertainty and risk.
One of the things I wanted to accomplish was to incorporate business knowledge explicitly into the forecast. This can include asking what markets the business is in, and seeking external data regarding the behavior and growth of those markets. If such external data are available, they can be incorporated as a predictor in the model data. Other examples include government regulations that may negatively or positively impact the business, overall health of the economy in which the company sells, and commodity prices that may affect the business levels of customers. After several discussions with various stakeholders, I decided to look at three factors: industrial activity, manufacturing capital expenditures, and pricing of a commodity. The business motivations for these are as follows. Most sales were B2B into industrial or manufacturing customers, so a measure of industrial activity might correlate with sales. Also, since significant sales were known to go into manufacturing operations, spending to install new manufacturing capacity might be a good sales predictor. Finally, due to large sales into one industrial segment involved with the production of a commodity, it was believed that global price increases in the commodity drove increased spending for production equipment in the segment. In the data understanding discussion I'll talk about challenges of incorporating these factors.
One has to talk to a lot of stakeholders to get all the information I've discussed. One thing I like to advise those in similar situations is to not assume you have all the relevant information. Stakeholders may forget to tell you something important, or decide it's not relevant and omit it. In addition, different stakeholders may use different data as the truth and you may get divergent answers to similar questions. My approach is to just keep asking questions throughout the project, and engage as many stakeholders in the client as possible.
```{r Data Undersstanding}
```
<font size=28pt> **4. Data Understanding**</font>
The CRM data represented the "sales pipeline", each entry representing the future sale of one product to one customer, organized into one of six sales stages, labeled as 0%, 5%, 25%, 50%, 75%, 90%. As is typical, the percentages don't represent probabilities, so it is better to think of them as simply labels, which are usually called stages. This complicates the interpretation of the pipeline and the use of it as input to the forecast, as I'll discuss later. In addition to the "open" pipeline, historical data were available including wins and losses, the time to close from creation, and the stage history for every closed item.
I don't have enough room or time here to fully discuss the analysis of the CRM data, so I'll just summarize a few points here. First, if I consider the life cycle of one item in the CRM (call it an 'opportunity'), then the opportunity is created when a new business lead is identified, it evolves from stage to stage as more sales effort is applied and (hopefully) it is moving towards a business win, and eventually it is either won or lost. If I look back at history, I can find some fraction of opportunities at a given stage that are typically won (the 'closed/won' rate), and an average time from the point an opportunity enters a given stage to closing. If I am looking at, say, stage 25% opportunities, I might find 30% of them are typically won, and on average it takes 315 days to close them. I can then say that if I look at the amount of revenue represented in stage 25% at some point in the past, I can expect 30% of that to show up as sales 315 days in the future. Therefore, if I can build a time series of the total dollar value of stage 25% versus time in the past, I can use that as a predictor for some of the future sales.
In this case after a lot of analysis and discussion with the sales stakeholders, I decided to use only the stage 25% opportunities. The data preparation then consisted of determining, from the stage history, what 25% opportunities were open for all dates in the past over the date range of interest. This was done according to the sales areas and product groups agreed upon with the stakeholders. The result was simply tabulated data for each area-group combination at every date in the past. To use them in the model, these tabular data were offset by the number of days determined to best represent the average time to close. Here are the (normalized) pipeline data by product group; I've added vertical lines at each year boundary to help visualize the behavior over time.
```{r Pipeline Data, echo=FALSE, message=FALSE, warning=FALSE}
pipeline <-
read.csv("pipeline_data.csv",
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(Date = as.Date(Date, format = '%m/%d/%Y')) %>%
scale_0_to_1(scale_cols = which(colnames(.) != "Date"))
melted_pipeline <- pipeline[, c(1, 2)]
melted_pipeline[, 3] <- rep(colnames(pipeline)[2],
nrow(pipeline))
colnames(melted_pipeline) <- c("date", "pipeline", "product")
for (i in 3:ncol(pipeline)) {
melted_temp <- pipeline[, c(1, i)]
melted_temp[, 3] <- rep(colnames(pipeline)[i],
nrow(pipeline))
colnames(melted_temp) <- c("date", "pipeline", "product")
melted_pipeline <-
rbind(melted_pipeline, melted_temp)
}
melted_pipeline %>%
filter(product != "Total") %>%
ggplot(aes(x = date,
y = pipeline,
group = product,
color = product)) +
geom_line() +
theme_economist() +
labs(x = "",
y = "Pipeline (normalized)\n",
title = "Open Sales Pipeline") +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = as.Date(c('2013-01-01',
'2014-01-01',
'2015-01-01',
'2016-01-01')),
size = 0.25,
color = "black")
```
These data have some issues! The "notchiness" of the curves is due to a typical process that occurs in sales management wherein the pipeline is cleaned up periodically--opportunities that are known to be lost but had not been closed are closed in bulk, leading to the sudden drops. It can be seen that this happens often at the end of the year. One group, Repl, has no data until very recently, and others, GP, Spec_2, and Spec_3, have no data until mid-2014 or later. I dropped those from further analysis, leaving this:
```{r Pipeline Data (filtered), echo=FALSE, message=FALSE, warning=FALSE}
exclude_pipelines <-
c("GP", "Repl", "Spec_3", "Spec_2")
melted_pipeline %>%
filter(!(product %in% exclude_pipelines)) %>%
filter(product != "Total") %>%
ggplot(aes(x = date,
y = pipeline,
group = product,
color = product)) +
geom_line() +
theme_economist() +
labs(x = "",
y = "Pipeline (normalized)\n",
title = "Open Sales Pipeline") +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5))
```
Although less than ideal, these data contain important information not otherwise available in the data set, so I retained them. The average time to revenue was found to be 606 days. However, the distribution of time to revenue is not at all Gaussian (i.e. it is not a normal distribution).
<img src="time_to_revenue.jpg" width="800px">
<font size="3"> Figure 2. Distribution of time to revenue for '25%' opportunities.
</font>
Ideally I would sample this distribution in some way and estimate fraction of revenue at all future dates and use that in the model. For the purposes here, I chose to use the time for which 50% of the revenue would be closed, which was calculated from the distribution as 357 days. (This value was actually taken as a midpoint between two histogram bins; a more precise figure could be taken from a fitted distribution.) This "lag time" represents the time from when an event happens in a predictor (in this case, an opportunity reaches a given stage at a given dollar value) to when it impacts the dependent variable--the values I am trying to predict. Determining lag times from proposed predictors to the future sales I am predicting is a significant theme of this work.
The ERP data were provided as tabular data with each line representing an order of one unique item (SKU) by one customer on a given purchase order for a given ship date. Since the goal of the project was mainly to improve the Operations situation, I initially focused on ship dates. However, an analysis of the shipments vs. order dates showed that the vast majority of orders shipped very soon after they were placed; in most cases ship dates long after order dates represented problems where materials or production capacity were not available in time to meet customer needs. Ultimately, I reverted to forecasting order dates after agreeing with stakeholders that this was the visibility they needed.
As with the CRM data, I won't detail all the steps required to prepare the ERP data for use in the project, but will touch on some important points. I found some erroneous dates in the data provided, flagged because the order dates were after the ship dates. In discussion with stakeholders, this was confusing as there was no process to allow orders to be entered after shipments. As a working solution, it was agreed to just use the order dates as the "truth". In addition, the ERP was not coded with the areas or product groups that had been agreed to as the granularity of the project. This is not unusual--sales may have their own definitions to manage sales activities, but the ERP is coded with "official" (and sometimes cryptic) categories. In this case, neither the sales areas or the product families were included in the ERP data. This required additional data cleaning and preparation steps wherein lookup tables that matched customer IDs to sales areas and item numbers to product groups were used. This raised another issue in that the totals by area and some product groups did not match the historical data provided by sales. A first round analysis was completed with the understanding that the discrepancies would be resolved later as a possible refinement.
With these caveats, a tabular data set comprising the order dates, areas, product groups, and CRM pipeline was prepared. I now look at one additional aspect of the sales data to consider. Here is a plot of the (normalized) raw sales vs. date:
```{r raw and MA data, echo=FALSE, message=FALSE, warning=FALSE}
raw_data <-
read.csv('raw_and_MA_sales.csv',
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(uniform_date = as.Date(uniform_date,
format = '%m/%d/%Y')) %>%
mutate(raw_sales = case_when(
is.na(raw_sales) ~ 0,
TRUE ~ raw_sales
)) %>%
scale_0_to_1(which(!(colnames(.) %in%
c("uniform_date"))))
#
raw_data %>%
ggplot(aes(x = uniform_date,
y = raw_sales)) +
geom_point(color = "red", shape = 21, size = 3) +
labs(x = "Sales Date",
y = "Sales (normalized)\n",
title = "Raw Sales Data") +
theme_economist() +
theme(plot.title = element_text(hjust = 0.5, size = 12))
#
```
Although I can see a fairly clear trend, the data are very noisy, and it's not evident there are patterns that can be modeled. I could jump into some autocorrelation and ARIMA analysis, but I decided to try an old tool--smoothing--before anything else. Also evident in this chart is that there are may dates with zero sales. These values were due to no data on most weekend and holiday dates, as orders are not taken and entered on those days. I knew that there were likely to be monthly and quarterly cycles due to sales goals, and I didn't want to smooth those out too much. I decided to use a 21-day moving average as a first approach, using uniform dates (i.e. filling in the missing dates); the data then appear as follows.
```{r Smoothed Sales, echo=FALSE, message=FALSE, warning=FALSE}
raw_data %>%
ggplot(aes(x = uniform_date,
y = X21_d_MA)) +
geom_line(color = "red") +
labs(x = "Sales Date",
y = "Sales (normalized)\n",
title = "Total Sales",
subtitle = "21-day moving average") +
theme_economist() +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, size = 12)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
```
Now I can see something of a pattern--there are big dips at the end of each year as well as some possibly recurring patterns within the year, including monthly, albeit noisy and not consistent. I chose to move forward modeling the smoothed data as the target output. Note that with some error, the forecasted moving averages can be converted back to weekly data, by distributing the weekend and holiday values back into the actual sales days. However, for understanding, say, sales by month or quarter the forecasted values could be used as is.
Consider this series for a moment. Most "textbook" time series examples are clearly periodic, sometimes growing, and sometimes with one big shift. In comparison, this modeling challenge is really hard. Although there are some patterns, it's clearly not a given I can forecast this. That's the point of this article, to explore methods that can generate predictive power for complex problems like this. I believe every business related time series has causes, and our job is to find them and build a model that accounts for the causes and predicts the business.
From this figure I see there are clearly annual and likely more frequent recurring patterns. I performed a Discrete Fourier Transform (DFT)[^8] to look for the dominant recurring frequencies, and found evidence of 6-month and monthly cycles, a cycle around 102 days, as well as less-intuitive cycles at 512 days. For the 102 day cycles I chose to use 91.3 days (roughly a quarter year) as being more consistent with the business understanding. While I thought the model would likely pick up the key frequencies based on the business cycles, I decided to provide periodic predictors using sine and cosine series tied to the day of the year. In addition, I used a mathematical technique as follows:
| Define the sine series as:
| sin(2 \* $\pi$ \* (cycles/day) * time(days))
| then add a second series as:
| cos(2 \* $\pi$ \* (cycles/day) * time(days))
and by using the two corresponding series in combination in a regression, the model coefficients encode the best fit *phase* as well as the amplitude. Predicting with only the sine/cosines and the date using a linear regression model fits a surprising amount of the variation:
```{r visualize sin/cos, echo=FALSE, message=FALSE, warning=FALSE}
DFT_results <-
raw_data %>%
select(uniform_date, X21_d_MA) %>%
mutate(day = as.integer(uniform_date)) %>%
mutate(sin_512 = sin(2 * pi * (1 / 512) * day)) %>%
mutate(cos_512 = cos(2 * pi * (1 / 512) * day)) %>%
mutate(sin_365 = sin(2 * pi * (1 / 365.25) * day)) %>%
mutate(cos_365 = cos(2 * pi * (1 / 365.25) * day)) %>%
mutate(sin_183 = sin(2 * pi * (1 / 182.6)* day)) %>%
mutate(cos_183 = cos(2 * pi * (1 / 182.6) * day)) %>%
mutate(sin_91 = sin(2 * pi * (1 / 91.3) * day)) %>%
mutate(cos_91 = cos(2 * pi * (1 / 91.3) * day)) %>%
mutate(sin_30 = sin(2 * pi * (1 / 30.44) * day)) %>%
mutate(cos_30 = cos(2 * pi * (1 / 30.44) * day))
DFT_fit <-
lm(X21_d_MA ~ . - uniform_date, data = DFT_results)
DFT_results %>%
mutate(fit = predict(DFT_fit)) %>%
ggplot(aes(x = uniform_date, y = X21_d_MA)) +
geom_line(color = "red", size = 0.5) +
geom_line(aes(x = uniform_date, y = fit),
color = "dodgerblue", size = 0.5) +
labs(x = "",
y = "Sales / fit (normalized)\n",
title = "DFT analysis",
subtitle = "Sales with DFT fit overlay") +
theme_economist() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
```
A word of caution using DFT and related methods to fit sine/cosines to a time series. These approaches can dramatically over-fit; that is, if I did a full frequency based reconstruction of the signal, it could perfectly match the training data, but when extended outside that region it would simply be a copy. Therefore, my approach is to use these predictors for a limited number of frequencies.
With this analysis, the data set now includes a day column and 10 columns with the calculated sine and cosine series (normalized), along with the previous data. Also, the sales area was encoded as a dummy variable, and the sales amounts by date-area-group were recorded in columns for each product group. At this stage I also encoded the day of the year (1 - 366), the day of the month (1 - 31) and variables indicating weekend and non-working holidays. The reason for these latter variables is based on my experience and data understanding that the model could use this information as predictive.
I now look at the autocorrelation of the sales data, to investigate if I can use lagged sales data to predict future sales.
```{r Basic Autocorrelation, echo=FALSE, message=FALSE, warning=FALSE}
acf(x = raw_data[, "X21_d_MA"],
plot = TRUE,
main = "Autocorrelation of Sales\n(moving average)")
```
This result isn't surprising for a couple of reasons. Like a lot of time series, the data are highly correlated at small lags. Unfortunately, this isn't helpful for the business case, since I need to forecast at up to 6 months. In addition, I can see that the 21-day moving average increases the correlation up to that lag period. However, here is what results when extending the analysis to much longer lags:
```{r Autocorrelation Analysis, echo=FALSE, message=FALSE, warning=FALSE}
acf(x = raw_data[, "X21_d_MA"],
plot = TRUE,
lag.max = 395,
xlim = c(0, 395),
col = adjustcolor("dodgerblue", alpha = 0.5),
main = "Autocorrelation of Sales\n(moving average)")
```
Happily, there is a peak in the correlation at one year! That means I can use sales data lagged one year to predict sales, at least in part. I'll return to this later in data understanding.
One reason for using the moving averages are the gaps in the sales data as orders are not entered on weekends or holidays. However, to sanity check the previous result, I can aggregate the raw data into weekly data without any smoothing, and perform an autocorrelation analysis on that. The result is shown here; 52 weeks appears as a long-lag peak in correlation, consistent with the previous result.
```{r Investigate Long Lags, echo=FALSE, message=FALSE, warning=FALSE}
raw_sales_clean <-
raw_data %>%
mutate(week = as.integer(format(uniform_date,
format = "%V"))) %>%
mutate(year = as.integer(format(uniform_date,
format = "%Y"))) %>%
mutate(serial_week = year * 100 + week) %>%
group_by(serial_week) %>%
summarize(weekly_sales = sum(raw_sales,
na.rm = TRUE)) %>%
select(weekly_sales)
acf(x = raw_sales_clean,
plot = TRUE,
lag.max = 56,
xlim = c(0, 56),
main = "Autocorrelation of sales\n(raw data)")
```
Further, before I get fully into modeling, I will predict the Sales figures with the 365-day lagged sales to do a preliminary validation that using such a long lag is useful. On the left are the original and lagged sales data, and on the right the result of fitting a linear model with the date and lagged sales as predictors.
```{r Prelim Fit to Autocorr, echo=FALSE, method=FALSE, fig.width=9}
test_fit <-
raw_data %>%
select(uniform_date, X21_d_MA) %>%
rename(lagged_sales = X21_d_MA) %>%
mutate(serial_date = as.integer(uniform_date) + 365)
test_fit <-
raw_data %>%
select(uniform_date, X21_d_MA) %>%
rename(orig_sales = X21_d_MA) %>%
mutate(serial_date = as.integer(uniform_date)) %>%
full_join(test_fit, by = "serial_date") %>%
select(serial_date, lagged_sales, orig_sales) %>%
rename(Date = serial_date) %>%
mutate(Date = as.Date(Date,
origin = '1970-01-01')) %>%
filter(!(is.na(lagged_sales)) &
!(is.na(orig_sales)) &
!(is.na(Date)))
source_plot <-
test_fit %>%
ggplot(aes(x = Date,
y = lagged_sales,
color = "black")) +
geom_line() +
geom_line(aes(x = Date,
y = orig_sales,
color = "red")) +
theme_economist() +
labs(x = "",
y = "Sales\n",
title = "Lagged Sales",
subtitle = "overlay on Sales") +
scale_color_manual("Data source",
values = c("dodgerblue", "red"),
labels = c("lagged", "original")) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5)) +
theme(legend.position = "right")
#
test_model <-
lm(orig_sales ~ lagged_sales,
data = test_fit)
test_fit <-
test_fit %>%
mutate(pred = predict(test_model)) %>%
mutate(res = pred - orig_sales)
#
pred_plot <-
test_fit %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = orig_sales,
color = "original")) +
geom_line(aes(y = pred,
color = "model")) +
theme_economist() +
labs(x = "",
y = "Model / Sales\n",
title = "Prediction using Lagged Sales",
subtitle = "overlay on Sales") +
scale_color_manual("Data Source",
values = c("original" = "red",
"model" = "dodgerblue")) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5)) +
theme(legend.position = "right") +
ylim(0, 1)
grid.arrange(source_plot, pred_plot, ncol = 2)
#
```
The residuals from this simple model are a good approximation of a normal distribution:
```{r Lagged Fit Residuals, echo=FALSE, message=FALSE, warning=FALSE}
mean_res <- mean(test_fit[, "res"])
sd_res <- sd(test_fit[, "res"])
norm_curve_x <-
seq(-1, 1, length.out = nrow(test_fit))
test_fit_density <-
dnorm(norm_curve_x,
mean = mean_res,
sd = sd_res)
density_scale <- max(test_fit_density)
test_fit_density <-
test_fit_density / density_scale
test_fit_hist <-
test_fit %>%
ggplot(aes(x = res)) +
geom_density(aes(y = ..scaled..),
fill = "dodgerblue",
color = "blue",
alpha = 0.5) +
theme_economist() +
xlim(-0.75, 0.75) +
labs(x = "Normalized Residual",
y = "Normalized Density\n",
title = "Distribution of Residuals",
subtitle = "linear fit to lagged Sales") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
test_fit_hist <-
test_fit_hist +
geom_line(aes(x = norm_curve_x,
y = test_fit_density),
color = "red",
size = 1)
print(test_fit_hist)
#
```
The next part of Data Understanding is to consider external factors as mentioned at the outset. In my many discussions whit stakeholders, a large amount of sales in one area was associated with a particular industry, and that industry tended to ebb and flow based on the market pricing of the commodity produced. I obtained data from a government website for the historical pricing (via an API--easily done in R using the httr library and some json tools) which is as follows, plotted together with (normalized) sales. It's notable that the downturn in the commodity price in 2014 seems to correspond roughly to a drop in sales. Here I have smoothed the sales data somewhat to aid interpretation.
```{r Commodity Data, echo=FALSE, message=FALSE, warning=FALSE}
commodity <-
read.csv('commodity_pricing.csv',
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(date = as.Date(date,
origin = '1899-12-30'))
scale_cols <-
which(colnames(commodity) != "date")
commodity <-
commodity %>%
filter(date >= '2011-01-01') %>%
scale_0_to_1(scale_cols)
#
commodity %>%
ggplot(aes(x = date, y = commodity_price,
color = "Commodity")) +
geom_line() +
labs(x = "Date",
y = "Commodity / Sales (normalized)\n",
title = "Commodity Trend",
subtitle = "overlay on Sales") +
geom_smooth(data = raw_data,
aes(x = uniform_date,
y = X21_d_MA,
color = "Sales"),
size = 0.5,
se = FALSE,
method = "loess",
span = 0.08) +
theme_economist() +
scale_color_manual("",
values = c("dodgerblue", "red"),
breaks = c("Commodity", "Sales")) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, size = 12)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
```
This is a case where something like cross correlation analysis won't work, as there is a single large change in the data that occurs only once. As an alternative, I tested lagging the commodity data from 0 to 24 months and testing the residuals when I fit the sales data to a linear trend plus the commodity data.
```{r Teset Commodity Lags, echo=FALSE, message=FALSE, warning=FALSE}
#
# explore lags up to two years and test residuals
#
test_RMSEs <- data.frame(lag = integer(),
RMSE = numeric())
mean_lin_RMSE <- 0
i <- 0
for (com_offset in seq(0, 730, 30)) {
test_cor <-
commodity %>%
mutate(uniform_date = date + com_offset) %>%
full_join(raw_data, by = "uniform_date") %>%
select(X21_d_MA, uniform_date, commodity_price) %>%
rename(date = uniform_date)
test_cor <-
test_cor[!(is.na(test_cor[, "X21_d_MA"])) &
!(is.na(test_cor[, "commodity_price"])), ]
linear_fit <-
lm(X21_d_MA ~ date, data = test_cor)
linear_RMSE <-
sqrt(sum((predict(linear_fit) -
test_cor[, "X21_d_MA"])^2) /
nrow(test_cor))
mean_lin_RMSE <- mean_lin_RMSE + linear_RMSE
com_fit <-
lm(X21_d_MA ~ commodity_price, data = test_cor)
com_RMSE <-
sqrt(sum((predict(com_fit) -
test_cor[, "X21_d_MA"])^2) /
nrow(test_cor))
both_fit <-
lm(X21_d_MA ~ commodity_price + date, data = test_cor)
both_RMSE <-
sqrt(sum((predict(both_fit) -
test_cor[, "X21_d_MA"])^2) /
nrow(test_cor))
i <- i + 1
test_RMSEs[i, "lag"] <- com_offset
test_RMSEs[i, "RMSE"] <- both_RMSE
}
mean_lin_RMSE <- mean_lin_RMSE / i
com_offset <-
test_RMSEs[which(test_RMSEs[, "RMSE"] ==
min(test_RMSEs[, "RMSE"])), "lag"]
test_RMSEs %>%
ggplot(aes(x = lag, y = RMSE)) +
geom_line(color = "black") +
theme_economist() +
labs(x = "Days commodity price leads Sales",
y = "RMSE of fit\n",
title = "RMSE of fit vs. lag",
subtitle = "Sales vs. (time + commmodity price)") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5)) +
geom_hline(yintercept = mean_lin_RMSE, color = "red") +
annotate("text",
x = 10,
y = mean_lin_RMSE +
0.03 * (max(test_RMSEs[, "RMSE"]) -
min(test_RMSEs[, "RMSE"])),
label = "RMSE for linear fit only",
color = "red",
hjust = 0)
```
This analysis arrived at a time lag of `r com_offset` days. I can interpret this as related to the lag time from when the price of a commodity significantly increases and the time that production of the commodity increases to take advantage, with the order and fulfillment lag time included. Shifting the commodity data this amount and re-plotting gives:
```{r lag and scale, echo=FALSE, message = FALSE}
commodity %>%
filter(date + com_offset < '2017-01-01') %>%
ggplot(aes(x = date + com_offset, y = commodity_price,
color = "Commodity")) +
geom_line() +
labs(x = "Date",
y = "Commodity / Sales (normalized)\n",
title = "Lagged Commodity Trend",
subtitle = "overlay on Sales") +
geom_smooth(data = raw_data,
aes(x = uniform_date,
y = X21_d_MA,
color = "Sales"),
size = 0.5,
se = FALSE,
method = "loess",
span = 0.08) +
theme_economist() +
scale_color_manual("",
values = c("dodgerblue", "red"),
breaks = c("Commodity", "Sales")) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, size = 12)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
```
Obviously this isn't the whole story, but as a sanity check, I compare fitting the sales data with a linear trend (fit vs. date only), fitting using the commodity data, lagged by `r com_offset` days, and using both. This shows that including the commodity data can significantly reduce the RMSE and generate residual errors that are more symmetric:
```{r test final offset, echo=FALSE, message=FALSE, warning=FALSE}
test_cor <-
commodity %>%
mutate(uniform_date = date + com_offset) %>%
full_join(raw_data, by = "uniform_date") %>%
select(X21_d_MA, uniform_date, commodity_price) %>%
rename(date = uniform_date)
test_cor <- test_cor[!(is.na(test_cor[, "X21_d_MA"])) &
!(is.na(test_cor[, "commodity_price"])), ]
linear_fit <-
lm(X21_d_MA ~ date, data = test_cor)
linear_RMSE <-
sqrt(sum((predict(linear_fit) -
test_cor[, "X21_d_MA"])^2) / nrow(test_cor))
com_fit <-
lm(X21_d_MA ~ commodity_price, data = test_cor)
com_RMSE <-
sqrt(sum((predict(com_fit) -
test_cor[, "X21_d_MA"])^2) / nrow(test_cor))
both_fit <-
lm(X21_d_MA ~ commodity_price + date, data = test_cor)
both_RMSE <-
sqrt(sum((predict(both_fit) -
test_cor[, "X21_d_MA"])^2) / nrow(test_cor))
test_cor <-
test_cor %>%
mutate(linear_fit = predict(linear_fit)) %>%
mutate(linear_res = linear_fit - X21_d_MA) %>%
mutate(com_fit = predict(com_fit)) %>%
mutate(com_res = com_fit - X21_d_MA) %>%
mutate(both_fit = predict(both_fit)) %>%
mutate(both_res = both_fit - X21_d_MA)
lin_res_sd <- sd(test_cor[, "linear_res"])
lin_res_mean <- mean(test_cor[, "linear_res"])
com_res_sd <- sd(test_cor[, "com_res"])
com_res_mean <- mean(test_cor[, "com_res"])
both_res_sd <- sd(test_cor[, "both_res"])
both_res_mean <- mean(test_cor[, "both_res"])
xlim <- c(-0.75, 0.75)
ylim <- c(0, 6)
linear_hist <-
test_cor %>%
ggplot(aes(x = linear_res)) +
geom_density(stat = "density",
fill = "blue",
color = "lightyellow",
alpha = 0.3) +
xlim(xlim) +
ylim(ylim) +
labs(title = "\n(linear fit)",
x = "") +
theme(plot.title = element_text(hjust = 0.5, size = 10))
centers <-
test_cor[, "linear_res"]
density <-
dnorm(centers, mean = lin_res_mean, sd = lin_res_sd)
linear_hist <-
linear_hist +
geom_line(aes(x = centers, y = density),
color = "red",
size = 1)
com_hist <-
test_cor %>%
ggplot(aes(x = com_res)) +
geom_density(stat = "density",
fill = "red",
color = "lightyellow",
alpha = 0.3) +
xlim(xlim) +
ylim(ylim) +
labs(title = "Residuals\n(fit to commodity)",
x = "",
y = "") +
theme(plot.title = element_text(hjust = 0.5, size = 10))
centers <-
test_cor[, "com_res"]
density <-
dnorm(centers, mean = com_res_mean, sd = com_res_sd)
com_hist <-
com_hist +
geom_line(aes(x = centers, y = density),
color = "red",
size = 1)
both_hist <-
test_cor %>%
ggplot(aes(x = both_res)) +
geom_density(stat = "density",
fill = "purple",
color = "lightyellow",
alpha = 0.75) +
xlim(xlim) +
ylim(ylim) +
labs(title = "\n(fit to both)",
x = "",
y = "") +
theme(plot.title = element_text(hjust = 0.5, size = 10))
centers <-
test_cor[, "both_res"]
density <-
dnorm(centers, mean = both_res_mean, sd = both_res_sd)
both_hist <-
both_hist +
geom_line(aes(x = centers, y = density),
color = "red",
size = 1)
grid.arrange(linear_hist, com_hist, both_hist, ncol = 3)
#
```
You might have noticed that if the best lag is 90 days, that is less than the desired forecast period. In this article, I'm cheating a bit as I have historical data covering the forecast period. To use this model in production, I would need to forecast the commodity out at least another 90 days. Alternatively, I have used an approach for some commodities to get futures pricing data via an API and use those with their maturation dates as a forecast. (You can see an example of that in action [here](https://eaf-llc.shinyapps.io/Business_Drivers/).)
As mentioned earlier, I was motivated to look for other market or external factors that might explain some of the observed cyclical behaviors. Given that the customers were mainly industrial, I looked for a proxy to represent the level of industrial activity in the economy. I was able to obtain industrial energy consumption data by various sectors, again via a government API, which were normalized into a single index.
```{r Economic Activity, echo=FALSE, message=FALSE, warning=FALSE}
econ_index <- read.csv('economic_index.csv',
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(date = as.Date(date, format = '%m/%d/%Y')) %>%
rename(Date = date) %>%
filter(Date >= '2012-01-01' &
Date <= '2016-12-31')
scale_cols <- which(colnames(econ_index) != "Date")
econ_index <- scale_0_to_1(econ_index, scale_cols)
econ_index %>%
ggplot(aes(x = Date, y = index)) +
geom_line(color = "black", size = 0.5) +
theme_economist() +
labs(x = "",
y = "index (normalized)\n",
title = "Raw econonic index") +
theme(plot.title = element_text(hjust = 0.5))
```
I used a correlation analysis to find the lag between these data and the sales data, arriving at 331 days. Here are the two series together with the economic data shifted 331 days. There is correlation of some of the biggest peaks as well as a relationship between the end-of-year dips and the index. Note that the scaling between the two series isn't relevant as that will be accounted for in the regression model.
```{r apply known lag, echo=FALSE, message=FALSE, warning=FALSE}
#
# compare with offset
#
econ_offset <- 331
econ_index %>%
ggplot(aes(x = Date + econ_offset, y = index)) +
geom_line(color = "black", size = 0.5) +
geom_line(data = raw_data, aes(x = uniform_date,
y = X21_d_MA),
color = "red") +
theme_economist() +
labs(x = "",
y = "index / Sales (normalized)\n",
title = "Lagged econonic index",
subtitle = "overlay on Sales") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5))
#
```
Because most of the sales in this case go into industrial settings, another possible external predictor are measures related to adding industrial capacity. A commonly used measure is Manufacturing Equipment Orders. There are historical data available for the US from the US Census Bureau.[^9] Here are the data plotted with the sales data, where the sales data have been aggregated to monthly to match the equipment order data, and the trend has been removed for clarity.
```{r Mfg. Equip. Orders, echo=FALSE, message=FALSE, warning=FALSE}
ind_orders <-
read.csv("Manufacturing_Equipment_Orders.csv",
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(Date = as.Date(Date, format = '%m/%d/%Y'))
#
scale_cols <- which(colnames(ind_orders) != "Date")
ind_orders <- scale_0_to_1(ind_orders, scale_cols)
#
# investigate cross correlation
#
# we need to build monthly data at month end
# from the sales data to match the equipment orders data
# in order to use the ccf function
#
test_ccf <-
raw_data %>%
mutate(Month =
as.integer(substr(as.character(uniform_date), 6, 7))) %>%
mutate(Year =
as.integer(substr(as.character(uniform_date), 1, 4))) %>%
mutate(Year = case_when(
Month < 12 ~ as.integer(Year),
Month == 12 ~ as.integer(Year + 1)
)) %>%
mutate(Month = case_when(
Month < 12 ~ as.integer(Month + 1),
Month == 12 ~ as.integer(1)
)) %>%
mutate(month_date = as.Date(paste0(Year, "-", Month, "-01"))) %>%
select(month_date, X21_d_MA) %>%
mutate(Date = as.Date(month_date - 1)) %>%
group_by(Date) %>%
summarise(monthly_sales = sum(X21_d_MA)) %>%
full_join(ind_orders, by = "Date") %>%
filter(!(is.na(Date)) &
!(is.na(monthly_sales)) &
!(is.na(Orders))) %>%
mutate(monthly_sales =
monthly_sales - predict(lm(monthly_sales ~ Date,
data = (.)))) %>%
scale_0_to_1(which(colnames(.) == "monthly_sales"))
#
# visualize the two monthly series together
#
test_ccf %>%
ggplot(aes(x = Date, y = Orders + 1)) +
geom_line(color = "black", size = 0.5) +
geom_line(aes(x = Date, y = monthly_sales),
color = "red") +
theme_economist() +
labs(x = "",
y = "Mfg. Equip. Orders / Sales (normalized)\n",
title = "Manufacturing Equipment Orders",
subtitle = "overlay on Sales") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5))
```
I ran a cross correlation analysis between the monthly series, as seen below. The strongest correlation occurs at 13 months, or 393 days.
```{r perform cross correlaiton, echo=FALSE, message=FALSE, warning=FALSE}
#
# now get the cross correlation results
#
ccf_result <-
ccf(test_ccf[, "monthly_sales"],
test_ccf[, "Orders"],
lag.max = 24,
type = "correlation",
plot = TRUE,
main = "Time equipment orders change before Sales\n(months)")
#
max_cor <-
cbind(lag = ccf_result[["lag"]],
ccf = ccf_result[["acf"]]) %>%
as.data.frame()
max_ccf <-
which(max_cor[, "ccf"] == max(max_cor[, "ccf"]))
zero_lag <-
which(max_cor[, "lag"] == 0)
ind_orders_offset <-
as.integer(test_ccf[max_ccf, "Date"] -
test_ccf[zero_lag, "Date"])
```
In this chart I shift the equipment orders by the lag found, and adjust for the difference in the means to aid in visualizing, as well as smoothing. As with the other factors, it's not perfect correlation and not the whole story, but I can use this in the model to try to improve the overall predictive power.
```{r visualize lagged data, echo=FALSE, message=FALSE, warning=FALSE}
#
# now we'll compare over the date range of interest
# and align the two series to visualize the main
# relative movements
#
mean_ind_orders <-
ind_orders %>%
mutate(Date = Date + ind_orders_offset) %>%
filter(Date > '2012-01-01' &
Date < '2016-01-01')
mean_ind_orders <-
mean(mean_ind_orders[, "Orders"])
mean_sales <-
test_ccf %>%
filter(Date > '2012-01-01' &
Date < '2016-01-01') %>%
as.data.frame()
mean_sales <-
mean(mean_sales[, "monthly_sales"])
mean_offset <-
mean_ind_orders - mean_sales
ind_orders %>%
mutate(Date = Date + ind_orders_offset) %>%
filter(Date > '2012-01-01' &
Date < '2017-01-01') %>%
ggplot(aes(x = Date, y = Orders,
color = "Equip. Orders")) +
geom_smooth(size = 0.5,
se = FALSE,
span = 0.15) +
geom_smooth(data = test_ccf %>%
filter(Date > '2012-01-01' &
Date < '2017-01-01'),
aes(x = Date,
y = monthly_sales + mean_offset,
color = "Sales"),
se = FALSE,
span = 0.15) +
theme_economist() +
ylim(0, 1) +
scale_color_manual("",
values = c("black", "red"),
breaks = c("Equip. Orders", "Sales")) +
labs(x = "Date",
y = "Mfg. Equip. Orders / Sales (normalized)\n",
title = "Lagged Manufacturing Equipment Orders",
subtitle = "overlay on de-trended Monthly Sales") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.5)) +
theme(legend.position = "right")
#
```
The external data will be merged with the existing data using the appropriate lag times. In the cases where the external data are monthly or weekly, I use linear interpolation to create daily values to be able to combine the data. Note that there are possibly better interpolation approaches and that could be an area of improvement in the future.
As mentioned earlier, the sales areas were encoded as dummy variables and the sales by product group were broken out into columns for each group. I now look at those data as the last part of data understanding. Here are the columns so far:
```{r Load Prepared Data, echo=FALSE, message=FALSE, warning=FALSE}
#
# load the prepared data
#
model_data <- read.csv('model_data.csv',
header = TRUE,
stringsAsFactors = FALSE) %>%
scale_0_to_1(scale_cols = which(colnames(.) != "date"))
print(colnames(model_data))
date_col <- which(colnames(model_data) == "date")
colnames(model_data)[date_col] <- "data_date"
#
plot_temp <-
dcast(model_data, data_date ~ "bookings",
value.var = "tot_bookings_raw",
fun.aggregate = sum) %>%
mutate(data_date = as.Date(data_date, origin = '1899-12-30')) %>%
filter(data_date < '2016-01-01')
total_sales_plot <-
plot_temp %>%
ggplot(aes(x = data_date + 365,
y = bookings)) +
geom_line() +
labs(x = "Sales Date",
y = "Sales (normalized)\n",
title = "Total Sales") +
theme_economist() +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, size = 12)) +
theme(legend.title = element_text(hjust = 0.5, size = 10)) +
theme(legend.text = element_text(size = 10))
#
regions <- which(colnames(model_data) %in%
c("North", "NorthEast", "South",
"Mountain", "Key_Accounts",
"Other", "SouthEast", "SouthWest"))
bookings <-
which(substr(colnames(model_data), 1, 8) ==
"Bookings")
pipelines <-
which(substr(colnames(model_data), 1, 8) ==
"Pipeline")
plot_melt <-
model_data[model_data[, regions[1]] == 1,
c(date_col, bookings)] %>%
mutate(hist_bookings = rowSums((.)[, -date_col])) %>%
dcast(data_date ~ "bookings",
value.var = "hist_bookings",
fun.aggregate = sum) %>%
mutate(data_date = as.Date(data_date, origin = '1899-12-30'))
plot_melt <-
cbind(plot_melt,
rep(colnames(model_data)[regions[1]],
nrow(plot_melt)))
colnames(plot_melt) <-
c("Date", "bookings", "region")
for (i in 2:length(regions)) {
plot_temp <-
model_data[model_data[, regions[i]] == 1,
c(date_col, bookings)] %>%
mutate(hist_bookings = rowSums((.)[, -date_col])) %>%
dcast(data_date ~ "bookings",
value.var = "hist_bookings",
fun.aggregate = sum) %>%
mutate(data_date = as.Date(data_date, origin = '1899-12-30'))
plot_temp <-
cbind(plot_temp,
rep(colnames(model_data)[regions[i]],
nrow(plot_temp)))
colnames(plot_temp) <-
c("Date", "bookings", "region")
plot_melt <- rbind(plot_melt, plot_temp)
}
region_plot <- plot_melt %>%
ggplot(aes(x = Date,
y = bookings)) +
geom_line(aes(color = region,
linetype = region),
size = 1) +
labs(x = "Sales Date",
y = "Sales (normalized by Area)\n",