Add time-series regression example notebook (#587)

santiago-imelio · web-flow · commit 57ed0c2f8a3a · 2024-09-05T17:14:33.000-04:00
diff --git a/notebooks/time_series/apple_stock_prices.livemd b/notebooks/time_series/apple_stock_prices.livemd
@@ -0,0 +1,312 @@
+# Time-series regression with RNNs
+
+```elixir
+Mix.install([
+  {:axon, "~> 0.6.1"},
+  {:kino_vega_lite, "~> 0.1.13"},
+  {:nx, "~> 0.7.3"},
+  {:tucan, "~> 0.3.1"},
+  {:polaris, "~> 0.1.0"},
+  {:exla, "~> 0.7.3"},
+  {:req, "~> 0.5.2"},
+  {:vega_lite, "~> 0.1.9"},
+  {:nimble_csv, "~> 1.2"},
+  {:scholar, "~> 0.3.1"}
+])
+
+Nx.global_default_backend(EXLA.Backend)
+Nx.Defn.global_default_options(compiler: EXLA)
+```
+
+## Introduction
+
+In this project, we will build a predictor for a time series using a Recurrent Neural Network (RNN) regressor. The prediction will be made for Apple's stock price 7 days in advance, based on historical data.
+
+We will use an RNN architecture known as Long Term Short Memory (LSTM).
+
+<!-- livebook:{"break_markdown":true} -->
+
+First, we load and preprocess the historical data. We will apply *normalization* to the series of historical data. This helps mitigate significant numerical issues associated with how activation functions like `tanh` transform very large numbers (whether positive or negative), and it aids in avoiding problems with calculating derivatives.
+
+```elixir
+NimbleCSV.define(CSVParser, separator: "\n", escape: "\"")
+
+data =
+  "https://raw.githubusercontent.com/santiago-imelio/datasets/main/apple_stock_prices.csv"
+  |> Req.get!()
+  |> Map.get(:body)
+  |> CSVParser.parse_string()
+  |> Enum.map(fn [price] ->
+    price
+    |> Float.parse()
+    |> elem(0)
+  end)
+  |> Nx.tensor()
+```
+
+```elixir
+min = Nx.reduce_min(data) |> Nx.to_number()
+max = Nx.reduce_max(data) |> Nx.to_number()
+
+{series_size} = Nx.shape(data)
+days = Nx.iota({series_size})
+
+Tucan.lineplot([Prices: data, Days: days], "Days", "Prices")
+|> Tucan.set_size(600, 400)
+|> Tucan.Scale.set_y_domain(min, max)
+```
+
+## Train-test splitting
+
+Here we split the dataset into train and test sets. We will use two thirds of the data for training and the remaining for validation. Since the dataset is a ordered time-series, we shouldn't shuffle the data before splitting.
+
+```elixir
+train_test_split = 2 / 3
+
+{train, test} = Nx.split(data, train_test_split)
+```
+
+## Normalization
+
+We normalize the series to belong within the range [-1, 1] using min-max scaling. It's also common to see applications where normalization is performed using standard deviation.
+
+```elixir
+alias Scholar.Preprocessing.MinMaxScaler
+```
+
+We fit a min-max scaler on the training data, and then transform both train and test sets using the fitted scaler. This prevents data leakage from the test set to the training set.
+
+```elixir
+scaler = MinMaxScaler.fit(train, min_bound: -1, max_bound: 1)
+train = MinMaxScaler.transform(scaler, train)
+test = MinMaxScaler.transform(scaler, test)
+```
+
+Let's visualize the normalized data.
+
+```elixir
+norm_data = Nx.concatenate([train, test])
+new_min = Nx.reduce_min(norm_data) |> Nx.to_number()
+new_max = Nx.reduce_max(norm_data) |> Nx.to_number()
+
+Tucan.lineplot([Prices: norm_data, Days: days], "Days", "Prices")
+|> Tucan.set_size(600, 400)
+|> Tucan.Scale.set_y_domain(new_min, new_max)
+```
+
+## Sliding window
+
+Generally, a time-series is mathematically represented as:
+
+$$
+\langle s_0, s_1, s_2, \ldots, s_P \rangle
+$$
+
+where $s_p$ is the numerical value of the series at time interval $p$, and $P$ is the total length of the series. To apply an RNN, the prediction should be treated as a regression problem. This involves using a sliding window to construct a set of input-output pairs on which regression will be applied.
+
+For example, with a window size $T = 3$, the following pairs need to be produced:
+
+$$
+\begin{array}{c|c}
+  \text{Input} & \text{Output}\\
+  \hline \langle s_{1},s_{2},s_{3}\rangle & s_{4} \\
+  \langle s_{2},s_{3},s_{4} \rangle & s_{5}  \\
+  \vdots & \vdots \\
+  \langle s_{P-3},s_{P-2},s_{P-1} \rangle & s_{P}
+\end{array}
+$$
+
+In each pair, the input consists of a sequence of $T$ consecutive values from the series, and the output is the subsequent value immediately following the input sequence. This approach prepares the data for training the RNN model for time-series prediction.
+
+```elixir
+sliding_window = fn series, win_size ->
+  {p} = Nx.shape(series)
+
+  x_idx =
+    series
+    |> Nx.shape()
+    |> Nx.iota()
+    |> Nx.reshape({:auto, 1})
+    |> Nx.vectorize(:elements)
+
+  y_idx =
+    series
+    |> Nx.shape()
+    |> Nx.iota()
+    |> Nx.add(win_size)
+    |> Nx.reshape({:auto, 1})
+    |> Nx.slice([0, 0], [p - win_size, 1])
+
+  x =
+    Nx.slice(series, [x_idx[0]], [win_size])
+    |> Nx.devectorize(keep_names: false)
+    |> Nx.slice([0, 0], [p - win_size, win_size])
+
+  y = Nx.gather(series, y_idx, axes: [0])
+
+  {x, y}
+end
+```
+
+Next, we can test our sliding window function with a given series. For example, we can test it using the first 10 numbers of the Fibonacci sequence, known as $F_{n+1} = F_{n - 1} + F_n$. Passing a window size of 2, we should expect input-output pairs like the following:
+
+$$
+\begin{array}{c|c}
+  \text{Input} & \text{Output}\\
+  \hline \langle F_1, F_2\rangle & F_3 \\
+  \langle F_{2},F_{3}\rangle & F_{4}  \\
+  \vdots & \vdots \\
+  \langle F_{8},F_{9} \rangle & F_{10} \\
+\end{array}
+$$
+
+```elixir
+test_series = Nx.tensor([0, 1, 1, 2, 3, 5, 8, 13, 21, 34])
+sliding_window.(test_series, 2)
+```
+
+Now we apply the sliding window to our dataset with a window size of 7. For simplicity, we truncate the test and train to a size divisible by the window length.
+
+```elixir
+win_size = 7
+
+{train_size} = Nx.shape(train)
+{test_size} = Nx.shape(test)
+
+p_train = train_size - rem(train_size, win_size)
+p_test = test_size - rem(test_size, win_size)
+
+train = train[0..(p_train - 1)]
+test = test[0..(p_test - 1)]
+
+{x_train, y_train} = sliding_window.(train, win_size)
+{x_test, y_test} = sliding_window.(test, win_size)
+```
+
+```elixir
+dataset_size = p_train + p_test
+```
+
+## Building and training our RNN regressor
+
+We will use Axon to build a neural network with two hidden RNN layers with the following specifications:
+
+1. The first layer should use an LSTM module with 5 hidden units. Its `input_shape` should be `(window_size, 1)`.
+2. The second layer uses a fully connected module with one unit.
+
+For the loss function we will use mean squared error.
+
+```elixir
+batch_size = 6
+input_shape = {batch_size, win_size, 1}
+
+model =
+  Axon.input("input", shape: input_shape)
+  |> Axon.lstm(5)
+  |> then(fn {out, _} -> out end)
+  |> Axon.nx(fn t -> t[[0..-1//1, -1]] end)
+  |> Axon.dense(1)
+
+Axon.Display.as_graph(model, Nx.template(input_shape, :f32))
+```
+
+Before training we split our train and test data in batches using the specified batch size.
+
+```elixir
+y_train_batches = Nx.to_batched(y_train, batch_size)
+
+x_train_batches =
+  x_train
+  |> Nx.reshape({:auto, win_size, 1})
+  |> Nx.to_batched(batch_size)
+
+y_test_batches = Nx.to_batched(y_test, batch_size)
+
+x_test_batches =
+  x_test
+  |> Nx.reshape({:auto, win_size, 1})
+  |> Nx.to_batched(batch_size)
+
+train_data = Stream.zip([x_train_batches, y_train_batches])
+test_data = Stream.zip([x_test_batches, y_test_batches])
+```
+
+We are ready to create our training loop and run it. Since our network is simple, we choose SGD as the network optimizer.
+
+```elixir
+optimizer = Polaris.Optimizers.sgd(learning_rate: 0.04)
+loop = Axon.Loop.trainer(model, :mean_squared_error, optimizer)
+
+trained_model_state = Axon.Loop.run(loop, train_data, %{}, epochs: 100)
+```
+
+Finally, we make predictions over the train and test sets so we can evaluate our model.
+
+```elixir
+y_pred_train =
+  x_train_batches
+  |> Enum.map(&Axon.predict(model, trained_model_state, &1))
+  |> Nx.concatenate()
+
+y_pred_test =
+  x_test_batches
+  |> Enum.map(&Axon.predict(model, trained_model_state, &1))
+  |> Nx.concatenate()
+```
+
+```elixir
+y_train
+|> Axon.Losses.mean_squared_error(y_pred_train, reduction: :mean)
+|> Nx.to_number()
+|> IO.inspect(label: "MSE train")
+
+y_test
+|> Axon.Losses.mean_squared_error(y_pred_test, reduction: :mean)
+|> Nx.to_number()
+|> IO.inspect(label: "MSE test")
+
+:ok
+```
+
+## Visualizing predictions
+
+Let's visualize the predictions compared to the real trend of the prices. First we build the plot data required for each layer.
+
+```elixir
+interval = Nx.iota({dataset_size})
+
+plt_data = [
+  Prices: norm_data[0..(dataset_size - 1)],
+  Day: interval[0..(dataset_size - 1)],
+  Eval: List.duplicate("Real prices", dataset_size)
+]
+
+{train_size} = Nx.shape(y_train)
+{test_size} = Nx.shape(y_test)
+
+plt_data_train = [
+  Prices: Nx.to_flat_list(y_pred_train),
+  Day: interval[(win_size - 1)..(train_size + win_size - 1)],
+  Eval: List.duplicate("Predictions train", train_size)
+]
+
+plt_data_test = [
+  Prices: Nx.to_flat_list(y_pred_test),
+  Day: interval[(dataset_size - test_size)..(dataset_size - 1)],
+  Eval: List.duplicate("Predictions test", test_size)
+]
+
+opts = [stroke_width: 2]
+```
+
+```elixir
+Tucan.layers([
+  Tucan.lineplot(plt_data, "Day", "Prices", Keyword.put(opts, :stroke_dash, [2])),
+  Tucan.lineplot(plt_data_train, "Day", "Prices", opts),
+  Tucan.lineplot(plt_data_test, "Day", "Prices", opts)
+])
+|> Tucan.color_by("Eval")
+|> Tucan.set_size(600, 400)
+|> Tucan.Scale.set_x_domain(0, dataset_size - 1)
+|> VegaLite.encode(:color, field: "Eval", scale: [scheme: "set1"])
+```