cp__onrobot.m

%% Using the PILCO toolbox for solving the cart pole problem


clear all;
close all;
clc;

%% 0. Settings of the learning scenarrio

N_episode=10;  % Number of episodes
flag.printing=1; % Printing optimization information: intensity 0-3
flag.plotting=2; % Plotting  information: intensity 0-3


%% 1. Defining the system


% For each scenario, we need the following set of files, which are specific to this scenario.
% - settings.m:     A file that contains scenario-specific settings and initializations
% - loss.m:         A cost function
% - dynamics.m:     A file that implements the ODE, which governs the dynamics
% - (optional) visualization

% First the enviroment and the actual system is specified
cp_settings;                      % Load settings for the cart-pole system


%% 2. Initializiaton

% Here we conduct initial trials without a controller to gather data in
% terms of states, actions, and resulting responses. Trajectores have
% length H (see settings).

% When iterating with a real setup, this corresponds to random experiments
% with random/preliminary controllers

x = [];                                  % used to gather intial data
y = [];
initfiles = {'initdata_1.txt', 'initdata_2.txt', 'initdata_3.txt', 'initdata_4.txt'};
for i = 1:length(initfiles)
    [xx, yy, realCost{i}, latent{i}] = readArdData(initfiles{i}, dt, H,cost);  % Execute interaction with the plant using the current random policy and a given inital condition (determined by a Gaussian distribution in this case)
    x = [x; xx]; y = [y; yy];       % Gathering obtained data into x and y
    if plotting.verbosity > 0       % visualization of trajectory
        if ~ishandle(1)
            figure(1);
        else
            set(0,'CurrentFigure',1);
        end
        clf(1);
        cp_display_rollout;        % plot experiment (stored in xx, yy)
    end
end
N_initial = length(initfiles);
J=N_initial; % Used by the toolbox


%% 3. Policy optimization

% for j = 1:N_episode % Note that the loop varaiable "j" is used in the "display_rollout" script
trainDynModel;   % Step 1. train GP model of the process dynamics [off line]
% The script that takes care of training the GP executes the following high-level steps:
%   a. Extract states and control inputs to be matched form x and y
%   b. Define the training inputs and targets of the GP
%   c. Train the GP
%   d. Display GP hyper-parameters, the learned noise hyper-parameters, and the signal-to-noise ratios. This information is very valuable for debugging purposes.


learnPolicy;     % Step 2. optimization of the policy based on the learnt model [off line, simulation phase]
%   a. Learn the policy by calling minimize. This uses the whole
%      inference toolchain to compute the gradients required by the
%      minimization
%   b. (optional) Plot overall optimization progress. (line search etc.)
%   c. Long-term prediction of a state trajectory from p(x0) using the
%      learned policy by calling pred. This prediction is equivalent to
%      the last predicted trajectory during policy learning, i.e.,
%      the predicted state trajectory that belongs to the the learned controller.
%   d. The predicted state trajectory is used to compute the corresponding
%      distribution over immediate costs by calling calcCost. => stored
%      in "fantasy"
%   c. (optional) Plot the predicted immediate cost distribution as a
%      function of the time steps.
%
writePolicyArd(policy) % display the necessary vectors and matrices
% for applying controller

%% 4. Execute policy, read and plot measurement results
filename = ['ardmeas/policydata_' num2str(j) '.txt'];
[xx, yy, realCost{j+J}, latent{j}] = readArdData(filename, dt, H,cost);
x = [x; xx]; y = [y; yy];       % Gathering obtained data into x and y
filename = ['pilco_realrobot_' num2str(j)]; save(filename);

% Step 3. apply controller to the system and gather data [on line, experiment phase]
% The specific steps of this generic script are
%     (a) determine start state
%     (b) generate rollout
%         i. compute control signal ??(xt) [this uses the policy object]
%         ii. simulate dynamics (or apply control to real robot) [this will use the function handle that describes the dynamics]
%         iii. transition to state xt+1 to continue the interaction
% This script also plots the response using figures already set up by the
% display scripts. Furthermore, it also stores the whole trajectory
% information in a saved file.


% When iteracting with a real setup, Step 3 means that with the control
% policy a full experiment is run. The rest of the steps remain
% unchanged.

% Plotting the most recent trajectory (stored in xx, yy)
disp(['controlled trial # ' num2str(j)]);
if plotting.verbosity > 0;      % visualization of trajectory
    if ~ishandle(1); figure(1); else set(0,'CurrentFigure',1); end; clf(1);
    cp_display_rollout;
end
figure()
hold on
plot(M{j}(1,:))
plot(xx(:,1))
hold off
figure()
hold on
plot(M{j}(4,:))
plot(xx(:,4))
hold off
j=j+1;
%%

% end