-
Notifications
You must be signed in to change notification settings - Fork 28
/
DQN_trade.py
158 lines (132 loc) · 4.12 KB
/
DQN_trade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import tensorflow as tf
import pandas as pd
from collections import deque
import numpy as np
from stock_env import StockEnv
import random
TRADE_PERIOD =240
GAMMA = 0.9 #discount factor
EPSILON = 0.9
REPLAY_SIZE = 500
BATCH_SIZE = 32
class DQN_Trade():
def __init__(self):
self.replay_buffer = deque()
self.time_step = 0
self.epsilon = EPSILON
self.state_dim = 40
self.action_dim = 3
#build NN and train mothed
self.buildNetwork()
self.train_method()
#init TF session
self.session = tf.InteractiveSession()
self.session.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
check_point = tf.train.get_checkpoint_state('saved_network')
if check_point and check_point.model_checkpoint_path:
self.saver.restore(self.session,check_point.model_checkpoint_path)
print 'load model success'
else:
print 'can not find old network weight'
def buildNetwork(self):
#first layer ,100 units
W1 = self.weight_variable([40,100])
b1 = self.bias_variable([100])
W2 = self.weight_variable([100,3])
b2 = self.bias_variable([3])
#input layer
self.state_input = tf.placeholder('float',[None,40])
#hiden layer
layer_1 = tf.nn.relu(tf.matmul(self.state_input,W1)+b1)
# Q value layer
self.Q_value = tf.matmul(layer_1,W2)+b2
def train_method(self):
self.action_input = tf.placeholder('float',[None,3])
self.y_input = tf.placeholder('float',[None])
Q_action = tf.reduce_sum(tf.multiply(self.Q_value, self.action_input),reduction_indices=1)
self.cost = tf.reduce_mean(tf.square(self.y_input-Q_action))
self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(self.cost)
def train_net(self):
# get random sample from replay buffer
minibach = random.sample(self.replay_buffer, BATCH_SIZE)
state_bach = [data[0] for data in minibach]
action_bach = [data[1] for data in minibach]
reward_bach = [data[2] for data in minibach]
next_state_bach = [data[3] for data in minibach]
#calcuate Q
Y_bach =[]
next_Q = self.Q_value.eval(feed_dict = {self.state_input:next_state_bach})
for i in range(0,BATCH_SIZE):
done = minibach[i][4]
if done :
Y_bach.append(reward_bach[i])
else:
Y_bach.append(reward_bach[i]+GAMMA*np.max(next_Q[i]))
self.optimizer.run(feed_dict ={
self.y_input:Y_bach,
self.action_input:action_bach,
self.state_input:state_bach
})
def precive(self,state,action,reward,state_,done):
self.time_step +=1
one_hot_action = np.zeros(3)
one_hot_action[action] =1
self.replay_buffer.append((state,one_hot_action,reward,state_,done))
if len(self.replay_buffer) >REPLAY_SIZE:
self.replay_buffer.popleft()
if len(self.replay_buffer) >100: # after 100 step ,pre train
self.train_net()
def egreedy_action(self,state):
Q_value = self.Q_value.eval(feed_dict={
self.state_input:[state]
})
Q_value = Q_value[0]
if self.epsilon<=0.1:
epsilon_rate =1
else:
epsilon_rate =0.95
if self.time_step > 200 :
self.epsilon=epsilon_rate*self.epsilon
if random.random()<= self.epsilon:
return random.randint(0, 2)
else:
return np.argmax(Q_value)
def action(self,state):
Q_value = self.Q_value(feed_dict={
self.state_input:[state]
})
Q_value = Q_value[0]
return np.argmax(Q_value)
def weight_variable(self,shape):
initial = tf.truncated_normal(shape)
return tf.Variable(initial)
def bias_variable(self,shape):
initial = tf.constant(0.01,shape=shape)
return tf.Variable(initial)
def save_model(self,step):
saver = self.saver
saver.save(self.session, 'saved_network/'+'network' + '-dqn',global_step=step)
def getLoss(self):
pass
def main(train = False):
data = np.loadtxt('./data.csv',delimiter = ',',skiprows=1)
data = data[230:-1] #delete the first day data
angent = DQN_Trade()
for i in range(0,10):
iters =len(data)/240
for iter_step in range(0,iters):
#print iter_step
iter_data =data[iter_step*240:iter_step*240+240]
env =StockEnv(iter_data)
s = env.reset()
while True:
action = angent.egreedy_action(s)
s_,reward,done =env.gostep(action)
print action
angent.precive(s,action,reward,s_,done)
s= s_
if done:
break
angent.save_model(step=i)
main()