-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbargaining.py
157 lines (121 loc) · 5.96 KB
/
bargaining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gym
from gym import Env, spaces
import numpy as np
class Bargaining(Env):
metadata = {'render.modes': ['human']}
reward_range = (-np.inf, np.inf)
action_space = spaces.Box(low=0.0, high=2.0, shape=(1,), dtype=np.float16) # Bid
observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(3,), dtype=np.float16) # [ Step num, Opponent Bid, Own price ]
discount = 0.99
no_deal_rate = 0.05 # Proportion of cases where no deal exists
max_steps = 16 # Max steps (both players together)
time_cost = 0.0 #0.005
def __init__(self):
pass
def reset(self):
"""
Resets the state of the environment and returns an initial observation.
# Returns
observation (object): The initial observation of the space. Initial reward is assumed to be 0.
"""
self.prices = [np.random.uniform(0,1), np.random.uniform(0,1)]
if self.prices[0]>self.prices[1]: # Make sure they are in right order
self.prices = [self.prices[1],self.prices[0]]
if np.random.uniform(0,1)<self.no_deal_rate: # Reverse if we want a no-deal condition
self.prices = [self.prices[1],self.prices[0]]
self.prices[0] = 1.0 - self.prices[0]
self.bids = [0.0, 0.0]
self.cur_player = 0 # 0 for seller, 1 for buyer
self.steps = 0
self.rewards = [0.0, 0.0]
self.ended, self.done = False, False
self.info = {}
return [self.steps, self.bids[1-self.cur_player], self.prices[self.cur_player]]
def step(self, actions):
"""Run one timestep of the environment's dynamics.
Accepts an action and returns a tuple (observation, reward, done, info).
# Arguments
action (object): An action provided by the environment.
# Returns
observation (object): Agent's observation of the current environment.
reward (float) : Amount of reward returned after previous action.
done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
"""
action = actions[0]
#print "\nAction: %.4f" % action
# If ended, inform the other party of the result and call it done
if self.ended or self.done:
self.done = True
# Stopping before a deal was reached
elif (action > self.prices[self.cur_player] or # Unprofitable bid
self.steps > self.max_steps): # Ran out of bids
cost = self.time_cost * self.steps
self.rewards = [-cost,-cost] # As time was wasted by both
self.ended = True
# Otherwise, update bid and check if we have a deal
else:
self.bids[self.cur_player] = action
if 1.0-self.bids[0]<=self.bids[1]: # We have a deal
# Current bid is just the "accept" signal, so actual price is the one before
sale_price = self.bids[1] if self.cur_player == 0 else 1.0-self.bids[0]
surplus = abs(self.prices[1]-(1.0-self.prices[0]))
# Discount the final value by the time spent and scale with max possible score
dmult = (self.discount**self.steps)/surplus
# Set rewards equal to surplus beyond reservation price for both parties
self.rewards = [dmult*( sale_price - (1.0-self.prices[0]) ),
dmult*( self.prices[1] - sale_price )]
#print " Step: %i, split %.0f/%.0f" % (self.steps, 100*(sale_price - self.prices[0])/surplus, 100*(self.prices[1] - sale_price)/surplus)
self.ended = True
self.steps += 1
resp = [ [self.steps, self.bids[self.cur_player], self.prices[1-self.cur_player]], # Observation
self.rewards[self.cur_player], self.done, self.info ]
self.cur_player = 1-self.cur_player
return resp
def render(self, mode='human', close=False):
"""Renders the environment.
The set of supported modes varies per environment. (And some
environments do not support rendering at all.)
# Arguments
mode (str): The mode to render with.
close (bool): Close all open renderings.
"""
#return
if self.done: return
if self.ended:
if 1.0-self.bids[0]<self.bids[1]:
print "Deal at %.3f, rewards %.3f-%.3f" % (self.bids[self.cur_player-1], self.rewards[0], self.rewards[1])
else:
print "No deal!"
return
print "Step: %i Gap: %.3f" % (self.steps, max((1.0-self.bids[0])-self.bids[1],0.0))
print "Seller: bid %.3f surplus %.3f" % (1.0-self.bids[0],self.prices[0]-self.bids[0])
print "Buyer: bid %.3f surplus %.3f" % (self.bids[1],self.prices[1]-self.bids[1])
def close(self):
"""Override in your subclass to perform any necessary cleanup.
Environments will automatically close() themselves when
garbage collected or when the program exits.
"""
pass
def seed(self, seed=None):
"""Sets the seed for this env's random number generator(s).
# Returns
Returns the list of seeds used in this env's random number generators
"""
np.random.seed(seed)
if __name__ == '__main__':
env = Bargaining()
obs = env.reset()
print "Seller price: %.3f, Buyer price: %.3f" % (1.0-env.prices[0], env.prices[1])
while True:
bid = input('Bid for %s: ' % ('Seller' if env.cur_player == 0 else 'Buyer'))
if env.cur_player==0: bid = 1.0-bid
obs, reward, done, info = env.step([bid])
if reward:
print "Got reward %.3f" % reward
_, reward, _, _ = env.step([0])
print "Other player reward %.3f" % reward
break
elif env.ended:
print "No deal!"
break