in News Vendor/src/news_vendor_environment.py [0:0]
def step(self, action):
done = False
inv_state, p, c, h, k, mu = self.break_state()
buys = max(0, min(action[0], max(0, self.max_level - np.sum(inv_state))))
demand_realization = np.random.poisson(mu)
# Compute Reward
on_hand = inv_state[0]
if self.l == 0:
on_hand += buys
sales = min(on_hand, demand_realization)
sales_revenue = p * sales
overage = max(0, on_hand - demand_realization)
underage = max(0, demand_realization - on_hand)
# purchase_cost = c * buys
purchase_cost = self.gamma ** self.l * c * buys
holding = overage * h
penalty_lost_sale = k * underage
reward = sales_revenue - purchase_cost - holding - penalty_lost_sale
new_state = np.copy(self.state)
buys = max(0, min(self.max_level - on_hand, buys))
if self.l > 1:
new_state[:self.inv_dim - 1] = np.copy(self.state[1:self.inv_dim])
new_state[self.l - 1] = buys
new_state[0] += overage
elif self.l == 1:
new_state[0] = overage + buys
else:
new_state[0] = overage
self.step_count += 1
if self.step_count >= self.max_steps:
done = True
# reward = reward/100.0 #reduce rewards to smaller values
self.state = np.copy(new_state)
info = {'demand realization': demand_realization, 'sales': sales, 'underage': underage, 'overage': overage}
return new_state, reward, done, info