I'm working on below reinforcement learning problem: I have bottle of fix capacity (say 5 liters). At the bottom of bottle there is cock to remove water. The distribution of removal of water is not fixed. we can remove any amount of water from bottle, i.e. any continuous value between [0, 5].
At the top of the bottle one tap is mounted to fill water in the bottle. RL agent can fill [0, 1, 2, 3, 4] liters in the bottle. Initial bottle level is any value between [0, 5].
I want to train the agent in this environment to get optimal sequence of actions such that bottle will not get empty and overflow which implies continuous supply of water demand.
Action space = [0, 1, 2, 3, 4] Discrete Space
Observation Space = [0, Capacity of Bottle] i.e. [0, 5] Continuous Space
Reward logic = if bottle empty due to action give negative rewards; if bottle overflow due to action give negative rewards
I have decided to use python to create an environment.
from gym import spaces import numpy as np class WaterEnv(): def __init__(self, BottleCapacity = 5): ## CONSTANTS self.MinLevel = 0 # minimum water level self.BottleCapacity = BottleCapacity # bottle capacity # action space self.action_space = spaces.Discrete(self.BottleCapacity) # observation space self.observation_space = spaces.Box(low=self.MinLevel, high=self.BottleCapacity, shape=(1,)) # initial bottle level self.initBlevel = self.observation_space.sample() def step(self, action): # water qty to remove WaterRemoveQty = np.random.uniform(self.MinLevel, self.BottleCapacity, 1) # updated water level after removal of water UpdatedWaterLevel = (self.initBlevel - WaterRemoveQty) # add water - action taken UpdatedWaterLevel_ = UpdatedWaterLevel + action if UpdatedWaterLevel_ <= self.MinLevel: reward = -1 done = True elif UpdatedWaterLevel_ > self.BottleCapacity: reward = -1 done = True else: reward = 0.5 done = False return UpdatedWaterLevel_, reward, done def reset(self): """ Reset the initial bottle value """ self.initBlevel = self.observation_space.sample() return self.initBlevel import random from collections import deque from keras.models import Sequential from keras.layers import Dense from keras.optimizers import sgd class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen=2000) # memory size self.gamma = 0.99 # discount rate self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 # minmun exploration rate self.epsilon_decay = 0.99 # exploration decay self.learning_rate = 0.001 # learning rate self.model = self._build_model() def _build_model(self): # Neural Net for Deep-Q learning Model model = Sequential() model.add(Dense(256, input_dim=self.state_size, activation='relu')) model.add(Dense(256, activation='relu')) model.add(Dense(self.action_size, activation='linear')) model.compile(loss='mse', optimizer=sgd(lr=self.learning_rate)) return model def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) act_values = self.model.predict(state) return np.argmax(act_values[0]) # returns action def replay(self, batch_size): minibatch = random.sample(self.memory, batch_size) for state, action, reward, next_state, done in minibatch: target = reward if not done: target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) target_f = self.model.predict(state) target_f[0][action] = target self.model.fit(state, target_f, epochs=1, verbose=0) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # create iSilo enviroment object env = WaterEnv() state_size = env.observation_space.shape[0] action_size = env.action_space.n minibatch = 32 # Initialize agent agent = DQNAgent(state_size, action_size) done = False lReward = [] # carry the reward upto end of simulation rewardAll = 0 XArray = [] # carry the actions upto end of simulation EPOCHS = 1000 for e in range(EPOCHS): #state = np.reshape(state, [1, 1]) # reset state in the beginning of each epoch state = env.reset() time_t = 0 rewardAll = 0 while True: # Decide action #state = np.reshape(state, [1, 1]) action = agent.act(state) next_state,reward, done = env.step(action) #reward = reward if not done else -10 # Remember the previous state, action, reward, and done #next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) # remembering the action for perfrormace check XArray.append(action) # Assign next_state the new current state for the next frame. state = next_state if done: print(" episode: {}/{}, score: {}, e: {:.2}" .format(e, EPOCHS, time_t, agent.epsilon)) break rewardAll += reward # experience and reply if len(agent.memory) > minibatch: agent.replay(minibatch) lReward.append(rewardAll) # append the rewards After running the 1000 epoch, I observed that agent has not learned anything. Unable to find out whats going wrong.