I have a problem with my reinforcement learning model. I am trying to simulate an electric battery storage.
The battery charges when the electricity prices are low and discharge ONLY to the user at fixed hours during the day, every day.
Therefore, the only cost for the user is power of charge * electricity price at the hour.
The reward function is set as the opposite of the cumulative sum of the cost.
Is it a correct approach? How to properly define it so that the overall cost of the purchased electricity is at minimum at the end of the year?
The problem that I have is that the battery will always near the maximum capacity and never fully take advantage of the full range of MWh available.
1. Define a dataframe where to store fictitious electricity prices for 365 days
df=pd.DataFrame(np.random.randint(0,500,size=(24, 365)))
2. Define the main parameters
Lookback_window_size=7
Current_day=Lookback_window_size
P_charge=2 #MW
P_discharge=3 #MW
3. Define the class Battery(Env)
class Battery(Env):
metadata = {'render.modes': ['human']}
def __init__(self, df):
#Import the dataframe
self.df = df
# The action space is a 1D array of shape (24,). Since we are simulating day-ahead market, the action space returns
# the overall daily charge / no charge scenario
# action = 1 means that we charge our battery, action = 0 means that we don't charge
self.action_space= spaces.MultiBinary(24)
# The observation space is a 1D array. Given a lookback window size of 1 day, then The first 48 columns represent
# the electricity prices for the current day + all the days before included in the lookback window size.
# The last two columns store SOC (state of charge) at the end of the day and overall cost
# (how much we paid for electricity).
self.observation_shape=(int((Lookback_window_size+1)*24+2),)
self.observation_space = spaces.Box(low = 0, high=np.inf, shape=self.observation_shape, dtype=np.float64)
def _next_observation(self):
# Add the prices of the last days to the monitor matrix
prices=[]
for i in range(self.Current_day - Lookback_window_size,self.Current_day + 1):
prices=np.concatenate([prices,self.df.iloc[0:,i].values])
# Add extra values to monitor such as SOC, cost and day of the week (Monday=1,Tuesday=2,etc.)
extra = [self.SOC, self.Cost]
obs=np.concatenate([prices,extra])
return obs
def _take_action(self, action):
# Being the action space an array, the for loop will check the action at every hour (action[i]) and update the
# cost and the state of charge
self.capacity=200 #MWh
i=0
for x in action:
#When action = 1 then we charge our battery, if action = 0 then we don't charge
if x == 1:
# The cost increase based on the price of the electricity at that hour
self.Cost+=self.df[self.Current_day][i]*P_charge
# If we charge, then the state of charge (SOC) increases as well
self.SOC+=P_charge
# Everyday we discharge the battery always at the same hours
if (i in range(8,14)):
self.SOC-=P_discharge
# if the battery is depleted, then we directly buy electricity from the grid
if self.SOC<0:
self.Cost+=self.df[self.Current_day][i+1]*(-self.SOC)
self.SOC=0
#the battery cannot charge above the capacity threshold.
if self.capacity is not None:
if self.SOC > self.capacity:
# We subtract the latest cost. Since it could not have happened being the SOC above the maximum.
self.Cost-=self.df[self.Current_day][i]*P_charge
# The capacity needs to be set to the threshold
self.SOC = min(self.SOC, self.capacity)
i+=1
def step(self, action):
# Execute one time step within the environment
self._take_action(action)
self.Current_day += 1
# Maximizing the reward means to minimize the costs
reward = - self.Cost
# Stop at the end of the dataframe
done = self.Current_day >= len(self.df.columns)-1
obs = self._next_observation()
return obs, reward, done, {}
def render(self, mode='human', close=False):
print(f'Day: {self.Current_day}')
print(f'SOC: {self.SOC}')
print(f'Cost: {self.Cost}')
print(f'Actions: {action}')
def reset(self):
self.Current_day = Lookback_window_size
# Give an initial SOC value
self.SOC = 50
# Cost at day 0 is null
self.Cost = 0
return self._next_observation()