I've been interested in Reinforcement Learning for a while. I've made a simple PPO agent, using Stable Baselines 3, PyTorch and Gym to trade Bitcoin using Binance data. This was relatively easy and straight forward to setup, get it running and get a somewhat decent model. I've noticed that Stable Baselines 3 isn't very efficient in training (especially on GPU). It takes about 3 weeks to train a model on 500 epochs.
I've read about RAY / RLlib and it seems like a better way to setup Reinforcement Learning agents / enviroments. I've read the RAY documentation for PPO, watched youtube tutorials and even asked GPT-4 for help. Yet nothing seems to work with my custom gym enviroment.
Why did this just work easily in Stable Baselines3 yet it's a struggle to get this to work on RAY? Can't find any good, simple explainations either.
Here's my code, with my enviroment etc. Does someone have a good, simple tutorial or knows what i've been doing wrong?
import datetime
import pandas as pd
import numpy as np
import gym
import requests
from gym import spaces
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from gym.envs.registration import register
# Constants
SYMBOL = "BTCUSDT"
INTERVAL = "6h"
WINDOW_SIZE = 60
BATCH_SIZE = 128
GAMMA = 0.95
LR = 0.0003
NUM_EPOCHS = 1000
STEPS_PER_EPISODE = 5000
TRADING_FEE = 0.0004
ENV_INITIAL_BALANCE = 1000
# Download Binance data. IMPORTANT: THIS ONLY WORKS OUTSIDE OF US DUE TO API LIMITS FROM BINANCE
def download_data(symbol, interval, start_date, end_date, limit=500):
data = []
start_time = int(start_date.timestamp() * 1000)
end_time = int(end_date.timestamp() * 1000)
base_url = 'https://api.binance.com/api/v3/klines'
while start_time < end_time:
params = {
'symbol': symbol,
'interval': interval,
'startTime': start_time,
'endTime': end_time,
'limit': limit
}
response = requests.get(base_url, params=params).json()
if not response:
break
for r in response:
row = [int(r[0]), float(r[1]), float(r[2]), float(r[3]), float(r[4]), float(r[5]), int(r[6]), float(r[7]), int(r[8]), float(r[9]), float(r[10]), float(r[11])]
data.append(row)
if int(r[6]) > start_time:
start_time = int(r[6]) + 1
if len(response) < limit:
break
df = pd.DataFrame(data, columns=["Open time", "Open", "High", "Low", "Close", "Volume", "Close time", "Quote asset volume", "Number of trades", "Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"])
df['Open time'] = pd.to_datetime(df['Open time'], unit='ms')
df['Close time'] = pd.to_datetime(df['Close time'], unit='ms')
df.dropna(inplace=True)
return df
class TradingEnvironment(gym.Env):
def __init__(self, data):
super(TradingEnvironment, self).__init__()
self.data = data
self.initial_balance = ENV_INITIAL_BALANCE
self.balance = self.initial_balance
self.window_size = WINDOW_SIZE
self.position = 0
self.current_step = self.window_size
self.done = False
self.portfolio_value = self.balance
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Box(low=-float('inf'), high=float('inf'), shape=(WINDOW_SIZE, 7), dtype='float32')
def reset(self):
self.balance = self.initial_balance
self.position = 0
self.current_step = self.window_size
self.done = False
self.portfolio_value = self.balance
return self._next_observation()
def buy(self, current_price):
amount_to_buy = (self.balance * 1.0) / current_price
trading_fee = amount_to_buy * current_price * TRADING_FEE
self.position += amount_to_buy
self.balance -= amount_to_buy * current_price + trading_fee
def sell(self, current_price):
amount_to_sell = self.position * 1.0
trading_fee = amount_to_sell * current_price * TRADING_FEE
self.balance += amount_to_sell * current_price - trading_fee
self.position -= amount_to_sell
def calculate_portfolio_value(self, current_price):
return self.balance + self.position * current_price
def step(self, action):
current_price = self.data.iloc[self.current_step]["Close"]
if action == 0: # Buy
self.buy(current_price)
elif action == 1: # Sell
self.sell(current_price)
else: # Hold
pass
self.portfolio_value = self.calculate_portfolio_value(current_price)
reward = self._get_reward()
self.current_step += 1
self.done = self.current_step >= len(self.data) - 1
obs = self._next_observation()
return obs, reward, self.done, {}
def _next_observation(self):
raw_state = self.data.iloc[self.current_step - self.window_size: self.current_step]
return raw_state.values
def _get_reward(self):
current_price = self.data.iloc[self.current_step]["Close"]
portfolio_value_before = self.calculate_portfolio_value(current_price)
trade_cost = self.position * current_price * TRADING_FEE
if self.portfolio_value > portfolio_value_before - trade_cost: # if the trade was profitable
reward = self.portfolio_value / self.initial_balance - 1 # similar to the original reward
else: # if the trade was not profitable
loss = portfolio_value_before - self.portfolio_value
reward = -loss / self.initial_balance # penalty proportional to the relative loss
return reward
def render(self, mode='human', close=False):
pass
########################################
# Downloading the data
start_date = datetime.datetime(2015, 1, 1)
end_date = datetime.datetime(2023, 1, 1)
data = download_data(SYMBOL, INTERVAL, start_date, end_date)
env_name = "TradingEnvironment-v0"
register(
id=env_name,
entry_point=TradingEnvironment,
)
# Initialize Ray
ray.init(ignore_reinit_error=True)
# Define your config dictionary
config = {
"env": TradingEnvironment,
"env_config": {"data": data},
"gamma": GAMMA,
"lr": LR,
"batch_mode": "complete_episodes",
"train_batch_size": BATCH_SIZE,
"num_workers": 1,
"num_gpus": 0,
"model": {
"fcnet_hiddens": [128, 128],
},
}
# Pass the config dictionary to the PPOConfig class
config = PPOConfig(config)
# Train the model using PPO
results = tune.run("PPO", config=config, stop={"training_iteration": NUM_EPOCHS})
# Get the trained policy
trained_agent = PPOTrainer(config=config)
best_checkpoint = results.get_best_checkpoint(trial=results.get_best_trial("episode_reward_mean"))
trained_agent.restore(best_checkpoint)
I've tried reading the RAY documentation, Youtube tutorials and even asked GPT-4 to debug for me. Most tutorials online + GPT-4 give old out-dated coding examples. I really want learn more about Ray / RLlib and build even better, more complex models but before i can do that i can't seem to get it to work with my gym enviroment for some reason.
Alternativly i also heard that using Gymnasium would be better then using Gym?
Someone any idea what i'm doing wrong? Or have good up-to-date tutorial?