Implement PPO agent using RAY on custom Gym environment

Question

I've been interested in Reinforcement Learning for a while. I've made a simple PPO agent, using Stable Baselines 3, PyTorch and Gym to trade Bitcoin using Binance data. This was relatively easy and straight forward to setup, get it running and get a somewhat decent model. I've noticed that Stable Baselines 3 isn't very efficient in training (especially on GPU). It takes about 3 weeks to train a model on 500 epochs.

I've read about RAY / RLlib and it seems like a better way to setup Reinforcement Learning agents / enviroments. I've read the RAY documentation for PPO, watched youtube tutorials and even asked GPT-4 for help. Yet nothing seems to work with my custom gym enviroment.

Why did this just work easily in Stable Baselines3 yet it's a struggle to get this to work on RAY? Can't find any good, simple explainations either.

Here's my code, with my enviroment etc. Does someone have a good, simple tutorial or knows what i've been doing wrong?

import datetime
import pandas as pd
import numpy as np
import gym
import requests
from gym import spaces
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig

from gym.envs.registration import register

# Constants
SYMBOL = "BTCUSDT"
INTERVAL = "6h"
WINDOW_SIZE = 60
BATCH_SIZE = 128
GAMMA = 0.95
LR = 0.0003
NUM_EPOCHS = 1000
STEPS_PER_EPISODE = 5000
TRADING_FEE = 0.0004
ENV_INITIAL_BALANCE = 1000


# Download Binance data. IMPORTANT: THIS ONLY WORKS OUTSIDE OF US DUE TO API LIMITS FROM BINANCE
def download_data(symbol, interval, start_date, end_date, limit=500):
    data = []
    start_time = int(start_date.timestamp() * 1000)
    end_time = int(end_date.timestamp() * 1000)
    base_url = 'https://api.binance.com/api/v3/klines'

    while start_time < end_time:
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': start_time,
            'endTime': end_time,
            'limit': limit
        }
        response = requests.get(base_url, params=params).json()

        if not response:
            break

        for r in response:
            row = [int(r[0]), float(r[1]), float(r[2]), float(r[3]), float(r[4]), float(r[5]), int(r[6]), float(r[7]), int(r[8]), float(r[9]), float(r[10]), float(r[11])]
            data.append(row)

            if int(r[6]) > start_time:
                start_time = int(r[6]) + 1

        if len(response) < limit:
            break

    df = pd.DataFrame(data, columns=["Open time", "Open", "High", "Low", "Close", "Volume", "Close time", "Quote asset volume", "Number of trades", "Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"])
    df['Open time'] = pd.to_datetime(df['Open time'], unit='ms')
    df['Close time'] = pd.to_datetime(df['Close time'], unit='ms')
    df.dropna(inplace=True)

    return df

class TradingEnvironment(gym.Env):
    def __init__(self, data):
        super(TradingEnvironment, self).__init__()
        self.data = data
        self.initial_balance = ENV_INITIAL_BALANCE
        self.balance = self.initial_balance
        self.window_size = WINDOW_SIZE
        self.position = 0
        self.current_step = self.window_size
        self.done = False
        self.portfolio_value = self.balance
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=-float('inf'), high=float('inf'), shape=(WINDOW_SIZE, 7), dtype='float32')

    def reset(self):
        self.balance = self.initial_balance
        self.position = 0
        self.current_step = self.window_size
        self.done = False
        self.portfolio_value = self.balance
        return self._next_observation()

    def buy(self, current_price):
        amount_to_buy = (self.balance * 1.0) / current_price
        trading_fee = amount_to_buy * current_price * TRADING_FEE
        self.position += amount_to_buy
        self.balance -= amount_to_buy * current_price + trading_fee

    def sell(self, current_price):
        amount_to_sell = self.position * 1.0
        trading_fee = amount_to_sell * current_price * TRADING_FEE
        self.balance += amount_to_sell * current_price - trading_fee
        self.position -= amount_to_sell

    def calculate_portfolio_value(self, current_price):
        return self.balance + self.position * current_price

    def step(self, action):
        current_price = self.data.iloc[self.current_step]["Close"]

        if action == 0:  # Buy
            self.buy(current_price)
        elif action == 1:  # Sell
            self.sell(current_price)
        else:  # Hold
            pass

        self.portfolio_value = self.calculate_portfolio_value(current_price)
        reward = self._get_reward()

        self.current_step += 1
        self.done = self.current_step >= len(self.data) - 1

        obs = self._next_observation()
        return obs, reward, self.done, {}

    def _next_observation(self):
        raw_state = self.data.iloc[self.current_step - self.window_size: self.current_step]
        return raw_state.values

    def _get_reward(self):
        current_price = self.data.iloc[self.current_step]["Close"]
        portfolio_value_before = self.calculate_portfolio_value(current_price)
        trade_cost = self.position * current_price * TRADING_FEE

        if self.portfolio_value > portfolio_value_before - trade_cost:  # if the trade was profitable
            reward = self.portfolio_value / self.initial_balance - 1  # similar to the original reward
        else:  # if the trade was not profitable
            loss = portfolio_value_before - self.portfolio_value
            reward = -loss / self.initial_balance  # penalty proportional to the relative loss

        return reward

    def render(self, mode='human', close=False):
        pass


########################################
# Downloading the data
start_date = datetime.datetime(2015, 1, 1)
end_date = datetime.datetime(2023, 1, 1)
data = download_data(SYMBOL, INTERVAL, start_date, end_date)

env_name = "TradingEnvironment-v0"
register(
    id=env_name,
    entry_point=TradingEnvironment,
)

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Define your config dictionary
config = {
    "env": TradingEnvironment,
    "env_config": {"data": data},
    "gamma": GAMMA,
    "lr": LR,
    "batch_mode": "complete_episodes",
    "train_batch_size": BATCH_SIZE,
    "num_workers": 1,
    "num_gpus": 0,
    "model": {
        "fcnet_hiddens": [128, 128],
    },
}

# Pass the config dictionary to the PPOConfig class
config = PPOConfig(config)

# Train the model using PPO
results = tune.run("PPO", config=config, stop={"training_iteration": NUM_EPOCHS})

# Get the trained policy
trained_agent = PPOTrainer(config=config)
best_checkpoint = results.get_best_checkpoint(trial=results.get_best_trial("episode_reward_mean"))
trained_agent.restore(best_checkpoint)

I've tried reading the RAY documentation, Youtube tutorials and even asked GPT-4 to debug for me. Most tutorials online + GPT-4 give old out-dated coding examples. I really want learn more about Ray / RLlib and build even better, more complex models but before i can do that i can't seem to get it to work with my gym enviroment for some reason.

Alternativly i also heard that using Gymnasium would be better then using Gym?

Someone any idea what i'm doing wrong? Or have good up-to-date tutorial?

Please edit the question to limit it to a specific problem with enough detail to identify an adequate answer. — Community, Aug 13 '23 at 17:47

Implement PPO agent using RAY on custom Gym environment

0 Answers0