What is the best way to store streaming data from different stock exchanges in order to minimise data weight?
Right now I'm using CCXT library on Python and in order to get current order book information and save in into Parquet type file using code below:
But everytime I save data my parquet file getting bigger and bigger and opening it everytime in order to save data using to_parquet() function is making my code speed go slower. Because of it as a time goes by I get a huge time lags in my dataframe.
Maybe you can recommend some different ways to store such type of data?
import ccxt.pro
import asyncio
import datetime
import pandas as pd
import numpy as np
import os
async def watch_book(exchange, ticker):
last = None
limits = {'Binance': 1000,
'Huobi': 150}
columns = ['bids_price', 'bids_value', 'asks_price', 'asks_value', 'time']
while True:
try:
bids, asks = {}, {}
orderbook = await exchange.watch_order_book(ticker)
now = datetime.datetime.now()
for b in orderbook['bids']:
bids[str(float(b[0]))] = float(b[1])
for a in orderbook['asks']:
asks[str(float(a[0]))] = float(a[1])
x = np.empty(limits[exchange.name])
x.fill(orderbook['timestamp'])
data_table = pd.DataFrame([bids.keys(), bids.values(), asks.keys(), asks.values(), x], index=columns).T.dropna(axis=1, how='all').replace({None: np.nan}).fillna(0).astype(float).set_index('time')
if os.path.exists(f'{exchange.name}_snap.parquet.gzip'):
data_table.to_parquet(f'{exchange.name}_snap.parquet.gzip', engine='fastparquet', append=True,
compression='gzip')
else:
data_table.to_parquet(f'{exchange.name}_snap.parquet.gzip', engine='fastparquet', compression='gzip')
except Exception as e:
print(f'{exchange.name} failed {type(e)} {e}')
async def main():
exchange_ids = ['binance']
exchanges = [getattr(ccxt.pro, exchange_id)() for exchange_id in exchange_ids]
try:
done, pending = await asyncio.wait({watch_book(exchange, 'BTC/USDT') for exchange in exchanges}, return_when=asyncio.FIRST_EXCEPTION)
for completed in done:
# trigger the exception here
completed.result()
except Exception as e:
print(f'closing all exchanges because of exception {type(e)} {e}')
await asyncio.gather(*[exchange.close() for exchange in exchanges])
asyncio.run(main())
I have tried using parquet data type file to store my data