I am trying to insert a pandas dataframe into cassandra. I am using the execute_concurrent, but I don't see any improvement. It is taking almost 5s per row insertions. There are 14k rows so at this rate it will take more than 15 hours. I have 12 GB RAM with 2 CPU cores. How fast can I run this operation? I've tried with different concurrency numbers but without any success. Following is my code-:
from flask import session
import yaml
import pandas as pd
import argparse
from get_data import read_params
import cassandra
from cassandra.concurrent import execute_concurrent_with_args, execute_concurrent
from cassandra.cluster import Cluster, ExecutionProfile
from cassandra.auth import PlainTextAuthProvider
import sys
import time
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.3+
count = len(it)
def show(j):
x = int(size*j/count)
print("{}[{}{}] {}/{}".format(prefix, u"█"*x, "."*(size-x), j, count),
end='\r', file=out, flush=True)
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
print("\n", flush=True, file=out)
def cassandraDBLoad(config_path):
try:
config = read_params(config_path)
execution_profile = ExecutionProfile(request_timeout=10)
cassandra_config = {'secure_connect_bundle': "path"}
auth_provider = PlainTextAuthProvider(
"client_id",
"client_secret"
)
cluster = Cluster(cloud=cassandra_config, auth_provider=auth_provider)
session = cluster.connect()
session.default_timeout = None
connect_db = session.execute("select release_version from system.local")
set_keyspace = session.set_keyspace("Keyspace Name")
table_ = "big_mart"
define_columns = "Item_Identifier varchar PRIMARY KEY, Item_Weight varchar, Item_Fat_Content varchar, Item_Visibility varchar, Item_Type varchar, Item_MRP varchar, Outlet_Identifier varchar, Outlet_Establishment_Year varchar, Outlet_Size varchar, Outlet_Location_type varchar, Outlet_Type varchar, Item_Outlet_Sales varchar, source varchar"
drop_table = f"DROP TABLE IF EXISTS {table_}"
drop_result = session.execute(drop_table)
create_table = f"CREATE TABLE {table_}({define_columns});"
table_result = session.execute(create_table)
train = pd.read_csv("train_source")
test = pd.read_csv("test_source")
#Combine test and train into one file
train['source']='train'
test['source']='test'
df = pd.concat([train, test],ignore_index=True)
df = df.fillna('NA')
columns = "Item_Identifier, Item_Weight, Item_Fat_Content, Item_Visibility, Item_Type, Item_MRP, Outlet_Identifier, Outlet_Establishment_Year, Outlet_Size, Outlet_Location_Type, Outlet_Type, Item_Outlet_Sales, source"
insert_qry = f"INSERT INTO {table_}({columns}) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)"
statement = session.prepare(insert_qry)
parameters = [
(str(df.iat[i,0]), str(df.iat[i,1]), str(df.iat[i,2]), str(df.iat[i,3]),
str(df.iat[i,4]), str(df.iat[i,5]), str(df.iat[i,6]), str(df.iat[i,7]),
str(df.iat[i,8]), str(df.iat[i,9]), str(df.iat[i,10]), str(df.iat[i,11]),
str(df.iat[i,12]))
for i in range(len(df))]
for i in progressbar(range(len(df)), "Computing: ", 40):
time.sleep(0.1)
execute_concurrent_with_args(
session,
statement,
parameters,
concurrency=500
)
session.execute(batch)
except Exception as e:
raise Exception("(cassandraDBLoad): Something went wrong in the CassandraDB Load operations\n" + str(e))
csv files link - https://drive.google.com/drive/folders/1O03lNTMfSwhUKG61zOs7fNxXIRe44GRp?usp=sharing