The following code are ingesting 10k-20k record per second and I want to improve the performance of it. I am reading a json format and ingesting it into database using Kafka. -I am running it on the cluster of five nodes with zookeeper and Kafka installed on it.
Can you give me some tips to improve?
import os
import json
from multiprocessing import Pool
from kafka.client import KafkaClient
from kafka.producer import SimpleProducer
def process_line(line):
producer = SimpleProducer(client)
try:
jrec = json.loads(line.strip())
producer.send_messages('twitter2613',json.dumps(jrec))
except ValueError, e:
{}
if __name__ == "__main__":
client = KafkaClient('10.62.84.35:9092')
myloop=True
pool = Pool(30)
direcToData = os.listdir("/FullData/RowData")
for loop in direcToData:
mydir2=os.listdir("/FullData/RowData/"+loop)
for i in mydir2:
if myloop:
with open("/FullData/RowData/"+loop+"/"+i) as source_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(process_line, source_file, 30)