I have a kafka topic from where I consumer messages and first write the data as is in a JSON file and then read the JSON file, apply transformations and then write transformed data into a CSV file.
Now I tried batchsize
but its leaving the last batch size as it doesn't match the batch size.
For example: If the batch size is 100,000 and there are 251,352 messages in the topic, it should process the first 100,000 messages, call read_raw_data()
to update the consumer offset, process the next 100,000 messages, call read_raw_data()
again, and finally process the remaining 51,352 messages and call read_raw_data()
for the last time. Let me know if this works for you?
How can I achieve this?
#!/usr/bin/env python
import os
from argparse import ArgumentParser, FileType
from configparser import ConfigParser
from confluent_kafka import Consumer, OFFSET_BEGINNING
import pandas as pd
import datetime
from datetime import timedelta
import json
import time
def parse_args():
"""
Parse command line arguments
"""
parser = ArgumentParser()
parser.add_argument("config_file", type=FileType("r"))
parser.add_argument("--reset", action="store_true")
parser.add_argument("topic_name", help="name of the Kafka topic")
return parser.parse_args()
def parse_config(args):
"""
Parse configuration file
"""
config_parser = ConfigParser()
config_parser.read_file(args.config_file)
config = dict(config_parser["default"])
config.update(config_parser["consumer"])
return config
def create_consumer(config):
"""
Create and return Consumer instance
"""
consumer = Consumer(config)
return consumer
def reset_offset(consumer, partitions, reset):
"""
Set message offset based on the reset flag
"""
if reset:
for p in partitions:
p.offset = OFFSET_BEGINNING
consumer.assign(partitions)
def get_file_name(topic):
"""
Generate file name based on topic and file counter
"""
folder_name = "trimet_raw_data"
if not os.path.exists(folder_name):
os.makedirs(folder_name)
return os.path.join(
folder_name, f"{topic}_{datetime.datetime.now().strftime('%Y-%m-%d')}.json"
)
def write_message_to_file(f, topic, key, value):
"""
Write message to file
"""
if key is None:
key = ""
else:
key = key.decode("utf-8")
if value is not None:
value = value.decode("utf-8")
f.write(f"{value}\n")
def consume_messages(consumer, topic, reset, batch_size):
"""
Consume messages from Kafka and write them to a file
"""
while True:
try:
message_count = 0
data_list = []
file_name = get_file_name(topic)
f = open(file_name, "w")
while True:
msg = consumer.poll(2.0)
if msg is None:
print("No new messages, waiting...")
time.sleep(1)
continue
elif msg.error():
print("ERROR: %s".format(msg.error()))
else:
key = msg.key()
value = msg.value()
try:
data_dict = json.loads(value)
data_list.append(data_dict)
message_count += 1
if message_count % 10000 == 0:
print(f"{message_count} messages processed")
except ValueError:
print("Error: Message is not in JSON format")
continue
write_message_to_file(f, topic, key, value)
if reset:
consumer.commit()
if message_count % 10000 == 0:
print(f"{message_count} messages processed")
if message_count > 0 and message_count % batch_size == 0:
print(f"{message_count} messages processed. Reading raw data...")
read_raw_data(topic, consumer)
message_count = 0
data_list = []
if message_count > 0:
print(f"{message_count} messages processed. Waiting for more messages...")
time.sleep(10)
msg = consumer.poll(10.0)
if msg is not None:
key = msg.key()
value = msg.value()
try:
data_dict = json.loads(value)
data_list.append(data_dict)
message_count += 1
print(f"{message_count} messages processed")
except ValueError:
print("Error: Message is not in JSON format")
continue
while True:
msg = consumer.poll(10.0)
if msg is None:
break
key = msg.key()
value = msg.value()
try:
data_dict = json.loads(value)
data_list.append(data_dict)
message_count += 1
print(f"{message_count} messages processed")
except ValueError:
print("Error: Message is not in JSON format")
continue
print(f"All messages have been processed. Reading raw data...")
read_raw_data(topic, consumer)
message_count = 0
data_list = []
except KeyboardInterrupt:
print("Closing Consumer")
print("Total message_count : " + str(message_count))
consumer.close()
break
except Exception as e:
print("An error occurred:", str(e))
continue
def data_transform(df):
if df["OPD_DATE"].str.contains(r"\d{2}-[A-Za-z]{3}-\d{2}").any():
filtered_df = df.copy()
filtered_df.rename(
columns={
"EVENT_NO_TRIP": "trip_id",
"OPD_DATE": "tstamp",
"VELOCITY": "longitude",
"DIRECTION": "latitude",
"RADIO_QUALITY": "gps_satellites",
"GPS_LONGITUDE": "gps_hdop",
},
inplace=True,
)
filtered_df.columns = filtered_df.columns.str.lower()
else:
filtered_df = df.copy()
filtered_df.rename(
columns={
"EVENT_NO_TRIP": "trip_id",
"OPD_DATE": "tstamp",
"GPS_LONGITUDE": "longitude",
"GPS_LATITUDE": "latitude",
},
inplace=True,
)
filtered_df.columns = filtered_df.columns.str.lower()
filtered_df["tstamp"] = filtered_df["tstamp"].apply(
lambda value: pd.to_datetime(value, format="%d-%b-%y", errors="coerce")
if len(value) <= 11
else pd.to_datetime(value, format="%d%b%Y:%H:%M:%S", errors="coerce")
)
filtered_df["act_time"] = pd.to_numeric(filtered_df["act_time"], errors="coerce")
filtered_df["tstamp"] = filtered_df.apply(
lambda row: row["tstamp"] + timedelta(seconds=row["act_time"])
if pd.notnull(row["tstamp"])
else "",
axis=1,
)
filtered_df = filtered_df.sort_values(["trip_id", "tstamp"])
filtered_df["dmeters"] = filtered_df.groupby(["trip_id"])["meters"].diff()
filtered_df["dtimestamp"] = filtered_df.groupby(["trip_id"])["tstamp"].diff()
# filtered_df["speed"] = filtered_df.apply(
# lambda row: round(row["dmeters"] / row["dtimestamp"].total_seconds(), 2), axis=1
# )
filtered_df["speed"] = filtered_df.apply(
lambda row: round(row["dmeters"] / row["dtimestamp"].total_seconds(), 2)
if row["dtimestamp"].total_seconds() != 0
else 0,
axis=1,
)
filtered_df["speed"] = filtered_df.groupby(["trip_id"])["speed"].fillna(
method="bfill"
)
filtered_df["service_key"] = filtered_df["tstamp"].dt.dayofweek.apply(
lambda day: "Weekday" if day < 5 else ("Saturday" if day == 5 else "Sunday")
)
return filtered_df
def read_raw_data(topic, consumer):
print(read_raw_data)
csv_filename = "test_csv.csv"
with open(get_file_name(topic), "r") as f:
df = pd.read_json(f, lines=True)
transformed_df = data_transform(df)
if not os.path.isfile(csv_filename):
print("Creating csv file")
transformed_df.to_csv(csv_filename, index=False)
else:
print("Appending to csv file")
transformed_df.to_csv(csv_filename, mode='a', index=False, header=False)
def main():
"""
Main function
"""
args = parse_args()
config = parse_config(args)
consumer = create_consumer(config)
topic = args.topic_name
consumer.subscribe([topic])
print(consume_messages)
batch_size = 100000
consume_messages(consumer, topic, args.reset, batch_size)
if __name__ == "__main__":
main()