0

I have a kafka topic from where I consumer messages and first write the data as is in a JSON file and then read the JSON file, apply transformations and then write transformed data into a CSV file.

Now I tried batchsize but its leaving the last batch size as it doesn't match the batch size.

For example: If the batch size is 100,000 and there are 251,352 messages in the topic, it should process the first 100,000 messages, call read_raw_data() to update the consumer offset, process the next 100,000 messages, call read_raw_data() again, and finally process the remaining 51,352 messages and call read_raw_data() for the last time. Let me know if this works for you?

How can I achieve this?

#!/usr/bin/env python

import os
from argparse import ArgumentParser, FileType
from configparser import ConfigParser
from confluent_kafka import Consumer, OFFSET_BEGINNING
import pandas as pd
import datetime
from datetime import timedelta
import json
import time


def parse_args():
    """
    Parse command line arguments
    """
    parser = ArgumentParser()
    parser.add_argument("config_file", type=FileType("r"))
    parser.add_argument("--reset", action="store_true")
    parser.add_argument("topic_name", help="name of the Kafka topic")
    return parser.parse_args()


def parse_config(args):
    """
    Parse configuration file
    """
    config_parser = ConfigParser()
    config_parser.read_file(args.config_file)
    config = dict(config_parser["default"])
    config.update(config_parser["consumer"])
    return config


def create_consumer(config):
    """
    Create and return Consumer instance
    """
    consumer = Consumer(config)
    return consumer


def reset_offset(consumer, partitions, reset):
    """
    Set message offset based on the reset flag
    """
    if reset:
        for p in partitions:
            p.offset = OFFSET_BEGINNING
        consumer.assign(partitions)


def get_file_name(topic):
    """
    Generate file name based on topic and file counter
    """
    folder_name = "trimet_raw_data"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.join(
        folder_name, f"{topic}_{datetime.datetime.now().strftime('%Y-%m-%d')}.json"
    )


def write_message_to_file(f, topic, key, value):
    """
    Write message to file
    """
    if key is None:
        key = ""
    else:
        key = key.decode("utf-8")
    if value is not None:
        value = value.decode("utf-8")
    f.write(f"{value}\n")

        
def consume_messages(consumer, topic, reset, batch_size):
    """
    Consume messages from Kafka and write them to a file
    """
    while True:
        try:
            message_count = 0
            data_list = []
            file_name = get_file_name(topic)
            f = open(file_name, "w")
            while True:
                msg = consumer.poll(2.0)
                if msg is None:
                    print("No new messages, waiting...")
                    time.sleep(1)
                    continue
                elif msg.error():
                    print("ERROR: %s".format(msg.error()))
                else:
                    key = msg.key()
                    value = msg.value()
                    try:
                        data_dict = json.loads(value)
                        data_list.append(data_dict)
                        message_count += 1
                        if message_count % 10000 == 0:
                            print(f"{message_count} messages processed")
                    except ValueError:
                        print("Error: Message is not in JSON format")
                        continue
                write_message_to_file(f, topic, key, value)
                if reset:
                    consumer.commit()
                if message_count % 10000 == 0:
                    print(f"{message_count} messages processed")
                if message_count > 0 and message_count % batch_size == 0:
                    print(f"{message_count} messages processed. Reading raw data...")
                    read_raw_data(topic, consumer)
                    message_count = 0
                    data_list = []
            if message_count > 0:
                print(f"{message_count} messages processed. Waiting for more messages...")
                time.sleep(10)
                msg = consumer.poll(10.0)
                if msg is not None:
                    key = msg.key()
                    value = msg.value()
                    try:
                        data_dict = json.loads(value)
                        data_list.append(data_dict)
                        message_count += 1
                        print(f"{message_count} messages processed")
                    except ValueError:
                        print("Error: Message is not in JSON format")
                        continue
                    while True:
                        msg = consumer.poll(10.0)
                        if msg is None:
                            break
                        key = msg.key()
                        value = msg.value()
                        try:
                            data_dict = json.loads(value)
                            data_list.append(data_dict)
                            message_count += 1
                            print(f"{message_count} messages processed")
                        except ValueError:
                            print("Error: Message is not in JSON format")
                            continue
                    print(f"All messages have been processed. Reading raw data...")
                    read_raw_data(topic, consumer)
                    message_count = 0
                    data_list = []
        except KeyboardInterrupt:
            print("Closing Consumer")
            print("Total message_count : " + str(message_count))
            consumer.close()
            break
        except Exception as e:
            print("An error occurred:", str(e))
            continue






def data_transform(df):
    if df["OPD_DATE"].str.contains(r"\d{2}-[A-Za-z]{3}-\d{2}").any():
        filtered_df = df.copy()
        filtered_df.rename(
            columns={
                "EVENT_NO_TRIP": "trip_id",
                "OPD_DATE": "tstamp",
                "VELOCITY": "longitude",
                "DIRECTION": "latitude",
                "RADIO_QUALITY": "gps_satellites",
                "GPS_LONGITUDE": "gps_hdop",
            },
            inplace=True,
        )
        filtered_df.columns = filtered_df.columns.str.lower()
    else:
        filtered_df = df.copy()
        filtered_df.rename(
            columns={
                "EVENT_NO_TRIP": "trip_id",
                "OPD_DATE": "tstamp",
                "GPS_LONGITUDE": "longitude",
                "GPS_LATITUDE": "latitude",
            },
            inplace=True,
        )
        filtered_df.columns = filtered_df.columns.str.lower()

    filtered_df["tstamp"] = filtered_df["tstamp"].apply(
        lambda value: pd.to_datetime(value, format="%d-%b-%y", errors="coerce")
        if len(value) <= 11
        else pd.to_datetime(value, format="%d%b%Y:%H:%M:%S", errors="coerce")
    )
    filtered_df["act_time"] = pd.to_numeric(filtered_df["act_time"], errors="coerce")
    filtered_df["tstamp"] = filtered_df.apply(
        lambda row: row["tstamp"] + timedelta(seconds=row["act_time"])
        if pd.notnull(row["tstamp"])
        else "",
        axis=1,
    )
    filtered_df = filtered_df.sort_values(["trip_id", "tstamp"])
    filtered_df["dmeters"] = filtered_df.groupby(["trip_id"])["meters"].diff()
    filtered_df["dtimestamp"] = filtered_df.groupby(["trip_id"])["tstamp"].diff()
    # filtered_df["speed"] = filtered_df.apply(
    #     lambda row: round(row["dmeters"] / row["dtimestamp"].total_seconds(), 2), axis=1
    # )
    filtered_df["speed"] = filtered_df.apply(
        lambda row: round(row["dmeters"] / row["dtimestamp"].total_seconds(), 2)
        if row["dtimestamp"].total_seconds() != 0
        else 0,
        axis=1,
    )

    filtered_df["speed"] = filtered_df.groupby(["trip_id"])["speed"].fillna(
        method="bfill"
    )
    filtered_df["service_key"] = filtered_df["tstamp"].dt.dayofweek.apply(
        lambda day: "Weekday" if day < 5 else ("Saturday" if day == 5 else "Sunday")
    )
    return filtered_df



def read_raw_data(topic, consumer):
    print(read_raw_data)
    csv_filename = "test_csv.csv"
    
    with open(get_file_name(topic), "r") as f:
        df = pd.read_json(f, lines=True)
    transformed_df = data_transform(df)
    
    if not os.path.isfile(csv_filename):
        print("Creating csv file")
        transformed_df.to_csv(csv_filename, index=False)
    else:
        print("Appending to csv file")
        transformed_df.to_csv(csv_filename, mode='a', index=False, header=False)

def main():
    """
    Main function
    """
    args = parse_args()
    config = parse_config(args)
    consumer = create_consumer(config)
    topic = args.topic_name
    consumer.subscribe([topic])
    print(consume_messages)
    batch_size = 100000
    consume_messages(consumer, topic, args.reset, batch_size)

if __name__ == "__main__":
    main()
OneCricketeer
  • 179,855
  • 19
  • 132
  • 245
Alice
  • 9
  • 3
  • Spark Structured Streaming can naturally use dataframes from Kafka events. You don't have to use Confluent library and pandas together. In any case, I don't really understand the problem. What's wrong with your code? Do you have unit tests to verify your logic? – OneCricketeer May 14 '23 at 16:06

0 Answers0