0

I am trying to create JSON files in batches of 100 records each using apache beam pipeline as a Google Dataflow job.

I am reading records from BigQuery and trying to create JSON files each having 100 records i.e. batch_size = 100

So I expect 7 JSON files to be created when Dataflow is executed which reads 700 records from BQ, but I see more files created and the batch size is not as expected.

I expect that the "finish_bundle" method executes once but I see it gets executed multiple times there by creating JSON batch files having records lesser than 100.

Here's the log details of current DF execution

batch id - 1 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093207.json
batch id - 2 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093239.json
batch id - 3 - length of batch (finish_bundle) - 43 - flie name - my_bucket/29_09_2021/jbatch_20210929_093253.json
batch id - 1 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093329.json
batch id - 2 - length of batch (finish_bundle) - 66 - flie name - my_bucket/29_09_2021/jbatch_20210929_093349.json
batch id - 1 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093423.json
batch id - 2 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093454.json
batch id - 3 - length of batch (finish_bundle) - 61 - flie name - my_bucket/29_09_2021/jbatch_20210929_093512.json
batch id - 1 - length of batch (finish_bundle) - 30 - flie name - my_bucket/29_09_2021/jbatch_20210929_093525.json

I wish to create batches of JSON file with the correct sequence and batch size with 100 records each like this

batch id - 1 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093207.json
batch id - 2 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093239.json
batch id - 3 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093253.json
batch id - 4 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093329.json
batch id - 5 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093349.json
batch id - 6 - length of batch (process)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093423.json
batch id - 7 - length of batch (finish_bundle)- 100 - flie name - my_bucket/29_09_2021/jbatch_20210929_093454.json

Here's my pipeline code to create JSON batch files and store in GCS bucket.

import os
import json
import apache_beam as beam
import logging

from datetime import datetime
from apache_beam.options.pipeline_options import PipelineOptions


class CreateJSONBatch:

    def execute_pipeline(self):
        try:
            query = "SELECT id, name, region, country, language, pin-code  FROM `project.dataset.table` LIMIT 700"

            beam_options = {
                "project": "<project>",
                "region": "<region>",
                "job_name": "create_json_batch",
                "runner": "DataflowRunner",
                "temp_location": f"gs://<bucket>/temp_location/",
                "setup_file": "./setup.py"
            }

            options = PipelineOptions(**beam_options, save_main_session=True)

            with beam.Pipeline(options=options) as pipeline:
                raw_data = (
                        pipeline | 'Read from BQ ' >> beam.io.Read(beam.io.ReadFromBigQuery(query=query,
                                                                                            use_standard_sql=True))
                )
                _ = (raw_data | 'Create JSON batch files ' >> beam.ParDo(CreateBatch()))

        except Exception as e:
            logging.error(f"Exception in execute_pipeline - {str(e)}")


class CreateBatch(beam.DoFn):

    def __init__(self):
        self.project = None
        self.region = None
        self.batch_size = None
        self.data_bucket = None
        self.json_folder = None
        self.batch_id = 0
        self.json_batch = []

    def get_file_name(self):
        try:
            cur_time = datetime.now()
            date_folder = f"{cur_time.strftime('%d_%m_%Y')}"
            file_name = cur_time.strftime('%Y%m%d_%H%M%S')
            file_name = os.path.join(self.data_bucket, self.json_folder, date_folder, f"jbatch_{file_name}.json")
            return file_name    # file_name -my_bucket/folder_to_store_json_files/29_09_2021/jbatch_20210929_060346.json
        except Exception as e:
            logging.error(f"Exception in CreateBatch.get_file_name - {str(e)}")

    def create_json_files(self, json_file):
        try:
            json_file = f"gs://{json_file}"

            beam_options = {
                "project": self.project,
                "region": self.region,
                "runner": "DataflowRunner",
                "temp_location": f"gs://<bucket>/temp_location/",
                "setup_file": "./setup.py"
            }

            options = PipelineOptions(**beam_options, save_main_session=True)

            with beam.Pipeline(options=options) as pipeline_for_json:
                data = (
                        pipeline_for_json
                        | 'Create pcollection' >> beam.Create(self.json_batch)
                        | 'Write Output' >> beam.io.WriteToText(json_file, shard_name_template='')
                )
        except Exception as e:
            logging.error(f"Exception in CreateBatch.create_json_files - {str(e)}")

    def prep_data(self):
        try:
            formatted_json_batch = []

            for x in range(len(self.json_batch)):
                element = self.json_batch[x]

                modified_element = "<logic to modify the element JSON to the needed format>"

                # sample modified element
                # {
                #     "id": "",
                #     "name": "",
                #     "address": {
                #         "region": "",
                #         "country": "",
                #         "language": "",
                #         "pin-code": ""
                #     }
                # }

                formatted_json_batch.append(json.dumps(modified_element))

            return formatted_json_batch
        except Exception as e:
            self.logger.log_n_notify(log_type="error", msg=f"Exception in CreateBatch.prep_data - {str(e)}")

    def process(self, record):
        try:
            self.project = "<project>"
            self.region = "<region>"
            self.batch_size = 100
            self.data_bucket = "my_bucket"
            self.json_folder = "folder_to_store_json_files"

            if len(self.json_batch) < self.batch_size:
                self.json_batch.append(record)
            else:
                self.batch_id = self.batch_id + 1

                file_name = self.get_file_name()

                # prepare for push
                self.json_batch = self.prep_data()

                logging.info(msg=f"batch id - {self.batch_id} - length of batch (process) - {str(len(self.json_batch))} - flie name - {file_name}")

                # write to JSON
                self.create_json_files(file_name)

                self.json_batch = []

                self.json_batch.append(record)
        except Exception as e:
            logging.error(f"Exception in CreateBatch.process - {str(e)}")

    def finish_bundle(self):
        try:
            self.batch_id = self.batch_id + 1
            if len(self.json_batch) > 0:

                file_name = self.get_file_name()

                # prepare for push
                self.json_batch = self.prep_data()

                logging.info(msg=f"batch id - {self.batch_id} - length of batch (finish_bundle) - {str(len(self.json_batch))} - flie name - {file_name}")

                # write to JSON
                self.create_json_files(file_name)
        except Exception as e:
            logging.error(f"Exception in CreateBatch.finish_bundle - {str(e)}")


if __name__ == "__main__":
    create_batch = CreateJSONBatch()
    create_batch.execute_pipeline()

I am not sure on why the "finish_bundle" is called multiple times ?

What modifications in my pipeline code would make the files to get created with given batch size ?

Edit: I tried executing the same program with "DirectRunner", and it created the correct number of files.

Gopinath S
  • 101
  • 1
  • 14

1 Answers1

1

When processing a PCollection, the elements of the PCollection are split up into any number of bundles which are then executed (generally concurrently on many workers, but a worker may also receive multiple bundles sequentially as well). The start_bundle and finish_bundle are then called for each of these individual bundles. See more details at https://beam.apache.org/documentation/runtime/model/

robertwb
  • 4,891
  • 18
  • 21