1

I want this code to write only the newly added files in the input pattern and write it to BQ table as it now adds the existing and new files not only the new files

I want this code to write only the newly added files in the input pattern and write it to BQ table as it now adds the existing and new files not only the new files

# Configure logging

logging.basicConfig(level=logging.INFO)

# Define the pipeline options

pipeline_options = PipelineOptions(
project='moneyfellows-data',
runner='DataflowRunner',
job_name='streaming-bundle-test',
temp_location='gs://mf-staging-area/temp',
region='us-central1')

standard_options = pipeline_options.view_as(StandardOptions)
standard_options.streaming = True

# Define the pipeline

with beam.Pipeline(options=pipeline_options) as pipeline:
\# Read the contents of the .gz archive using ReadFromText
\# and parse each JSON object using json.loads
lines = (
pipeline
| "ReadFromGCS" \>\> beam.io.ReadFromText(input_pattern,compression_type=CompressionTypes.GZIP)
| "ParseJSON" \>\> beam.Map(json.loads)
)

    # Write the data to BigQuery
    lines | "WriteToBigQuery" >> beam.io.WriteToBigQuery(
        output_table,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
    )

while True:
\# Log when the pipeline starts
logging.info("Pipeline started")

    # Run the pipeline
    pipeline_result = pipeline.run()
    
    # Log when the pipeline finishes
    logging.info("Pipeline finished")
    
    # Wait for the pipeline to finish
    pipeline_result.wait_until_finish()
Mazlum Tosun
  • 5,761
  • 1
  • 9
  • 23

1 Answers1

1

If you want to continuously monitor a file directory for new files, you could try this:

import logging

import apache_beam as beam
from apache_beam.io.fileio import EmptyMatchTreatment, MatchContinuously
from apache_beam.io.filesystems import FileSystems

logging.root.setLevel(logging.ERROR)

with beam.Pipeline() as p:
    p | "MatchContinuously" >> MatchContinuously(
        file_pattern=FileSystems.join("./", "*.csv"),
        interval=5,
        empty_match_treatment=EmptyMatchTreatment.DISALLOW,
    ) | "AttachKey" >> beam.Map(lambda x: (x.path, x)) | beam.Map(print)

https://beam.apache.org/releases/pydoc/current/_modules/apache_beam/ml/inference/utils.html#WatchFilePattern contains more steps to load the latest files.

XQ Hu
  • 141
  • 4