How can this code make the dataflow streaming only listen to the newly inserted files in input pattern rather new and existing

Question

I want this code to write only the newly added files in the input pattern and write it to BQ table as it now adds the existing and new files not only the new files

# Configure logging

logging.basicConfig(level=logging.INFO)

# Define the pipeline options

pipeline_options = PipelineOptions(
project='moneyfellows-data',
runner='DataflowRunner',
job_name='streaming-bundle-test',
temp_location='gs://mf-staging-area/temp',
region='us-central1')

standard_options = pipeline_options.view_as(StandardOptions)
standard_options.streaming = True

# Define the pipeline

with beam.Pipeline(options=pipeline_options) as pipeline:
\# Read the contents of the .gz archive using ReadFromText
\# and parse each JSON object using json.loads
lines = (
pipeline
| "ReadFromGCS" \>\> beam.io.ReadFromText(input_pattern,compression_type=CompressionTypes.GZIP)
| "ParseJSON" \>\> beam.Map(json.loads)
)

    # Write the data to BigQuery
    lines | "WriteToBigQuery" >> beam.io.WriteToBigQuery(
        output_table,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
    )

while True:
\# Log when the pipeline starts
logging.info("Pipeline started")

    # Run the pipeline
    pipeline_result = pipeline.run()
    
    # Log when the pipeline finishes
    logging.info("Pipeline finished")
    
    # Wait for the pipeline to finish
    pipeline_result.wait_until_finish()

What are the criteria for you to consider files are new ? – Mazlum Tosun Mar 27 '23 at 15:18 — Mazlum Tosun, Mar 27 '23 at 15:18

score 1 · Answer 1 · answered Mar 28 '23 at 00:51

If you want to continuously monitor a file directory for new files, you could try this:

import logging

import apache_beam as beam
from apache_beam.io.fileio import EmptyMatchTreatment, MatchContinuously
from apache_beam.io.filesystems import FileSystems

logging.root.setLevel(logging.ERROR)

with beam.Pipeline() as p:
    p | "MatchContinuously" >> MatchContinuously(
        file_pattern=FileSystems.join("./", "*.csv"),
        interval=5,
        empty_match_treatment=EmptyMatchTreatment.DISALLOW,
    ) | "AttachKey" >> beam.Map(lambda x: (x.path, x)) | beam.Map(print)

https://beam.apache.org/releases/pydoc/current/_modules/apache_beam/ml/inference/utils.html#WatchFilePattern contains more steps to load the latest files.

How can this code make the dataflow streaming only listen to the newly inserted files in input pattern rather new and existing

1 Answers1