I want this code to write only the newly added files in the input pattern and write it to BQ table as it now adds the existing and new files not only the new files
I want this code to write only the newly added files in the input pattern and write it to BQ table as it now adds the existing and new files not only the new files
# Configure logging
logging.basicConfig(level=logging.INFO)
# Define the pipeline options
pipeline_options = PipelineOptions(
project='moneyfellows-data',
runner='DataflowRunner',
job_name='streaming-bundle-test',
temp_location='gs://mf-staging-area/temp',
region='us-central1')
standard_options = pipeline_options.view_as(StandardOptions)
standard_options.streaming = True
# Define the pipeline
with beam.Pipeline(options=pipeline_options) as pipeline:
\# Read the contents of the .gz archive using ReadFromText
\# and parse each JSON object using json.loads
lines = (
pipeline
| "ReadFromGCS" \>\> beam.io.ReadFromText(input_pattern,compression_type=CompressionTypes.GZIP)
| "ParseJSON" \>\> beam.Map(json.loads)
)
# Write the data to BigQuery
lines | "WriteToBigQuery" >> beam.io.WriteToBigQuery(
output_table,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
)
while True:
\# Log when the pipeline starts
logging.info("Pipeline started")
# Run the pipeline
pipeline_result = pipeline.run()
# Log when the pipeline finishes
logging.info("Pipeline finished")
# Wait for the pipeline to finish
pipeline_result.wait_until_finish()