I have a table_a
in BigQuery. It has 3 columns:
- id: STRING
- source_data: STRING (can be parsed AS JSON)
- timestamp: TIMESTAMP
I would like to create a view table_b
based on table_a
that flattens source_data
at the first level only.
Here is the main.py file of my data pipeline (GCP Flex Template):
import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions from apache_beam.io.gcp.internal.clients import bigquery
class ParseJsonFn(beam.DoFn):
def process(self, element):
import json
source_data = element['source_data']
parsed_json = json.loads(source_data)
return [parsed_json]
class FlattenJsonFn(beam.DoFn):
def process(self, element):
def flatten_dict(data, prefix=""):
items = []
for key, value in data.items():
new_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
items.extend(flatten_dict(value, new_key).items())
else:
items.append((new_key, value))
return dict(items)
return [flatten_dict(element)]
def run():
# Replace with your project ID and input/output table names
project_id = "project_id"
input_table = "table_a"
output_table = "table_b"
options = PipelineOptions()
options.view_as(StandardOptions).runner = 'DirectRunner' # Change to DataflowRunner for production use
with beam.Pipeline(options=options) as p:
data = (
p
| "Read from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(query=f"SELECT source_data FROM {input_table}"))
| "Parse JSON" >> beam.ParDo(ParseJsonFn())
| "Flatten JSON" >> beam.ParDo(FlattenJsonFn())
)
table_schema = bigquery.TableSchema()
table_schema.fields.append(bigquery.TableFieldSchema(name="field", type="STRING"))
table_schema.fields.append(bigquery.TableFieldSchema(name="value", type="STRING"))
data | "Write to BigQuery" >> beam.io.WriteToBigQuery(
output_table,
schema=table_schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
if __name__ == "__main__":
run()
However, I get the following error:
ValueError: Invalid GCS location: None.
Writing to BigQuery with FILE_LOADS method requires a GCS location to be provided to write files to be loaded into BigQuery. Please provide a GCS bucket through custom_gcs_temp_location in the constructor of WriteToBigQuery or the fallback option --temp_location, or pass method="STREAMING_INSERTS" to WriteToBigQuery. [while running 'Write to BigQuery/BigQueryBatchFileLoads/GenerateFilePrefix']
make: *** [test-template] Error 1
Can anyone help?
Thank you so much!