I would like to join files (expeditions- 2010s.csv and peaks.csv) using join key "peakid" with CoGroupByKey. However, there is an error when I sink it to BigQuery: RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_88_215864ba592a2e01f0c4e2157cc60c47_86e3562707f348c29b2a030cb6ed7ded failed. Error Result: <ErrorProto location: 'gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502' message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket-name/input/temp/bq_load/ededcfb43cda4d16934011481e2fd774/project_name.dataset.expeditions/9fe30f70-8473-44bc-86d5-20dfdf59f502' reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs'].
Please review code as below:
def read_csv_pd_input1(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'bcdate', 'smtdate']]
a = df.set_index('peakid')[['bcdate', 'smtdate']].apply(tuple,1).to_dict()
a = tuple(a.items())
# result: only column name
# a = df.agg(lambda x: (x.values)).apply(tuple)
# result: only value but not as expected
# a = [tuple(x) for x in df.values]
# a = tuple(a)
return a
def read_csv_pd_input3(readable_file):
import json
import pandas as pd
import csv
import io
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
df = pd.DataFrame(csv_dict)
df = df[['peakid', 'pkname', 'heightm']]
a = df.set_index('peakid')[['pkname', 'heightm']].apply(tuple,1).to_dict()
a = tuple(a.items())
return a
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/expeditions- 2010s.csv')
parser.add_argument(
'--input3',
dest='input3',
required=False,
help='Input_p3 file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://bucket-name/input/peaks.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input])
| 'Pair each employee with key p1' >> beam.FlatMap(read_csv_pd_input1)
# | beam.Map(print)
)
input_p3 = (
p
| 'Read From GCS input3' >> beam.Create([known_args.input3])
| 'Pair each employee with key p3' >> beam.FlatMap(read_csv_pd_input3)
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
{'input_p1': input_p1, 'input_p3': input_p3}
| 'Join' >> beam.CoGroupByKey()
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project_name:dataset.expeditions',
schema='peakid:STRING,bcdate:DATE,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket-name/input/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()