0

I am trying to insert new data in a table already created in bigquery. After processing the data and creating the data frame, I want to add the new rows in the table. i am looking bor a efficient way of doing it .

Currently what i am doing is

def export_items_to_bigquery(ID_recipien,action_date_hour,monday, tuesday, wednesday, thursday, friday, saturday, sunday, click_action, read_action, converted,imedeate_response, weight, weight_mu, processing_date):
# Instantiates a client
  bigquery_client = bigquery.Client()
# Prepares a reference to the dataset
  dataset_ref = bigquery_client.dataset('internal')
  table_ref = dataset_ref.table('Processed_data')
  table = bigquery_client.get_table(table_ref)  # API call
  rows_to_insert = [(ID_recipien,action_date_hour,monday, tuesday, wednesday, thursday, friday, saturday, sunday, click_action, read_action, converted,imedeate_response, weight, weight_mu, processing_dat)]
  errors = bigquery_client.insert_rows(table, rows_to_insert)  # API request
#assert errors == []
  print(errors)

for i in range(len(out)): 
    hrs=int(out.iloc[i].hours)
    ID_recipient=int(out.iloc[i].ID_recipient)
    Last_Date=str(out.iloc[i].Last_Date)
    last_id_send=int(out.iloc[i].last_id_send)
    unique_send_id_action=int(out.iloc[i].unique_send_id_action)
    click_count=int(out.iloc[i].click_count)
    read_count=int(out.iloc[i].read_count)
    Monday=int(out.iloc[i].Monday)
    Tuesday=int(out.iloc[i].Tuesday)
    Wednesday=int(out.iloc[i].Wednesday)
    Thursday=int(out.iloc[i].Thursday)
    Friday=int(out.iloc[i].Friday)
    Saturday=int(out.iloc[i].Saturday)
    Sunday=int(out.iloc[i].Sunday)
    Total_day_active=int(out.iloc[i].Total_day_active)
    click_read_mulrtiplier=int(out.iloc[i].click_read_mulrtiplier)
    click_read_weight=int(out.iloc[i].click_read_weight)
    final_weight=int(out.iloc[i].final_weight)
    last_hour=int(out.iloc[i].last_hour)
    last_email=int(out.iloc[i].last_email)
    last_log_id=int(out.iloc[i].last_log_id)
    export_items_to_bigquery(hrs,ID_recipient,Last_Date, last_id_send, unique_send_id_action,click_count, read_count, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday, Total_day_active, click_read_mulrtiplier, click_read_weight, final_weight, last_hour, last_email,last_log_id)

This is very inefficient

I also tried

new_data.to_gbq('internal.intermediate_table_weight_calc', 'ids-internal', chunksize=10000, if_exists='append')

but it gives Please verify that the structure and data types in the DataFrame match the schema of the destination table even when all names are same and all dtype is int.

Sourav Roy
  • 347
  • 3
  • 20

1 Answers1

4
t1=dt.datetime.now()
from google.cloud import bigquery
import os, json
def format_schema(schema):
    formatted_schema = []
    for row in schema:
        formatted_schema.append(bigquery.SchemaField(row['name'], row['type'], row['mode']))   
    return formatted_schema

#json_data = new_data.head(4).to_json(orient = 'records')

### Additional parameter used to convert to newline delimited format
json_data = new_data.head(4).to_json(orient = 'records')
json_object = json.loads(json_data)
project_id = 'mailkit-internal'
dataset_id = 'internal'
table_id = 'intermediate_table_weight_calc'

client = bigquery.Client(project = project_id)
dataset = client.dataset(dataset_id)
table = dataset.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
#job_config.schema = format_schema(table_schema)
job = client.load_table_from_dataframe(df, table, job_config = job_config)
print(job.result())

this worked .. hope it helps .. make sure to install !pip install pyarrow !pip install fastparquet and table name and type matches dataframe and bigquery table

Sourav Roy
  • 347
  • 3
  • 20