I can run code manually in vertex to push code to BQ, but when I schedule the code the output table is not pushed to BQ.
No errors are listed, but neither does it record the code as having been run (although I can see it is scheduled).
import json
import requests
import pandas as pd
from datetime import date
from google.cloud import bigquery
# Set up the BigQuery client
client = bigquery.Client()
# Define the BigQuery table details
dataset_id = 'scraper_pool'
table_id = 'price_scraper'
table_ref = client.dataset(dataset_id).table(table_id)
schema = [
bigquery.SchemaField('title', 'STRING', mode='nullable'),
bigquery.SchemaField('created', 'STRING', mode='nullable'),
bigquery.SchemaField('price', 'FLOAT', mode='nullable'),
bigquery.SchemaField('sku', 'STRING', mode='nullable'),
bigquery.SchemaField('available', 'BOOLEAN', mode='nullable'),
bigquery.SchemaField('retrieval_date', 'DATE', mode='nullable'),
]
# Define the URLs to scrape
urls = ['https://www.twowheelsempire.com/products.json?limit=250&page=1']
# Scrape the data and load it into BigQuery
for url in urls:
r = requests.get(url)
data = r.json()
products = data["products"]
rows = []
for item in data["products"]:
title = item["title"]
created = item["created_at"]
product_type = item["product_type"]
for variant in item["variants"]:
price = float(variant["price"]) # Cast price to float
sku = str(variant["sku"]) # Cast sku to string
available = bool(variant["available"]) # Cast available to boolean
row = {'title': str(title),
'created': str(created),
'price': float(price),
'sku': str(sku),
'available': bool(available),
'retrieval_date': date.today().isoformat()} # Add the current date to the row data
rows.append(row)
# Create a new BigQuery table and load the data
# Create a new BigQuery table and load the data
job_config = bigquery.LoadJobConfig(schema=schema, write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
job = client.load_table_from_json(rows, table_ref, job_config=job_config)
job.result()
I expected the code to push a new table to BQ. The code will manually run and it doesnt record any errors in the schedule, but nor does it appear to have actually run.