To do this with Scio, you can create a custom output transform that writes to a destination specified by a DynamicDestinations
object (Apache Beam). The table is dynamically determined by some characteristic of the input element, in this case the event time (hour) of the element.
Custom output transform for BigQuery:
import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.bigquery.BigQueryUtil
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition
import org.apache.beam.sdk.io.gcp.bigquery._
import org.apache.beam.sdk.transforms.PTransform
import org.apache.beam.sdk.values.{PCollection, PDone, ValueInSingleWindow}
def saveAsBigQuery(tblPrefix: String,
tblSchema: String,
writeDisposition: WriteDisposition):
PTransform[PCollection[TableRow], PDone] = {
BigQueryIO.writeTableRows()
.to(new DynamicDestinations[TableRow, String] {
override def getTable(tblSuffix: String): TableDestination = {
// TODO: construct table name
val tblName = "%s_%s".format(tblPrefix, tblSuffix)
new TableDestination(tblName, null)
}
override def getDestination(tblRow: ValueInSingleWindow[TableRow]): String = {
// TODO: determine hourly table suffix based on event time in tblRow object
}
override def getSchema(destination: String): TableSchema = {
BigQueryUtil.parseSchema(tblSchema)
}
})
.withWriteDisposition(writeDisposition)
.asInstanceOf[PTransform[PCollection[TableRow], PDone]]
}
Apply custom output transform using the function above:
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write
val tblPrefix = "table_prefix"
val tblSchema = "table_schema" // TODO: this needs to be in valid BigQuery schema format
val writeDisposition = Write.WriteDisposition.WRITE_APPEND
val bqTransform = saveAsBigQuery(
tblPrefix,
tblSchema,
writeDisposition)
// assuming tblRows is an SCollection[TableRow]
tblRows.saveAsCustomOutput("saveAsBigQuery", bqTransform)