I'm using dataproc-bigQuery connector to read a partitioned table ,it contains over 300GB of data and its partitioned by date ,but all I need is the data from today to read with the spark connector, I tried reading it with a view from bigquery already partitioned, but that doesn't work , is there a way to read a partition from a bigquery table with apache spark?
Update (now with code snippet):
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat
import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat
import com.google.gson.JsonObject
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.DoubleWritable
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
@transient
val conf = sc.hadoopConfiguration
//path to the view
val fullyQualifiedInputTableId = "XXXX"
val projectId conf.get("fs.gs.project.id")
val bucket = conf.get("fs.gs.system.bucket")
conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)
conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)
BigQueryConfiguration.configureBigQueryInput(conf,
fullyQualifiedInputTableId)
val outputTableId = projectId + ":sparkBigQuery.classifiedQueries"
val outputGcsPath = ("gs://" +bucket+"/hadoop/tmp/bigquery/wordcountoutput")
BigQueryOutputConfiguration.configure(conf,outputTableId,null,outputGcsPath,BigQueryFileFormat.NEWLINE_DELIMITED_JSON,classOf[TextOutputFormat[_,_]])
conf.set("mapreduce.job.outputformat.class",classOf[IndirectBigQueryOutputFormat[_,_]].getName)
conf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY,"WRITE_TRUNCATE")
def convertToTuple(record: JsonObject) : (String, String,Double) = {
val user = record.get("user").getAsString
val query = record.get("query").getAsString.toLowerCase
val classifiedQuery= nb.predict(tf.transform(query.split(" ")))
return (user, query,classifiedQuery)
}
// Load data from BigQuery.
val tableData = sc.newAPIHadoopRDD(
conf,
classOf[GsonBigQueryInputFormat],
classOf[LongWritable],
classOf[JsonObject])
tableData.map(entry=>convertToReadbale(entry._2)).first()
val classifiedRDD = tableData.map(entry => convertToTuple(entry._2))
classifiedRDD.take(10).foreach(l => println(l))