0

I'm using dataproc-bigQuery connector to read a partitioned table ,it contains over 300GB of data and its partitioned by date ,but all I need is the data from today to read with the spark connector, I tried reading it with a view from bigquery already partitioned, but that doesn't work , is there a way to read a partition from a bigquery table with apache spark?

Update (now with code snippet):

 import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
    import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat
    import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat
    import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration
    import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat
    import com.google.gson.JsonObject
    import org.apache.hadoop.io.LongWritable
    import org.apache.hadoop.io.DoubleWritable
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
    import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
    import org.apache.spark.mllib.util.MLUtils
    import org.apache.spark.mllib.feature.{HashingTF, IDF}
    import org.apache.spark.mllib.linalg.Vector
    import org.apache.spark.rdd.RDD     


      @transient   
            val conf = sc.hadoopConfiguration   
            //path to the view   
            val fullyQualifiedInputTableId = "XXXX"   
            val projectId conf.get("fs.gs.project.id")   
            val bucket = conf.get("fs.gs.system.bucket")   
            conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)     
            conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)    
            BigQueryConfiguration.configureBigQueryInput(conf,
            fullyQualifiedInputTableId)     
            val outputTableId = projectId + ":sparkBigQuery.classifiedQueries"
            val outputGcsPath = ("gs://"   +bucket+"/hadoop/tmp/bigquery/wordcountoutput")    
            BigQueryOutputConfiguration.configure(conf,outputTableId,null,outputGcsPath,BigQueryFileFormat.NEWLINE_DELIMITED_JSON,classOf[TextOutputFormat[_,_]])     
            conf.set("mapreduce.job.outputformat.class",classOf[IndirectBigQueryOutputFormat[_,_]].getName)           
            conf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY,"WRITE_TRUNCATE")       
            def convertToTuple(record: JsonObject) : (String, String,Double) = {   
              val user = record.get("user").getAsString   
              val query = record.get("query").getAsString.toLowerCase   
              val classifiedQuery= nb.predict(tf.transform(query.split(" ")))    
              return (user, query,classifiedQuery)    
            }
        // Load data from BigQuery.
        val tableData = sc.newAPIHadoopRDD(
            conf,
            classOf[GsonBigQueryInputFormat],
            classOf[LongWritable],
            classOf[JsonObject])
tableData.map(entry=>convertToReadbale(entry._2)).first()

val classifiedRDD = tableData.map(entry => convertToTuple(entry._2))

        classifiedRDD.take(10).foreach(l => println(l)) 
Graham Polley
  • 14,393
  • 4
  • 44
  • 80
Mootaz
  • 13
  • 6

1 Answers1

0

Use the partition decorator ("$") documented here, it looks like the Hadoop connector does support the "$" in the table name string.

Victor Mota
  • 1,219
  • 12
  • 17