I assume that you are using spark + scala +Hbase
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.HTable;
object SparkWithMyTable {
def main(args: Array[String]) {
//Initiate spark context with spark master URL. You can modify the URL per your environment.
val sc = new SparkContext("spark://ip:port", "MyTableTest")
val tableName = "myTable"
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "list of cluster ip's")
conf.set("hbase.zookeeper"+ ".property.clientPort","2181");
conf.set("hbase.master", "masterIP:60000");
conf.set("hadoop.security.authentication", "kerberos");
conf.set("hbase.security.authentication", "kerberos");
UserGroupInformation.setConfiguration(conf);
UserGroupInformation.loginUserFromKeytab("user@---", keyTabPath);
// Add local HBase conf
// conf.addResource(new Path("file://hbase/hbase-0.94.17/conf/hbase-site.xml"))
conf.set(TableInputFormat.INPUT_TABLE, tableName)
// create my table with column family
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(tableName)) {
print("Creating MyTable")
val tableDesc = new HTableDescriptor(tableName)
tableDesc.addFamily(new HColumnDescriptor("cf1".getBytes()));
admin.createTable(tableDesc)
}else{
print("Table already exists!!")
val columnDesc = new HColumnDescriptor("cf1");
admin.disableTable(Bytes.toBytes(tableName));
admin.addColumn(tableName, columnDesc);
admin.enableTable(Bytes.toBytes(tableName));
}
//first put data into table
val myTable = new HTable(conf, tableName);
for (i <- 0 to 5) {
var p = new Put();
p = new Put(new String("row" + i).getBytes());
p.add("cf1".getBytes(), "column-1".getBytes(), new String(
"value " + i).getBytes());
myTable.put(p);
}
myTable.flushCommits();
//how to create rdd
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//get the row count
val count = hBaseRDD.count()
print("HBase RDD count:"+count)
System.exit(0)
}
}
Maven Artifact
<dependency>
<groupId>it.nerdammer.bigdata</groupId>
<artifactId>spark-hbase-connector_2.10</artifactId>
<version>1.0.3</version> // Version can be changed as per your Spark version, I am using Spark 1.6.x
</dependency>
Can also have a look at