2

I am using Java(8) to connect spark(1.6.0) with hbase(1.2.2 zookeeper 3.4.6). , the scala code on the spark client is OK . Spark cluster, hbase cluster, zookeeper cluster are on cloud . And the code is below :

package com.carelinker.spark;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.mesos.protobuf.ServiceException;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class SparktestApplication {
    public static void main(String[] args) throws ServiceException {
        //HBase config
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.property.clientPort", "2181");
        conf.set("hbase.zookeeper.quorum", "192.168.100.5:2181,192.168.100.3:2181,192.168.100.6:2181");
        conf.set("zookeeper.znode.parent", "/hbase/hbs-frz2bnnm");
        conf.set(TableInputFormat.INPUT_TABLE, "test");
        //Spark config
        SparkContext sc = new SparkContext("spark://skn-1w3zsyz0-spark-master:7077", "HBaseRead");
        sc.setLocalProperty("spark.executor.extraClassPath","/usr/local/hbase/lib/*");
        sc.setLocalProperty("spark.driver.extraClassPath","/usr/local/hbase/lib/*");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sc);
        JavaPairRDD<ImmutableBytesWritable, Result> myRDD = javaSparkContext.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
        System.out.println(myRDD.count());
    }
}

And the log keeps showing "15:04:46.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0" and there is an exception before this keep showing just like below :

15:03:46.380 [dag-scheduler-event-loop] DEBUG org.apache.spark.rdd.NewHadoopRDD - Failed to use InputSplit#getLocationInfo.
java.lang.NullPointerException: null
    at scala.collection.mutable.ArrayOps$ofRef$.length$extension(ArrayOps.scala:114)
    at scala.collection.mutable.ArrayOps$ofRef.length(ArrayOps.scala:114)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:32)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at org.apache.spark.rdd.HadoopRDD$.convertSplitLocationInfo(HadoopRDD.scala:412)
    at org.apache.spark.rdd.NewHadoopRDD.getPreferredLocations(NewHadoopRDD.scala:233)
    at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
    at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.preferredLocations(RDD.scala:256)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$getPreferredLocsInternal(DAGScheduler.scala:1545)
    at org.apache.spark.scheduler.DAGScheduler.getPreferredLocs(DAGScheduler.scala:1519)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:974)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:972)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:972)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:921)
    at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
15:03:46.417 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1 stored as values in memory (estimated size 1688.0 B, free 209.3 KB)
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1 locally took  2 ms
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1 without replication took  3 ms
15:03:46.424 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1_piece0 stored as bytes in memory (estimated size 1076.0 B, free 210.3 KB)
15:03:46.424 [dispatcher-event-loop-2] INFO org.apache.spark.storage.BlockManagerInfo - Added broadcast_1_piece0 in memory on 192.168.56.1:54197 (size: 1076.0 B, free: 1800.4 MB)
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManagerMaster - Updated info of block broadcast_1_piece0
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Told master about block broadcast_1_piece0
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1_piece0 locally took  3 ms
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1_piece0 without replication took  4 ms
15:03:46.425 [dag-scheduler-event-loop] INFO org.apache.spark.SparkContext - Created broadcast 1 from broadcast at DAGScheduler.scala:1006
15:03:46.428 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Submitting 1 missing tasks from ResultStage 0 (NewHadoopRDD[0] at newAPIHadoopRDD at SparktestApplication.java:28)
15:03:46.429 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.DAGScheduler - New pending partitions: Set(0)
15:03:46.430 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Adding task set 0.0 with 1 tasks
15:03:46.434 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Epoch for TaskSet 0.0: 0
15:03:46.445 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Valid locality levels for TaskSet 0.0: ANY
15:03:46.462 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:46.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:47.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:48.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:49.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:50.973 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:51.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:52.975 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:53.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:54.975 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:55.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:56.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:57.973 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:58.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0

And I have searched this problem as Spark give Null pointer exception during InputSplit for Hbase shows: config the hbase.master but it do not help me.

I have below questions :

  1. What does this exception means?
  2. Why does the parentName log keeps showing and there is no result?

Thanks for your kindly help.

The below is the maven pom file details:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.carelinker.spark</groupId>
    <artifactId>sparktest</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>sparktest</name>
    <description>Demo project for Spring Boot</description>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>1.5.2.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>1.6.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.eclipse.jetty.orbit</groupId>
                    <artifactId>javax.servlet</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
            <version>1.6.0</version>
        </dependency>
        <dependency>
            <groupId>javax.servlet</groupId>
            <artifactId>javax.servlet-api</artifactId>
            <version>3.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-tomcat</artifactId>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>


</project>
Community
  • 1
  • 1
  • I have changed below master url to "local" and it seems work, but it is wired that the spark master is on cloud , and it should be like "spark:\\IP:Port", Why it is OK when I changed it to local? `code` SparkContext sc = new SparkContext("spark://skn-1w3zsyz0-spark-master:7077", "HBaseRead"); – George Yuan Apr 12 '17 at 10:54
  • Having same issue... have you find resolution/root cause identification? – kensai Dec 04 '18 at 11:46

0 Answers0