4

I am trying to run SparkSql on hive tables. But the problem I could not understand. Here is my code:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.hive.*;
import org.apache.spark.sql.SQLContext;
public class queryhive {
public static void main(String[] args)
{
    //SparkSession sc = new SparkConf().setAppName("SparkSessionZipsExample").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("SparkSessionZipsExample").setMaster("local");
        JavaSparkContext scon = new JavaSparkContext(sparkConf);
            SQLContext sqlContext = new SQLContext(scon);
        String warehouseLocation = "file:${system:user.dir}/spark-warehouse";
            SparkSession sc = SparkSession
               .builder()
               .appName("SparkSessionZipsExample")
               .config("spark.sql.warehouse.dir", warehouseLocation)
               .enableHiveSupport()
               .getOrCreate();
            HiveContext hc = new org.apache.spark.sql.hive.HiveContext(sc);
            hc.sql("select count(*) from SparkHive.health");
            Row[] results = (Row[]) sqlContext.sql("FROM src SELECT key, value").collect();
}
}

The exception I get is:

17/02/16 16:36:51 INFO SparkSqlParser: Parsing command: select count(*) from SparkHive.health
Exception in thread "main" java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.orc.DefaultSource could not be instantiated
    at java.util.ServiceLoader.fail(ServiceLoader.java:232)
    at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
    at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
    at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
    at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
    at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
    at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
    at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:550)
    at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:86)
    at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:86)
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:325)
    at org.apache.spark.sql.execution.datasources.ResolveDataSource$$anonfun$apply$1.applyOrElse(rules.scala:58)
    at org.apache.spark.sql.execution.datasources.ResolveDataSource$$anonfun$apply$1.applyOrElse(rules.scala:41)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
    at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:60)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:58)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:58)
    at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
    at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:58)
    at org.apache.spark.sql.execution.datasources.ResolveDataSource.apply(rules.scala:41)
    at org.apache.spark.sql.execution.datasources.ResolveDataSource.apply(rules.scala:40)
    at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
    at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
    at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
    at scala.collection.immutable.List.foldLeft(List.scala:84)
    at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
    at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
    at scala.collection.immutable.List.foreach(List.scala:381)
    at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
    at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:64)
    at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:62)
    at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48)
    at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:63)
    at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:592)
    at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:699)
    at SparkHiveSql.sparkhivesql.queryhive.main(queryhive.java:27)
Caused by: java.lang.VerifyError: Bad return type
Exception Details:
  Location:
    org/apache/spark/sql/hive/orc/DefaultSource.createRelation(Lorg/apache/spark/sql/SQLContext;[Ljava/lang/String;Lscala/Option;Lscala/Option;Lscala/collection/immutable/Map;)Lorg/apache/spark/sql/sources/HadoopFsRelation; @35: areturn
  Reason:
    Type 'org/apache/spark/sql/hive/orc/OrcRelation' (current frame, stack[0]) is not assignable to 'org/apache/spark/sql/sources/HadoopFsRelation' (from method signature)
  Current Frame:
    bci: @35
    flags: { }
    locals: { 'org/apache/spark/sql/hive/orc/DefaultSource', 'org/apache/spark/sql/SQLContext', '[Ljava/lang/String;', 'scala/Option', 'scala/Option', 'scala/collection/immutable/Map' }
    stack: { 'org/apache/spark/sql/hive/orc/OrcRelation' }
  Bytecode:
    0x0000000: b200 1c2b c100 1ebb 000e 592a b700 22b6
    0x0000010: 0026 bb00 2859 2c2d b200 2d19 0419 052b
    0x0000020: b700 30b0                              

    at java.lang.Class.getDeclaredConstructors0(Native Method)
    at java.lang.Class.privateGetDeclaredConstructors(Class.java:2671)
    at java.lang.Class.getConstructor0(Class.java:3075)
    at java.lang.Class.newInstance(Class.java:412)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
    ... 43 more
17/02/16 16:36:55 INFO SparkContext: Invoking stop() from shutdown hook
17/02/16 16:36:55 INFO SparkUI: Stopped Spark web UI at http://10.0.0.3:4040

I do not know why is this happening. Before running this program, my HIVE was working fine, but now it is not working at all.
What is the reason and how I make the above code run?
I am using Eclispe IDE and my Spark version is 2.1.0

OneCricketeer
  • 179,855
  • 19
  • 132
  • 245
Jaffer Wilson
  • 7,029
  • 10
  • 62
  • 139
  • Related: http://stackoverflow.com/questions/41516166/spark-2-0-datasourceregister-configuration-error-while-saving-dataframe-as-cvs – G_H Feb 16 '17 at 11:44
  • @G_H Not a problem what I have. Thank you for your share – Jaffer Wilson Feb 16 '17 at 12:12
  • If you check the stack trace on both questions, you'll find the root cause is the same. A ServiceLoader finds DefaultSource implementations on the classpath invokes a constructor that returns a type which doesn't correspond to the expected return type. An `OrcRelation` is returned where a `HadoopFsRelation` is expected, but OrcRelation doesn't implement HadoopFsRelation. It might be a version conflict, since I can't find HadoopFsRelation in 2.1.0, while it is there in older versions (e.g. 1.6.0). Do you have multiple Spark versions on your classpath, or mixed Spark/Hive implementations? – G_H Feb 16 '17 at 13:43
  • @G_H May be the multiple spark. Actually I have used some libraries during my program from the open source Spark. I have installed hadoop using Bitnami installer and it have came with bundle of hive and spark. Using the bundle for my purpose. But what to run it using java program. – Jaffer Wilson Feb 16 '17 at 13:51
  • 1
    Is it possible that the Hadoop installation uses a version of Hive and Spark before 2.0.0 (like 1.6.3), while you've included 2.1.0 libraries in your code? Or the other way around: using pre-2.0.0 libraries while the install is 2.1.0. Your classpath has a jar somewhere with `META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` in it, which lists `org.apache.spark.sql.hive.orc.DefaultSource` as an implementation. The DefaultSource it finds has a method called `createRelation` that returns an OrcRelation which isn't a HadoopFsRelation subclass, but the code calling it expects that. – G_H Feb 16 '17 at 14:07

1 Answers1

5

There's most likely some version conflict in your classpath. To understand what's happening, I'll briefly explain the Java service provider mechanism.

In Java, the service provider mechanism allows an API to specify some (abstract) class which implementations of the API must subclass. A ServiceLoader can then be used to find implementations of the provider class. An example of such a class is JAXBContext from the Java API. JAXB itself is the API, which you'd use in an application, but there are multiple JAXB implementations (a reference implementation and EclipseLink Moxy). The abstract class (JAXBContext in this example) is the entry point for the API. One way the service provider mechanism can find implementations is through files in a special folder on the classpath: META-INF/services. You'll normally find such folders in jar files. The services folder can contain files with the name of the abstract provider class. There may be multiple such files. Each file can contain any number of specific class names that are implementations of the abstract class.

So, for JAXB, you'd find files with filename javax.xml.bind.JAXBContext in jar files. That's the name of the abstract provider class. The files would then contain one or more lines listing implementations of JAXBContext, which can be instantiated to provide an entry to that provider.

Let's look at your stack trace. At some point, a class called DataSource wishes to find implementations. It happens here in the stack:

at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:550)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:325)

The java.util.ServiceLoader class is used to iterate through implementations that have been registered via the above mechanism. It goes through the list of implementations until it finds something according to some criteria of the calling code. That happens here:

at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)

This is where things go wrong. The details of the issue are found in this part:

Location:
org/apache/spark/sql/hive/orc/DefaultSource.createRelation(Lorg/apache/spark/sql/SQLContext;[Ljava/lang/String;Lscala/Option;Lscala/Option;Lscala/collection/immutable/Map;)Lorg/apache/spark/sql/sources/HadoopFsRelation; @35: areturn
Reason:
Type 'org/apache/spark/sql/hive/orc/OrcRelation' (current frame, stack[0]) is not assignable to 'org/apache/spark/sql/sources/HadoopFsRelation' (from method signature)
Current Frame:
bci: @35
flags: { }
locals: { 'org/apache/spark/sql/hive/orc/DefaultSource', 'org/apache/spark/sql/SQLContext', '[Ljava/lang/String;', 'scala/Option', 'scala/Option', 'scala/collection/immutable/Map' }
stack: { 'org/apache/spark/sql/hive/orc/OrcRelation' }

It takes a bit of digging, but you can deduct from this that a class org.apache.spark.sql.hive.orc.DefaultSource is found that allegedly implements what the call is looking for. An instance of it is created and a method createRelation is then called with it. That method has a return type org.apache.spark.sql.sources.HadoopFsRelation, at least according to the abstract class. However, what's returned is a class org.apache.spark.sql.hive.orc.OrcRelation. That's fine if it is a subclass of HadoopFsRelation, but apparently it isn't.

The way this is most likely to happen is if different implementations of the same class ended up on the classpath, and the one you get has different method return types or simply has a different (incompatible) class hierarchy.

I went looking for class HadoopFsRelation. It can be found in Spark up until version 1.6.3, in package org.apache.spark.sql.sources. In version 2.1.0 it's no longer there, so I suspect it's been removed from the 2.x API onwards. In version 1.6.3, you'll also find class org.apache.spark.sql.hive.orc.OrcRelation, and in that version it does indeed implement HadoopFsRelation.

Now onto Spark version 2.1.0. Class HadoopFsRelation is nowhere to be found. I have found OrcRelation, in the same package as before (and in jar spark-hive_2.11-2.1.0.jar which is part of the Spark 2.1.0 distribution). Only now the class doesn't implement HadoopFsRelation.

So what's happening is this. You try to use Spark in your code with Hive. The SQLContext class you've created is probably loaded from Spark 1.6.x (or some other version older than 2.x). It goes looking for DataSourceRegister implementations (the class that's listed as the service provider) and finds a META-INF/services/org.apache.spark.sql.sources.DataSourceRegister file (or multiple, more likely). It decides that what it needs is implementation org.apache.spark.sql.hive.orc.DefaultSource. The class is found and instantiated. So far so good. The 1.6.x code then calls createRelation on it and expects a HadoopFsRelation implementation. However, the returned org.apache.spark.sql.hive.orc.OrcRelation had its class loaded from version 2.1.0 (or anything after 2) which doesn't implement HadoopFsRelation.

If you've installed Spark 2.1.0 and used its libraries, and then added some libraries you've downloaded, my bet is that you've downloaded some pre-2.x version. Those end up being the entry point when executing your code but they accidentally find newer classes that have a different definition. You need to check what's on your classpath and remove incorrect entries. It might be best to remove everything Spark, Hive and Hadoop-related and make sure you use only things from the installation, or download the latest version via Apache, check that you code against its API and only use those jars. If you need anything extra, make sure it's from the same version or it's some utility compatible with the version you're using.

G_H
  • 11,739
  • 3
  • 38
  • 82