0

I want ro run simple worcount ecample using apache Spark. Using local jar files in $SPARK_HOME/jars it runs correctly, but using maven dependancies it errors:

java.lang.NoSuchMethodError: org.apache.hadoop.fs.FileSystem$Statistics.getThreadStatistics()Lorg/apache/hadoop/fs/FileSystem$Statistics$StatisticsData;
at org.apache.spark.deploy.SparkHadoopUtil$$anonfun$1$$anonfun$apply$mcJ$sp$1.apply(SparkHadoopUtil.scala:149)
at org.apache.spark.deploy.SparkHadoopUtil$$anonfun$1$$anonfun$apply$mcJ$sp$1.apply(SparkHadoopUtil.scala:149)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.deploy.SparkHadoopUtil$$anonfun$1.apply$mcJ$sp(SparkHadoopUtil.scala:149)
at org.apache.spark.deploy.SparkHadoopUtil.getFSBytesReadOnThreadCallback(SparkHadoopUtil.scala:150)
at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:224)
at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:203)
at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:94)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)

Here is the code:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.util.Arrays;

public class SparkTest {
    public static void main(String[] args){
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("SparkTest");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> rdd = sc.textFile("file:///usr/local/spark/LICENSE");
        JavaPairRDD<String, Integer> counts = rdd
                .flatMap(s -> Arrays.asList(s.split(" ")).iterator())
                .mapToPair(word -> new Tuple2<>(word, 1))
                .reduceByKey((a, b) -> a + b);

        counts.coalesce(1).saveAsTextFile("file:///home/XXX/Desktop/Processing/spark");

    }
}

Here is POM.xml file:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>Processing</groupId>
    <artifactId>Streaming</artifactId>
    <version>1.0-SNAPSHOT</version>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.10.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.10_2.11</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-filesystem_2.11</artifactId>
            <version>1.3.2</version>
        </dependency>
    </dependencies>
</project>

It also included some othe apache software like Hadoop and Flink.

Spark version installed: 2.2.0 Download link: https://www.apache.org/dyn/closer.lua/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz

Hadoop installde version = 2.7.3

Something is mismatched here!

Soheil Pourbafrani
  • 3,249
  • 3
  • 32
  • 69

3 Answers3

1

Using your dependencies and showing how Java loads your class with org.apache.hadoop.fs.FileSystem.class.getResource("FileSyste‌​m.class") it appears your jar is loaded from org.apache.flink:flink-shaded-hadoop2:jar:1.3.2. When showing dependency tree with mvn dependency:tree we see its a transitive dependency from flink-java: and flink-streaming-java_2.11

[INFO] +- org.apache.flink:flink-java:jar:1.3.2:compile
[INFO] |  +- ...
[INFO] |  +- org.apache.flink:flink-shaded-hadoop2:jar:1.3.2:compile
[INFO] +- org.apache.flink:flink-streaming-java_2.11:jar:1.3.2:compile
[INFO] |  +- org.apache.flink:flink-runtime_2.11:jar:1.3.2:compile
[INFO] |  |  +- org.apache.flink:flink-shaded-hadoop2:jar:1.3.2:compile

This jar contains the entire org.apache.hadoop.fs package overriding the proper definition and causing your issue. You can try to remove the flink-java dependency or exclude flink-shaded-hadoop2 but that may cause issue with your code as other required Flink classes may be missing. For example:

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.3.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.flink</groupId>
                    <artifactId>flink-shaded-hadoop2</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.3.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.flink</groupId>
                    <artifactId>flink-shaded-hadoop2</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

Otherwise you'll have to find another solution depending on your project requirements: play around with class loading to ensure your classes are loaded properly, update your dependencies versions so that the Hadoop classes match with Flink, etc.

Pierre B.
  • 11,612
  • 1
  • 37
  • 58
0

Finally creating another dedicated maven project for Spark with just spark-core maven dependency it works.

Can anyone say why?

Soheil Pourbafrani
  • 3,249
  • 3
  • 32
  • 69
  • Using your dependencies and showing how Java loads your class with `org.apache.hadoop.fs.FileSystem.class.getResource("FileSystem.class")` it appears `org.apache.flink:flink-java:jar:1.3.2` define a transitive dependency `org.apache.flink:flink-shaded-hadoop2:jar:1.3.2` which itself contains the entire `org.apache.hadoop.fs`package... Your app loads the class from this bogus jar instead of the proper one, hence your error. Using Spark Core as principal dependency seems like a proper work-around, but if you can exclude `flink-shaded-hadoop2` entirely without issues that will be better – Pierre B. Nov 29 '17 at 09:02
  • @PierreB. Great! Post an answer and I will get it correct answer – Soheil Pourbafrani Nov 29 '17 at 09:09
  • Glad that'd help, I posted an answer – Pierre B. Nov 29 '17 at 09:40
0

As of Flink 1.4 (release pending), Flink can run without any hadoop dependencies, and if you need hadoop, having hadoop in the classpath is sufficient. This should make your life easier.

David Anderson
  • 39,434
  • 4
  • 33
  • 60