val airlinesDf = spark.read.csv("input file")
val airlinesData : H2OFrame = airlinesDf
val airlinesTable: RDD[Airlines] = asRDD[Airlines](airlinesDf)
val flightsToORD = airlinesTable.filter(f => f.Dest == Some("ORD"))
flightsToORD.count()
when executing any function on flightsToORD we are getting below error message. we can only create Spark Data Frame from input file here, that is why we are creating spark data frame and converting it to H2O
I am using sparkling-water-2.0.25 with Spark 2.0.2
airlinesDf: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [ActualElapsedTime: string, AirTime: string ... 29 more fields]
airlinesTable: org.apache.spark.h2o.RDD[org.apache.spark.examples.h2o.Airlines] = H2ORDD[1420] at RDD at H2ORDD.scala:52
flightsToORD: org.apache.spark.rdd.RDD[org.apache.spark.examples.h2o.Airlines] = MapPartitionsRDD[1421] at filter at <console>:412
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 253.0 failed 4 times, most recent failure: Lost task 0.3 in stage 253.0 (TID 2351, dn4.h2oitc.ad.phemi.com): java.lang.IllegalArgumentException: Operation not allowed on string vector.
at water.fvec.CStrChunk.at8_impl(CStrChunk.java:89)
at water.fvec.Chunk.at8(Chunk.java:266)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.longAt(InternalReadConverterCtx.scala:60)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.intAt(InternalReadConverterCtx.scala:59)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.intAt(InternalReadConverterCtx.scala:29)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$ExtractorsTable$4.apply(ReadConverterCtx.scala:108)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$ExtractorsTable$4.apply(ReadConverterCtx.scala:108)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx$$anonfun$returnOption$2.apply(InternalReadConverterCtx.scala:47)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx$$anonfun$returnOption$2.apply(InternalReadConverterCtx.scala:46)
at scala.Option$WithFilter.flatMap(Option.scala:208)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.returnOption(InternalReadConverterCtx.scala:46)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$org$apache$spark$h2o$converters$ReadConverterCtx$$OptionReadersMap$1$$anonfun$apply$1.apply(ReadConverterCtx.scala:120)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$org$apache$spark$h2o$converters$ReadConverterCtx$$OptionReadersMap$1$$anonfun$apply$1.apply(ReadConverterCtx.scala:120)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$5$$anonfun$apply$1.apply(H2ORDD.scala:129)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$6.apply(H2ORDD.scala:133)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$6.apply(H2ORDD.scala:133)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.extractRow(H2ORDD.scala:133)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.readOneRow(H2ORDD.scala:189)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.hasNext(H2ORDD.scala:157)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1763)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)