I am trying to run spark job with cassandra as a backend. I have a master and 2 slave machines.
I have a table in cassandra which I am querying with cassandrasqlcontext. There are only 9 entries for the query I am executing. I am using spark shell to execute this. When I call rdd.count() it gives me java heap out of space exception. The worker machines are using 2 GB ram each. For such a small row count it should not give this error. Please let me know the issue.
scala> val cc = new CassandraSQLContext(sc)
cc: org.apache.spark.sql.cassandra.CassandraSQLContext = org.apache.spark.sql.cassandra.CassandraSQLContext@12d16ab1
scala>
scala> val rdd = cc.sql("select * from sams.events where appname = 'test1'")
15/07/11 21:45:00 INFO Cluster: New Cassandra host /172.28.0.164:9042 added
15/07/11 21:45:00 INFO CassandraConnector: Connected to Cassandra cluster: Test Cluster
rdd: org.apache.spark.sql.DataFrame = [appname: string, addedtime: timestamp, assetname: string, brandname: string, client: map<string,string>, eventname: string, eventorigin: string, eventtime: timestamp, geolocation: map<string,string>, location: map<string,string>, meta: map<string,string>, packname: string, timezone: float, userid: string]
scala> 15/07/11 21:45:01 INFO CassandraConnector: Disconnected from Cassandra cluster: Test Cluster
scala> rdd.count()
15/07/11 21:45:27 INFO CassandraStrategies$CassandraTableScans: projectList: ArrayBuffer()
15/07/11 21:45:27 INFO CassandraStrategies$CassandraTableScans: predicates: List((appname#14 = test1))
15/07/11 21:45:27 INFO CassandraStrategies$CassandraTableScans: pushdown predicates: ArrayBuffer((appname#14 = test1))
15/07/11 21:45:27 INFO CassandraStrategies$CassandraTableScans: remaining predicates: ArrayBuffer()
15/07/11 21:45:28 INFO Exchange: Using SparkSqlSerializer2.
15/07/11 21:45:28 INFO CassandraTableScan: attributes :
15/07/11 21:45:29 INFO SparkContext: Starting job: count at <console>:27
15/07/11 21:45:29 INFO Cluster: New Cassandra host /172.28.0.164:9042 added
15/07/11 21:45:29 INFO CassandraConnector: Connected to Cassandra cluster: Test Cluster
15/07/11 21:45:30 INFO CassandraConnector: Disconnected from Cassandra cluster: Test Cluster
Exception in thread "dag-scheduler-event-loop" java.lang.OutOfMemoryError: Java heap space
at scala.collection.mutable.ResizableArray$class.$init$(ResizableArray.scala:32)
at scala.collection.mutable.ArrayBuffer.<init>(ArrayBuffer.scala:47)
at scala.collection.mutable.ArrayBuffer.<init>(ArrayBuffer.scala:62)
at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:911)
at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:929)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:969)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:972)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$class.toStream(Iterator.scala:1143)
at scala.collection.AbstractIterator.toStream(Iterator.scala:1157)
at scala.collection.Iterator$$anonfun$toStream$1.apply(Iterator.scala:1143)
at scala.collection.Iterator$$anonfun$toStream$1.apply(Iterator.scala:1143)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1085)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1077)
at scala.collection.immutable.Stream$StreamWithFilter.scala$collection$immutable$Stream$StreamWithFilter$$tailMap$1(Stream.scala:492)
at scala.collection.immutable.Stream$StreamWithFilter$$anonfun$scala$collection$immutable$Stream$StreamWithFilter$$tailMap$1$1.apply(Stream.scala:494)
at scala.collection.immutable.Stream$StreamWithFilter$$anonfun$scala$collection$immutable$Stream$StreamWithFilter$$tailMap$1$1.apply(Stream.scala:494)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1085)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1077)
at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:376)
at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:376)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1085)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1077)
at scala.collection.immutable.StreamIterator$$anonfun$next$1.apply(Stream.scala:980)
at scala.collection.immutable.StreamIterator$$anonfun$next$1.apply(Stream.scala:980)
at scala.collection.immutable.StreamIterator$LazyCell.v$lzycompute(Stream.scala:969)
at scala.collection.immutable.StreamIterator$LazyCell.v(Stream.scala:969)
at scala.collection.immutable.StreamIterator.hasNext(Stream.scala:974)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.parallel.mutable.ResizableParArrayCombiner$$anon$1.$plus$plus$eq(ResizableParArrayCombiner.scala:90)