I have a Spark application that runs iteratively over 5 million elements.
The application takes 2 hours to run on the whole dataset. But I have to run the application on the whole dataset of over 50 million elements.
The code runs successfully but the thing is that most of my program runs on the driver and the executors have a minimal role to play in running the application.
Therefore the computational time is very large for this iterative application.
The application finds connected components by building a graph from the n-triples dataset.
The problem is that executor is not receiving tasks and the first for loop runs until all the 5 million elements are completed and so this part takes about 90% time and so I need to optimize this part mainly.
Suggest changes to transfer the work from driver to executor thereby making this code scalable to significantly reduce the computational time.
import scala.io.Source
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
object Wisdom {
val componentLists = HashMap[VertexId, ListBuffer[VertexId]]()
val prefLabelMap = HashMap[VertexId, String]()
def main(args: Array[String]) {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val tripleEndingPattern = """\s*\.\s*$""".r
val languageTagPattern = "@[\\w-]+".r
var edgeArray = Array(Edge(0L,0L,"http://dummy/URI"))
var literalPropsTriplesArray = new Array[(Long,Long,String)](0)
var vertexArray = new Array[(Long,String)](0)
val source = sc.textFile("hdfs://ec2-54-172-85-190.compute-1.amazonaws.com:54310/akshat/datas.nt")
val lines = source.toArray
var vertexURIMap = new HashMap[String, Long];
var triple = new Array[String](3)
var nextVertexNum = 0L
for (i <- 0 until lines.length) {
lines(i) = tripleEndingPattern.replaceFirstIn(lines(i)," ")
triple = lines(i).mkString.split(">\\s+")
val tripleSubject = triple(0).substring(1)
val triplePredicate = triple(1).substring(1)
if (!(vertexURIMap.contains(tripleSubject))) {
vertexURIMap(tripleSubject) = nextVertexNum
nextVertexNum += 1
}
if (!(vertexURIMap.contains(triplePredicate))) {
vertexURIMap(triplePredicate) = nextVertexNum
nextVertexNum += 1
}
val subjectVertexNumber = vertexURIMap(tripleSubject)
val predicateVertexNumber = vertexURIMap(triplePredicate)
if (triple(2)(0) == '<') {
val tripleObject = triple(2).substring(1)
if (!(vertexURIMap.contains(tripleObject))) {
vertexURIMap(tripleObject) = nextVertexNum
nextVertexNum += 1
}
val objectVertexNumber = vertexURIMap(tripleObject)
edgeArray = edgeArray :+
Edge(subjectVertexNumber,objectVertexNumber,triplePredicate)
}
else {
literalPropsTriplesArray = literalPropsTriplesArray :+
(subjectVertexNumber,predicateVertexNumber,triple(2))
}
}
for ((k, v) <- vertexURIMap) vertexArray = vertexArray :+ (v, k)
for (i <- 0 until literalPropsTriplesArray.length) {
if (literalPropsTriplesArray(i)._2 ==
vertexURIMap("http://www.w3.org/2000/01/rdf-schema#label")) {
val prefLabel =
languageTagPattern.replaceFirstIn(literalPropsTriplesArray(i)._3,"")
prefLabelMap(literalPropsTriplesArray(i)._1) = prefLabel;
}
}
val vertexRDD: RDD[(Long, String)] = sc.parallelize(vertexArray)
val edgeRDD: RDD[Edge[(String)]] =
sc.parallelize(edgeArray.slice(1,edgeArray.length))
val literalPropsTriplesRDD: RDD[(Long,Long,String)] =
sc.parallelize(literalPropsTriplesArray)
val graph: Graph[String, String] = Graph(vertexRDD, edgeRDD)
val skosRelatedSubgraph =
graph.subgraph(t => t.attr ==
"http://purl.org/dc/terms/subject")
val ccGraph = skosRelatedSubgraph.connectedComponents()
ccGraph.vertices.saveAsTextFile("hdfs://ec2-54-172-85-190.compute-1.amazonaws.com/akshat/outp")
sc.stop
}
}