1

When I attempt to generate the connected components using graphframes it is taking substantially longer than I expected. I am running on spark 2.1, graphframes 0.5 and AWS EMR with 3 r4.xlarge instances. When the generating the connected components for a graph of about 12 million edges it is taking around 3 hours.

The code is below. I am fairly new to spark so any suggestions would be awesome.

def main(args: Array[String]): Unit = {
  val sparkConf = new SparkConf()
    .setMaster("yarn-cluster")
    .setAppName("Connected Component")

  val sc = new SparkContext(sparkConf)
  sc.setCheckpointDir("s3a://......")
  AWSUtils.setS3Credentials(sc.hadoopConfiguration)

  implicit val sqlContext = SQLContext.getOrCreate(sc)
  import sqlContext.implicits._

  val historical = sqlContext
    .read
    .option("mergeSchema", "false")
    .parquet("s3a://.....")
    .map(x => (x(0).toString, x(2).toString, x(1).toString, x(3).toString, x(4).toString.toLong, x(5).toString.toLong))

  // Complete graph
  val g = GraphFrame(
    historical.flatMap(e => List((e._1, e._3, e._5), (e._2, e._4, e._5))).toDF("id", "type", "timestamp"),
    historical.toDF("src", "dst", "srcType", "dstType", "timestamp", "companyId")
  )

  val connectedComponents: DataFrame = g.connectedComponents.run()

  connectedComponents.toDF().show(100, false)

  sc.stop()
}
gth685f
  • 585
  • 2
  • 6
  • 13
  • 2
    I ended up modifying the connected components implementation here: https://github.com/kwartile/connected-component for my use case. – gth685f Aug 07 '17 at 14:24

0 Answers0