model.freqItemsets
FPGROWTH algorithm is spark 2.4 is not showing any results for the complete dataset of 16gb but the same model or code is working for the 1Gb sample dataset which are subset or sample dataset from the 16GB data set
code snippet
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.log4j.Level
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import scala.io.Source
import java.nio.charset.CodingErrorAction
import scala.io.Codec
import org.apache.spark.mllib.recommendation._
import org.apache.spark.ml.fpm.FPGrowth
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.fpm.FPGrowthModel
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.explode
object Full_Data_Association_4 {
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val ss = SparkSession
.builder
.appName("Fpgrowth_1").getOrCreate()
import ss.implicits._
val in = ss.read.textFile(args(0))
val in_2 = in.map(x => x.split("\t")(1))
val in_3 = in_2.map(t => t.split(",")).toDF("items")
val fpgrowth = new FPGrowth().setItemsCol("items")
.setMinSupport(0.1).setMinConfidence(0.6)
val model = fpgrowth.fit(in_3)
model.freqItemsets.show(300)
}
And I get the following output:
+-----+----+
|items|freq|
+-----+----+
+-----+----+