0

When I run the code on an IDE, there's no compile errors. However, when I try to use spark-submit, I get java.lang.ClassNotFoundException every time. I've altered the source folder several times to no avail.

enter image description here

I put the code here since coding code in that box is a disaster.
https://pastebin.com/Sqmmazv5

package com.sundogsoftware.spark

     import java.nio.charset.CodingErrorAction

       import org.apache.log4j.{Level, Logger}
      import org.apache.spark.SparkContext

     import scala.io.{Codec, Source}
         import scala.math._

 /*** Created by Subramanian on 8/28/2017.*/

 object MovieSimilarities {

 /** Load up a Map of movie IDs to movie names. */
   def loadMovieNames() : Map[Int, String] = {

   // Handle character encoding issues:
   implicit val codec = Codec("UTF-8")
    codec.onMalformedInput(CodingErrorAction.REPLACE)
    codec.onUnmappableCharacter(CodingErrorAction.REPLACE)

     // Create a Map of Ints to Strings, and populate it from u.item.
   var movieNames:Map[Int, String] = Map()

    val lines = Source.fromFile("../ml-100k/u.item").getLines()
    for (line <- lines) {
      var fields = line.split('|')
      if (fields.length > 1) {
        movieNames += (fields(0).toInt -> fields(1))
      }
     }

    return movieNames
  }

 type MovieRating = (Int, Double)
 type UserRatingPair = (Int, (MovieRating, MovieRating))
 def makePairs(userRatings:UserRatingPair) = {
  val movieRating1 = userRatings._2._1
  val movieRating2 = userRatings._2._2

  val movie1 = movieRating1._1
  val rating1 = movieRating1._2
  val movie2 = movieRating2._1
  val rating2 = movieRating2._2

  ((movie1, movie2), (rating1, rating2))
  }

 def filterDuplicates(userRatings:UserRatingPair):Boolean = {
  val movieRating1 = userRatings._2._1
  val movieRating2 = userRatings._2._2

  val movie1 = movieRating1._1
  val movie2 = movieRating2._1

  return movie1 < movie2
 }

type RatingPair = (Double, Double)
type RatingPairs = Iterable[RatingPair]

def computeCosineSimilarity(ratingPairs:RatingPairs): (Double, Int) = {
 var numPairs:Int = 0
 var sum_xx:Double = 0.0
 var sum_yy:Double = 0.0
 var sum_xy:Double = 0.0

for (pair <- ratingPairs) {
  val ratingX = pair._1
  val ratingY = pair._2

  sum_xx += ratingX * ratingX
  sum_yy += ratingY * ratingY
  sum_xy += ratingX * ratingY
  numPairs += 1
}

val numerator:Double = sum_xy
val denominator = sqrt(sum_xx) * sqrt(sum_yy)

var score:Double = 0.0
if (denominator != 0) {
  score = numerator / denominator
}

return (score, numPairs)
}

/** Our main function where the action happens */
def main(args: Array[String]) {

// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)

 // Create a SparkContext using every core of the local machine
val sc = new SparkContext("local", "MovieSimilarities")

println("\nLoading movie names... 2 + 2 ")
val nameDict = loadMovieNames()

val data = sc.textFile("../ml-100k/u.data")

// Map ratings to key / value pairs: user ID => movie ID, rating
val ratings = data.map(l => l.split("\t")).map(l => (l(0).toInt, (l(1).toInt, l(2).toDouble)))

// Emit every movie rated together by the same user.
// Self-join to find every combination.
val joinedRatings = ratings.join(ratings)

// At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))

// Filter out duplicate pairs
val uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

// Now key by (movie1, movie2) pairs.
val moviePairs = uniqueJoinedRatings.map(makePairs)

// We now have (movie1, movie2) => (rating1, rating2)
// Now collect all ratings for each movie pair and compute similarity
val moviePairRatings = moviePairs.groupByKey()

// We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
// Can now compute similarities.
val moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).cache()

//Save the results if desired
//val sorted = moviePairSimilarities.sortByKey()
//sorted.saveAsTextFile("movie-sims")

// Extract similarities for the movie we care about that are "good".

if (args.length > 0) {
  val scoreThreshold = 0.97
  val coOccurenceThreshold = 50.0

  val movieID:Int = args(0).toInt

  // Filter for movies with this sim that are "good" as defined by
  // our quality thresholds above

  val filteredResults = moviePairSimilarities.filter( x =>
    {
      val pair = x._1
      val sim = x._2
      (pair._1 == movieID || pair._2 == movieID) && sim._1 > scoreThreshold && sim._2 > coOccurenceThreshold
    }
  )

  // Sort by quality score.
  val results = filteredResults.map( x => (x._2, x._1)).sortByKey(false).take(10)

  println("\nTop 10 similar movies for " + nameDict(movieID))
  for (result <- results) {
    val sim = result._1
    val pair = result._2
    // Display the similarity result that isn't the movie we're looking at
    var similarMovieID = pair._1
    if (similarMovieID == movieID) {
      similarMovieID = pair._2
    }
    println(nameDict(similarMovieID) + "\tscore: " + sim._1 + "\tstrength: " + sim._2)
  }
}
}

}
Haspemulator
  • 11,050
  • 9
  • 49
  • 76

1 Answers1

0

if you are using maven add below build tag and should take care of adding class files in your jar

<build>
<defaultGoal>clean install</defaultGoal>
<sourceDirectory>src/main/scala</sourceDirectory>
<plugins>
    <plugin>
        <!-- see http://davidb.github.com/scala-maven-plugin -->
        <groupId>net.alchim31.maven</groupId>
        <artifactId>scala-maven-plugin</artifactId>
        <version>3.2.0</version>
        <executions>
            <execution>
                <goals>
                    <goal>compile</goal>
                    <goal>testCompile</goal>
                </goals>
            </execution>
        </executions>
    </plugin>
    <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-dependency-plugin</artifactId>
        <executions>
            <execution>
                <id>copy-dependencies</id>
                <phase>prepare-package</phase>
                <goals>
                    <goal>copy-dependencies</goal>
                </goals>
                <configuration>
                    <outputDirectory>${project.build.directory}/lib</outputDirectory>
                    <overWriteReleases>false</overWriteReleases>
                    <overWriteSnapshots>false</overWriteSnapshots>
                    <overWriteIfNewer>true</overWriteIfNewer>
                </configuration>
            </execution>
        </executions>
    </plugin>
    <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
        <configuration>
            <archive>
                <manifest>
                    <addClasspath>true</addClasspath>
                    <classpathPrefix>lib/</classpathPrefix>
                    <mainClass>EnterpriseConnectorServer</mainClass>
                </manifest>
            </archive>
        </configuration>
    </plugin>
</plugins>

rajcool111
  • 657
  • 4
  • 14
  • 36