I want to utilize Apache Spark to preprocess text data from an Elasticsearch index. With ElasticSearch Hadoop I am retrieving the index to Apache Spark. I get an RDD of the type: RDD[(String, scala.collection.Map[String,AnyRef])]
The first element looks like: document: (String, scala.collection.Map[String,AnyRef]) = (file:document_id, Map(created -> Mon Jan 20 11:50:35 CET 2014, modified -> Fri Oct 23 12:46:40 CEST 2015, indexed -> Fri Mar 25 18:05:37 CET 2016, mimetype -> application/pdf, content -> Plaintext of the document)
Now the critical parts are the processing of the above content field with a NLP toolkit and storing the results in Elasticsearch. The first part works great. I found a similar question on Stackoverflow using the StanfordCoreNLP (unfortunately Spark does not provide this itself and I cannot retrieve the tokens from Elasticsearch directly). As a result I get the tokens for each document as RDD[Seq[String]] but I don't know how to get this into Elasticsearch.
Obviously I need an outputRDD that connects documents with the associated tokens. Something like: Map("document_id_1" -> "Tokens for id_1", "document_id_2" -> "Tokens for id_2"). Maybe someone wants to provide a hint how to get there or has a better idea to solve the problem. Any help is very much appreciated.
import org.apache.spark._
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer
import org.elasticsearch.spark._
import org.elasticsearch.spark.rdd._
import edu.stanford.nlp.pipeline._
import edu.stanford.nlp.ling.CoreAnnotations._
import scala.collection.JavaConversions._
import java.util.Properties
object Stemming {
def main(args: Array[String]) {
val conf = new SparkConf() .setMaster("local[*]") .setAppName("SparkLemma")
.set("es.nodes", "hadoop-m:9200" )
.set("es.write.operation", "upsert")
.set("es.mapping.id", "id")
val esIndex = "elastic/documents"
val sc = new SparkContext(conf)
// Read data from ES
val esRDD = sc.esRDD(esIndex)
val id:RDD[String] = esRDD.map(_._1.toString)
val content:RDD[String] = esRDD.map(_._2("content").toString)
val plainText: RDD[(String, String)] = id.zip(content)
val stopWords = sc.broadcast(scala.io.Source.fromFile("stopwords.txt").getLines().toSet).value
def createNLPPipeline(): StanfordCoreNLP = {
val props = new Properties()
props.put("annotators", "tokenize, ssplit, pos, lemma")
new StanfordCoreNLP(props)
}
def plainTextToLemmas(content: String, stopWords: Set[String], nlp: StanfordCoreNLP) : Seq[String] = {
val doc = new Annotation(content)
nlp.annotate(doc)
val lemmas = new ArrayBuffer[String]()
val sentences = doc.get(classOf[SentencesAnnotation])
for (sentence <- sentences;
token <- sentence.get(classOf[TokensAnnotation])) {
val lemma = token.get(classOf[LemmaAnnotation])
if (lemma.length > 3 && !stopWords.contains(lemma)) {
if (lemmas.isEmpty) {
lemmas += id += lemma.toLowerCase
}
else {
lemmas += lemma.toLowerCase
}
}
}
lemmas
}
val lemmatized: RDD[Seq[String]] = plainText.mapPartitions(strings => {
val nlp = createNLPPipeline()
strings.map{case(id, content) => plainTextToLemmas(content, stopWords, nlp)}
})
def writeTokensToES(row:Seq[String]): Map[String,String] = {
val tokens = row.drop(1).mkString(" ")
Map("id" -> row.head, "content" -> tokens, "last-run" -> getDate())
}
val outputRDD = lemmatized.map(row => writeTokensToES(row))
EsSpark.saveToEs(outputRDD, esIndex)
sc.stop()
}
}