Below code groups a List of Strings into type List[(String, List[String])] Where all capitals are encountered in String of length 5, this is the identifier and all data subsequent to the identifier is grouped into a list. The terminating factor for each group is an empty line encountered. So below "lines" get converted to :
(IDENT,List(p1text, p2text))
(IDENY,List(p2text, p3text, p4text))
Is there a more idiomatic way of achieving this in Scala/Spark ?
Possibly using a groupBy
call with predicate ?
Ideally the data structure would be of type RDD[(String, List[String])] instead of List[(String, List[String])]
val lines = List[String]("line1",
" ",
"line2",
" ",
" IDENT",
"p1text",
"p2text",
" ",
" IDENY",
"p2text",
"p3text",
"p4text",
" ",
"some text") //> lines : List[String] = List(line1, "
//|
//|
//| ", line2, "
//|
//|
//| ", " IDENT", p1text, p2text, "
//|
//|
//| ", " IDENY", p2text, p3text, p4text, "
//| ", some text)
def getItems(i: Int): List[String] = {
var iter = i;
val l = new scala.collection.mutable.ArrayBuffer[String]()
while (!lines(iter).trim.isEmpty) {
iter = iter + 1
if(!lines(iter).trim.isEmpty)
l.append(lines(iter).trim)
}
l.toList
} //> getItems: (i: Int)List[String]
val regex = "\\w{5}" //> regex : String = \w{5}
val u: List[(String , List[String])] = lines.zipWithIndex.map({
case (s, i) => {
if (s.trim.toUpperCase.matches(regex)) {
(s.trim, getItems(i))
} else {
("" , List())
}
}
}) //> u : List[(String, List[String])] = List((line1,List()), ("",List()), (line
//| 2,List()), ("",List()), (IDENT,List(p1text, p2text)), ("",List()), ("",List
//| ()), ("",List()), (IDENY,List(p2text, p3text, p4text)), ("",List()), ("",Li
//| st()), ("",List()), ("",List()), ("",List()))
val fi : List[(String, List[String])] = u.filterNot(f => f._2.isEmpty || f._2(0).trim.isEmpty)
//> fi : List[(String, List[String])] = List((IDENT,List(p1text, p2text)), (ID
//| ENY,List(p2text, p3text, p4text)))
fi.foreach(println) //> (IDENT,List(p1text, p2text))
//| (IDENY,List(p2text, p3text, p4text))