Here is a simple Java example based on Jacek's answer above:
JSON Array:
[{
"customer_id": "d6315a00",
"product": "Super widget",
"price": 10,
"bought_date": "2019-01-01"
},
{
"customer_id": "d6315a00",
"product": "Super widget",
"price": 10,
"bought_date": "2019-01-01"
},
{
"customer_id": "d6315a00",
"product": "Super widget",
"price": 10,
"bought_date": "2019-01-02"
},
{
"customer_id": "d6315a00",
"product": "Food widget",
"price": 4,
"bought_date": "2019-08-20"
},
{
"customer_id": "d6315cd0",
"product": "Food widget",
"price": 4,
"bought_date": "2019-09-19"
}, {
"customer_id": "d6315e2e",
"product": "Bike widget",
"price": 10,
"bought_date": "2019-01-01"
}, {
"customer_id": "d6315a00",
"product": "Bike widget",
"price": 10,
"bought_date": "2019-03-10"
},
{
"customer_id": "d631614e",
"product": "Garage widget",
"price": 4,
"bought_date": "2019-02-15"
}
]
Java Code:
package io.centilliard;
import static org.apache.spark.sql.functions.explode;
import static org.apache.spark.sql.functions.from_json;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.DataStreamWriter;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Function2;
import scala.runtime.BoxedUnit;
public class Pivot {
public static void main(String[] args) throws StreamingQueryException, AnalysisException {
StructType schema = new StructType(new StructField[]{
new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),
new StructField("product", DataTypes.StringType, false, Metadata.empty()),
new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("bought_date", DataTypes.StringType, false, Metadata.empty())
});
ArrayType arrayType = new ArrayType(schema, false);
SparkSession spark = SparkSession
.builder()
.appName("SimpleExample")
.getOrCreate();
// Create a DataSet representing the stream of input lines from Kafka
Dataset<Row> dataset = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "utilization")
.load()
.selectExpr("CAST(value AS STRING) as json");
Column col = new Column("json");
Column data = from_json(col,arrayType).as("data");
Column explode = explode(data);
Dataset<Row> customers = dataset.select(explode).select("col.*");
DataStreamWriter<Row> dataStreamWriter = new DataStreamWriter<Row>(customers);
StreamingQuery dataStream = dataStreamWriter.foreachBatch(new Function2<Dataset<Row>, Object, BoxedUnit>() {
@Override
public BoxedUnit apply(Dataset<Row> dataset, Object object) {
dataset
.groupBy("customer_id","product","bought_date")
.pivot("product")
.sum("price")
.orderBy("customer_id")
.show();
return null;
}
})
.start();
dataStream.awaitTermination();
}
}
Output:
+-----------+-------------+-----------+-----------+-----------+-------------+------------+
|customer_id| product|bought_date|Bike widget|Food widget|Garage widget|Super widget|
+-----------+-------------+-----------+-----------+-----------+-------------+------------+
| d6315a00| Bike widget| 2019-03-10| 20| null| null| null|
| d6315a00| Super widget| 2019-01-02| null| null| null| 20|
| d6315a00| Super widget| 2019-01-01| null| null| null| 40|
| d6315a00| Food widget| 2019-08-20| null| 8| null| null|
| d6315cd0| Food widget| 2019-09-19| null| 8| null| null|
| d6315e2e| Bike widget| 2019-01-01| 20| null| null| null|
| d631614e|Garage widget| 2019-02-15| null| null| 8| null|
+-----------+-------------+-----------+-----------+-----------+-------------+------------+