I am running a simple batch job in Flink.
The dashboard says the job is finished but it only shows that about 30000
records were processed out of about 220000
.
Otherwise, there are no errors and the output seems as expected.
How to check why the job finished prematurely?
Here is the source code:
package com.otorio.zeeklogprocessor;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.api.java.DataSet;
import com.google.gson.*;
import java.lang.reflect.Type;
import com.otorio.zeeklogprocessor.RegulatedZeekConnRecord;
/**
* Skeleton for a Flink Batch Job.
*
* <p>For a tutorial how to write a Flink batch application, check the
* tutorials and examples on the <a href="https://flink.apache.org/docs/stable/">Flink Website</a>.
*
* <p>To package your application into a JAR file for execution,
* change the main class in the POM.xml file to this class (simply search for 'mainClass')
* and run 'mvn clean package' on the command line.
*/
public class BatchJob {
public static void main(String[] args) throws Exception {
// set up the batch execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> loglines = env.readTextFile("/Users/ben.reich/software/Flink/zeek/conn.log");
DataSet<RegulatedZeekConnRecord> jasonized = loglines.map(new Jsonizer());
DataSet<String> aggregated = jasonized.groupBy("key").reduce(new ReductionLogic()).map(new OutputBuilder());
aggregated.writeAsText("/Users/ben.reich/software/Flink/zeek/graphdata.log", WriteMode.OVERWRITE);
// execute program
env.execute("Zeek conn.log Processor");
}
// DeSerialize the log record
public static final class Jsonizer implements MapFunction<String, RegulatedZeekConnRecord> {
private static GsonBuilder gb = new GsonBuilder();
private static Gson gson;
private static RegulatedZeekConnRecord logObject;
public RegulatedZeekConnRecord map(String record) {
// Initialize gson with customized deserializer
if (gson == null) {
gb.registerTypeAdapter(RegulatedZeekConnRecord.class, new ConnLogDeserializer());
gson = gb.create();
}
logObject = gson.fromJson(record, RegulatedZeekConnRecord.class);
return logObject;
}
}
public static class ReductionLogic implements ReduceFunction<RegulatedZeekConnRecord> {
@Override
public RegulatedZeekConnRecord reduce(RegulatedZeekConnRecord pre, RegulatedZeekConnRecord current) {
pre.key = current.key;
pre.ts = current.ts;
pre.id_orig_h = current.id_orig_h;
pre.id_orig_p = current.id_orig_p;
pre.id_resp_h = current.id_resp_h;
pre.id_resp_p = current.id_resp_p;
pre.proto = current.proto;
pre.conn_state = current.conn_state;
pre.history = current.history;
pre.service = current.service;
pre.orig_pkts = current.orig_pkts + pre.orig_pkts;
pre.orig_ip_bytes = current.orig_ip_bytes + pre.orig_ip_bytes;
pre.resp_pkts = current.resp_pkts + pre.resp_pkts;
pre.resp_ip_bytes = current.resp_ip_bytes + pre.resp_ip_bytes;
pre.missed_bytes = current.missed_bytes + pre.missed_bytes;
return pre;
}
}
public static class OutputBuilder implements MapFunction<RegulatedZeekConnRecord, String> {
private static Gson gson = new Gson();
@Override
public String map(RegulatedZeekConnRecord record) {
String jsonTarget = "";
jsonTarget = gson.toJson(record);
return jsonTarget;
}
}
public static class ConnLogDeserializer implements JsonDeserializer<RegulatedZeekConnRecord> {
@Override
public RegulatedZeekConnRecord deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException {
JsonObject jsonobj = json.getAsJsonObject();
RegulatedZeekConnRecord rec = new RegulatedZeekConnRecord(jsonobj);
return rec;
}
}
}