2

I am applying union operator on two DataStreams of Generic record type.

package com.gslab.com.dataSets;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericRecord;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class FlinkBroadcast {
    public static void main(String[] args) throws Exception {

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(2);

        List<String> controlMessageList = new ArrayList<String>();
        controlMessageList.add("controlMessage1");
        controlMessageList.add("controlMessage2");

        List<String> dataMessageList = new ArrayList<String>();
        dataMessageList.add("Person1");
        dataMessageList.add("Person2");
        dataMessageList.add("Person3");
        dataMessageList.add("Person4");

        DataStream<String> controlMessageStream  = env.fromCollection(controlMessageList);
        DataStream<String> dataMessageStream  = env.fromCollection(dataMessageList);

        DataStream<GenericRecord> controlMessageGenericRecordStream = controlMessageStream.map(new MapFunction<String, GenericRecord>() {
            @Override
            public GenericRecord map(String value) throws Exception {
                 Record gr = new GenericData.Record(new Schema.Parser().parse(new File("src/main/resources/controlMessageSchema.avsc")));
                 gr.put("TYPE", value);
                 return gr;
            }
        });

        DataStream<GenericRecord> dataMessageGenericRecordStream = dataMessageStream.map(new MapFunction<String, GenericRecord>() {
            @Override
            public GenericRecord map(String value) throws Exception {
                 Record gr = new GenericData.Record(new Schema.Parser().parse(new File("src/main/resources/dataMessageSchema.avsc")));
                 gr.put("FIRSTNAME", value);
                 gr.put("LASTNAME", value+": lastname");
                 return gr;
            }
        });

        //Displaying Generic records
        dataMessageGenericRecordStream.map(new MapFunction<GenericRecord, GenericRecord>() {
            @Override
            public GenericRecord map(GenericRecord value) throws Exception {
                System.out.println("data before union: "+ value);
                return value;
            }
        });

        controlMessageGenericRecordStream.broadcast().union(dataMessageGenericRecordStream).map(new MapFunction<GenericRecord, GenericRecord>() {
            @Override
            public GenericRecord map(GenericRecord value) throws Exception {
                System.out.println("data after union: " + value);
                return value;
            }
        });
        env.execute("stream");
    }
}

Output:

05/09/2016 13:02:13 Map(2/2) switched to FINISHED 
data after union: {"TYPE": "controlMessage1"}
data before union: {"FIRSTNAME": "Person2", "LASTNAME": "Person2: lastname"}
data after union: {"TYPE": "controlMessage1"}
data before union: {"FIRSTNAME": "Person1", "LASTNAME": "Person1: lastname"}
data after union: {"TYPE": "controlMessage2"}
data after union: {"TYPE": "controlMessage2"}
data after union: {"FIRSTNAME": "Person1", "LASTNAME": "Person1"}
data before union: {"FIRSTNAME": "Person4", "LASTNAME": "Person4: lastname"}
data before union: {"FIRSTNAME": "Person3", "LASTNAME": "Person3: lastname"}
data after union: {"FIRSTNAME": "Person2", "LASTNAME": "Person2"}
data after union: {"FIRSTNAME": "Person3", "LASTNAME": "Person3"}
05/09/2016 13:02:13 Map -> Map(2/2) switched to FINISHED 
data after union: {"FIRSTNAME": "Person4", "LASTNAME": "Person4"}
05/09/2016 13:02:13 Map -> Map(1/2) switched to FINISHED 
05/09/2016 13:02:13 Map(1/2) switched to FINISHED 
05/09/2016 13:02:13 Map(2/2) switched to FINISHED 
05/09/2016 13:02:13 Job execution switched to status FINISHED.

As you can see records in dataMessageGenericRecordStream are not correct after union. All field values are getting replaced by first fields value.

Matthias J. Sax
  • 59,682
  • 7
  • 117
  • 137
Ranjit Shinde
  • 71
  • 1
  • 5
  • I also posted to your other question. Could you please print the `TypeInformation` for each DataStream. You can get that using `DataStream.getType()`, i.e. `System.out.println(dataMessageGenericRecordStream.getType())`. – aljoscha May 09 '16 at 13:22
  • printing dataMessageGenericRecordStream.getType() :GenericType printing controlMessageGenericRecordStream.getType() :GenericType – Ranjit Shinde May 09 '16 at 13:41
  • This is reproducible only for GenericRecord, when i chang it to Map its working. can you suggest any workarround – Ranjit Shinde May 10 '16 at 07:28

2 Answers2

2

I have spent a couple days investigating this for a different issue (but still involving GenericRecord) and have found the root cause and the solution.

Root Cause: Within the Apache Avro "Schema.class" the "field" position is a TRANSIENT and does not get serialized by Kryo, and therefore gets initialized as position "0" when deserialized within the Flink pipeline.

See The JIRA AVRO-1476 which describes this and specifically mentions kyro serialization.

This was fixed in Avro 1.7.7

Solution: Flink must use Avro 1.7.7 (or later). I have verified the fix in my local machine by replacing the Avro classes within the flink-dist_2.11-1.1.3.jar and it corrected my issue.

I updated the JIRA issue for this: https://issues.apache.org/jira/browse/FLINK-5039

There is a PR for this now: https://github.com/apache/flink/pull/2953

And I expect it will be included in the Flink 1.1.4 and 1.2.0 builds.

Dave Torok
  • 36
  • 3
1

I was facing similar issue in DataSet API. I was reading some Avro files as GenericRecords and saw this weird behavior. I used this workaround- instead of reading them as GenericRecords, I read them as specific records (e.g. MyAvroObject) and then use a map to convert/typecast them as GenericRecords.

I wrote some code to test your use case using DataSet API and it works with the above workaround-

public static void maintest(String[] args) throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(2);

    List<String> queryList1 = new ArrayList<String>();
    queryList1.add("query1");
    queryList1.add("query2");

    List<String> queryList2 = new ArrayList<String>();
    queryList2.add("QUERY1");
    queryList2.add("QUERY2");
    queryList2.add("QUERY3");
    queryList2.add("QUERY4");

    DataSet<String> dataset1  = env.fromCollection(queryList1);
    DataSet<String> dataset2  = env.fromCollection(queryList2);

    DataSet<GenericRecord> genericDS1 = dataset1.map(new MapFunction<String, GenericRecord>() {
        @Override
        public GenericRecord map(String value) throws Exception {
            Query query = Query.newBuilder().setQuery(value).build();
            return (GenericRecord) query;
        }
    });

    DataSet<GenericRecord> genericDS2 = dataset2.map(new MapFunction<String, GenericRecord>() {
        @Override
        public GenericRecord map(String value) throws Exception {
            SearchEngineQuery searchEngineQuery = SearchEngineQuery.newBuilder().setSeQuery(value).build();
            return (GenericRecord) searchEngineQuery;
        }
    });

    genericDS2.map(new MapFunction<GenericRecord, GenericRecord>() {
        @Override
        public GenericRecord map(GenericRecord value) throws Exception {
            System.out.println("DEBUG: data before union: " + value);
            return value;
        }
    });

    genericDS1.union(genericDS2).map(new MapFunction<GenericRecord, GenericRecord>() {
        @Override
        public GenericRecord map(GenericRecord value) throws Exception {
            System.out.println("DEBUG: data after union: " + value);
            return value;
        }
    }).print();
}

Where Query and SearchEngine query are my Avro objects (similar to your control message list and data message list).

Output:

{"query": "query1"}
{"se_query": "QUERY1"}
{"se_query": "QUERY3"}
{"query": "query2"}
{"se_query": "QUERY2"}
{"se_query": "QUERY4"}
Matthias J. Sax
  • 59,682
  • 7
  • 117
  • 137