1

I have a streaming data stream that reads a kafka topical, which is a json that is converted to class object and then grouped by id and send an output json to another kafka topical.

The problem is that sending more than 500 json at the same time to kafka, it does not produce the same number of outputs and several of these data are repeated in the output.

Windows and triggers were applied but I am new to Apache Beam and I don't know what is wrong with my code. I have tried to follow the official documentation but I have not achieved what I want.

I am using the DirectRunner.

Here is the json I sent to kafka:

{
    "transactionId": "1k1",
    "products":[
    {
        "negotiationId": 2002,
        "productGtin": 329794528329,
        "state": 1,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 9087120
    },
    {
        "negotiationId": 2004,
        "productGtin": 219059901669,
        "state": 2,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 1312691
    },
    {
        "negotiationId": 2008,
        "productGtin": 111579181771,
        "state": 2,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 4494522
    },
    {
        "negotiationId": 2009,
        "productGtin": 622670393720,
        "state": 2,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 5742150
    },
    {
        "negotiationId": 2005,
        "productGtin": 397999498452,
        "state": 1,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 5075384
    },
    {
        "negotiationId": 2004,
        "productGtin": 161158769517,
        "state": 2,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 4256911
    },
    {
        "negotiationId": 2009,
        "productGtin": 352265510055,
        "state": 1,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 2751445
    },
    {
        "negotiationId": 2003,
        "productGtin": 681014619525,
        "state": 2,
        "observation": "0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023-0056-0055-0012-0034-0032-2056-0021-0058-0073-0020-0028-0023",
        "retailerCode": 5359730
    }
     ]
}

Here is the code:

public PipelineResult run() {

    Pipeline pipeline = Pipeline.create(options);
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coderPipeline.getCoder().getEncodedTypeDescriptor(), coderPipeline.getCoder());
    
    options.setProject("low-code-exito");
    options.setRegion("us-central1");
    options.setStagingLocation("gs://low-code-exito/binaries/");
    options.setGcpTempLocation("gs://low-code-exito/temp/");
    options.setNetwork("default");
    options.setSubnetwork("regions/us-central1/subnetworks/default");
    options.isStreaming();
    options.setKafkaReadTopics("test-topic");
    options.setReadBootstrapServers("localhost:9092");

    pipeline
            .apply(
                    "ReadFromKafka",
                    KafkaIO.<String, String>read()
                            .withTopic(options.getKafkaReadTopics())
                            .withBootstrapServers(options.getReadBootstrapServers())
                            .withKeyDeserializer(StringDeserializer.class)
                            .withValueDeserializer(StringDeserializer.class)
                            .withTimestampPolicyFactory((tp, previousWatermark) -> new CustomFieldTimePolicy(previousWatermark))
                            .withoutMetadata())
            .apply(Values.create())
            .apply(Window.<String>into(FixedWindows.of(Duration.millis(1)))
                    .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(1))))
                    .discardingFiredPanes()
                    .withAllowedLateness(Duration.ZERO))
            .apply("print", ParDo.of(new DoFn<String, KV<String, Product>>() {
                    @ProcessElement
                    public void processElement(@Element String input, ProcessContext context) {
                        Gson gson = builder.create();
                        NegotiationProduct negotiationProducts = gson.fromJson(input, NegotiationProduct.class);
                        fileName = negotiationProducts.getTransactionId();
                        Arrays.stream(negotiationProducts.getProducts())
                                .forEach(product -> context.output(KV.of(product.getNegotiationId(), product)));
                    }
                }))
            .apply(GroupByKey.create())
            .apply(WithKeys.of(0))
            .apply(GroupIntoBatches.ofSize(8))
            .apply(ParDo.of(new DoFn<KV<Integer, Iterable<KV<String, Iterable<Product>>>>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                    GsonBuilder builder = new GsonBuilder();
                    builder.setPrettyPrinting();
                    Gson gson = builder.create();

                    List<Negotiation> negotiations = new ArrayList<>();
                    Objects.requireNonNull(c.element()).getValue().forEach(stringIterableKV -> {
                        Negotiation negotiation1 = new Negotiation(stringIterableKV.getKey(), IterableUtils.toList(stringIterableKV.getValue()));
                        negotiations.add(negotiation1);
                    });

                    Payload payload = new Payload(negotiations);
                    String products = gson.toJson(payload);
                    Output output = new Output(
                            new Header(
                                    "a30631d2-56a0-4bdf-b311-355676495af3",
                                    "sinco-notificacion-aprobacion-segmentacion-mq",
                                    LocalDateTime.now().atOffset(ZoneOffset.UTC).toString(),
                                    new FlexField(fileName)),
                            new DataInfo(
                                    "sinco-notificacion-aprobacion-segmentacion-mq",
                                    "application/json",
                                    products.replaceAll("\\n|\\t|\\s", ""),
                                    LocalDateTime.now().atOffset(ZoneOffset.UTC).toString(),
                                    "a30631d2-56a0-4bdf-b311-355676495af3",
                                    "negotiation-load-approved-2096-1664469006272-a30631d2-56a0-4bdf-b311-355676495af3.json",
                                    "UTF-8"
                            ));
                    String results = gson.toJson(output);
                    c.output(results);
                }
            }))
            .apply("write to kafka", KafkaIO.<Void, String>write()
                    .withBootstrapServers("localhost:9092")
                    .withTopic("exito")
                    .withValueSerializer(StringSerializer.class)
                    .withProducerConfigUpdates(ImmutableMap.of(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, 52428800))
                    .values());

    return pipeline.run();
}

I welcome any corrections and suggestions

  • I don't see anything that would cause duplicating elements. Since you use `discardingFiredPanes` and `FixedWindows` each element should only be output once. – Kenn Knowles Nov 28 '22 at 21:33
  • According to the scenario I am considering, what window and trigger strategy is most appropriate? Or what pipeline improvement do you think should be done? – Ricardo Ortega Nov 28 '22 at 21:50
  • Is your pipeline / ParDo failing and bubbling exceptions up at some point? That seems the only possible reason for duplications, as records will be retried/reprocessed in such a scenario. – Bruno Volpato Nov 29 '22 at 16:33

0 Answers0