0

I'm evaluating Esper as a system for loss-less processing of billing data. It is expected that system can handle ~20000 events per second and run ~400 statements with continuos aggregation (without storing events in memory). In order to get expected performance I've started to send events in multiple threads and found that esper often looses data.

Simple example that shows data loss

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import com.espertech.esper.client.Configuration;
import com.espertech.esper.client.EPAdministrator;
import com.espertech.esper.client.EPRuntime;
import com.espertech.esper.client.EPServiceProvider;
import com.espertech.esper.client.EPServiceProviderManager;
import com.espertech.esper.client.EPStatement;

public class Example {

    public static void main(String[] args) throws Exception {
        new Example().run();
    }

    public void run() throws Exception {
        Configuration config = new Configuration();

        // use default configuration
        EPServiceProvider epService = EPServiceProviderManager.getDefaultProvider(config);

        EPAdministrator epAdministrator = epService.getEPAdministrator();
        // simple schema
        epAdministrator.getConfiguration().addEventType(LogLine.class);

        // event for terminating context partition
        createEPL(epAdministrator, "create schema TerminateEvent() ");

        // Start context partition on LogLine event and terminate on TerminateEvent.
        createEPL(epAdministrator, "create context InitCtx start LogLine end TerminateEvent");

        // select to collect count of events per account_name.
        EPStatement statement = createEPL(epAdministrator, "context InitCtx select context.id as partition_id, count(*), sum(bytes) from LogLine output last when terminated");

        // register listener to output all newEvents properties values
        statement.addListener((newEvents, oldEvents) -> {
            String resultEvents = Arrays.stream(newEvents).map((event) -> {
                return Arrays.stream(event.getEventType().getPropertyNames()).map((prop) -> {
                    return prop + "=" + event.get(prop);
                }).collect(Collectors.joining(", "));
            }).collect(Collectors.joining("]; ["));
            System.out.println("=== results: [" + resultEvents + "]");

        });

        //lets use 4 threads for sending data
        ExecutorService myexecutor = Executors.newFixedThreadPool(4);
        List<CompletableFuture<Void>> listOfTasks = new ArrayList<>();

        //get data to be processed
        List<LogLine> list = getData();
        for (int i = 1; i <= list.size(); i++) {
            //concurrently send each logline
            final LogLine logLine = list.get(i - 1);
            CompletableFuture<Void> task = CompletableFuture.runAsync(() -> {
                epService.getEPRuntime().sendEvent(logLine);
                System.out.println("== sending data " + logLine);
            }, myexecutor);
            listOfTasks.add(task);

            if (i % 4 == 0) {
                // terminate context partition after every 4 events.
                sendTerminateEvent(listOfTasks, epService.getEPRuntime());
            }
        }

        // terminate context partition at the end of the execution.
        sendTerminateEvent(listOfTasks, epService.getEPRuntime());

        // shutdow all services.
        myexecutor.shutdown();
        epService.destroy();
    }

    private void sendTerminateEvent(List<CompletableFuture<Void>> listOfTasks, EPRuntime epRuntime) throws Exception {
        // wait for all submitted tasks to finish
        CompletableFuture[] array = listOfTasks.toArray(new CompletableFuture[listOfTasks.size()]);
        CompletableFuture.allOf(array).get(1, TimeUnit.MINUTES);
        listOfTasks.clear();

        System.out.println("== sending terminate event.");
        // send partition termination event
        epRuntime.sendEvent(Collections.emptyMap(), "TerminateEvent");
    }

    private List<LogLine> getData() {
        List<LogLine> dataEventsList = new ArrayList<>();
        dataEventsList.add(new LogLine(0, 1));
        dataEventsList.add(new LogLine(0, 2));
        dataEventsList.add(new LogLine(0, 3));
        dataEventsList.add(new LogLine(0, 4));
        dataEventsList.add(new LogLine(0, 5));
        dataEventsList.add(new LogLine(1, 1));
        dataEventsList.add(new LogLine(1, 2));
        dataEventsList.add(new LogLine(1, 3));
        dataEventsList.add(new LogLine(1, 4));
        dataEventsList.add(new LogLine(1, 5));
        return dataEventsList;
    }

    private EPStatement createEPL(EPAdministrator admin, String statement) {
        System.out.println("creating EPL: " + statement);
        return admin.createEPL(statement);
    }

    public static class LogLine {
        int account_id;
        int bytes;

        public LogLine(int account_id, int bytes) {
            this.account_id = account_id;
            this.bytes = bytes;
        }

        public int getAccount_id() {
            return account_id;
        }

        public int getBytes() {
            return bytes;
        }

        @Override
        public String toString() {
            return "[account_id=" + account_id + ", bytes=" + bytes + "]";
        }
    }

}

Execution output:

creating EPL: create schema TerminateEvent() 
creating EPL: create context InitCtx start LogLine end TerminateEvent
creating EPL: context InitCtx select context.id as partition_id, count(*), sum(bytes) from LogLine output last when terminated
== data [account_id=0, bytes=3] was send
== data [account_id=0, bytes=1] was send
== data [account_id=0, bytes=4] was send
== data [account_id=0, bytes=2] was send
== sending terminate event.
=== results: [partition_id=0, count(*)=4, sum(bytes)=10]
== data [account_id=1, bytes=2] was send
== data [account_id=1, bytes=3] was send
== data [account_id=0, bytes=5] was send
== data [account_id=1, bytes=1] was send
== sending terminate event.
=== results: [partition_id=1, count(*)=2, sum(bytes)=6]
== data [account_id=1, bytes=5] was send
== data [account_id=1, bytes=4] was send
== sending terminate event.
=== results: [partition_id=2, count(*)=1, sum(bytes)=4]

There are correct results for the first partition, the next 2 partitions output invalid results:

// OK
actual   [partition_id=0, count(*)=4, sum(bytes)=10]
expected [partition_id=0, count(*)=4, sum(bytes)=10]

// LOSS
actual   [partition_id=1, count(*)=2, sum(bytes)=6]
expected [partition_id=1, count(*)=4, sum(bytes)=11]

// LOSS
actual   [partition_id=2, count(*)=1, sum(bytes)=4]
expected [partition_id=2, count(*)=2, sum(bytes)=9]

What's wrong with this example code?

Enabling priority execution order didn't help

creating EPL: create schema TerminateEvent() 
creating EPL: @Priority(1) create context InitCtx start LogLine end TerminateEvent
creating EPL: @Priority(0) context InitCtx select context.id as partition_id, count(*), sum(bytes) from LogLine output last when terminated
== data [account_id=0, bytes=3] was send
== data [account_id=0, bytes=4] was send
== data [account_id=0, bytes=1] was send
== data [account_id=0, bytes=2] was send
== sending terminate event.
=== results: [partition_id=0, count(*)=4, sum(bytes)=10]
== data [account_id=1, bytes=2] was send
== data [account_id=1, bytes=3] was send
== data [account_id=0, bytes=5] was send
== data [account_id=1, bytes=1] was send
== sending terminate event.
=== results: [partition_id=1, count(*)=2, sum(bytes)=6]
== data [account_id=1, bytes=5] was send
== data [account_id=1, bytes=4] was send
== sending terminate event.
=== results: [partition_id=2, count(*)=1, sum(bytes)=4]
Taras
  • 46
  • 5

1 Answers1

0

This question is a more elaborate DUPLICATE of Esper data loss when inbound threading is enabled

In the case that Esper EPL requires ordered execution you must develop your code so it processes events in an ordered fashion. Esper cannot magically enforce some ordering. The JVM can pause any thread at any time. You must design your code properly.

For example, lets assume you have 2 threads. Lets assume that A can be processed in parallel and B must be processed exactly in the order as provided in the example below.

Lets say you have events come in. You want B to process after A1 and A2 but before A3 and A4:

A1 A2 B1 A3 A4

If you just add all A and B events to a queue and thread pool with say 5 threads that means that B can be processed first, in the middle, or last. Every run you can get a different result as the JVM does not enforce an order. Esper cannot enforce an order as your application drives Esper and not the other way around.

What you can do, for example, is add the first set of A events to a queue (A1, A2). When B comes in, wait for the queue to empty. Next add B to the queue. Wait for B to complete. Then add the next set of A events (A3, A4) into the queue. Thus you achieve ordered processing in respect to A and B, with all A events being processed in parallel.

CORRECTION:

I see now that you only have one event type and no A+B. In this case make sure you are running the most recent version. Also make sure that "create context" does not get a lower priority otherwise context partitions get created last. I have run your code around 10 times and did not see invalid output with 7.1.0. I'm on JDK 1.8.0_121 (Oracle).

user650839
  • 2,594
  • 1
  • 13
  • 9
  • My example has only 2 types of events: RowLogLine (contains data to be processed) and TerminateEvent (marker event that is sent every 5 min in order to terminate context). I don't care about order in which RowLogLine events are processed, that why I send them concurrently. When I need to terminate context I wait for all currently running sending tasks to finish and then submit TerminateEvent. What is wrong with this logic? – Taras Apr 09 '18 at 19:04
  • In my use case ordering is not required for processing. SUM(), AVG(), COUNT() and other aggregation functions can produce valid results regardless of whether RowLogLine1 or RowLogLine2 was first. – Taras Apr 09 '18 at 19:10
  • The @priority seems wrong then because you want the context to get a chance to actually allocate partitions. – user650839 Apr 09 '18 at 19:27
  • I've removed priority but I still have strange results. Updated question with code sample and result... – Taras Apr 10 '18 at 11:02
  • Your update still has priority. Switch the priority giving the context the higher priority. – user650839 Apr 10 '18 at 11:07
  • :) There was example class added. It doesn't enable priority-based statement execution order. But even when I enable it and set statement priorities results are still the same invalid – Taras Apr 10 '18 at 11:20
  • updated original question with output for prioritized execution – Taras Apr 10 '18 at 11:28
  • I have run if a few times and it works fine for me. Make sure you have the current version 7.1.0. Perhaps an older version had an issue. – user650839 Apr 10 '18 at 20:00