3

I’ve a use case where I need to fetch all the records from Cassandra for a given time range and divide it into 30 chunks then further aggregate each chunk, for example let us suppose I’m fetching 60 records for a time range of 30 minutes. Now I need to divide into 30 chunk which will be 2 records per minute. If I’m fetching 600 records for a time range of 1 hour, then 30 chunk will be 20 records per 2 minutes. If I’m fetching 600 records for a time range of 1 week, then 30 chunk will be 20 records per 5.6 hours and so on.

For implementing the same I have written a java code which is giving result in 3 seconds for 100k records. I thought implementing the same in Cassandra UDF will have performance benefit, but the UDF is taking 6-7 seconds(double the time taken by java code) which is shocking for me. Somebody please guide, where I’m off the track, below is my table structure and java as well as UDF code.

Cassandra table schema

CREATE TABLE transactions_data (
app_name text,
api_name text,
app_id text,
start_time timestamp,
duration int,
end_time timestamp,
node_id text,
request_body text,
request_parameter_name1 text,
request_parameter_name2 text,
request_parameter_name3 text,
request_parameter_name4 text,
request_parameter_name5 text,
request_parameter_value1 text,
request_parameter_value2 text,
request_parameter_value3 text,
request_parameter_value4 text,
request_parameter_value5 text,
response_body text,
response_parameter_name1 text,
response_parameter_name2 text,
response_parameter_name3 text,
response_parameter_name4 text,
response_parameter_name5 text,
response_parameter_value1 text,
response_parameter_value2 text,
response_parameter_value3 text,
response_parameter_value4 text,
response_parameter_value5 text,
responsestatus text,
responsestatuscode text,
transaction_id text,
PRIMARY KEY ((app_name, api_name, app_id), start_time)
); 

Java code

public class SamplingDataJava {

private static Logger logger = LoggerFactory.getLogger(SamplingDataJava.class);

private static String startTime = "2017-03-21 00:00:00.000";
private static String endTime = "2017-04-25 00:00:00.000";

private final String SELECT_STATEMENT = "select start_time,duration from transactions_data "
        + " where app_name='app_name-abc' and api_name='api_name-1' "
        + " and app_id='app_id-xyz' " + " AND start_time>='"
        + startTime + "' AND start_time<='" + endTime + "' ";

private Cluster cluster;
private Session session;

private String Host = "localhost";


public SamplingDataJava() throws IOException {

    // this.query=query;
    logger.info("Using CQL3 Writer");

    cluster = Cluster.builder().addContactPoints(Host)
            .withSocketOptions(new SocketOptions().setConnectTimeoutMillis(2000000)).build();

    session = cluster.connect();

}

private class Result {
    double duration;
    int count;

    Result(double duration, int count) {
        this.duration = duration;
        this.count = count;
    }

    @Override
    public String toString() {
        return "Result [duration=" + duration + ", count=" + count + "]";
    }

}

public void hashSampling(long interval,long initTime) throws IOException {

    HashMap<Long, Result> agg = new HashMap<>();
    ResultSet rs = session.execute(SELECT_STATEMENT);

    int i = 0;
    for (com.datastax.driver.core.Row row : rs) {

        i++;
        Long hashcode = Math.abs((row.getTimestamp("start_time").getTime() - initTime) / interval);

        Result hasResult = agg.get(hashcode);

        if (hasResult == null) {
            hasResult = new Result(row.getInt("duration"), 1);
        } else {
            hasResult.duration = (hasResult.duration + row.getInt("duration"));
            hasResult.count++;
        }

        agg.put(hashcode, hasResult);

    }

    System.out.println("total number of records " + i);

    Long code=0L;
    while (code<30) {
       System.out.println(" code "+agg.get(code));
       code++;
    }

}

public void close() {
    cluster.close();
    session.close();
}

public static void main(String[] args) throws IOException {

    long beginTime = System.currentTimeMillis();
    SamplingDataJava cqp = new SamplingDataJava();

    long onlyQueryTime = System.currentTimeMillis();
    DateTimeFormatter readPattern = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS");

    DateTime sTime = readPattern.parseDateTime(startTime);

    DateTime eTime = readPattern.parseDateTime(endTime);

    long interval = (eTime.getMillis() - sTime.getMillis()) / 30;

    System.out.println("start end time :" + eTime.getMillis() + " " + sTime.getMillis());

    cqp.hashSampling(interval,sTime.getMillis());
    System.out.println("total time without open close " + (System.currentTimeMillis() - onlyQueryTime));

    cqp.close();

    System.out.println("total time " + (System.currentTimeMillis() - beginTime));
}

}

UDF code

CREATE OR REPLACE FUNCTION txn_group_count_and_sum( txn map<bigint, 
frozen<tuple<int,int>>>, start_time bigint, duration int , sample_size 
bigint, begin_time bigint )
RETURNS NULL ON NULL INPUT
RETURNS map<bigint, frozen<tuple<int,int>>>
LANGUAGE java AS '

Long hashcode = (start_time - begin_time)/sample_size;
TupleValue tupleValue = txn.get(hashcode);

if (tupleValue == null) {
com.datastax.driver.core.TupleType tupleType = 

com.datastax.driver.core.TupleType.of( 
com.datastax.driver.core.ProtocolVersion.
NEWEST_SUPPORTED, com.datastax.driver.core.CodecRegistry.DEFAULT_INSTANCE, 
com.datastax.driver.core.DataType.cint(), 
com.datastax.driver.core.DataType.cint());
tupleValue = tupleType.newValue(1, duration );
}else{

tupleValue.setInt(0, tupleValue.getInt(0) + 1);
tupleValue.setInt(1, tupleValue.getInt(1) + duration);

}

 txn.put(hashcode, tupleValue);
 return txn; ' ;


CREATE OR REPLACE AGGREGATE group_count_and_sum(bigint, int ,bigint, bigint) 
SFUNC txn_group_count_and_sum 
STYPE map<bigint, frozen<tuple<int,int>>>
INITCOND {};

query

 select group_count_and_sum(toUnixTimestamp(start_time),duration,100800000,1490054400000) from transactions_data 
         where app_name='app_name-abc' and api_name='api_name-1' 
         and app_id='app_id-xyz'  
         AND start_time>='2017-03-21 00:00:00.000' AND start_time<='2017-04-25 00:00:00.000';

Note:-

 100800000 = (end_time - start_time)/30
 1490054400000 = millisecond of 2017-03-21 00:00:00.000
Vikas Singh
  • 399
  • 4
  • 8

0 Answers0