0

I am trying Apache Storm for Processing Streams of GeoHash Codes. I am using this library and Apache Storm 0.9.3. The geohash details for python can be found at enter link description here.

Currently, I am facing an synchronization issue in the execute method of one BOLT class. I have tried using a single bold, which gives me the correct output. But the moment I go from one Bolt thread to two or more. The output gets messed up.

The code snippet for one of the BOLT(Only this is having issues) is:

public static int PRECISION=6;
private OutputCollector collector;
BufferedReader br;
String lastGeoHash="NONE";
HashMap<String,Integer> map;
HashMap<String,String[]> zcd;
TreeMap<Integer,String> counts=new TreeMap<Integer,String>();
public void prepare( Map conf, TopologyContext context, OutputCollector collector ) 
{
    String line="";
    this.collector = collector;
    map=new HashMap<String,Integer>();
    zcd=new HashMap<String,String[]>();
    try {
        br = new BufferedReader(new FileReader("/tmp/zip_code_database.csv"));
        int i=0;
        while ((line = br.readLine()) != null) {
            if(i==0){
                String columns[]=line.split(",");
                for(int j=0;j<columns.length;j++){
                    map.put(columns[j],j);
                }
            }else{
                String []split=line.split(",");
                zcd.put(split[map.get("\"zip\"")],new String[]{split[map.get("\"state\"")],split[map.get("\"primary_city\"")]});
            }
            i++;
        }
        br.close();
    //  System.out.println(zcd);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    System.out.println("Initialize");
    initializeTreeMapAsPerOurRequirement(counts);
}

public void execute( Tuple tuple ) 
{
    String completeFile = tuple.getStringByField("string");//So, this data is generated by Spout and it contains the complete shape file where each line is separated by a new line character i.e. "\n"
    String lines[]=completeFile.split("\t");
    String geohash=lines[0];
    int count=Integer.parseInt(lines[1]);
    String zip=lines[2];
    String best="";
    String city="";
    String state="";

    if(!(geohash.equals(lastGeoHash)) && !(lastGeoHash.equals("NONE"))){
        //if(counts.size()!=0){
            //System.out.println(counts.firstKey());
                best=counts.get(counts.lastKey());
                //System.out.println(geohash);
                if(zcd.containsKey("\""+best+"\"")){
                    city = zcd.get("\""+best+"\"")[0];
                    state = zcd.get("\""+best+"\"")[1];
                    System.out.println(lastGeoHash+","+best+","+state+","+city+","+"US");
                }else if(!best.equals("NONE")){
                    System.out.println(lastGeoHash);
                    city="MISSING";
                    state="MISSING";
                }
        //      initializeTreeMapAsPerOurRequirement(counts);
            //}else{
                //System.out.println("else"+geohash);
            //}

        //}
    }
    lastGeoHash=geohash;
    counts.put(count, zip);

    collector.ack( tuple );
}

private void initializeTreeMapAsPerOurRequirement(TreeMap<Integer,String> counts){
    counts.clear();
    counts.put(-1,"NONE");
}

public void declareOutputFields( OutputFieldsDeclarer declarer ) 
{
    System.out.println("here");
    declarer.declare( new Fields( "number" ) );
}

Topology code is:

public static void main(String[] args) 
{
    TopologyBuilder builder = new TopologyBuilder();

    builder.setSpout( "spout", new SendWholeFileDataSpout(),2);
    builder.setBolt( "map", new GeoHashBolt(),2).shuffleGrouping("spout");
    builder.setBolt("reduce",new GeoHashReduceBolt(),2).fieldsGrouping("map", new Fields("value"));

    Config conf = new Config();

    LocalCluster cluster = new LocalCluster();
    cluster.submitTopology("test", conf, builder.createTopology());
    Utils.sleep(10000);
    cluster.killTopology("test");
    cluster.shutdown();
}

Can someone look into the code and guide me a bit.

Matthias J. Sax
  • 59,682
  • 7
  • 117
  • 137
anuj pradhan
  • 2,777
  • 4
  • 26
  • 31
  • what you have set for the `parallelism_hint` for your bolt? Please also show your topology code. – Shams Feb 23 '15 at 12:54
  • @shizan added the topology code. I have not added the number of tasks for each executor, considering that it will not benefit for my application. – anuj pradhan Feb 24 '15 at 04:43

1 Answers1

0

You have set the parallelism_hint to 2 for your spout and both of your bolts. It means 2 executers will run per component, which may mess-up your output.
By setting parallelism_hint to 1 you may achieve your desired output.

Shams
  • 3,637
  • 5
  • 31
  • 49