0
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.SocketConfig;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ThreadScrapResults {

    private static final int MYTHREADS = 10000000;
     HttpClient client = HttpClientBuilder.create().build();
     static Hashtable<String, String> subCodeSubName = null;
     static Hashtable<String, String> collCodeCollName = null;

    public static void main(String[] args) throws IOException, InterruptedException{
         BlockingQueue<Runnable> blockingQueue =   
                    new LinkedBlockingQueue<Runnable>(105);
    //ExecutorService executor = Executors.newFixedThreadPool(MYTHREADS);

//       RejectedExecutionHandler block = new RejectedExecutionHandler() {
//            void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
//               executor.getQueue().put( r );
//            }
//          };   

    ThreadPoolExecutor executor = new ThreadPoolExecutor(Integer.MAX_VALUE, Integer.MAX_VALUE, 20, TimeUnit.MILLISECONDS, blockingQueue);



        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        cm.setDefaultSocketConfig( 
                SocketConfig.custom().setSoKeepAlive(true)
                  .setSoReuseAddress(true)
                  .setSoTimeout(3000)
                  .build());


        Runnable worker = null;
        //Generating some register Numbers
        for(int year = 11; year <= 13; year++){
            for(int i = 1; i <= 350; i++){
                //generating 1050 URLs at one shot
                StringBuffer regNo = new StringBuffer("1111").append(year).append("111").append(String.format("%03d", i));


                String url = "magicUrl" + regNo;
                System.out.println(url);
                worker = new MyRunnable(url, regNo.toString());
                 executor.execute(worker); 
                }    
            }
            executor.shutdown();

            //I want to execute all those 1050 Urls at one shot and parse the result //web pages. But this actually gives me only one result.



        }






    }

    class MyRunnable implements Runnable{

        private final String url;
        private final String registerNumber;

        public MyRunnable(String url, String registerNumber) {
            // TODO Auto-generated constructor stub

            this.url = url;
            this.registerNumber = registerNumber;
        }


        public void run(){

            HttpClient client = HttpClientBuilder.create().build();
            HttpGet get  = new HttpGet(url); 
            boolean insertOrNot = true;
            HttpResponse response = null;
            try {
                response = client.execute(get);
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }


            BufferedReader rd = null;
            try {
                rd = new BufferedReader(
                        new InputStreamReader(response.getEntity().getContent()));
            } catch (IllegalStateException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            //I get the result of each url here.
            StringBuffer result = new StringBuffer();
            String line = "";
            try {
                while ((line = rd.readLine()) != null) {
                    result.append(line);
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            Document resultWebPage = Jsoup.parse(result.toString());

            Elements resultForm = resultWebPage.getElementsByTag("strong");     
            Elements error = resultWebPage.getElementsByTag("b");

            if(error.size() == 4){

                String inValidRegNo = error.get(3).html();



                if(inValidRegNo.startsWith("Sorry")){

                    //log here
                    insertOrNot = false;
                }

            }   
            System.out.println(resultForm);

            Iterator<Element> itr = resultForm.iterator();

            int count = 1;
            boolean set = true;


             List<List<String>>  resultDBOject = new ArrayList<List<String>>();
             String regNum = null;
             String name = null;
             String deptName = null;  
             String collName = null;   //TODO : Get collName and deptName from enum. 

             String key = "Super";
             while(itr.hasNext()){
                    // System.out.println(itr.next().html());
                  key = itr.next().html();

                     try {                  

                         if(key.equals("<font color=\"#0000cc\" size=\"3\">Subject Code</font>") || key.equals("<font color=\"#0000cc\" size=\"3\">Grade</font>")
                                 || key.equals("<font color=\"#0000cc\" size=\"3\">Result</font>")){
                             continue;
                         }
                         else if(key.isEmpty()){

                        //   System.out.println("N/A");


                         }else if(!key.isEmpty()){              
                                 if(set){ 
                                     if(count == 1){
                                         regNum = key;
                                    //   System.out.println(regNum);
                                         count++;
                                     }
                                     if(count == 2){
                                         name = itr.next().html();          
                                    //   System.out.println(name);
                                         count++;
                                     }
                                     if(count == 3){
                                         deptName = itr.next().html();
                                    //   System.out.println(deptName);
                                     }
                                } 
                                if(count == 4 || count == 1){
                                     count = 0;
                                     set = false;       
                                    // String temp = itr.next().html();
                                //   Result results = new Result();
                                    // System.out.println(temp);


                                     List<String> resultOfAStudent = new ArrayList<String>();
                                        resultOfAStudent.add( key);
                                        resultOfAStudent.add( itr.next().html());
                                        resultOfAStudent.add(itr.next().html());
                                    //    resultOfAStudent.add(results.getSubjName());
                                        resultDBOject.add(resultOfAStudent);
                                 }  
                         }
                         count++;
                        // System.out.println(count);


                     } finally{

                     }


             }  //end of while   

            //insert it in db
            if(insertOrNot){ 



             System.out.println("Successfully inserted" + registerNumber);
            }
        }
    }       

Following is what I am trying to do.

I generate 1050 Urls. - Works fine in two for loops in main method. 1) I don't see my program terminating once I run, but I get all the results. 2) How can I make this program to sleep after executing 500 Urls and sleep for 10 seconds, then resume working on next 500 Urls.

sofs1
  • 3,834
  • 11
  • 51
  • 89
  • Do you really want to have a thread pool with `Integer.MAX_VALUE` threads? You'll need about 8 Gb of ram to store just the references to them! – isnot2bad Aug 13 '14 at 21:23

1 Answers1

2

Look at your loop:

for(int year = 11; year <= 13; year++){
        for(int i = 1; i <= 350; i++){
            //generating 1050 URLs at one shot
            StringBuffer regNo = new StringBuffer("1111").append(year).append("111").append(String.format("%03d", i));


            String url = "magicUrl" + regNo;
            System.out.println(url);
            worker = new MyRunnable(url, regNo.toString());

        }    
}

You're overwriting worker each time through the loop, so by the time you get to executor.execute(worker);, worker holds the last value you assigned to it, which will be the runnable created from the last url generated.

Try replacing the line worker = new MyRunnable(url, regNo.toString()); with executor.execute(new MyRunnable(url, regNo.toString())); and see if that fixes it.

Mike B
  • 5,390
  • 2
  • 23
  • 45
  • So, May I know how can I create 1000 Urls and execute it in one shot. Because doing it one by one will take more time. – sofs1 Aug 13 '14 at 18:15
  • You could either make the change I suggested in my answer, or perhaps collect all the runnables in a list and then loop through and execute each one. I would just do it the way I suggested in my answer, though. – Mike B Aug 13 '14 at 18:17
  • That seems like working. But the program doesn't terminate. It hangs after executing certain number of urls. I would like to count how many urls out of 1000 were successfully completed. – sofs1 Aug 13 '14 at 18:24
  • May I know why this executor.execute(new MyRunnable(url, regNo.toString())); fixes it and not worker = new MyRunnable(url, regNo.toString()); – sofs1 Aug 13 '14 at 20:21
  • @user3705478 You are adding just one single worker AFTER the loop. Correct is (as stated in this answer) to add the worker INSIDE the inner loop so that you end up having 1050 workers doing their job. – isnot2bad Aug 13 '14 at 21:20
  • Where should I add thread.sleep(1000) so that the program finishes parsing for every 1000 urls? – sofs1 Aug 13 '14 at 21:45