0

I am writing a code, which is using NIO/Selector to do web scraping. It works. I do get OP_CONNECT, then I send the GET request, and get the entire html page back. But, after that, I do not get a -1 to know it is finished. I do see , which means the entire page has been sent, but SocketChannel.read does not return -1 to indicate the end of the stream. Would really appreciate any help!

Here is the entire sample code:

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.StandardSocketOptions;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.SelectionKey;
import java.nio.channels.Selector;
import java.nio.channels.SocketChannel;
import java.util.Iterator;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HttpClientTest {
    private static final Logger logger = LoggerFactory.getLogger(HttpClientTest.class);
    private static final String BASE_URL_STR = "https://www.youtube.com/channel";
    private static final String CHANNEL_ID = "UCDm6kPZFCoT7altG4WNGy-A";

    private final ByteArrayOutputStream baHtmlPage = new ByteArrayOutputStream();
    private final ByteBuffer buffer = ByteBuffer.allocate(128 * 1024);

    private String htmlPage = null;

    private void startHttpClient() throws InterruptedException {


        // open Selector and ServerSocketChannel by calling the open() method
        try (Selector selector = Selector.open();
                SocketChannel socketChannel = SocketChannel.open()) {

            // check that both of them were successfully opened
            if ((socketChannel.isOpen()) && (selector.isOpen())) {

                // configure non-blocking mode
                socketChannel.configureBlocking(false);
                socketChannel.setOption(StandardSocketOptions.SO_RCVBUF,
                        128 * 1024);
                socketChannel.setOption(StandardSocketOptions.SO_SNDBUF,
                        128 * 1024);
                socketChannel.setOption(StandardSocketOptions.SO_KEEPALIVE,
                        true);
                //socketChannel.setOption(StandardSocketOptions.TCP_NODELAY,
                //      true);

                //socketChannel.connect(new InetSocketAddress(IP, DEFAULT_PORT));
                socketChannel.connect(createSocketAddress(CHANNEL_ID));

                // register the current channel with the given selector
                socketChannel.register(selector, SelectionKey.OP_CONNECT);


                while (true) {
                    // wait for incomming events
                    int num = selector.selectNow();
                    if (num==0) {
                        //Thread.yield();
                        Thread.sleep(2000);
                        System.out.println("sleep: 2 sec");
                        continue;
                    }


                    // there is something to process on selected keys
                    Iterator<SelectionKey> keys = selector.selectedKeys().iterator();
                    while (keys.hasNext()) {
                        SelectionKey key = (SelectionKey) keys.next();

                        // prevent the same key from coming up again
                        keys.remove();

                        if (!key.isValid()) {
                            continue;
                        }

                        if (key.isConnectable() && socketChannel.finishConnect()) {
                            System.out.println("Key: OP_CONNECT");
                            // reset the byte-array
                            baHtmlPage.reset();

                            // Connected --> Send the HTTP request 
                            key.interestOps(SelectionKey.OP_WRITE);

                        } else if (key.isReadable()) {
                            System.out.println("Key: OP_READ");
                            if (readResponse(key)) {
                                logger.info("finished reading, htmlpage:{}", htmlPage);
                            } else {
                                key.interestOps(SelectionKey.OP_READ);
                            }

                            // Once read is done --> we are done
                            //key.interestOps(SelectionKey.OP_WRITE);

                        } else if (key.isWritable()) {
                            System.out.println("Key: OP_WRITE");
                            if (writeHttpRequest(key)) {                            
                                // HTTP request is sent --> Get the response
                                key.interestOps(SelectionKey.OP_READ);
                            }
                        }
                    }

                }
            } else { // if ((serverSocketChannel.isOpen()) && (selector.isOpen())) {
                System.out
                        .println("The server socket channel or selector cannot be opened!");
            }
        } catch (IOException ex) {
            System.err.println(ex);
        }
    }

    private static InetSocketAddress createSocketAddress(String channelID) throws MalformedURLException {
        //String urlStr = BASE_URL_STR + "/" + CHANNEL_ID;  
        String urlStr = "http://www.google.com";  

        URL url = new URL(urlStr);
        String host = url.getHost();  
        int port = url.getPort();  
        if (port == -1) 
            port = 80;

        return new InetSocketAddress(host, port);
    }

    private boolean readResponse(SelectionKey key) throws IOException {
        boolean done = false;
        SocketChannel socketChannel = (SocketChannel) key.channel();

        int numRead = -1;
        do {
            buffer.clear();
            numRead = socketChannel.read(buffer);

            baHtmlPage.write(buffer.array(), 0, numRead);
            System.out.println("Server sent:" + new String(buffer.array(), 0, numRead, "UTF-8") );
        } while(numRead>0);

        if (numRead == -1) {
            System.out.println("Connection closed by: " + socketChannel.getRemoteAddress());
            key.cancel();
            socketChannel.close();
            htmlPage = baHtmlPage.toString("UTF-8");
            done = true;
        }
        return done;
    }

    private boolean writeHttpRequest(SelectionKey key) throws IOException {
        boolean done = false;

        SocketChannel socketChannel = (SocketChannel) key.channel();
        String request = 
                "GET /channel/UCDm6kPZFCoT7altG4WNGy-A HTTP/1.1\r\n" + 
                "Host: www.youtube.com\r\n" +
                "Cache-Control: no-cache\r\n\r\n"; 

        // ISO-8859-1
        ByteBuffer randomBuffer = ByteBuffer.wrap(request.getBytes("UTF-8"));
        int rem = randomBuffer.remaining();
        int num = socketChannel.write(randomBuffer);

        if (rem==num) {
            done = true;
            System.out.printf("Request written:%s\n", request);
        }
        return done;
    }

//  private void doEchoJob(SelectionKey key, byte[] data) {
//
//      SocketChannel socketChannel = (SocketChannel) key.channel();
//      List<byte[]> channelData = keepDataTrack.get(socketChannel);
//      channelData.add(data);
//
//      key.interestOps(SelectionKey.OP_WRITE);
//  }

    public static void main(String[] args) throws InterruptedException {
        HttpClientTest client = new HttpClientTest();
        client.startHttpClient();
    }
}
Behzad Pirvali
  • 764
  • 3
  • 10
  • 28

1 Answers1

2

You are doing a HTTP/1.1 requests, which has an implicit keep-alive. That means, that the server will not necessary close the connection once the full response is sent, but instead will keep it open for a while in the hope that it will get more requests and thus can save the overhead of another TCP connection setup.

While this helps with performance in the normal case of a browser, it does not help in your case. I would recommend to use HTTP/1.0 instead of HTTP/1.1, so that you don't have to deal with keep-alive or other HTTP/1.1 features like chunked encoding. Apart from that it is recommended to use existing HTTP libraries which deal with all these problems already.

Steffen Ullrich
  • 114,247
  • 10
  • 131
  • 172
  • Thanks for the reply. Yes, using HTTP 1.0, I am getting indeed a -1. I was hoping that using HTTP 1.1, I would get a content-length, but instead I am getting Transfer-Encoding: chunked – Behzad Pirvali Sep 12 '14 at 05:05
  • HTTP/1.0 has either content-length or simply end of connection, while HTTP/1.1 has additionally chunked mode. – Steffen Ullrich Sep 12 '14 at 05:14
  • Yes, Thanks, I am tempted to use HTTP 1.1 because of being able to reuse a socket connection – Behzad Pirvali Sep 12 '14 at 05:47
  • You can reuse the socket by explicitly setting "Connection: keep-alive" in HTTP/1.0 too. But then you cannot expect the stream to close at end of response like you currently do, but must instead use an explicitly given Content-length to find the end of the response. – Steffen Ullrich Sep 12 '14 at 05:55
  • Thanks for the tip, but then what would be the advantage of HTTP 1.1 unless you have got big amount of data to download where chunked transfer would make sense? – Behzad Pirvali Sep 12 '14 at 14:03
  • Chunked transfer has nothing to do with lots of data. It is used with dynamically generated content, where the server does not know up-front the size and thus cannot send the content-length to the client. With HTTP/1.0 the only options were to buffer everything to get the content-length or send no content-length and close the TCP connection to signal end of response. With HTTP/1.1 chunked mode you can start sending immediately and give the content-length for the various parts of the response and after you are done you can continue with the next request on the same TCP connection. – Steffen Ullrich Sep 12 '14 at 15:13
  • And HTTP/1.1 has also also better support for caching and has incorporated some things which were often used with HTTP/1.0 but never really defined (like keep-alive and host header). – Steffen Ullrich Sep 12 '14 at 15:16
  • Thank you so much for this clarification. Yes, it does make sense. So, on the client side with HTTP 1.0, you have got less work, but it does put more load and work on the Server. – Behzad Pirvali Sep 12 '14 at 17:29
  • If you do only a single request HTTP/1.0 is ok. If you do a lot more (like in a browser) using HTTP/1.1 is better for both client and server, because you can do more requests in the same time. – Steffen Ullrich Sep 12 '14 at 18:07