Page titles from URLs dumped to file

Question

I have a text document with maybe 50 URLs in there that I want the page titles to. I've been trying to use some code that does do that, but the compiler seems to be tripping up on scanner versus string arguments. Ideally, I'd like to take in a text document of URLs and dump the corresponding page titles to another doc.

import java.io.*;
import java.util.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//code from: http://www.gotoquiz.com/web-coding/programming/java-programming/how-to-extract-titles-from-web-pages-in-java/

public class TitleExtractor {
    /* the CASE_INSENSITIVE flag accounts for
     * sites that use uppercase title tags.
     * the DOTALL flag accounts for sites that have
     * line feeds in the title text */
    private static final Pattern TITLE_TAG =
        Pattern.compile("\\<title>(.*)\\</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);

    /**
     * @param x the HTML page
     * @return title text (null if document isn't HTML or lacks a title tag)
     * @throws IOException
     */
    public static String getPageTitle(String url) throws IOException {
        URL u = new URL(url);
        URLConnection conn = u.openConnection();

        // ContentType is an inner class defined below
        ContentType contentType = getContentTypeHeader(conn);
        if (!contentType.contentType.equals("text/html"))
            return null; // don't continue if not HTML
        else {
            // determine the charset, or use the default
            Charset charset = getCharset(contentType);
            if (charset == null)
                charset = Charset.defaultCharset();

            // read the response body, using BufferedReader for performance
            InputStream in = conn.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset));
            int n = 0, totalRead = 0;
            char[] buf = new char[1024];
            StringBuilder content = new StringBuilder();

            // read until EOF or first 8192 characters
            while (totalRead < 8192 && (n = reader.read(buf, 0, buf.length)) != -1) {
                content.append(buf, 0, n);
                totalRead += n;
            }
            reader.close();

            // extract the title
            Matcher matcher = TITLE_TAG.matcher(content);
            if (matcher.find()) {
                /* replace any occurrences of whitespace (which may
                 * include line feeds and other uglies) as well
                 * as HTML brackets with a space */
                return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
            }
            else
                return null;
        }
    }

    /**
     * Loops through response headers until Content-Type is found.
     * @param conn
     * @return ContentType object representing the value of
     * the Content-Type header
     */
    private static ContentType getContentTypeHeader(URLConnection conn) {
        int i = 0;
        boolean moreHeaders = true;
        do {
            String headerName = conn.getHeaderFieldKey(i);
            String headerValue = conn.getHeaderField(i);
            if (headerName != null && headerName.equals("Content-Type"))
                return new ContentType(headerValue);

            i++;
            moreHeaders = headerName != null || headerValue != null;
        }
        while (moreHeaders);

        return null;
    }

    private static Charset getCharset(ContentType contentType) {
        if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName))
            return Charset.forName(contentType.charsetName);
        else
            return null;
    }

    /**
     * Class holds the content type and charset (if present)
     */
    private static final class ContentType {
        private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);

        private String contentType;
        private String charsetName;
        private ContentType(String headerValue) {
            if (headerValue == null)
                throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue");
            int n = headerValue.indexOf(";");
            if (n != -1) {
                contentType = headerValue.substring(0, n);
                Matcher matcher = CHARSET_HEADER.matcher(headerValue);
                if (matcher.find())
                    charsetName = matcher.group(1);
            }
            else
                contentType = headerValue;
        }
    }


    public class readfile{

        private Scanner x;

        public void openFile(){

            try{
                x = new Scanner(new File("urls.txt"));
            }
            catch(Exception e){
                System.out.println("could not find file");
            }

        }

        public void readFile() throws IOException{

            while(x.hasNext()){
                //String title = TitleExtractor.getPageTitle(x);
                String a = x.next();
                String title = TitleExtractor.getPageTitle(a);
                System.out.printf("%s\n",title);
                }
        }

        public void closeFile(){
            x.close();
        }



    }


    public static void main(String[] args) throws IOException{


        readfile r = new readfile();
        r.openFile();
        r.readFile();
        r.closeFile();



        }


}

I get a compilation error on the readfile r = new readfile(); bit:

"No enclosing instance of type TitleExtractor is accessible. Must qualify the allocation with an enclosing instance of the type TitleExtractor"

I'm not quite sure where the code is getting confused between scanners and string variables.

How great would it be if there was a way to know what compilation error occurred? — Sotirios Delimanolis, Jun 03 '14 at 22:37

score 0 · Answer 1 · answered Jun 03 '14 at 22:45

0

readfile is an non-static inner class, so it can't be instantiated without referencing an instance of TitleExtractor. The simplest solution would be to change the class declaration to public static class readfile.

answered Jun 03 '14 at 22:45

shmosel

49,289
6
73
138

Page titles from URLs dumped to file

1 Answers1