I have a text document with maybe 50 URLs in there that I want the page titles to. I've been trying to use some code that does do that, but the compiler seems to be tripping up on scanner versus string arguments. Ideally, I'd like to take in a text document of URLs and dump the corresponding page titles to another doc.
import java.io.*;
import java.util.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//code from: http://www.gotoquiz.com/web-coding/programming/java-programming/how-to-extract-titles-from-web-pages-in-java/
public class TitleExtractor {
/* the CASE_INSENSITIVE flag accounts for
* sites that use uppercase title tags.
* the DOTALL flag accounts for sites that have
* line feeds in the title text */
private static final Pattern TITLE_TAG =
Pattern.compile("\\<title>(.*)\\</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
/**
* @param x the HTML page
* @return title text (null if document isn't HTML or lacks a title tag)
* @throws IOException
*/
public static String getPageTitle(String url) throws IOException {
URL u = new URL(url);
URLConnection conn = u.openConnection();
// ContentType is an inner class defined below
ContentType contentType = getContentTypeHeader(conn);
if (!contentType.contentType.equals("text/html"))
return null; // don't continue if not HTML
else {
// determine the charset, or use the default
Charset charset = getCharset(contentType);
if (charset == null)
charset = Charset.defaultCharset();
// read the response body, using BufferedReader for performance
InputStream in = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset));
int n = 0, totalRead = 0;
char[] buf = new char[1024];
StringBuilder content = new StringBuilder();
// read until EOF or first 8192 characters
while (totalRead < 8192 && (n = reader.read(buf, 0, buf.length)) != -1) {
content.append(buf, 0, n);
totalRead += n;
}
reader.close();
// extract the title
Matcher matcher = TITLE_TAG.matcher(content);
if (matcher.find()) {
/* replace any occurrences of whitespace (which may
* include line feeds and other uglies) as well
* as HTML brackets with a space */
return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
}
else
return null;
}
}
/**
* Loops through response headers until Content-Type is found.
* @param conn
* @return ContentType object representing the value of
* the Content-Type header
*/
private static ContentType getContentTypeHeader(URLConnection conn) {
int i = 0;
boolean moreHeaders = true;
do {
String headerName = conn.getHeaderFieldKey(i);
String headerValue = conn.getHeaderField(i);
if (headerName != null && headerName.equals("Content-Type"))
return new ContentType(headerValue);
i++;
moreHeaders = headerName != null || headerValue != null;
}
while (moreHeaders);
return null;
}
private static Charset getCharset(ContentType contentType) {
if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName))
return Charset.forName(contentType.charsetName);
else
return null;
}
/**
* Class holds the content type and charset (if present)
*/
private static final class ContentType {
private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
private String contentType;
private String charsetName;
private ContentType(String headerValue) {
if (headerValue == null)
throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue");
int n = headerValue.indexOf(";");
if (n != -1) {
contentType = headerValue.substring(0, n);
Matcher matcher = CHARSET_HEADER.matcher(headerValue);
if (matcher.find())
charsetName = matcher.group(1);
}
else
contentType = headerValue;
}
}
public class readfile{
private Scanner x;
public void openFile(){
try{
x = new Scanner(new File("urls.txt"));
}
catch(Exception e){
System.out.println("could not find file");
}
}
public void readFile() throws IOException{
while(x.hasNext()){
//String title = TitleExtractor.getPageTitle(x);
String a = x.next();
String title = TitleExtractor.getPageTitle(a);
System.out.printf("%s\n",title);
}
}
public void closeFile(){
x.close();
}
}
public static void main(String[] args) throws IOException{
readfile r = new readfile();
r.openFile();
r.readFile();
r.closeFile();
}
}
I get a compilation error on the readfile r = new readfile(); bit:
"No enclosing instance of type TitleExtractor is accessible. Must qualify the allocation with an enclosing instance of the type TitleExtractor"
I'm not quite sure where the code is getting confused between scanners and string variables.