1

Basics of this program; Runs a webcrawler based on PerentUrl and Keyword specified by the user in Controller (main). If the Keyword is found in the page text, the Url is then saved to an array list;

ArrayList UrlHits = new ArrayList();

Once the crawl is complete the program will call methods from the WriteFile class in the main to write a html file containing all the UrlHits.

    WriteFile f = new WriteFile();
    f.openfile(Search);
    f.StartHtml();
    f.addUrl(UrlHits);
    f.EndHtml();
    f.closeFile();

All but f.addUrl work correctly, creating a html file with the correct name and directory. But none of the strings from the ArrayList output to the file.

public static void main(String[] args) throws Exception {

    RobotstxtConfig robotstxtConfig2 = new RobotstxtConfig();

    String crawlStorageFolder = "/Users/Jake/Documents/sem 2/FYP/Crawler/TestData";
    int numberOfCrawlers = 1;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    config.setMaxDepthOfCrawling(21);
    config.setMaxPagesToFetch(24);

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    Scanner perentUrl = new Scanner(System.in);
    System.out.println("Enter full perant Url... example. http://www.domain.co.uk/");
    String Url = perentUrl.nextLine();

    Scanner keyword = new Scanner(System.in);
    System.out.println("Enter search term... example. Pies");
    String Search = keyword.nextLine();

    System.out.println("Searching domain :" + Url);
    System.out.println("Keyword:" + Search);

    ArrayList<String> DomainsToInv = new ArrayList<String>();
    ArrayList<String> SearchTerms = new ArrayList<String>();
    ArrayList<String> UrlHits = new ArrayList<String>();

    DomainsToInv.add(Url);
    SearchTerms.add(Search);

    controller.addSeed(Url);

    controller.setCustomData(DomainsToInv);
    controller.setCustomData(SearchTerms);
    controller.start(Crawler.class, numberOfCrawlers);

    WriteFile f = new WriteFile();
    f.openfile(Search);
    f.StartHtml();
    f.addUrl(UrlHits);
    f.EndHtml();
    f.closeFile();
}

}

public class Crawler extends WebCrawler {

@Override
public void visit(Page page) {

    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    String domain = page.getWebURL().getDomain();
    String path = page.getWebURL().getPath();
    String subDomain = page.getWebURL().getSubDomain();
    String parentUrl = page.getWebURL().getParentUrl();
    String anchor = page.getWebURL().getAnchor();

    System.out.println("Docid: " + docid);
    System.out.println("URL: " + url);
    System.out.println("Domain: '" + domain + "'");
    System.out.println("Sub-domain: '" + subDomain + "'");
    System.out.println("Path: '" + path + "'");
    System.out.println("Parent page: " + parentUrl);
    System.out.println("Anchor text: " + anchor);

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        List<WebURL> links = htmlParseData.getOutgoingUrls();

        System.out.println("Text length: " + text.length());
        System.out.println("Html length: " + html.length());
        System.out.println("Number of outgoing links: " + links.size());
    }

    Header[] responseHeaders = page.getFetchResponseHeaders();
    if (responseHeaders != null) {
        System.out.println("Response headers:");
        for (Header header : responseHeaders) {
            System.out.println("\t" + header.getName() + ": " + header.getValue());
        }
    }
    System.out.println("=============");

    ArrayList<String> SearchTerms = (ArrayList<String>) this.getMyController().getCustomData();
    ArrayList<String> UrlHits = (ArrayList<String>) this.getMyController().getCustomData();

    for (String Keyword : SearchTerms) {

        System.out.println("Searching Keyword: " + Keyword);

        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

        int KeywordCounter = 0;
        String pagetext = htmlParseData.getText();
        Pattern pattern = Pattern.compile(Keyword);
        Matcher match1 = pattern.matcher(pagetext);

        if (match1.find()) {
            while (match1.find()) {
                KeywordCounter++;
            }
            System.out.println("FOUND " + Keyword + " in page text. KeywordCount: " + KeywordCounter);

            UrlHits.add(url);
            for (int i = 0; i < UrlHits.size(); i++) {
                System.out.print(UrlHits.get(i) + "\n");

                System.out.println("=============");
            }

        } else {
            System.out.println("Keyword search was unsuccesful");

            System.out.println("=============");
        }

    }

}

}

public class WriteFile {

private Formatter x;

public void openfile(String keyword) {

    try {
        x = new Formatter(keyword + ".html");
    } catch (Exception e) {

        System.out.println("ERROR");
    }
}

public void StartHtml() {
    x.format("%s %n %s %n %s %n %s %n %s %n ", "<html>", "<head>", "</head>", "<body>", "<center>");
}

public void addUrl(ArrayList<String> UrlHits) {

    for (String list : UrlHits) {
        x.format("%s%s%s%s%s%n%s%n", "<a href=\"", list, "\" target=\"_blank\">", list, "</a>", "<br>");
    }
}

public void EndHtml() {
    x.format("%s %n %s %n %s %n", "</center>", "</body>", "</html>");
}

public void closeFile() {
    x.close();
}

}

Apologies for the class headers outside the code blocks its a little fiddly. I have tried a few different "for" statements to get the method to output the array list but it doesn't seem to be having it. The strings are being added to the array list as i can call them using a for loop in the main. But when i pass the array list to the method addUrl, it comes up with squat. is there an easier way to use arraylists using formatters and .format?

Thanks for you help

0 Answers0