0

I've created the code to crawl a website. Two questions.

  1. The code is supposed to be recursive to get all the links from the same domain, but it stops without retrieving all. I feel there is something wrong in the loop function

  2. The test code mocking a function fails. A similar code implementing a foo function works for me, but this one doesn't.

class Crawler {

  val mainURL = "http://www.eldiario.es"


  def getLinksPage(urlToCrawl: String): List[String] = {

    val connURL: Try[Document] = Try(Jsoup.connect(urlToCrawl).get())

    def links(doc:Document):  Try[List[String]] = Try {
      val elements = doc.select("a[href]").asScala
      val links = elements.map(_.attr("abs:href")).toSeq
      val linksURL = links.map(new URL(_))

      val targetURL = (new URL(urlToCrawl)).getHost
      val listLinks = linksURL.filter(_.getHost == targetURL).map(_.toString).toList
      listLinks
    }

    val getListLinks: Try[List[String]] = for {
      a <- connURL
      b <- links(a)
    } yield  b

    val pageLinks: List[String] = getListLinks.getOrElse(List[String](urlToCrawl))
    println(pageLinks)
    pageLinks
  }


  def loop(ls: List[String], acc: List[String]): List[String] = ls match {
    case Nil => acc
    case hd::tl => if (!acc.contains(hd))   loop(getLinksPage(hd),hd::acc)
                        else  loop(tl, acc)
  }

  def getAllLinkPages(mainURL:String)= loop(getLinksPage(mainURL), List(mainURL))

}

   class CrawlerSpec extends WordSpec with MockFactory {

      trait LinksFixture {

        val getLinksPage = stubFunction[String, List[String]]

        lazy val crawlerMock = new Crawler() {
          override def getLinksPage(urlToCrawl: String) = LinksFixture.this.getLinksPage(urlToCrawl)
        }
      }

      "getLinksPage" should {
        "return the links" in new LinksFixture {

          getLinksPage when "http://example.com" returns  List("http://example.com", "http://example.com/a", "http://example.com/b")

          crawlerMock.getLinksPage("http://example.com") shouldBe  List("http://example.com", "http://example.com/a", "http://example.com/b")

        }
      }
    }

    [info] CrawlerSpec:
    [info] getLinksPage
    [info] - should return the links *** FAILED ***
    [info]   scala.MatchError: null
    [info]   at rbs.Crawler.loop(Crawler.scala:43)
    [info]   at rbs.Crawler.getAllLinkPages(Crawler.scala:47)
    [info]   at rbs.Crawler.<init>(Crawler.scala:49)

Edit 2: With mockFunction

[info] CrawlerSpec:
[info] getLinksPage
[info] - should return the links *** FAILED ***
[info]   Unexpected call: MockFunction1-1(http://www.eldiario.es)
[info]
[info]   Expected:
[info]   inAnyOrder {
[info]     MockFunction1-1(http://example.com) once (never called - UNSATISFIED)
[info]   }
[info]
[info]   Actual:
[info]     MockFunction1-1(http://www.eldiario.es) (Option.scala:121)
rodbs
  • 185
  • 1
  • 4
  • 12
  • To answer the code for Crawler would be necessary. Looks like there's a pattern match in there that is being passed a null for some reason. – Brian Smith Oct 25 '17 at 21:30
  • I've added more code and details – rodbs Oct 25 '17 at 22:42
  • i suspect your definition of the `getLinksPage` doesn't capture all possible arguments at the moment. Try using a `mockFunction` instead, that fails on invalid arguments instead of returning a null. Stubs are tolerant to inputs and will return null for all unknown / not modelled invocations. – Philipp Oct 26 '17 at 09:10
  • I've algo got errors using the mockFunction – rodbs Oct 26 '17 at 12:08

0 Answers0