1
package main

import (
    "fmt"
    "strings"
    "sync/atomic"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/queue"
)

func main() {
    c := colly.NewCollector(

    )

    c.SetRequestTimeout(time.Minute * 5)

    queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000})

    var (
        visited int64
        cards   []map[string]string
    )

    c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })

    c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
        atomic.AddInt64(&visited, 1)
        card := make(map[string]string)
        e.ForEach("p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
            text := strings.Split(elem.Text, ":")
            if len(text) > 1 {
                card[text[0]] = text[1]
            } else {
                card["type"] = text[0]
            }
        })
        cards = append(cards, card)

    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    const (
        baseURL = "some_url"
        maxPage = 5
    )

    for p := 1; p <= maxPage; p++ {
        urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
        queue.AddURL(urlPath)
    }

    queue.Run(c)

    fmt.Println(visited)
    fmt.Println(len(cards))
}

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "strings"
    "sync/atomic"
    "time"
)

func main() {
    c := colly.NewCollector(
        colly.Async(true),

    )

    c.SetRequestTimeout(5 * time.Minute)

    var (
        visited int64
        cards   []map[string]string
    )

    c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
        atomic.AddInt64(&visited, 1)
        card := make(map[string]string)
        e.ForEach("li>p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
            text := strings.Split(elem.Text, ":")
            if len(text) > 1 {
                card[text[0]] = text[1]
            } else {
                card["type"] = text[0]
            }
        })
        cards = append(cards, card)

    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    const (
        baseURL = "some_url"
        maxPage = 5
    )

    for p := 1; p <= maxPage; p++ {
        urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
        c.Visit(urlPath)
    }

    c.Wait()

    fmt.Println(visited)
    fmt.Println(len(cards))
}

I am using gocolly for web scraping and I don't understand why when using the async mode or a queue, I get inconsistent results for visited. I get the value slightly smaller than the expected value as if it didn't have enough time to process all URLs.

Running all URLs consecutively yields the correct result. I have tried playing around with RequestTimeout, but without any success.

Maybe, there is something I am missing in the code that results in inconsistent behavior. For smaller values of pages, the results are most often correct.

My guess is that URLs occasionally return an error, but the library doesn't log anything unusual

Don Draper
  • 463
  • 7
  • 21
  • "inconsistent results for visited" please explain what you mean (sample output from your app demonstrating the issue would help). You have a [data race](https://go.dev/doc/articles/race_detector) on `cards` so I would not be surprised if the two values output differ. – Brits Aug 03 '23 at 00:53
  • For cards, the data race is there, you are right, but for `visited` the counter should be correct. I know how many items a page has and thus the total number after all pages have been processed and thus the correct counter value. However, it is most often off by a small number. For example, if the correct value is 300, the counter is 295 or 298. – Don Draper Aug 03 '23 at 06:30

0 Answers0