package main
import (
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
func main() {
c := colly.NewCollector(
)
c.SetRequestTimeout(time.Minute * 5)
queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000})
var (
visited int64
cards []map[string]string
)
c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
atomic.AddInt64(&visited, 1)
card := make(map[string]string)
e.ForEach("p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
text := strings.Split(elem.Text, ":")
if len(text) > 1 {
card[text[0]] = text[1]
} else {
card["type"] = text[0]
}
})
cards = append(cards, card)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
const (
baseURL = "some_url"
maxPage = 5
)
for p := 1; p <= maxPage; p++ {
urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
queue.AddURL(urlPath)
}
queue.Run(c)
fmt.Println(visited)
fmt.Println(len(cards))
}
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"strings"
"sync/atomic"
"time"
)
func main() {
c := colly.NewCollector(
colly.Async(true),
)
c.SetRequestTimeout(5 * time.Minute)
var (
visited int64
cards []map[string]string
)
c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
atomic.AddInt64(&visited, 1)
card := make(map[string]string)
e.ForEach("li>p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
text := strings.Split(elem.Text, ":")
if len(text) > 1 {
card[text[0]] = text[1]
} else {
card["type"] = text[0]
}
})
cards = append(cards, card)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
const (
baseURL = "some_url"
maxPage = 5
)
for p := 1; p <= maxPage; p++ {
urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
c.Visit(urlPath)
}
c.Wait()
fmt.Println(visited)
fmt.Println(len(cards))
}
I am using gocolly for web scraping and I don't understand why when using the async mode or a queue, I get inconsistent results for visited
. I get the value slightly smaller than the expected value as if it didn't have enough time to process all URLs.
Running all URLs consecutively yields the correct result. I have tried playing around with RequestTimeout, but without any success.
Maybe, there is something I am missing in the code that results in inconsistent behavior. For smaller values of pages
, the results are most often correct.
My guess is that URLs occasionally return an error, but the library doesn't log anything unusual