1

i have a go colly project that i use to crawl multiple links that i fetch from a table like below :

func main() {
    //db, err := sql.Open("postgres", "postgresql://postgres:postgres@localhost:5432/db?sslmode=disable")
    dbutil.Init()
    defer dbutil.Close()

    db := dbutil.GetDB()

    rows, err := db.Query("SELECT id, link FROM cities_table") 
    for rows.Next() {
        
    for _, City := range cities {
            c := colly.NewCollector(
            colly.MaxDepth(1),
            colly.Async(true),
        )
        extensions.RandomUserAgent(c)
        c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 20})
        spider(c, db, City)
        baseURL := ThroughProxy(City)
        c.Visit(baseURL.String())
        c.Wait()
    }
}

on this example i have set Parallelism: 20 but because i am creating a new golly instance for each record and because maxDepth is 1 its now working in parallel mode . is there any way to run this crawler in parallel and get the results faster ? note : if i bring this part out of loop :

 c := colly.NewCollector(
            colly.MaxDepth(1),
            colly.Async(true),
        )

my max depth is not 1 for each website . its 1 for all websites .

Farshad
  • 1,830
  • 6
  • 38
  • 70

1 Answers1

0

Looks like github.com/gocolly/colly/v2/queue is what you need.

Queue is a request queue which uses a Collector to consume requests in multiple threads

The pseudo code below is modified from the official queue demo:

package main

import (
    "fmt"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/queue"
)

func main() {
    c := colly.NewCollector(
        colly.MaxDepth(1),
    )

    // spider(c, db, city)
    // registers functions to be executed

    q, _ := queue.New(
        20,
        &queue.InMemoryQueueStorage{MaxSize: 10000},
    )

    // rows, err := db.Query("SELECT id, link FROM cities_table")
    cities := []string{}

    for _, city := range cities {
        // extensions.RandomUserAgent(c)
        // call q.AddRequest to add a request with User-Agent header if you need it.

        // baseURL := ThroughProxy(city)
        baseURL := city
        if err := q.AddURL(baseURL); err != nil {
            fmt.Printf("failed to add URL: %s\n%v", baseURL, err)
        }
    }

    err := q.Run(c)
    if err != nil {
        fmt.Printf("failed to run: %s", err)
    }
}
Zeke Lu
  • 6,349
  • 1
  • 17
  • 23