1

I'm doing a golang scraper to get information from this site https://www.allrecipes.com/recipes/17562/dinner/

I want to get : Name, URL, Descriptions, Ingredients, Photos, Directions.

How can I use the links in the struct products URL to send the scraper to these pages to retrieve other data ?

Here is my code


package main

import (
    "encoding/json"
    "fmt"
    "net"
    "net/http"
    "os"
    "time"

    "github.com/gocolly/colly"
)

type products struct {
    recettes
    Name string `json:"name"`
    URL  string `json:"url"`
}

type recettes struct {
    Descriptions string `json:"descriptions"`
    Ingredients  string `json:"ingredients"`
    Photos       string `json:"photos"`
    Directions   string `json:"directions"`
}

var allProducts []products
var allRecettes []recettes

func main() {
    c := colly.NewCollector()
    c.WithTransport(&http.Transport{
        DialContext: (&net.Dialer{
            Timeout:   60 * time.Second,
            KeepAlive: 30 * time.Second,
            DualStack: true,
        }).DialContext,
        MaxIdleConns:          100,
        IdleConnTimeout:       90 * time.Second,
        TLSHandshakeTimeout:   10 * time.Second,
        ExpectContinueTimeout: 1 * time.Second,
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Scraping:", r.URL)
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Println("Status:", r.StatusCode)
    })

    // OnHTML enregistre une fonction. La fonction sera exécutée sur chaque HTML élément correspondant au paramètre
    c.OnHTML("a.mntl-card", func(h *colly.HTMLElement) {
        products := products{
            URL:  h.Attr("href"),
            Name: h.ChildText(".card__title-text"),
        }
        c.Visit(products.URL)
        // fmt.Println(products)
        allProducts = append(allProducts, products)
    })

    c.OnHTML("article.fixed-recipe-card", func(h *colly.HTMLElement) {
        recettes := recettes{
            Descriptions: h.ChildText("p.article-subheading"),
            Photos:       h.ChildAttr("div.img-placeholder", "src"),
            Ingredients:  h.ChildText("div.mntl-structured-ingredients"),
            Directions:   h.ChildText("div.recipe__steps"),
        }
        fmt.Println(recettes)
        allRecettes = append(allRecettes, recettes)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "nError:", err)
    })

    c.Visit("https://www.allrecipes.com/recipes/17562/dinner/")

    content, err := json.Marshal(allProducts)
    if err != nil {
        fmt.Println(err.Error())
    }

    os.WriteFile("data.json", content, 0644)
    fmt.Println("Total produts: ", len(allProducts))
    fmt.Println("Total recettes: ", len(allRecettes))
}

How can I do to succeed?

maka
  • 39
  • 7
  • I'm not sure if i understand your problem correctly, but i'd say u simply need to use the Visit function and pass the URL from the objects. – Tobias Theel Apr 14 '23 at 04:58
  • Yes, that's my problem, I don't know how to get my scraper to go to the recipe pages. Where there is a c.Visit(products.URL). But then, I have a new problem that make the scraper go on ALL the links ^^ which gives an infinite loop – maka Apr 14 '23 at 06:28

0 Answers0