Why does using async mode/queue when parsing with gocolly yield incosistent results?

134 Views Asked by At
package main

import (
    "fmt"
    "strings"
    "sync/atomic"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/queue"
)

func main() {
    c := colly.NewCollector(

    )

    c.SetRequestTimeout(time.Minute * 5)

    queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000})

    var (
        visited int64
        cards   []map[string]string
    )

    c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })

    c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
        atomic.AddInt64(&visited, 1)
        card := make(map[string]string)
        e.ForEach("p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
            text := strings.Split(elem.Text, ":")
            if len(text) > 1 {
                card[text[0]] = text[1]
            } else {
                card["type"] = text[0]
            }
        })
        cards = append(cards, card)

    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    const (
        baseURL = "some_url"
        maxPage = 5
    )

    for p := 1; p <= maxPage; p++ {
        urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
        queue.AddURL(urlPath)
    }

    queue.Run(c)

    fmt.Println(visited)
    fmt.Println(len(cards))
}

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "strings"
    "sync/atomic"
    "time"
)

func main() {
    c := colly.NewCollector(
        colly.Async(true),

    )

    c.SetRequestTimeout(5 * time.Minute)

    var (
        visited int64
        cards   []map[string]string
    )

    c.OnHTML("a.css-rc5s2u", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnHTML("ul.css-sfcl1s", func(e *colly.HTMLElement) {
        atomic.AddInt64(&visited, 1)
        card := make(map[string]string)
        e.ForEach("li>p.css-b5m1rv", func(_ int, elem *colly.HTMLElement) {
            text := strings.Split(elem.Text, ":")
            if len(text) > 1 {
                card[text[0]] = text[1]
            } else {
                card["type"] = text[0]
            }
        })
        cards = append(cards, card)

    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    const (
        baseURL = "some_url"
        maxPage = 5
    )

    for p := 1; p <= maxPage; p++ {
        urlPath := fmt.Sprintf("%s&page=%d", baseURL, p)
        c.Visit(urlPath)
    }

    c.Wait()

    fmt.Println(visited)
    fmt.Println(len(cards))
}

I am using gocolly for web scraping and I don't understand why when using the async mode or a queue, I get inconsistent results for visited. I get the value slightly smaller than the expected value as if it didn't have enough time to process all URLs.

Running all URLs consecutively yields the correct result. I have tried playing around with RequestTimeout, but without any success.

Maybe, there is something I am missing in the code that results in inconsistent behavior. For smaller values of pages, the results are most often correct.

My guess is that URLs occasionally return an error, but the library doesn't log anything unusual

0

There are 0 best solutions below