Running the crawler

At long last we can give a try to the application with a real test. We're going to add a main file inside the cmd folder.

To test the application we define a simple ProducerConsumer queue channel based, which will print all fetched links, using a tab to list all children at every level.

cmd/webcrawler/main.go

package main

import (
    "encoding/json"
    "flag"
    "log"

    "webcrawler"
    "webcrawler/fetcher"
    "webcrawler/messaging"
)

const (
    // Default depth to crawl for each domain
    defaultDepth int = 16
    // Default number of concurrent goroutines to crawl
    defaultConcurrency int = 8
)

// printEvents is a simple ChannelQueue consumer, just print received results
// from the crawler on stdout, simulate a decoupled process meant to process
// incoming events from the crawler
func printEvents(queue *messaging.ChannelQueue) {
    events := make(chan []byte)
    go func(ch <-chan []byte) {
        var res crawler.ParsedResult
        for e := range ch {
            if err := json.Unmarshal(e, &res); err == nil {
                log.Println(res.URL)
                for _, link := range res.Links {
                    log.Println("\t", link)
                }
            }
        }
    }(events)
    if err := queue.Consume(events); err != nil {
        log.Fatal(err)
    }
}

// withMaxDepth is a simple constructor option to pass into the
// crawler.New function call to set the number of levels to crawl
// for each page
func withMaxDepth(depth int) crawler.CrawlerOpt {
    return func(s *crawler.CrawlerSettings) {
        s.MaxDepth = depth
    }
}

// withConcurrency is a simple constructor option to pass into the
// crawler.New function call to set the concurrency level
func withConcurrency(concurrency int) crawler.CrawlerOpt {
    return func(s *crawler.CrawlerSettings) {
        s.Concurrency = concurrency
    }
}

func main() {
    var (
        targetURL   string
        maxDepth    int
        concurrency int
    )
    flag.StringVar(&targetURL, "target", "", "URL to crawl")
    flag.IntVar(&maxDepth, "depth", defaultDepth, "Maximum depth of crawling")
    flag.IntVar(&concurrency, "concurrency", defaultConcurrency, "Number of concurrent goroutine to run")
    flag.Parse()
    // We create a ChannelQueue instance here, ideally it could be a
    // RabbitMQ/AWS SQS task queue
    bus := messaging.NewChannelQueue()
    userAgent string = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
    go func() { printEvents(&bus) }()
    c := crawler.New(userAgent, &bus,
        withMaxDepth(maxDepth),
        withConcurrency(concurrency),
    )
    c.Crawl(targetURL)
}

go run cmd/webcrawler/main.go -target golang.org

2.5. First run

Running the crawler

results matching ""

No results matching ""