More than 5 years have passed since last update.

A Tour of Go の Exercise: Web Crawler を書いてみた

Last updated at 2020-03-20Posted at 2020-03-20

A Tour of Go

面白法人カヤックでGo書いてるギリ新卒のkoyoです！
次の新卒が入ってくる時期になってきましたね。新卒でGo書く人は A Tour of Go やる時期なのかなーと思います。
Goを覚えるにはとてもいい題材なのですが、Exerciseが難しい笑（なので新卒の皆さんは全部解けなくてもいいと思います。私も去年は解けなかった）
特に最後の Exercise: Web Crawler が難しいですよね。1年仕事で書いてると、理解も深まるもので、苦戦しましたが良さそうなコードを書くことができたので、記事を書いてみました！

コード

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	Fetch(url string) (body string, urls []string, err error)
}

type Crawler struct {
	cache *sync.Map
}

func NewCrawler() *Crawler {
	return &Crawler{
		cache: &sync.Map{},
	}
}

func (c *Crawler) Crawl(url string, depth int, fetcher Fetcher) {
	wg := &sync.WaitGroup{}
	c.crawl(url, depth, wg)
	wg.Wait()

	return
}

func (c *Crawler) crawl(url string, depth int, wg *sync.WaitGroup) {
	if depth <= 0 {
		return
	}

	if _, ok := c.cache.Load(url); ok {
		return
	}
	c.cache.Store(url, struct{}{})

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Printf("found: %s %q\n", url, body)
	wg.Add(len(urls))
	for _, u := range urls {
		go func(u string) {
			c.crawl(u, depth-1, wg)
			wg.Done()
		}(u)
	}
}

func main() {
	NewCrawler().Crawl("https://golang.org/", 4, fetcher)
}

type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

実行結果

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"

Program exited.

簡単な解説

sync.WaitGroup を使って Crawler を定義する方法でやってみました。
channelを使うやり方や、クロージャで済ませる方法もあるかなと思いますね。ただクロージャだと可読性と拡張性が低くなるので、struct定義しちゃう方がいいかなーと思う派ですね。

実装のポイントは sync.Map{} を使うことです。 sync.Mutex と map[string]bool を組み合わせるやり方もあると思うんですが、これを使うと一つだけで済むのでラクですね。
簡単なキャッシュを実装したい場合に使えるのでオススメです！

厳密に考えるとするとエラー返しているので sync.WaitGroup ではなく errgroup.Group 使う方がいいかもですが、実装例の一つとして参考になればと思います。

おまけ: クロージャで解いてみた

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	Fetch(url string) (body string, urls []string, err error)
}

func Crawl(url string, depth int, fetcher Fetcher) {
	var (
		cache = &sync.Map{}
		wg    = &sync.WaitGroup{}
		crawl func(url string, depth int)
	)

	crawl = func(url string, depth int) {
		if depth <= 0 {
			return
		}

		if _, ok := cache.Load(url); ok {
			return
		}
		cache.Store(url, struct{}{})

		body, urls, err := fetcher.Fetch(url)
		if err != nil {
			fmt.Println(err)
			return
		}

		fmt.Printf("found: %s %q\n", url, body)
		wg.Add(len(urls))
		for _, u := range urls {
			go func(u string) {
				crawl(u, depth-1)
				wg.Done()
			}(u)
		}
	}

	crawl(url, depth)
	wg.Wait()
	return
}

func main() {
	Crawl("https://golang.org/", 4, fetcher)
}

type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up