More than 5 years have passed since last update.

【A Tour of Go】Concurrency/Exercise: Web Crawler (10/11) 途中

Posted at 2018-06-11

問題

この演習では、ウェブクローラ( web crawler )を並列化するため、Goの並行性の特徴を使います。
同じURLを2度取ってくることなく並列してURLを取ってくるように、 Crawl 関数を修正してみてください(注1)。
補足: 工夫すれば Crawl 関数のみの修正で実装できますが、無理に Crawl 関数内部に収める必要はありません。
ひとこと: mapにフェッチしたURLのキャッシュを保持できますが、mapだけでは並行実行時の安全性はありません!

第一段階(同じURLを2度取らない)

並列化は、あとにして「同じURLを2度取ってくることなく」の対応をする。
出力結果が以下のようになったので、問題ないと思われる

found: https://golang.org/ "The Go Programming Language"
found: https://golang.org/pkg/ "Packages"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/fmt/ "Package fmt"
found: https://golang.org/pkg/os/ "Package os"

---------------------

* ★： 追加
* ■： 変更

```exercise-web-crawler.go
package main

import (
	"fmt"
	"sync" // ★
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	map_url.mux.Lock()      // ★
	_, ok := map_url.v[url] // ★
	map_url.v[url]++        // ★
	map_url.mux.Unlock()    // ★
	if ok {                 // ★
		return // ★
	} // ★

	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		Crawl(u, depth-1, fetcher)
	}
	return
}

func main() {
	map_url.v = make(map[string]int) // ★
	Crawl("https://golang.org/", 4, fetcher)
}

//
type SafeCounter struct { // ★
	v   map[string]int // ★
	mux sync.Mutex     // ★
}                       // ★
var map_url SafeCounter // ★

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

第二段階(Channelを使って結果を出力)

main で Channel をつくる
Crawl に引数(ch chan string)を追加
結果の出力先を Channel に変更
有効な ch を出力するロジック
- 当初は次のようにかいていたが、<-chすると len(ch)が減るので、3件分しか出力できていなかった。
- len(ch)は有効な channel の数になるので、 st をチェックする必要もない

for i := 0; i < len(ch); i++ { // ★2
	v, st := <-ch // ★2
	if st {       // ★2
		fmt.Print(v) // ★2
	} // ★2
} // ★2




---------------------
* ★2： 追加
* ■2： 変更

```exercise-web-crawler.go
package main

import (
	"fmt"
	"sync" // ★
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan string) { // ■2
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	map_url.mux.Lock()      // ★
	_, ok := map_url.v[url] // ★
	map_url.v[url]++        // ★
	map_url.mux.Unlock()    // ★
	if ok {                 // ★
		return // ★
	} // ★

	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		ch <- fmt.Sprintln(err) // ■2
		return
	}
	ch <- fmt.Sprintf("found: %s %q\n", url, body) // ■2
	for _, u := range urls {
		Crawl(u, depth-1, fetcher, ch)
	}
	return
}

func main() {
	map_url.v = make(map[string]int)             // ★
	ch := make(chan string, 20)                  // ★2
	Crawl("https://golang.org/", 4, fetcher, ch) // ■2

	for len(ch) > 0 { // ★2
		v, st := <-ch // ★2
		if st {       // ★2
			fmt.Print(v) // ★2
		} // ★2
	} // ★2
}

//
type SafeCounter struct { // ★
	v   map[string]int // ★
	mux sync.Mutex     // ★
}                       // ★
var map_url SafeCounter // ★

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
// 長くなるので略
}

最終形(並列化)

go Crawl(...) した上で、処理終了の待ち合わせをしなければならない

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up