1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

【A Tour of Go】Concurrency/Exercise: Web Crawler (10/11) 途中

Posted at

問題

この演習では、ウェブクローラ( web crawler )を並列化するため、Goの並行性の特徴を使います。
同じURLを2度取ってくることなく並列してURLを取ってくるように、 Crawl 関数を修正してみてください(注1)。
補足: 工夫すれば Crawl 関数のみの修正で実装できますが、無理に Crawl 関数内部に収める必要はありません。
ひとこと: mapにフェッチしたURLのキャッシュを保持できますが、mapだけでは並行実行時の安全性はありません!

第一段階(同じURLを2度取らない)

  • 並列化は、あとにして「同じURLを2度取ってくることなく」の対応をする。
  • 出力結果が以下のようになったので、問題ないと思われる

found: https://golang.org/ "The Go Programming Language"
found: https://golang.org/pkg/ "Packages"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/fmt/ "Package fmt"
found: https://golang.org/pkg/os/ "Package os"

---------------------

* ★: 追加
* ■: 変更

```exercise-web-crawler.go
package main

import (
	"fmt"
	"sync" // ★
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	map_url.mux.Lock()      // ★
	_, ok := map_url.v[url] // ★
	map_url.v[url]++        // ★
	map_url.mux.Unlock()    // ★
	if ok {                 // ★
		return // ★
	} // ★

	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		Crawl(u, depth-1, fetcher)
	}
	return
}

func main() {
	map_url.v = make(map[string]int) // ★
	Crawl("https://golang.org/", 4, fetcher)
}

//
type SafeCounter struct { // ★
	v   map[string]int // ★
	mux sync.Mutex     // ★
}                       // ★
var map_url SafeCounter // ★

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

第二段階(Channelを使って結果を出力)

  • main で Channel をつくる
  • Crawl に 引数(ch chan string)を追加
  • 結果の出力先を Channel に変更
  • 有効な ch を出力するロジック
    • 当初は次のようにかいていたが、<-chすると len(ch)が減るので、3件分しか出力できていなかった。
    • len(ch)は有効な channel の数になるので、 st をチェックする必要もない
for i := 0; i < len(ch); i++ { // ★2
	v, st := <-ch // ★2
	if st {       // ★2
		fmt.Print(v) // ★2
	} // ★2
} // ★2



---------------------
* ★2: 追加
* ■2: 変更

```exercise-web-crawler.go
package main

import (
	"fmt"
	"sync" // ★
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan string) { // ■2
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	map_url.mux.Lock()      // ★
	_, ok := map_url.v[url] // ★
	map_url.v[url]++        // ★
	map_url.mux.Unlock()    // ★
	if ok {                 // ★
		return // ★
	} // ★

	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		ch <- fmt.Sprintln(err) // ■2
		return
	}
	ch <- fmt.Sprintf("found: %s %q\n", url, body) // ■2
	for _, u := range urls {
		Crawl(u, depth-1, fetcher, ch)
	}
	return
}

func main() {
	map_url.v = make(map[string]int)             // ★
	ch := make(chan string, 20)                  // ★2
	Crawl("https://golang.org/", 4, fetcher, ch) // ■2

	for len(ch) > 0 { // ★2
		v, st := <-ch // ★2
		if st {       // ★2
			fmt.Print(v) // ★2
		} // ★2
	} // ★2
}

//
type SafeCounter struct { // ★
	v   map[string]int // ★
	mux sync.Mutex     // ★
}                       // ★
var map_url SafeCounter // ★

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
// 長くなるので略
}

最終形(並列化)

  • go Crawl(...) した上で、処理終了の待ち合わせをしなければならない
1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?