type WebSpider struct {
url string
keyword string
document *goquery.Document
}
func NewWebSpider(url string, keyword string) *WebSpider {
return &WebSpider{
url: url,
keyword: keyword,
document: nil,
}
}
func (ws *WebSpider) Run() {
ws.document, _ = ws.buildDocument()
route := ws.findRoutesFromKeyWord()[ws.keyword]
ws.document.Find(strings.Join(route, " ")).Each(func(i int, s *goquery.Selection) {
fmt.Println(s.Text())
})
}
func (ws *WebSpider) buildDocument() (*goquery.Document, error) {
response, err := http.Get(ws.url)
if err != nil {
log.Fatal(err)
return &goquery.Document{}, err
}
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal(err)
return &goquery.Document{}, err
}
return document, nil
}
// Routesと複数形の理由はキーワードに正規表現を使ったときを想定して。
func (ws *WebSpider) findRoutesFromKeyWord() map[string][]string {
routes := make(map[string][]string, 0)
body := ws.document.Find("body")
body.Find("*").Each(func(i int, s *goquery.Selection) {
text := s.Clone().Children().Remove().End().Text()
if text == ws.keyword {
routes[text] = ws.findRouteFromNode(s)
}
})
return routes
}
func (ws *WebSpider) findRouteFromNode(node *goquery.Selection) []string {
route := make([]string, 0)
currentNode := node
for {
route = append(route, goquery.NodeName(currentNode))
if goquery.NodeName(currentNode) == "html" {
break
}
currentNode = currentNode.Parent()
}
// reverse
for i, j := 0, len(route)-1; i < j; i, j = i+1, j-1 {
route[i], route[j] = route[j], route[i]
}
return route
}
func GoquerySample() {
ws := NewWebSpider(
"https://ja.wikipedia.org/wiki/%E5%9B%BD%E3%81%AE%E4%B8%80%E8%A6%A7",
"日本国",
)
ws.Run()
}
More than 3 years have passed since last update.
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme