【Go言語】新聞記事を自動収集！Goで作る簡単なWebスクレイピングツール

Posted at 2024-12-17

はじめに

「Go言語を使って新聞記事を自動収集してみたい！」
そんなニーズに応えるべく、今回の記事では、複数の新聞サイトから本文をスクレイピングし、CSVに保存するツールを作成する方法を解説します。

以下の内容を中心に進めます：

Go言語でのWebスクレイピングの基本
複数の新聞サイトに対応した設定の作り方
記事データをCSVに保存する方法

初心者にもわかりやすいように、細かいコード解説を交えながら説明していきます！

実現したいこと

以下の新聞サイトから記事情報をスクレイピングして、CSVに保存するツールを作ります。

新聞サイトにアクセス
記事タイトルや本文、日付を取得
設定ファイルで複数サイトを切り替えられるようにする
-all オプションで全サイトを処理可能

プロジェクト構成

最初に、プロジェクト構成を確認しておきましょう。

newspaper_scraper/
│
├── main.go  // メインの処理
└── go.mod   // Goモジュール設定

今回はすべて main.go に記述していきます。

コード全体

まずは完成版のコードをご覧ください。

package main

import (
	"encoding/csv"
	"flag"
	"fmt"
	"net/http"
	"os"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)

// NewspaperConfig 設定情報を保持
type NewspaperConfig struct {
	BaseURL        string // ベースURL
	ListSelector   string // 記事リストを取得するCSSセレクタ
	DetailSelector string // 記事本文を取得するCSSセレクタ
	DateSelector   string // 日付を取得するCSSセレクタ
	DateRegexp     string // 日付抽出用の正規表現
	DateFormat     string // 日付のフォーマット
	CSVFile        string // 出力するCSVファイル名
}

// 新聞の設定情報
var configs = map[string]NewspaperConfig{
	"mainichi": {
		BaseURL:       "https://mainichi.jp/yoroku/",
		ListSelector:  "ul.articlelist li a",
		DetailSelector: "section#articledetail-body p",
		DateSelector:   "span.articletag-date",
		DateRegexp:     `(\d{4}/\d{1,2}/\d{1,2})`, // 例: 2024/12/7
		DateFormat:     "2006/1/2",
		CSVFile:        "mainichi_articles.csv",
	},
	"chugoku": {
		BaseURL:       "https://www.chugoku-np.co.jp/feature/special/%E5%A4%A9%E9%A2%A8%E9%8C%B2",
		ListSelector:  "div.l-articles article.l-articles__item a",
		DetailSelector: "div.article-body p",
		DateSelector:   "div.m-header-info__date",
		DateRegexp:     `(\d{1,2})月(\d{1,2})日`, // 例: 12月7日
		DateFormat:     "2006年1月2日",
		CSVFile:        "chugoku_articles.csv",
	},
}

func main() {
	// CLI引数でモードを指定
	all := flag.Bool("all", false, "Run for all newspapers")
	newspaper := flag.String("newspaper", "", "Specify a single newspaper to process")
	flag.Parse()

	if *all {
		processAllNewspapers()
	} else if *newspaper != "" {
		config, exists := configs[*newspaper]
		if !exists {
			fmt.Printf("Invalid newspaper specified: %s\n", *newspaper)
			return
		}
		processNewspaper(config)
	} else {
		fmt.Println("Please specify a newspaper or use -all to process all newspapers.")
	}
}

func processAllNewspapers() {
	for name, config := range configs {
		fmt.Printf("Processing newspaper: %s\n", name)
		processNewspaper(config)
	}
}

func processNewspaper(config NewspaperConfig) {
	doc, err := fetchDocument(config.BaseURL)
	if err != nil {
		fmt.Printf("Failed to fetch URL: %v\n", err)
		return
	}

	// 記事一覧を取得
	articles := doc.Find(config.ListSelector)
	articles.Each(func(i int, s *goquery.Selection) {
		href, exists := s.Attr("href")
		if !exists {
			return
		}

		fullURL := href
		if !strings.HasPrefix(href, "http") {
			fullURL = config.BaseURL + href
		}

		processArticle(fullURL, config)
	})
}

func processArticle(url string, config NewspaperConfig) {
	doc, err := fetchDocument(url)
	if err != nil {
		fmt.Printf("Failed to fetch article URL: %v\n", err)
		return
	}

	text := extractText(doc, config.DetailSelector)
	date, err := extractDate(doc, config)
	if err != nil {
		fmt.Printf("Failed to extract date: %v\n", err)
		return
	}

	// データフォーマット
	insertText := fmt.Sprintf("%d,%d,%d,%s", date.Year(), int(date.Month()), date.Day(), text)

	// CSVに書き込む
	err = writeToCSV(config.CSVFile, insertText)
	if err != nil {
		fmt.Printf("Failed to write to CSV: %v\n", err)
	}
}

func fetchDocument(url string) (*goquery.Document, error) {
	resp, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
	}

	return goquery.NewDocumentFromReader(resp.Body)
}

func extractText(doc *goquery.Document, selector string) string {
	var textBuilder strings.Builder
	doc.Find(selector).Each(func(i int, s *goquery.Selection) {
		textBuilder.WriteString(strings.TrimSpace(s.Text()))
	})
	return regexp.MustCompile(`\s+`).ReplaceAllString(textBuilder.String(), " ")
}

func extractDate(doc *goquery.Document, config NewspaperConfig) (time.Time, error) {
	dateText := doc.Find(config.DateSelector).Text()
	re := regexp.MustCompile(config.DateRegexp)
	matches := re.FindStringSubmatch(dateText)
	if len(matches) == 0 {
		return time.Time{}, fmt.Errorf("failed to match date")
	}
	return time.Parse(config.DateFormat, matches[0])
}

func writeToCSV(filename, text string) error {
	file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
	if err != nil {
		return err
	}
	defer file.Close()

	writer := csv.NewWriter(file)
	defer writer.Flush()

	return writer.Write(strings.Split(text, ","))
}

コード解説

設定情報の定義

var configs = map[string]NewspaperConfig{
	"mainichi": {
		BaseURL:       "https://mainichi.jp/yoroku/",
		ListSelector:  "ul.articlelist li a",
		DetailSelector: "section#articledetail-body p",
		DateSelector:   "span.articletag-date",
		DateRegexp:     `(\d{4}/\d{1,2}/\d{1,2})`, // 例: 2024/12/7
		DateFormat:     "2006/1/2",
		CSVFile:        "mainichi_articles.csv",
	},
}

ここでは、複数の新聞に対応する設定情報を管理しています。それぞれのサイトの構造に合わせて、以下を設定します：

BaseURL: 記事一覧ページのURL
ListSelector: 記事リンクのCSSセレクタ
DetailSelector: 記事本文のCSSセレクタ
DateSelector: 日付を取得するCSSセレクタ
DateRegexp/DateFormat: 日付フォーマットの定義

記事のスクレイピング処理

func processNewspaper(config NewspaperConfig) {
	doc, err := fetchDocument(config.BaseURL)
	if err != nil {
		fmt.Printf("Failed to fetch URL: %v\n", err)
		return
	}

	articles := doc.Find(config.ListSelector)
	articles.Each(func(i int, s *goquery.Selection) {
		href, exists := s.Attr("href")
		if !exists {
			return
		}

		fullURL := href
		if !strings.HasPrefix(href, "http") {
			fullURL = config.BaseURL + href
		}

		processArticle(fullURL, config)
	})
}

fetchDocument を使って、記事一覧ページを取得。
Find メソッドで記事リンクを抽出。
各リンクに対して processArticle 関数を実行。

実行例

全ての新聞を処理:

go run main.go -all

特定の新聞を処理:

go run main.go -newspaper=mainichi

おわりに

Go言語のシンプルさを活かして、実用的なWebスクレイピングツールを作成しました。新聞のスクレイピングだけでなく、他の用途にも応用可能です。

ぜひ、この記事を参考にして、あなたのプロジェクトにも活用してください！ 🚀

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up