日々 http://manga-now.com で xml をパースしているのだけど、Python の実装を Go に変えたら速くなるのか比較してみました。なおパースの速度だけ比較したいので xml がメモリに読み込まれた状態から各要素を取得し終わるまでの速度を計測しています。

xml のダウンロード

まず Amazon Product Advertising API を使って書籍情報の xml を落としてファイルに保存しておきます。

get_books_xml.go

$ mkdir xmls
$ go run get_books_xml.go

AccessKey, SecretKey, AssociateTag を適当なものに変更して実行すると xmls ディレクトリに 145個のファイルが保存されます。1つのファイルには10冊までの情報が含まれ、合計1442冊の情報になります。

Python で実行

parse_amazon_xml.py

# -*- coding:utf-8 -*-
import time
from lxml import objectify


class ImageInfo:
    def __init__(self):
        self.url = ''
        self.width = ''
        self.height = ''

class BookInfo:
    def __init__(self):
        self.asin = ''
        self.title = ''
        self.binding = ''
        self.author = ''
        self.publisher = ''
        self.publicationDate = ''
        self.images = {}


def getText(dom, tag):
    return getattr(dom, tag).text if tag in dom else ''


def parseXmls(xmls):
    bookInfos = []
    for xml in xmls:
        dom = objectify.fromstring(xml)
        for item in dom.Items.Item:
            bookInfo = BookInfo()
            bookInfo.asin = item.ASIN.text

            attr = item.ItemAttributes
            bookInfo.title = getText(attr, 'Title')
            bookInfo.binding = getText(attr, 'Binding')
            bookInfo.author = getText(attr, 'Author')
            bookInfo.publisher = getText(attr, 'Publisher')
            bookInfo.publicationDate = getText(attr, 'PublicationDate')

            imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
            for imageLabel in imageLabels:
                image = ImageInfo()
                if imageLabel in item:
                    image.url = getattr(item, imageLabel).URL.text
                    image.width = int(getattr(item, imageLabel).Width.text)
                    image.height = int(getattr(item, imageLabel).Height.text)
                bookInfo.images[imageLabel] = image

            bookInfos.append(bookInfo)

    return bookInfos


def getXmls():
    xmls = []
    for i in range(0, 1440+1, 10):
        path = 'xmls/{}.xml'.format(i)
        with open(path, 'r') as f:
            xml = f.read()
            xmls.append(xml)
    return xmls


def main():
    xmls = getXmls()
    start = time.time()
    bookInfos = parseXmls(xmls)
    end = time.time()
    print('xml数: {}'.format(len(xmls)))
    print('book数: {}'.format(len(bookInfos)))
    print('parse時間: {}秒'.format(end - start))


if __name__ == '__main__':
    main()

$ python parse_amazon_xml.py
xml数: 145
book数: 1442
parse時間: 0.14079904556274414秒

0.140秒でした。パースには lxml モジュールを使用しています。

Go で実行

parse_amazon_xml.go

package main

import (
    "fmt"
    "github.com/PuerkitoBio/goquery"
    "io/ioutil"
    "strconv"
    "strings"
    "time"
)

type ImageInfo struct {
    url    string
    width  int
    height int
}

type BookInfo struct {
    asin            string
    title           string
    binding         string
    author          string
    publisher       string
    publicationDate string
    images          map[string]ImageInfo
}

func parseXmls(xmls []string) []BookInfo {
    bookInfos := []BookInfo{}
    for _, xml := range xmls {
        dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
        dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
            bookInfo := BookInfo{}
            bookInfo.asin = item.Find("ASIN").Text()
            attributes := item.Find("ItemAttributes").First()
            if attributes.Length() > 0 {
                bookInfo.title = attributes.Find("Title").Text()
                bookInfo.binding = attributes.Find("Binding").Text()
                bookInfo.author = attributes.Find("Author").Text()
                bookInfo.publisher = attributes.Find("Publisher").Text()
                bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
            }
            imageLabels := []string{
                "SmallImage",
                "MediumImage",
                "LargeImage",
            }
            images := map[string]ImageInfo{}
            for _, imageLabel := range imageLabels {
                xml := item.Find(imageLabel).First()
                url := xml.Find("URL").Text()
                width, _ := strconv.Atoi(xml.Find("Height").Text())
                height, _ := strconv.Atoi(xml.Find("Width").Text())
                image := ImageInfo{url, width, height}
                images[imageLabel] = image
            }
            bookInfo.images = images
            bookInfos = append(bookInfos, bookInfo)
        })
    }
    return bookInfos
}

func getXmls() []string {
    xmls := []string{}
    for i := 0; i <= 1440; i += 10 {
        path := fmt.Sprintf("xmls/%d.xml", i)
        xml, _ := ioutil.ReadFile(path)
        xmls = append(xmls, string(xml))
    }
    return xmls
}

func main() {
    xmls := getXmls()
    start := time.Now()
    bookInfos := parseXmls(xmls)
    end := time.Now()
    fmt.Printf("xml数: %d\n", len(xmls))
    fmt.Printf("book数: %d\n", len(bookInfos))
    fmt.Printf("parse時間: %f秒\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml.go
xml数: 145
book数: 1442
parse時間: 0.180461秒

0.18秒。Python より遅いですね。パースには goquery を使っています。

Go で並列実行

シングルスレッドだと Go の方が遅いけど、Go なら並列実行が簡単に行えるのでこちらも比較してみます。実行しているCPUは2コア4スレッドです。コードの変更箇所だけ書きます。

parse_amazon_xml_th.go

// 引数にチャンネルを取る
// 戻り値を削除
func parseXmls(result chan []BookInfo, xmls []string) {
    ...同じなので省略
    // 処理結果をチャンネルに返す（returnを置き換えた）
    result <- bookInfos
}

// xml の配列を num に分割
func divideXmls(xmls []string, num int) [][]string {
    xmlsNum := len(xmls)
    size := xmlsNum / num
    result := [][]string{}
    for i := 0; i < num; i++ {
        start := size * i
        end := size * (i + 1)
        if i == (num - 1) {
            end = xmlsNum
        }
        result = append(result, xmls[start:end])
    }
    return result
}

func main() {
    allXmls := getXmls()
    // xml を4つに分割する
    divXmls := divideXmls(allXmls, 4)
    start := time.Now()

    result := make(chan []BookInfo)
    // ４スレッドで実行する
    for _, xmls := range divXmls {
        go parseXmls(result, xmls)
    }
    // チャンネルから処理結果を受取り１つにまとめる
    bookInfos := []BookInfo{}
    for _, _ = range divXmls {
        bookInfos = append(bookInfos, <-result...)
    }

    end := time.Now()
    fmt.Printf("xml数: %d\n", len(allXmls))
    fmt.Printf("book数: %d\n", len(bookInfos))
    fmt.Printf("parse時間: %f秒\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml_th.go
xml数: 145
book数: 1442
parse時間: 0.084918秒

0.084秒。2倍くらいになった。

まとめ

実装	速度
Python (lxml)	0.140秒
Go (goquery) 1スレッド	0.180秒
Go (goquery) 4スレッド	0.084秒

並列実行してこその Go （並列実行しないと Go のメリットはない）

PythonとGoでxmlのパース速度を比較してみる

xml のダウンロード

Python で実行

Go で実行

Go で並列実行

まとめ