More than 3 years have passed since last update.

[Go]文字コードのUTF-8変換を行う - autodetect

Posted at 2020-08-13

文字コードが分からない際にautodetectする場合、自前でゴリゴリ書いていたのですが
ICU実装をgoで書いたsaintfish/chardetという便利libraryがありました。

実装

package main

import (
	"errors"
	"fmt"
	"github.com/saintfish/chardet"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/transform"
)

// Converts to UTF-8.
// Charset automatically detected.
func DecodeAutoDetect(src []byte) (string, error) {
	d := chardet.NewHtmlDetector()
	r, err := d.DetectBest(src)
	if err != nil {
		return string(src), err
	}
	e, _ := charset.Lookup(r.Charset)
	if e == nil {
		return string(src), errors.New(fmt.Sprintf("invalid charset [%s]", r.Charset))
	}
	decodeStr, _, err := transform.Bytes(
		e.NewDecoder(),
		src,
	)
	if err != nil {
		return string(src), err
	}
	return string(decodeStr), nil
}

車輪の再発明はしないに越したことはないです。

確認

とはいえsaintfish/chardetって更新だいぶ昔に止まっているし、本当に判別できるのか心配になったのでShift-JIS EUC-JP ISO-2022-JP UTF-8の以下サイトを突っ込んでテストを回してみる。

http://charset.7jp.net/sjis.html
http://charset.7jp.net/jis.html
http://charset.7jp.net/euc.html
https://www.unicode.org/reports/tr51/tr51-18.html

package chardet_test

import (
	"github.com/saintfish/chardet"
	"io"
	"os"
	"path/filepath"
	"testing"
)

func TestDetector(t *testing.T) {
	type file_charset_language struct {
		File     string
		IsHtml   bool
		Charset  string
		Language string
	}
	var data = []file_charset_language{
		{"utf8.html", true, "UTF-8", ""},
		{"utf8_bom.html", true, "UTF-8", ""},
		{"8859_1_en.html", true, "ISO-8859-1", "en"},
		{"8859_1_da.html", true, "ISO-8859-1", "da"},
		{"8859_1_de.html", true, "ISO-8859-1", "de"},
		{"8859_1_es.html", true, "ISO-8859-1", "es"},
		{"8859_1_fr.html", true, "ISO-8859-1", "fr"},
		{"8859_1_pt.html", true, "ISO-8859-1", "pt"},
		{"8859_1_pt.html", true, "ISO-8859-1", "pt"},
		{"iso-2022-jp.html", true, "ISO-2022-JP", "ja"},
		{"gb18030.html", true, "GB-18030", "zh"},
		{"euc_jp.html", true, "EUC-JP", "ja"},
		{"euc_kr.html", true, "EUC-KR", "ko"},
		{"big5.html", true, "Big5", "zh"},
	}

	textDetector := chardet.NewTextDetector()
	htmlDetector := chardet.NewHtmlDetector()
	buffer := make([]byte, 32<<10)
	for _, d := range data {
		f, err := os.Open(filepath.Join("testdata", d.File))
		if err != nil {
			t.Fatal(err)
		}
		defer f.Close()
		size, _ := io.ReadFull(f, buffer)
		input := buffer[:size]
		var detector = textDetector
		if d.IsHtml {
			detector = htmlDetector
		}
		result, err := detector.DetectBest(input)
		if err != nil {
			t.Fatal(err)
		}
		if result.Charset != d.Charset {
			t.Errorf("[%s] Expected charset %s, actual %s", d.Charset, d.Charset, result.Charset)
		}
		if result.Language != d.Language {
			t.Errorf("[%s] Expected language %s, actual %s", d.Charset, d.Language, result.Language)
		}
	}
}

=== RUN   TestDetector
--- FAIL: TestDetector (0.07s)
    detector_test.go:58: [ISO-2022-JP] Expected language ja, actual 
FAIL

あら、ISO-2022-JPの場合はlanguageの判定が出来ないようです。
ただし、charsetの判定は出来ているので問題はなさそう。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up