LoginSignup
1
1

More than 3 years have passed since last update.

[Go]文字コードのUTF-8変換を行う - autodetect

Posted at

文字コードが分からない際にautodetectする場合、自前でゴリゴリ書いていたのですが
ICU実装をgoで書いたsaintfish/chardetという便利libraryがありました。

実装

package main

import (
    "errors"
    "fmt"
    "github.com/saintfish/chardet"
    "golang.org/x/net/html/charset"
    "golang.org/x/text/transform"
)

// Converts to UTF-8.
// Charset automatically detected.
func DecodeAutoDetect(src []byte) (string, error) {
    d := chardet.NewHtmlDetector()
    r, err := d.DetectBest(src)
    if err != nil {
        return string(src), err
    }
    e, _ := charset.Lookup(r.Charset)
    if e == nil {
        return string(src), errors.New(fmt.Sprintf("invalid charset [%s]", r.Charset))
    }
    decodeStr, _, err := transform.Bytes(
        e.NewDecoder(),
        src,
    )
    if err != nil {
        return string(src), err
    }
    return string(decodeStr), nil
}

車輪の再発明はしないに越したことはないです。

確認

とはいえsaintfish/chardetって更新だいぶ昔に止まっているし、本当に判別できるのか心配になったのでShift-JIS EUC-JP ISO-2022-JP UTF-8の以下サイトを突っ込んでテストを回してみる。

http://charset.7jp.net/sjis.html
http://charset.7jp.net/jis.html
http://charset.7jp.net/euc.html
https://www.unicode.org/reports/tr51/tr51-18.html

package chardet_test

import (
    "github.com/saintfish/chardet"
    "io"
    "os"
    "path/filepath"
    "testing"
)

func TestDetector(t *testing.T) {
    type file_charset_language struct {
        File     string
        IsHtml   bool
        Charset  string
        Language string
    }
    var data = []file_charset_language{
        {"utf8.html", true, "UTF-8", ""},
        {"utf8_bom.html", true, "UTF-8", ""},
        {"8859_1_en.html", true, "ISO-8859-1", "en"},
        {"8859_1_da.html", true, "ISO-8859-1", "da"},
        {"8859_1_de.html", true, "ISO-8859-1", "de"},
        {"8859_1_es.html", true, "ISO-8859-1", "es"},
        {"8859_1_fr.html", true, "ISO-8859-1", "fr"},
        {"8859_1_pt.html", true, "ISO-8859-1", "pt"},
        {"8859_1_pt.html", true, "ISO-8859-1", "pt"},
        {"iso-2022-jp.html", true, "ISO-2022-JP", "ja"},
        {"gb18030.html", true, "GB-18030", "zh"},
        {"euc_jp.html", true, "EUC-JP", "ja"},
        {"euc_kr.html", true, "EUC-KR", "ko"},
        {"big5.html", true, "Big5", "zh"},
    }

    textDetector := chardet.NewTextDetector()
    htmlDetector := chardet.NewHtmlDetector()
    buffer := make([]byte, 32<<10)
    for _, d := range data {
        f, err := os.Open(filepath.Join("testdata", d.File))
        if err != nil {
            t.Fatal(err)
        }
        defer f.Close()
        size, _ := io.ReadFull(f, buffer)
        input := buffer[:size]
        var detector = textDetector
        if d.IsHtml {
            detector = htmlDetector
        }
        result, err := detector.DetectBest(input)
        if err != nil {
            t.Fatal(err)
        }
        if result.Charset != d.Charset {
            t.Errorf("[%s] Expected charset %s, actual %s", d.Charset, d.Charset, result.Charset)
        }
        if result.Language != d.Language {
            t.Errorf("[%s] Expected language %s, actual %s", d.Charset, d.Language, result.Language)
        }
    }
}
=== RUN   TestDetector
--- FAIL: TestDetector (0.07s)
    detector_test.go:58: [ISO-2022-JP] Expected language ja, actual 
FAIL

あら、ISO-2022-JPの場合はlanguageの判定が出来ないようです。
ただし、charsetの判定は出来ているので問題はなさそう。

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1