Goで文字コードを自動判別してCSVを読み込む/文字コードを指定して書き込む
文字コードとか種類多すぎてわからーん。
とりあえずUTF-8, Shift_JIS, EUC-UPくらい対応したい。
今のところは問題なく動いているけど、例外があったら直す。
CSVの文字コード自動判別/指定書き込み
package separate
import (
"bufio"
"bytes"
"encoding/csv"
"io"
"os"
"strings"
"github.com/saintfish/chardet"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
)
type EncodeType int
const (
UTF8 EncodeType = iota
UTF8BOM
ShiftJIS
EUCJP
)
func ReadFileDetect(filePath string, split rune) ([][]string, EncodeType, error) {
data, err := os.ReadFile(filePath)
if err != nil {
return nil, UTF8, err
}
return ReadDetect(data, split)
}
func ReadDetect(data []byte, split rune) ([][]string, EncodeType, error) {
ch, err := chardet.NewTextDetector().DetectBest(data)
if err != nil {
return nil, UTF8, err
}
charSet := strings.ToUpper(ch.Charset)
switch charSet {
case "EUC-JP":
ret, err := ReadStream(bytes.NewBuffer(data), split, EUCJP)
return ret, EUCJP, err
default:
if strings.Contains(charSet, "WINDOWS") || strings.Contains(charSet, "JIS") {
ret, err := ReadStream(bytes.NewBuffer(data), split, ShiftJIS)
return ret, ShiftJIS, err
}
ret, err := ReadStream(bytes.NewBuffer(data), split, UTF8)
return ret, UTF8, err
}
}
func ReadStream(ioReader io.Reader, split rune, enc EncodeType) ([][]string, error) {
var reader *csv.Reader
switch enc {
case ShiftJIS:
reader = csv.NewReader(transform.NewReader(ioReader, japanese.ShiftJIS.NewDecoder()))
case EUCJP:
reader = csv.NewReader(transform.NewReader(ioReader, japanese.EUCJP.NewDecoder()))
default:
br := bufio.NewReader(ioReader)
bs, err := br.Peek(3)
if err == nil {
if bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF {
br.Discard(3)
}
}
reader = csv.NewReader(br)
}
reader.Comma = split
reader.FieldsPerRecord = -1
return reader.ReadAll()
}
// **** 以下は書き込み系 ****
func innerWrite(file *os.File, split rune, enc EncodeType, datas [][]string) error {
var writer *csv.Writer
switch enc {
case ShiftJIS:
writer = csv.NewWriter(japanese.ShiftJIS.NewEncoder().Writer(file))
case EUCJP:
writer = csv.NewWriter(japanese.EUCJP.NewEncoder().Writer(file))
case UTF8BOM:
bw := bufio.NewWriter(file)
bw.Write([]byte{0xEF, 0xBB, 0xBF})
writer = csv.NewWriter(bw)
default:
writer = csv.NewWriter(file)
}
writer.Comma = split
return writer.WriteAll(datas)
}
func WriteFile(filePath string, split rune, enc EncodeType, datas ...[]string) error {
file, err := os.Create(filePath)
if err != nil {
return err
}
defer file.Close()
return innerWrite(file, split, enc, datas)
}
func AppendFile(filePath string, split rune, enc EncodeType, datas ...[]string) error {
file, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
if err != nil {
return err
}
defer file.Close()
return innerWrite(file, split, enc, datas)
}
文字列の派生パターンも念の為めもめも。
func ReadDetectString(data []byte) (string, EncodeType, error) {
ch, err := chardet.NewTextDetector().DetectBest(data)
if err != nil {
return "", UTF8, err
}
charSet := strings.ToUpper(ch.Charset)
switch charSet {
case "EUC-JP":
ret, err := ReadStreamString(bytes.NewBuffer(data), EUCJP)
return ret, EUCJP, err
default:
if strings.Contains(charSet, "WINDOWS") || strings.Contains(charSet, "JIS") {
ret, err := ReadStreamString(bytes.NewBuffer(data), ShiftJIS)
return ret, ShiftJIS, err
}
ret, err := ReadStreamString(bytes.NewBuffer(data), UTF8)
return ret, UTF8, err
}
}
func ReadStreamString(ioReader io.Reader, enc EncodeType) (string, error) {
var reader *transform.Reader
switch enc {
case ShiftJIS:
reader = transform.NewReader(ioReader, japanese.ShiftJIS.NewDecoder())
case EUCJP:
reader = transform.NewReader(ioReader, japanese.EUCJP.NewDecoder())
default:
br := bufio.NewReader(ioReader)
bs, err := br.Peek(3)
if err == nil {
if bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF {
br.Discard(3)
}
}
ret, err := io.ReadAll(br)
return string(ret), err
}
ret, err := io.ReadAll(reader)
return string(ret), err
}
もうちょっと綺麗にする。