LoginSignup
1
0

More than 1 year has passed since last update.

【MEMO】Goで文字コードを自動判別してCSVを読み込む/文字コードを指定して書き込む

Last updated at Posted at 2022-09-06

Goで文字コードを自動判別してCSVを読み込む/文字コードを指定して書き込む

文字コードとか種類多すぎてわからーん。
とりあえずUTF-8, Shift_JIS, EUC-UPくらい対応したい。

今のところは問題なく動いているけど、例外があったら直す。

CSVの文字コード自動判別/指定書き込み


package separate

import (
	"bufio"
	"bytes"
	"encoding/csv"
	"io"
	"os"
	"strings"

	"github.com/saintfish/chardet"
	"golang.org/x/text/encoding/japanese"
	"golang.org/x/text/transform"
)

type EncodeType int

const (
	UTF8 EncodeType = iota
	UTF8BOM
	ShiftJIS
	EUCJP
)

func ReadFileDetect(filePath string, split rune) ([][]string, EncodeType, error) {
	data, err := os.ReadFile(filePath)
	if err != nil {
		return nil, UTF8, err
	}
	return ReadDetect(data, split)
}

func ReadDetect(data []byte, split rune) ([][]string, EncodeType, error) {
	ch, err := chardet.NewTextDetector().DetectBest(data)
	if err != nil {
		return nil, UTF8, err
	}
	charSet := strings.ToUpper(ch.Charset)
	switch charSet {
	case "EUC-JP":
		ret, err := ReadStream(bytes.NewBuffer(data), split, EUCJP)
		return ret, EUCJP, err
	default:
		if strings.Contains(charSet, "WINDOWS") || strings.Contains(charSet, "JIS") {
			ret, err := ReadStream(bytes.NewBuffer(data), split, ShiftJIS)
			return ret, ShiftJIS, err
		}
		ret, err := ReadStream(bytes.NewBuffer(data), split, UTF8)
		return ret, UTF8, err
	}
}

func ReadStream(ioReader io.Reader, split rune, enc EncodeType) ([][]string, error) {
	var reader *csv.Reader
	switch enc {
	case ShiftJIS:
		reader = csv.NewReader(transform.NewReader(ioReader, japanese.ShiftJIS.NewDecoder()))
	case EUCJP:
		reader = csv.NewReader(transform.NewReader(ioReader, japanese.EUCJP.NewDecoder()))
	default:
		br := bufio.NewReader(ioReader)
		bs, err := br.Peek(3)
		if err == nil {
			if bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF {
				br.Discard(3)
			}
		}
		reader = csv.NewReader(br)
	}
	reader.Comma = split
	reader.FieldsPerRecord = -1
	return reader.ReadAll()
}

// **** 以下は書き込み系 ****

func innerWrite(file *os.File, split rune, enc EncodeType, datas [][]string) error {
	var writer *csv.Writer
	switch enc {
	case ShiftJIS:
		writer = csv.NewWriter(japanese.ShiftJIS.NewEncoder().Writer(file))
	case EUCJP:
		writer = csv.NewWriter(japanese.EUCJP.NewEncoder().Writer(file))
	case UTF8BOM:
		bw := bufio.NewWriter(file)
		bw.Write([]byte{0xEF, 0xBB, 0xBF})
		writer = csv.NewWriter(bw)
	default:
		writer = csv.NewWriter(file)
	}
	writer.Comma = split
	return writer.WriteAll(datas)
}

func WriteFile(filePath string, split rune, enc EncodeType, datas ...[]string) error {
	file, err := os.Create(filePath)
	if err != nil {
		return err
	}
	defer file.Close()
	return innerWrite(file, split, enc, datas)
}

func AppendFile(filePath string, split rune, enc EncodeType, datas ...[]string) error {
	file, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
	if err != nil {
		return err
	}
	defer file.Close()
	return innerWrite(file, split, enc, datas)
}


文字列の派生パターンも念の為めもめも。


func ReadDetectString(data []byte) (string, EncodeType, error) {
	ch, err := chardet.NewTextDetector().DetectBest(data)
	if err != nil {
		return "", UTF8, err
	}
	charSet := strings.ToUpper(ch.Charset)
	switch charSet {
	case "EUC-JP":
		ret, err := ReadStreamString(bytes.NewBuffer(data), EUCJP)
		return ret, EUCJP, err
	default:
		if strings.Contains(charSet, "WINDOWS") || strings.Contains(charSet, "JIS") {
			ret, err := ReadStreamString(bytes.NewBuffer(data), ShiftJIS)
			return ret, ShiftJIS, err
		}
		ret, err := ReadStreamString(bytes.NewBuffer(data), UTF8)
		return ret, UTF8, err
	}
}

func ReadStreamString(ioReader io.Reader, enc EncodeType) (string, error) {
	var reader *transform.Reader
	switch enc {
	case ShiftJIS:
		reader = transform.NewReader(ioReader, japanese.ShiftJIS.NewDecoder())
	case EUCJP:
		reader = transform.NewReader(ioReader, japanese.EUCJP.NewDecoder())
	default:
		br := bufio.NewReader(ioReader)
		bs, err := br.Peek(3)
		if err == nil {
			if bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF {
				br.Discard(3)
			}
		}
		ret, err := io.ReadAll(br)
		return string(ret), err
	}
	ret, err := io.ReadAll(reader)
	return string(ret), err
}

もうちょっと綺麗にする。

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0