0
0

More than 1 year has passed since last update.

csvのデリミタ、改行文字とかを推測

Last updated at Posted at 2022-01-07

csv定義 推測関数


def sniffer(dat:str,
    delimiters = ['\t', ';', ',', ':', '|', ' '],
    quotechars = ['"', '\'', '`'],
    escapechars = ['\\', '^'],
    ):

    # guess lineterminator
    LF = ""
    lines = 1
    if ('\r' in dat):
        LF += '\r'
    if ('\n' in dat):
        LF += '\n'
        lines = dat.count('\n')
    if (LF == "" and '0x0' in dat):
        LF = '0x0'
        lines = dat.count('0x0')

    # guess delimiter
    mxdlm = 0
    delimiter = None
    _it_delim = ((dat.count(x), x) for x in delimiters if x in dat)
    try: mxdlm, delimiter = max((v, x) for v, x in _it_delim if (v / lines) >= 1)
    except ValueError: pass

    # guess quotes
    mxqt = 0
    quotechar = '"'
    _it_quot = ((dat.count(x), x) for x in quotechars if x in dat)
    try: mxqt, quotechar = max((v, x) for v, x in _it_quot if v % 2 == 0)
    except ValueError: pass

    # guess escapechar, etc
    if quotechar:
        escapechar, doublequote = next(((esc, False) for esc in escapechars if (esc + quotechar) in dat), (None, True))

    return dict(
        delimiter=delimiter,
        doublequote=doublequote,
        lineterminator=LF,
        quoting = 1 if quotechar and mxqt > mxdlm * 2 else 0,
        quotechar=quotechar,
        skipinitialspace = delimiter and (delimiter + ' ') in dat)

使い方

>>> csvdat = """n,aa
1,1
2,あ
"""
>>> print(sniffer(csvdat))
{'delimiter': ',', 'doublequote': True, 'escapechar': None, 'lineterminator': '\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}

>>> import csv
>>> from io import StringIO
>>> r = csv.reader(StringIO(csvdat), **sniffer(csvdat))
>>> print(list(r))
[['n', 'aa'], ['1', '1'], ['2', 'あ']]

以上csv.Sniffer 車輪の再発明

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0