More than 3 years have passed since last update.

[Python] テキストファイルを指定バイト数ごとに分割する

Last updated at 2021-02-01Posted at 2021-02-01

バイト数の取得

Pythonで文字列のバイト数は

len('**文字列**'.encode('**文字コード**'))

で取得できる。

e.g.

len('あああ'.encode('utf-8'))

バイト数ごとに文字列を分割する面倒くささ

print(len("a".encode('utf-8'))) # => 1
print(len("Ø".encode('utf-8'))) # => 2
print(len("あ".encode('utf-8'))) # => 3
print(len("𧚓".encode('utf-8'))) # => 4
print(len("🤔".encode('utf-8'))) # => 4
print(len("🙋🏽‍♀️".encode('utf-8'))) # => 17

文字数とバイト数が一致しないので結構めんどい。

バイト数ごとに文字列を分割する関数

# split_byte_sizeは分割するバイト数
# charcodeはテキストの文字コード
def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
    bytes_text = text.encode(charcode)
    head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
    tail = text[len(head):]

    if tail == text:
        return []

    split_tail = split_text_by_byte_size(tail, split_byte_size)

    results = []
    results.append(head)
    results.extend(split_tail)

    return results

こんな感じになる

print(split_text_by_byte_size('aaa', 1)) # => ['a', 'a', 'a']
print(split_text_by_byte_size('aaa', 2)) # => ['aa', 'a']
print(split_text_by_byte_size('aaa', 3)) # => ['aaa']
print(split_text_by_byte_size('あいう', 2)) # => []
print(split_text_by_byte_size('あいう', 3)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 4)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 6)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 7)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 9)) # => ['あいう']
print(split_text_by_byte_size('🤔🤔', 3)) # => []
print(split_text_by_byte_size('🤔🤔', 4)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 5)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 8)) # => ['🤔🤔']
print(split_text_by_byte_size('aあ🤔', 4)) # => ['aあ', '🤔']
print(split_text_by_byte_size('aaあ🤔', 4)) # => ['aa', 'あ', '🤔']
print(split_text_by_byte_size('aaあaa🤔', 4)) # => ['aa', 'あa', 'a', '🤔']

decode(..., errors='ignore')

decode(..., errors='ignore') がミソ。

print("あああ".encode('utf-8')) # => b'\xe3\x81\x82\xe3\x81\x82\xe3\x81\x82'
print("あああ".encode('utf-8').decode('utf-8')) # => あああ

# 3文字目の「あ」の途中のバイトでdecodeしようとするとUnicodeDecodeErrorになる
print("あああ".encode('utf-8')[0:7].decode('utf-8')) # => UnicodeDecodeError

# errors='ignore'を指定すれば、途中バイトの文字を無視してその前までの文字列を返してくれる
print("あああ".encode('utf-8')[0:7].decode('utf-8', errors='ignore')) # => ああ

ファイル書き出しの処理とかと合わせる

import glob
import os
from os import path


def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
    bytes_text = text.encode(charcode)
    head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
    tail = text[len(head):]

    if tail == text:
        return []

    split_tail = split_text_by_byte_size(tail, split_byte_size, charcode)

    results = []
    results.append(head)
    results.extend(split_tail)

    return results


if __name__ == '__main__':
    # テキストの文字コード
    charset = 'utf-8'
    # 分割するバイト数
    split_byte_size = 10000

    input_dir = '**入力フォルダ**'
    output_dir = '**出力フォルダ**'
    os.makedirs(output_dir, exist_ok=True)

    file_list = glob.glob(input_dir + '/*')

    for file_path in file_list:
        with open(file_path, 'r') as fr:
            text = fr.read()

            split_texts = split_text_by_byte_size(
                text, split_byte_size, charset)

            for index, split_text in enumerate(split_texts):
                ext_pair = path.splitext(path.basename(file_path))
                output_path = path.join(
                    output_dir, ext_pair[0] + '_' + str(index) + ext_pair[1])
                with open(output_path, 'w') as fw:
                    fw.write(split_text)
                    print(output_path)

お気持ち

splitの過去分詞形ってsplitなんですね……。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up