LoginSignup
4
3

More than 3 years have passed since last update.

[Python] テキストファイルを指定バイト数ごとに分割する

Last updated at Posted at 2021-02-01

バイト数の取得

Pythonで文字列のバイト数は

len('**文字列**'.encode('**文字コード**'))

で取得できる。

e.g.

len('あああ'.encode('utf-8'))

バイト数ごとに文字列を分割する面倒くささ

print(len("a".encode('utf-8'))) # => 1
print(len("Ø".encode('utf-8'))) # => 2
print(len("あ".encode('utf-8'))) # => 3
print(len("𧚓".encode('utf-8'))) # => 4
print(len("🤔".encode('utf-8'))) # => 4
print(len("🙋🏽‍♀️".encode('utf-8'))) # => 17

文字数とバイト数が一致しないので結構めんどい。

バイト数ごとに文字列を分割する関数

# split_byte_sizeは分割するバイト数
# charcodeはテキストの文字コード
def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
    bytes_text = text.encode(charcode)
    head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
    tail = text[len(head):]

    if tail == text:
        return []

    split_tail = split_text_by_byte_size(tail, split_byte_size)

    results = []
    results.append(head)
    results.extend(split_tail)

    return results

こんな感じになる

print(split_text_by_byte_size('aaa', 1)) # => ['a', 'a', 'a']
print(split_text_by_byte_size('aaa', 2)) # => ['aa', 'a']
print(split_text_by_byte_size('aaa', 3)) # => ['aaa']
print(split_text_by_byte_size('あいう', 2)) # => []
print(split_text_by_byte_size('あいう', 3)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 4)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 6)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 7)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 9)) # => ['あいう']
print(split_text_by_byte_size('🤔🤔', 3)) # => []
print(split_text_by_byte_size('🤔🤔', 4)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 5)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 8)) # => ['🤔🤔']
print(split_text_by_byte_size('aあ🤔', 4)) # => ['aあ', '🤔']
print(split_text_by_byte_size('aaあ🤔', 4)) # => ['aa', 'あ', '🤔']
print(split_text_by_byte_size('aaあaa🤔', 4)) # => ['aa', 'あa', 'a', '🤔']

decode(..., errors='ignore')

decode(..., errors='ignore') がミソ。

print("あああ".encode('utf-8')) # => b'\xe3\x81\x82\xe3\x81\x82\xe3\x81\x82'
print("あああ".encode('utf-8').decode('utf-8')) # => あああ

# 3文字目の「あ」の途中のバイトでdecodeしようとするとUnicodeDecodeErrorになる
print("あああ".encode('utf-8')[0:7].decode('utf-8')) # => UnicodeDecodeError

# errors='ignore'を指定すれば、途中バイトの文字を無視してその前までの文字列を返してくれる
print("あああ".encode('utf-8')[0:7].decode('utf-8', errors='ignore')) # => ああ

ファイル書き出しの処理とかと合わせる

import glob
import os
from os import path


def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
    bytes_text = text.encode(charcode)
    head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
    tail = text[len(head):]

    if tail == text:
        return []

    split_tail = split_text_by_byte_size(tail, split_byte_size, charcode)

    results = []
    results.append(head)
    results.extend(split_tail)

    return results


if __name__ == '__main__':
    # テキストの文字コード
    charset = 'utf-8'
    # 分割するバイト数
    split_byte_size = 10000

    input_dir = '**入力フォルダ**'
    output_dir = '**出力フォルダ**'
    os.makedirs(output_dir, exist_ok=True)

    file_list = glob.glob(input_dir + '/*')

    for file_path in file_list:
        with open(file_path, 'r') as fr:
            text = fr.read()

            split_texts = split_text_by_byte_size(
                text, split_byte_size, charset)

            for index, split_text in enumerate(split_texts):
                ext_pair = path.splitext(path.basename(file_path))
                output_path = path.join(
                    output_dir, ext_pair[0] + '_' + str(index) + ext_pair[1])
                with open(output_path, 'w') as fw:
                    fw.write(split_text)
                    print(output_path)

お気持ち

splitの過去分詞形ってsplitなんですね……。

4
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
3