バイト数の取得
Pythonで文字列のバイト数は
len('**文字列**'.encode('**文字コード**'))
で取得できる。
e.g.
len('あああ'.encode('utf-8'))
バイト数ごとに文字列を分割する面倒くささ
print(len("a".encode('utf-8'))) # => 1
print(len("Ø".encode('utf-8'))) # => 2
print(len("あ".encode('utf-8'))) # => 3
print(len("𧚓".encode('utf-8'))) # => 4
print(len("🤔".encode('utf-8'))) # => 4
print(len("🙋🏽♀️".encode('utf-8'))) # => 17
文字数とバイト数が一致しないので結構めんどい。
バイト数ごとに文字列を分割する関数
# split_byte_sizeは分割するバイト数
# charcodeはテキストの文字コード
def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
bytes_text = text.encode(charcode)
head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
tail = text[len(head):]
if tail == text:
return []
split_tail = split_text_by_byte_size(tail, split_byte_size)
results = []
results.append(head)
results.extend(split_tail)
return results
こんな感じになる
print(split_text_by_byte_size('aaa', 1)) # => ['a', 'a', 'a']
print(split_text_by_byte_size('aaa', 2)) # => ['aa', 'a']
print(split_text_by_byte_size('aaa', 3)) # => ['aaa']
print(split_text_by_byte_size('あいう', 2)) # => []
print(split_text_by_byte_size('あいう', 3)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 4)) # => ['あ', 'い', 'う']
print(split_text_by_byte_size('あいう', 6)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 7)) # => ['あい', 'う']
print(split_text_by_byte_size('あいう', 9)) # => ['あいう']
print(split_text_by_byte_size('🤔🤔', 3)) # => []
print(split_text_by_byte_size('🤔🤔', 4)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 5)) # => ['🤔', '🤔']
print(split_text_by_byte_size('🤔🤔', 8)) # => ['🤔🤔']
print(split_text_by_byte_size('aあ🤔', 4)) # => ['aあ', '🤔']
print(split_text_by_byte_size('aaあ🤔', 4)) # => ['aa', 'あ', '🤔']
print(split_text_by_byte_size('aaあaa🤔', 4)) # => ['aa', 'あa', 'a', '🤔']
decode(..., errors='ignore')
decode(..., errors='ignore')
がミソ。
print("あああ".encode('utf-8')) # => b'\xe3\x81\x82\xe3\x81\x82\xe3\x81\x82'
print("あああ".encode('utf-8').decode('utf-8')) # => あああ
# 3文字目の「あ」の途中のバイトでdecodeしようとするとUnicodeDecodeErrorになる
print("あああ".encode('utf-8')[0:7].decode('utf-8')) # => UnicodeDecodeError
# errors='ignore'を指定すれば、途中バイトの文字を無視してその前までの文字列を返してくれる
print("あああ".encode('utf-8')[0:7].decode('utf-8', errors='ignore')) # => ああ
ファイル書き出しの処理とかと合わせる
import glob
import os
from os import path
def split_text_by_byte_size(text, split_byte_size, charcode='utf-8'):
bytes_text = text.encode(charcode)
head = bytes_text[:split_byte_size].decode(charcode, errors='ignore')
tail = text[len(head):]
if tail == text:
return []
split_tail = split_text_by_byte_size(tail, split_byte_size, charcode)
results = []
results.append(head)
results.extend(split_tail)
return results
if __name__ == '__main__':
# テキストの文字コード
charset = 'utf-8'
# 分割するバイト数
split_byte_size = 10000
input_dir = '**入力フォルダ**'
output_dir = '**出力フォルダ**'
os.makedirs(output_dir, exist_ok=True)
file_list = glob.glob(input_dir + '/*')
for file_path in file_list:
with open(file_path, 'r') as fr:
text = fr.read()
split_texts = split_text_by_byte_size(
text, split_byte_size, charset)
for index, split_text in enumerate(split_texts):
ext_pair = path.splitext(path.basename(file_path))
output_path = path.join(
output_dir, ext_pair[0] + '_' + str(index) + ext_pair[1])
with open(output_path, 'w') as fw:
fw.write(split_text)
print(output_path)
お気持ち
splitの過去分詞形ってsplitなんですね……。