More than 5 years have passed since last update.

日本語抽出ツール

Python

Last updated at 2019-01-30Posted at 2019-01-30

ソース内の日本語を全部英語に変更するみたいな作業はよくあるとおもいますが、
下記の2つツールを使えば仕事をスムーズに進めると思います。

１）日本語抽出ツールを使って、システムの日本語を洗い出す、CSVの形で出力される。
２）CSVをローカライズしてもらう。
３）置換ツールを使って日本語を英語に変更する。

日本語抽出ツール

search.py

# -*- coding: utf-8 -*-
import os,sys
import re
import codecs
import unicodedata
import csv
import operator
reload(sys)
sys.setdefaultencoding("utf8")
# Language Translator
from googletrans import Translator
translator = Translator()

kanjiPattern = re.compile(u"[\u4E00-\u9FFF]+")
katakanaPattern=re.compile(u"[\u30a0-\u30ff]+")
hiraganaPattern=re.compile(u"[\u3040-\u309f]+")

except_file_type = ['.asta','.csv','.xlsx','.json','.json','.h','.sql','.class','.sh','.txt','.jpg','.png','.xls,','.svn-base']
except_file_like = r'test|debug|target|cocos2d|tools|Libraries'
# choice_tpye_list = ['.html','.js','.css','.java','.cpp','.mm','.xml']
choice_tpye_list = ['.js']
quationFiles_list = ['.js','.css','.java','.cpp','.mm']
save_path = ''

import time

timestr = time.strftime("%Y%m%d-%H%M%S")

def matchJapanese(str):
	return kanjiPattern.search(str) or katakanaPattern.search(str) or hiraganaPattern.search(str)


def start():
    '''
    argv[1]:検索先のファイル名　or パース
    '''
    global choice_tpye_list,save_path
    try:
        paths = sys.argv[1]
    except Exception:
        paths = ''

    save_path = os.path.join(os.getcwd(),'check_jp_' + timestr +'.csv')

    path_list = paths.split(',')
    print("".join(choice_tpye_list))
    for path in path_list:
        checkDIR(path)

	#文字数で並び替え、長い順
	mycsv = csv.reader(open(save_path),delimiter='#')
    result = sorted(mycsv, key=lambda x: int(x[2]), reverse=True)
    with open(save_path,'wt') as csvfile1:
        writer = csv.writer(csvfile1,delimiter='#', lineterminator='\n')
        writer.writerows(result)


def sortcsvfiles(inputfilename, outputfilename):
    with open(inputfilename,'rt') as csvfile1:
        reader = csv.reader(csvfile1)
        headers = next(reader, None) 
        rows = sorted(
            (r for r in reader if len(r) > 1),
            key=lambda r: (int(r[0]), int(r[1])))

    with open(outputfilename,'wt') as csvfile1:
        writer = csv.writer(csvfile1, lineterminator='\n')
        if headers:
            writer.writerow(headers)
        writer.writerows(rows)

def checkDIR(path):
    if os.path.isfile(path):
        a,b = os.path.splitext(path)
        if choice_tpye_list:
            if b in choice_tpye_list:
                replaceFile(path)
        else:
            if b not in except_file_type:
                replaceFile(path)

    elif os.path.isdir(path):
        file_list = os.listdir(path)
        path_list = map(lambda x: os.path.join(path, x), file_list)
        for item in path_list:
            checkDIR(item)
    else:
        print '---Wrong File---' + path

def removeComments(string):
	if string.lstrip().lower().startswith(("new","CCAssert","qblogsv","log","cclog","public","cocos2d::log", "return", "void", "@", "*", "throw", ".", "console.log")):
		string = "";

	string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) #/*COMMENT */
	string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # //COMMENT
	string = re.sub(re.compile("(<!--.*?-->)",re.DOTALL ) ,"" ,string)#<!--COMMENT-->)

	for value in ("テスト","<%","実装","クラス", "LOG", "★", "//", "<<"):
		if string.lower().find(value) > 0:
			string = "";
			break

	#string = getStringFromHtml(string)

	#string = re.sub('<[^<>]+>', '', string)

	'''
	p = re.compile(r"<[^>]*?>")
	string = p.sub("", string)
	p = re.compile(r'"([^"]*)"')
	string = p.sub("", string)
	string = re.sub('"(.*?)"', r'\1', string)
	string = getStringFromHtml(string)
	'''
	'''
	string = re.sub('<[^<>]+>', '', string)

	if string.endswith(','):
		string = string[:-1]
	if getStringFromQua(string) != "":
		string = getStringFromQua(string)
    '''

	return string.lstrip()

def getStringFromQua(string):
	quoted = re.compile('"[^"]*"')
	for value in quoted.findall(string):
		if matchJapanese(value):
			return "".join(value)
		else:
		    return ""

def getStringFromHtml(raw_html):
	#raw_html = re.sub('<[^>]+>', '', raw_html)
	try:
		raw_html = re.findall(r'>(.*)<', raw_html)[0]
	except IndexError:
		print("")
		
	return raw_html

def ifHasQuationGetString(strOrgin):
	quotedDouble = re.compile('"[^"]*"')
	quotedSingle = re.compile("(?<=')[^']+(?=')")

	matchDouble = re.search('"[^"]*"', strOrgin)
	matchSingle = re.search("(?<=')[^']+(?=')", strOrgin)
	string = ""
	if matchDouble:
		for value in quotedDouble.findall(strOrgin):
			if value:
				string = "".join(value)

	if matchSingle:
		for value in quotedSingle.findall(strOrgin):
			if value:
				string = "".join(value)
	return string

def checkIfKeysInString(keys,string):
	return any(s in string for s in keys)

def replaceFile(file):
    num = 1
    all_lis = []
    lis = []
    if any(re.findall(except_file_like, file, re.IGNORECASE)):
		return False
    with open(file, 'r') as f:
        line = f.readline()
        while line:
			try:
				line = line.decode('utf-8')
				line = removeComments(line)
			except Exception,e:
				line = removeComments(line)
			content_lis = line.split('#')
			if matchJapanese(content_lis[0]):
				strOrgin = content_lis[0]
				string  = content_lis[0]

				string = getStringFromHtml(string)
				
				if ifHasQuationGetString(string) != "":
					string = ifHasQuationGetString(string)

				
				if matchJapanese(string):
					lis = [file, num, len(string.replace('\n', '').replace('\r', '').strip()),string.replace('\n', '').replace('\r', '').strip()]
					all_lis.append(lis)

			line = f.readline()
			num += 1

    with codecs.open(save_path, 'a', "utf-8") as f:
        if all_lis:
            for itme in all_lis:
				f.write('%s#%s#%s#%s\n' % (itme[0],itme[1],itme[2],itme[3]))


if __name__ == '__main__':
    start()


    print 'Output:%s'%save_path

置換ツール

search.py

# -*- coding: utf-8 -*-
import csv
import fileinput
import os,sys
import re
import codecs
reload(sys)
sys.setdefaultencoding("utf8")

# ローカライズファイル
csvPath = 'C:/Users/r.cho/.m2/repository/git/magica/etc/tools/2019CPP.csv'

# ソースファイルD:\00_Docs\witch\sourcet
# source_dir = 'C:/Users/r.cho/.m2/repository/git/magica'
source_dir = 'C:/Users/r.cho/.m2/repository/git/client'
# source_dir = 'D:/00_Docs/witch/sourcet'
except_file_like = r'.git|test|target'
except_file_type = ['.java','.db','.md','.xml','.asta','.vfxb','.mtn','.plist','.plist','.ExportJson','.vfxj','.moc','.css','.csv','.xlsx','.json','.h','.sql','.class','.sh','.txt','.jpg','.png','.xls,','.svn-base']
# file_type = ['.html','.js','.css','.java','.cpp','.mm','.xml']
file_type = ['.cpp']

def start():
	checkDIR(source_dir)

def checkDIR(path):
    if os.path.isfile(path):
		a,b = os.path.splitext(path)

		#if b not in except_file_type:
		if b in file_type:
			replaceFile(path)

    elif os.path.isdir(path):
        file_list = os.listdir(path)
        path_list = map(lambda x: os.path.join(path, x), file_list)
        for item in path_list:
            checkDIR(item)
    else:
        print '---Wrong File---' + path

def replaceFile(fileName):
    print("###### START ##########")
    #if any(re.findall(except_file_like, fileName, re.IGNORECASE)):
	#	return False
    with open(fileName, 'r') as sourceFile:
		sourceData = sourceFile.read()

    with open(csvPath, 'r') as csv_file:
		csv_reader = csv.reader(csv_file, delimiter='#')
		for row in csv_reader:
		    if len(row) > 1:
				sourceData = sourceData.replace(row[0], row[1])

    with open(fileName, 'w') as file:
		print("####" + fileName)
		file.write(sourceData)


if __name__ == '__main__':
    start()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up