LoginSignup
2
2

More than 5 years have passed since last update.

Cabochaで活用して自動で「IOB2タグcorpus」学習データを発生

Last updated at Posted at 2017-04-26

#coding: utf-8
from __future__ import print_function  # Only needed for Python 2
import MeCab
import CaboCha
import sys
import os


cabocha = CaboCha.Parser("-f1 -n1")
m = MeCab.Tagger ("-Ochasen")

# For reading from file
class getWords():
    def readText(self, filename):
        ###ファイルを展開
        with open(filename, 'r', encoding='utf-8') as f:
            tText = f.read()
            f.close()
        return tText

#Usage: python training_generator <text file>
with open(sys.argv[1], 'r') as my_file:
    text = my_file.read()


getText = getWords()
#file_output = '<Filename>'

file_output = sys.argv[1]

text = getText.readText(file_output)

cabocha_text = cabocha.parseToString(text)
cabocha_text = cabocha_text.replace("B-ORGANIZATION", "B-ORG")
cabocha_text = cabocha_text.replace("I-ORGANIZATION", "I-ORG")
cabocha_text = cabocha_text.replace("B-ARTIFACT", "B-ART")
cabocha_text = cabocha_text.replace("I-ARTIFACT", "I-ART")
cabocha_text = cabocha_text.replace("B-LOCATION", "B-LOC")
cabocha_text = cabocha_text.replace("I-LOCATION", "I-LOC")
cabocha_text = cabocha_text.replace("B-DATE", "B-DAT")
cabocha_text = cabocha_text.replace("I-DATE", "I-DAT")
cabocha_text = cabocha_text.replace("B-TIME", "B-TIM")
cabocha_text = cabocha_text.replace("I-TIME", "I-TIM")
cabocha_text = cabocha_text.replace("B-PERSON", "B-PSN")
cabocha_text = cabocha_text.replace("I-PERSON", "I-PSN")
cabocha_text = cabocha_text.replace("B-MONEY", "B-MNY")
cabocha_text = cabocha_text.replace("I-MONEY", "I-MNY")
cabocha_text = cabocha_text.replace("B-PERCENT", "B-PNT")
cabocha_text = cabocha_text.replace("I-PERCENT", "I-PNT")


#Remove commas and replace with tab
cabocha_text = cabocha_text.replace(",", "\t")

filename = file_output + '_generated.txt'

if os.path.exists(filename):
    os.remove(filename)

# Remove * and add line space
for line in cabocha_text.splitlines():
    if not line.startswith('*'):
        with open(filename, 'a') as f:
            print(line, file=f)
    if line.startswith('。'):
        with open(filename, 'a') as f:
            print("", file=f)

readFile = open(filename)

lines = readFile.readlines()
lines = lines[:-1]

readFile.close()

w = open(filename,'w')
w.writelines([item for item in lines[:-1]])
w.close()

Next Step: Fix tags to suit your needs

Reference:
http://qiita.com/Hironsan/items/326b66711eb4196aa9d4
https://github.com/Hironsan/IOB2Corpus

2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2