はじめに
Pythonで自然言語生成ライブラリを使って遊んでみました。準備するもの
from pdfminer.high_level import extract_text
from nltk.stem import WordNetLemmatizer
import nltk
script.py
from pdfminer.high_level import extract_text
from nltk.stem import WordNetLemmatizer
import nltk
text = extract_text(r'/Users/username/Desktop/004.文字起こしOCR/Project001/RowData/サンプル.pdf')
# 改行を削除
text = text.replace('\n', '')
# 「。」で改行
text = text.replace('。', '。\n')
#print(text)
nltk.download('punkt') # 分かち書き用
nltk.download('averaged_perceptron_tagger') # 品詞の取得用
morph = nltk.word_tokenize(text)
print(morph)
pos = nltk.pos_tag(morph)
print(pos) # [('Hi', 'NNP'), (',', ','), ('I', 'PRP'), ("'m", 'VBP'), ('Taro', 'JJ'), ('Yamada', 'NNP'), ('I', 'PRP'), ('woke', 'VBD'), ('up', 'RB'), ('at', 'IN'), ('8am', 'CD')]
lemmatizer = WordNetLemmatizer()
#s = "getting"
print(lemmatizer.lemmatize(text)) # getting
print(lemmatizer.lemmatize(text, pos="v")) # get