LoginSignup
9

More than 5 years have passed since last update.

自然言語処理 webスクレイピングによるトレーニングデータの集め方

Last updated at Posted at 2018-03-30

自然言語処理 webスクレイピングによるトレーニングデータの集め方

今更ながらbotを作りたいなと思ったのですが、機械学習によりモデルを作るためのトレーニングデータを集めるところに苦労したため、トレーニングデータを集めるところだけにフォーカスを当ててて自分が実行した手段を公開したいと思います。

今回自分がとったデータの集め方は、「http://ssmania.info/category/ 」 のサイトにアップされているキャラクターのやり取りをもとにトレーニングデータを抽出するやり方です。

全体の流れとしては、
1.リンクを集める
http://ssmania.info/category/ 」 のサイトに掲載されているURLのリンクを一つのページにまとめる
2.リンクからやり取りを集める
まとめたURLからページのやり取りを「input.txt」「output.txt」二つのファイルに保存する。
3.やり取りを単語ごとに分ける
MeCabを使用して「input.txt」「output.txt」の二つの文章データを単語ごとに分ける。

前提条件

OS:Windows10
言語:Python 3.6.4

目次

1.リンクを集める
2.リンクからやり取りを集める
3.やり取りを単語ごとに分ける
4.結果

1.リンクを集める

http://ssmania.info/category/ 」 のサイトの収集したいやり取りのアニメを選択する。
例1)以下のようにデータを取得したいアニメを選択する。
無題.png
例2)以下のページに移動したら、URLをコピーしてプログラムCollectLinks.pyの11行目の[self.base_url='※※※'] の※※※のところにペーストする。
02.png

CollectLinks.py
#!/Users/igaki/.pyenv/shims/python
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import types

class CharaScraping(object):
    def __init__(self):
        self.exclusion_url = "http://ssmania.info/"
        self.base_url ="http://ssmania.info/category/765%E3%83%97%E3%83%AD"

    def _boardLinkSearch(self):
        self.board_list_url = self.base_url+"?page={}"
        page = 1
        while True:
            link = list()
            print(page)
            board_top_html = requests.get(self.board_list_url.format(page))
            board_top_soup = BeautifulSoup(board_top_html.content, "html.parser")
            main_contents = board_top_soup.find(id="contents")
            if "該当記事はありませんでした" == board_top_soup.find(class_="alink").text:
                print("finish")
                break
            for a in main_contents.find_all('a'):
                tmp_link = a.get('href')
                if self.exclusion_url not in tmp_link:
                    link.append(a.get('href'))
            link_list = "\n".join(link)
            link_list = "\n"+link_list
            with open("./charaLink.txt", "a") as f:
                f.write(link_list)

            page += 1


def main():
    scrpeing = CharaScraping()
    scrpeing._boardLinkSearch()

if __name__ == "__main__":
    main()
    print("search Finish")

2.リンクからやり取りを集める

1で収集したリンクが保存されているファイルからキャラクター同士のやり取りを収集する。

CollectWords.py
#!/Users/igaki/.pyenv/shims/python
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import types


class WordScraping(object):
    def __init__(self):
        self.urlList ={"blog.livedoor.jp":self._livedoor,"ssbiyori.blog.fc2.com":self._ssbiyori,"yomicom.jp":self._yomicom,"potittoss.blog.jp":self._potittoss,"ss-m.net":self._ss_m,"s2-log.com":self._s2_log,"ss-navi.com":self._ss_navi,"horahorazoon.blog134.fc2.com":self._horahorazoon,"ayame2nd.blog.jp":self._ayame2nd,"ssimas72.blog.jp":self._ssimas72,"elephant.2chblog.jp":self._elephant,"morikinoko.com":self._morikinoko,"amnesiataizen.blog.fc2.com":self._amnesiataizen,"ssblog614.blog.fc2.com":self._ssblog614,"sssokuhou.com":self._sssokuhou,"invariant0.blog130.fc2.com":self._invariant0,"darusoku.xyz":self._darusoku,"ss-station.2chblog.jp":self._ss_station,"minnanohimatubushi.2chblog.jp":self._minnanohimatubushi,"ssmansion.xyz":self._ssmansion,"142ch.blog90.fc2.com":self._142ch,"tangaron3.sakura.ne.jp":self._tangaron3,"ssmaster.blog.jp":self._ssmaster,"dousoku.net":self._dousoku,"ssflash.net":self._ssflash,"lclc.blog.jp":self._lclc,"www.lovelive-ss.com":self._www,"maoyuss.blog.fc2.com":self._maoyuss,"ssspecial578.blog135.fc2.com":self._ssspecial578}

    def _fileRead(self):
        with open("./charaLink.txt", "r") as f:
            num = 1
            for link in f.readlines():
                self.urlList[link.split("/")[2]](link)
                num += 1

    def _takeCharaWord(self,word):
        char_num = len(word)
        flag = 0
        with open("./input.txt", "a") as inputfile:
            with open("./output.txt", "a") as outputfile:
                for num in range(char_num):
                    if word[num] == "「":
                        num += 1
                        if flag == 0:
                            while num < char_num and word[num] != "」":
                                inputfile.write(word[num])
                                num += 1
                            inputfile.write("\n")
                            flag = 1
                        else:
                            while num < char_num and word[num] != "」":
                                outputfile.write(word[num])
                                num += 1
                            outputfile.write("\n")
                            flag = 0
                    elif word[num] == "『":
                        num += 1
                        if flag%2 == 0:
                            while num < char_num and word[num] != "』":
                                inputfile.write(word[num])
                                num += 1
                            inputfile.write("\n")
                            flag = 1
                        else:
                            while num < char_num and word[num] != "』":
                                outputfile.write(word[num])
                                num += 1
                            outputfile.write("\n")
                            flag = 0

    def _livedoor(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ssbiyori(self,url):
        pass

    def _yomicom(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(_class="sh_heading_main_b") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="ently_text").text)
            except:
                pass


    def _potittoss(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ss_m(self,url):
        pass

    def _s2_log(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ss_navi(self,url):
        pass

    def _horahorazoon(self,url):
        pass

    def _ayame2nd(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ssimas72(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _elephant(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            self._takeCharaWord(livedoor_html_soup.find(class_="article").text)

    def _morikinoko(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _amnesiataizen(self,url):
        pass

    def _ssblog614(self,url):
        pass

    def _sssokuhou(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass


    def _invariant0(self,url):
        pass

    def _darusoku(self,url):
        pass

    def _ss_station(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _minnanohimatubushi(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ssmansion(self,url):
        pass

    def _142ch(self,url):
        pass

    def _tangaron3(self,url):
        pass

    def _ssmaster(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _dousoku(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _ssflash(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _lclc(self,url):
        livedoor_html = requests.get(url)
        livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser")
        if livedoor_html_soup.find(id="character") == None:
            try:
                self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text)
            except:
                pass

    def _www(self,url):
        pass

    def _maoyuss(self,url):
        pass

    def _ssspecial578(self,url):
        pass

    def _boardLinkSearch(self):
        self.board_list_url = "http://ssmania.info/category/765%E3%83%97%E3%83%AD?page={}"
        page = 1
        while True:
            link = list()
            print(page)
            board_top_html = requests.get(self.board_list_url.format(page))
            board_top_soup = BeautifulSoup(board_top_html.content, "html.parser")
            main_contents = board_top_soup.find(id="contents")
            if "該当記事はありませんでした" == board_top_soup.find(class_="alink").text:
                print("finish")
                break
            for a in main_contents.find_all('a'):
                tmp_link = a.get('href')
                if self.exclusion_url not in tmp_link:
                    link.append(a.get('href'))
            link_list = "\n".join(link)
            link_list = "\n"+link_list
            with open("./charaLink.txt", "a") as f:
                f.write(link_list)

            page += 1

def temp():
    link_list = list()
    with open("./charaLink.txt", "r") as f:
        for link in f:
            link_list.append(link)
    link_type = list()
    for link in link_list:
        if link.split("/")[2] not in link_type:
            link_type.append(link.split("/")[2])
    print(len(link_type))
    for link in link_type:
        print(link)

def main():
    scrpeing = WordScraping()
    scrpeing._fileRead()



if __name__ == "__main__":
    main()
    print("search Finish")

3.やり取りを単語ごとに分ける

seq2seqにより学習するためには文章を単語ごとに分ける必要があります。そのため今回はMeCabというオープンソースである形態素解析器を使用しました。
取得したデータが格納されている「input.txt」と「output.txt」からデータを抽出して「input_result.txt」「output_result.txt」にそれぞれ出力しました。

WordsMecab.py
# -*- coding: utf-8 -*-
import MeCab
import sys

mode = MeCab.Tagger("-Ochasen")

num = 0
with open("./input.txt","r",encoding="utf-8_sig") as f:
    for line in f.readlines():
        num += 1
        line_result = mode.parse(line).split("\n")
        with open("./input_result.txt","a",encoding="utf-8") as result_file:
            for word in line_result:
                if word.split("\t")[0] != "EOS":
                    result_file.write(word.split("\t")[0])
                    result_file.write(" ")
            result_file.write("\n")

num = 0
with open("./output.txt","r",encoding="utf-8_sig") as f:
    for line in f.readlines():
        num += 1
        line_result = mode.parse(line).split("\n")
        with open("./output_result.txt","a",encoding="utf-8") as result_file:
            for word in line_result:
                if word.split("\t")[0] != "EOS":
                    result_file.write(word.split("\t")[0])
                    result_file.write(" ")
            result_file.write("\n")

4.結果

結果以下のような内容のデータが収集できました。

output_result.txt
...
うーん 、 ブラック の 缶 珈琲 って なんとなく 味気 ない 気 が し て  
で さぁ 春香  
今日 な んで 事務所 来 た の ?  
いや 、 今日 春香 オフ じゃん  
… それ 、 先週 別 の 日 に なっ たって 言っ た と 思う ん だ けど …  
...

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
9