0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

言語処理学会 論文検索支援コード

Last updated at Posted at 2021-12-11

言語処理論文を検索するときに使えます。

colabのコードです。そのままコピペで使えます。

お探しのkeywordをいれて使ってください。

import requests 
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
  index = index + refyear
  url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  r = requests.get(url, headers=headers)
  content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
  soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
  first_div = soup.find("div", {'class': 'span9'})
  mid_td = first_div.find_all("td")
  result = []
  for i in mid_td:
    second_a = i.find("span",{'class': 'title'})
    if second_a != None:
      result.append(second_a.text)
  return result
def search(year = 2021, refyear = 2015, notsearch = "", list_of_words = ["default"]):
  if len(list_of_words) == 0:
    return
  res = []
  for i in tqdm(range(0,year-refyear+1)):
    for j in crawler_title(year, refyear, i):
      count = 0
      for k in list_of_words:
        if k in j and notsearch not in j:
          count += 1
      if count == len(list_of_words):
        res.append(j)
  return res
def search2(sch, ls):
  for i in ls:
    if sch in i:
      print(i)
year = 2021
refyear = 2015

test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
print(a)
#スペースくきり入力、例:分析 生成
while 1:
  word = input()
  if word == "":
    break
  a = search(list_of_words = word)
print(a)
##エンターくきり入力、例:
#分析 
#生成
"""
実行例:
分析 生成
2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析',  ...,']
"""
"""
実行例2:
分析
生成

2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析', ...,']
"""

show as pandas with download link

import requests 
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
  index = index + refyear
  download_url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index)
  url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  r = requests.get(url, headers=headers)
  content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
  soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
  first_div = soup.find("div", {'class': 'span9'})
  mid_tr = first_div.find_all("tr")
  result = []

  second_a = ""
  for i in mid_tr:
      if i.find("span",{'class': 'title'}) != None: second_a = i.find("span",{'class': 'title'})
      if i.find('a') != None and second_a != "":
        second_b = download_url + "/" + i.find('a')["href"].replace("./","")
        result.append(str(index) + ","+second_a.text+ "," + second_b)
  return result
def search(year = 2021, refyear = 2015, notsearch = "", list_of_words = ["default"]):
  if len(list_of_words) == 0:
    return
  res = []
  for i in tqdm(range(0,year-refyear+1)):
    for j in crawler_title(year, refyear, i):
      count = 0
      for k in list_of_words:
        if k in j and notsearch not in j:
          count += 1
      if count == len(list_of_words):
        res.append(j)
  return res
def search2(sch, ls):
  for i in ls:
    if sch in i:
      print(i)
year = 2021
refyear = 2018

test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
#スペースくきり入力、例:分析 生成
import pandas as pd

def make_clickable(link):
    # target _blank to open new window
    # extract clickable text to display for your link
    text = link.split('=')[0]
    return f'<a target="_blank" href="{link}">{text}</a>'

df = pd.DataFrame([i.split(",") for i in a],columns=["year","title","address"])
df.style.format({'address': make_clickable})
法律問題がある場合は消しますので、コメントお願いします。
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?