More than 1 year has passed since last update.

言語処理学会論文検索支援コード

Last updated at 2022-07-02Posted at 2021-12-11

言語処理論文を検索するときに使えます。

colabのコードです。そのままコピペで使えます。

お探しのkeywordをいれて使ってください。

import requests 
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
  index = index + refyear
  url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  r = requests.get(url, headers=headers)
  content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
  soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
  first_div = soup.find("div", {'class': 'span9'})
  mid_td = first_div.find_all("td")
  result = []
  for i in mid_td:
    second_a = i.find("span",{'class': 'title'})
    if second_a != None:
      result.append(second_a.text)
  return result
def search(year = 2021, refyear = 2015, notsearch = "가", list_of_words = ["default"]):
  if len(list_of_words) == 0:
    return
  res = []
  for i in tqdm(range(0,year-refyear+1)):
    for j in crawler_title(year, refyear, i):
      count = 0
      for k in list_of_words:
        if k in j and notsearch not in j:
          count += 1
      if count == len(list_of_words):
        res.append(j)
  return res
def search2(sch, ls):
  for i in ls:
    if sch in i:
      print(i)
year = 2021
refyear = 2015

test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
print(a)
#スペースくきり入力、例：分析 生成

while 1:
  word = input()
  if word == "":
    break
  a = search(list_of_words = word)
print(a)
##エンターくきり入力、例：
#分析 
#生成

"""
実行例：
分析 生成
2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析',  ...,']
"""
"""
実行例２：
分析
生成

2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析', ...,']
"""

show as pandas with download link

import requests 
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
  index = index + refyear
  download_url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index)
  url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  r = requests.get(url, headers=headers)
  content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
  soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
  first_div = soup.find("div", {'class': 'span9'})
  mid_tr = first_div.find_all("tr")
  result = []

  second_a = ""
  for i in mid_tr:
      if i.find("span",{'class': 'title'}) != None: second_a = i.find("span",{'class': 'title'})
      if i.find('a') != None and second_a != "":
        second_b = download_url + "/" + i.find('a')["href"].replace("./","")
        result.append(str(index) + ","+second_a.text+ "," + second_b)
  return result
def search(year = 2021, refyear = 2015, notsearch = "가", list_of_words = ["default"]):
  if len(list_of_words) == 0:
    return
  res = []
  for i in tqdm(range(0,year-refyear+1)):
    for j in crawler_title(year, refyear, i):
      count = 0
      for k in list_of_words:
        if k in j and notsearch not in j:
          count += 1
      if count == len(list_of_words):
        res.append(j)
  return res
def search2(sch, ls):
  for i in ls:
    if sch in i:
      print(i)
year = 2021
refyear = 2018

test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
#スペースくきり入力、例：分析 生成
import pandas as pd

def make_clickable(link):
    # target _blank to open new window
    # extract clickable text to display for your link
    text = link.split('=')[0]
    return f'<a target="_blank" href="{link}">{text}</a>'

df = pd.DataFrame([i.split(",") for i in a],columns=["year","title","address"])
df.style.format({'address': make_clickable})

法律問題がある場合は消しますので、コメントお願いします。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

言語処理学会 論文検索支援コード

言語処理論文を検索するときに使えます。

show as pandas with download link

法律問題がある場合は消しますので、コメントお願いします。

言語処理学会論文検索支援コード