言語処理論文を検索するときに使えます。
colabのコードです。そのままコピペで使えます。
お探しのkeywordをいれて使ってください。
import requests
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
index = index + refyear
url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get(url, headers=headers)
content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
first_div = soup.find("div", {'class': 'span9'})
mid_td = first_div.find_all("td")
result = []
for i in mid_td:
second_a = i.find("span",{'class': 'title'})
if second_a != None:
result.append(second_a.text)
return result
def search(year = 2021, refyear = 2015, notsearch = "가", list_of_words = ["default"]):
if len(list_of_words) == 0:
return
res = []
for i in tqdm(range(0,year-refyear+1)):
for j in crawler_title(year, refyear, i):
count = 0
for k in list_of_words:
if k in j and notsearch not in j:
count += 1
if count == len(list_of_words):
res.append(j)
return res
def search2(sch, ls):
for i in ls:
if sch in i:
print(i)
year = 2021
refyear = 2015
test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
print(a)
#スペースくきり入力、例:分析 生成
while 1:
word = input()
if word == "":
break
a = search(list_of_words = word)
print(a)
##エンターくきり入力、例:
#分析
#生成
"""
実行例:
分析 生成
2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析', ...,']
"""
"""
実行例2:
分析
生成
2015
2016
2017
2018
2019
2020
2021
['崩れ表記語の生成確率を用いた表記正規化と形態素解析', ...,']
"""
show as pandas with download link
import requests
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm
def crawler_title(year = 2021, refyear = 2015, index = 0):
index = index + refyear
download_url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index)
url = "https://www.anlp.jp/proceedings/annual_meeting/" + str(index) + "/index.html#session_list"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get(url, headers=headers)
content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)
first_div = soup.find("div", {'class': 'span9'})
mid_tr = first_div.find_all("tr")
result = []
second_a = ""
for i in mid_tr:
if i.find("span",{'class': 'title'}) != None: second_a = i.find("span",{'class': 'title'})
if i.find('a') != None and second_a != "":
second_b = download_url + "/" + i.find('a')["href"].replace("./","")
result.append(str(index) + ","+second_a.text+ "," + second_b)
return result
def search(year = 2021, refyear = 2015, notsearch = "가", list_of_words = ["default"]):
if len(list_of_words) == 0:
return
res = []
for i in tqdm(range(0,year-refyear+1)):
for j in crawler_title(year, refyear, i):
count = 0
for k in list_of_words:
if k in j and notsearch not in j:
count += 1
if count == len(list_of_words):
res.append(j)
return res
def search2(sch, ls):
for i in ls:
if sch in i:
print(i)
year = 2021
refyear = 2018
test = ["雑談"]
a = search(year=year,refyear=refyear,list_of_words = test) # ここを修正、refyear = 2015、year = 2021,で2015〜2021の論文検索
#スペースくきり入力、例:分析 生成
import pandas as pd
def make_clickable(link):
# target _blank to open new window
# extract clickable text to display for your link
text = link.split('=')[0]
return f'<a target="_blank" href="{link}">{text}</a>'
df = pd.DataFrame([i.split(",") for i in a],columns=["year","title","address"])
df.style.format({'address': make_clickable})