web.archive.orgからのwebscraping
from datetime import datetime as dt, timedelta, date
from bs4 import BeautifulSoup
import requests, time, lxml, pandas as pd, re
def date_list(dt_start, dt_end, each_date=1): #date_list('2018-09-01','2018-09-05')
dt_start = dt.strptime(dt_start, '%Y-%m-%d')
dt_end = dt.strptime(dt_end, '%Y-%m-%d')
return [dt_start + timedelta(n*each_date) for n in range((dt_end - dt_start).days // each_date)]
def getSoup(url, timeSleep=3,maxSleep=300):
time.sleep(timeSleep)
ret_sleep=timeSleep
soup = None
while ret_sleep < maxSleep:
try:
res = requests.session().get(url)
soup = BeautifulSoup(res.content, 'lxml')
ret_sleep=maxSleep
except:
print('error sleep=', ret_sleep)
ret_sleep=ret_sleep*3
time.sleep(ret_sleep)
return soup
def latest_weekday(dt_date, weekday=2): #return_latest_weekday('2018-10-08', weekday=2)
#0=Monday, 1=Tuesday, ...
for i in range(7):
ret=dt.strptime(dt_date, '%Y-%m-%d') + timedelta(-i)
if ret.weekday()==weekday:
ans=ret
return ans
dt_start='2016-07-01'
dt_start=latest_weekday(dt_start).strftime('%Y-%m-%d')
dt_end=date.today().strftime('%Y-%m-%d')
date_series=date_list(dt_start,dt_end,each_date=7)
web_url=r'http:xxx'
str_1=r'http://web.archive.org/web/'
str_2=r'/'+web_url+r'/'
df=pd.DataFrame(index=date_series, columns=['count'])
for dt_each in date_series:
url=str_1+dt_each.strftime('%Y%m%d')+str_2
soup=getSoup(url)
print(url)
try:
ret=soup.find... #略
except:
ret='error'
print(ret,dt_each)
if ret != 'error':
df.loc[dt_each]=re.sub(r'\D', '', ret)
df.to_csv('output.csv')
Web Archiveに記録されたデータポイントのリスト
import requests
from datetime import datetime as dt
def web_archive_stamp_list(url, from_when, to_when):
'''example: url='yahoo.co.jp', from=2010, to=2011'''
base_url=''.join(['http://web.archive.org/cdx/search/cdx?',
'url={url}&from={f}&to={t}&output=json'])
base_url=base_url.format(url=url, f=from_when, t=to_when)
r = requests.get(base_url).json()
return r
def web_archive_stamp_simplified(url, from_when, to_when):
'''example: url='yahoo.co.jp', from=2010, to=2011'''
list1=web_archive_stamp_list(url, from_when, to_when)
base_url='https://web.archive.org/web/{timestamp}/{url}'
time_str='%Y%m%d%H%M%S'
dict1={dt.strptime(e[1],time_str):base_url.format(timestamp=e[1],url=e[2])
for e in list1 if e[1] != 'timestamp'}
return dict1