0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

WebScraping

Last updated at Posted at 2018-10-18

web.archive.orgからのwebscraping

from datetime import datetime as dt, timedelta, date
from bs4 import BeautifulSoup
import requests, time, lxml, pandas as pd, re

def date_list(dt_start, dt_end, each_date=1): #date_list('2018-09-01','2018-09-05')
    dt_start = dt.strptime(dt_start, '%Y-%m-%d')
    dt_end   = dt.strptime(dt_end, '%Y-%m-%d')
    return [dt_start + timedelta(n*each_date) for n in range((dt_end - dt_start).days // each_date)]

def getSoup(url, timeSleep=3,maxSleep=300):
    time.sleep(timeSleep)
    ret_sleep=timeSleep
    soup = None
    while ret_sleep < maxSleep:
        try:
            res = requests.session().get(url)
            soup = BeautifulSoup(res.content, 'lxml')
            ret_sleep=maxSleep
        except:
            print('error sleep=', ret_sleep)
            ret_sleep=ret_sleep*3
            time.sleep(ret_sleep)
    return soup

def latest_weekday(dt_date, weekday=2): #return_latest_weekday('2018-10-08', weekday=2) 
    #0=Monday, 1=Tuesday, ...
    for i in range(7):
        ret=dt.strptime(dt_date, '%Y-%m-%d') + timedelta(-i)
        if ret.weekday()==weekday:
            ans=ret
    return ans
    
dt_start='2016-07-01'
dt_start=latest_weekday(dt_start).strftime('%Y-%m-%d')
dt_end=date.today().strftime('%Y-%m-%d')
date_series=date_list(dt_start,dt_end,each_date=7)

web_url=r'http:xxx'
str_1=r'http://web.archive.org/web/'
str_2=r'/'+web_url+r'/'

df=pd.DataFrame(index=date_series, columns=['count'])

for dt_each in date_series:
    url=str_1+dt_each.strftime('%Y%m%d')+str_2
    soup=getSoup(url)
    print(url)
    try:
        ret=soup.find... #略
    except:
        ret='error'
    print(ret,dt_each)
    if ret != 'error':
        df.loc[dt_each]=re.sub(r'\D', '', ret)
df.to_csv('output.csv')

Web Archiveに記録されたデータポイントのリスト

import requests
from datetime import datetime as dt

def web_archive_stamp_list(url, from_when, to_when):
    '''example: url='yahoo.co.jp', from=2010, to=2011'''
    base_url=''.join(['http://web.archive.org/cdx/search/cdx?',
                      'url={url}&from={f}&to={t}&output=json'])
    base_url=base_url.format(url=url, f=from_when, t=to_when)
    r = requests.get(base_url).json()
    return r

def web_archive_stamp_simplified(url, from_when, to_when):
    '''example: url='yahoo.co.jp', from=2010, to=2011'''
    list1=web_archive_stamp_list(url, from_when, to_when)
    base_url='https://web.archive.org/web/{timestamp}/{url}'
    time_str='%Y%m%d%H%M%S'
    dict1={dt.strptime(e[1],time_str):base_url.format(timestamp=e[1],url=e[2])
           for e in list1 if e[1] != 'timestamp'}
    return dict1
0
0
4

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?