機械学習ための画像収集が大変だと思い作成しました。
アメンバー限定記事以外の画像を保存します。
ブログによってタグが違うのでそれに応じて変更してください。
qiita.py
# coding:utf-8
import re
import requests
from bs4 import BeautifulSoup
import os
import csv
import urllib2
def get_entry_list(html):
url_list = [html]
while True:
html = requests.get(html).content
soup = BeautifulSoup(html, "lxml")
next_page = soup.find("a", {"class", "skinSimpleBtn pagingNext"})
if isinstance(next_page, type(None)):
print("finish")
return url_list
else:
url_list.append(next_page["href"])
html = next_page["href"]
def get_url(entry_list,id):
page_list = []
for html in entry_list:
html = requests.get(html).content
soup = BeautifulSoup(html, "lxml")
urls = soup.find_all("a", {"class", "contentTitle"})
for url in urls:
url = url["href"]
page_list.append(url)
amember = "https://ameblo.jp/{0}/amemberentrylist.html".format(id)
amember_html = requests.get(amember).content
amember_soup = BeautifulSoup(amember_html, "lxml")
amember_urls = amember_soup.find_all("a", {"class", "contentTitle"})
for url in amember_urls:
url = url["href"]
page_list.remove(url)
print("finish")
return page_list
def scraping(entry_list):
image_list = []
for url in entry_list:
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")
body = soup.find("div", {"class", "articleText"})
image = body.find_all("img")
for img in image:
img = img['src']
image_list.append(img)
print("finish")
return image_list
def download(download_list, BASE_DIRE):
BASE_DIRE = BASE_DIRE + 'image/'
for url in download_list:
label = url.rsplit("?", 1)[0].rsplit("/", 1)[1]
extension = url.rsplit("?", 1)[0].rsplit(".", 1)[1]
if not os.path.exists(BASE_DIRE):
os.mkdir(BASE_DIRE)
if extension == 'jpg':
try:
data = urllib2.urlopen(url).read()
f = open(os.path.join(BASE_DIRE , label), 'wb')
f.write(data)
f.close()
except Exception as e:
print(e)
print('finish')
if __name__ == "__main__":
blog_id = "xxx"
url = "https://ameblo.jp/{0}/entrylist.html".format(blog_id)
BASE_DIRE = "/Users/xxx/WorkSpace/{0}/".format("xxx")
all_entry_list = get_entry_list(url)
page_list = get_url(all_entry_list,blog_id)
download_list = scraping(page_list)
download(data,BASE_DIRE)