LoginSignup
4
2

More than 5 years have passed since last update.

Amebaブログの画像収集

Last updated at Posted at 2018-02-22

機械学習ための画像収集が大変だと思い作成しました。
アメンバー限定記事以外の画像を保存します。
ブログによってタグが違うのでそれに応じて変更してください。

qiita.py
# coding:utf-8
import re

import requests
from bs4 import BeautifulSoup
import os
import csv
import urllib2

def get_entry_list(html):
    url_list = [html]
    while True:
        html = requests.get(html).content
        soup = BeautifulSoup(html, "lxml")
        next_page = soup.find("a", {"class", "skinSimpleBtn pagingNext"})
        if isinstance(next_page, type(None)):
            print("finish")
            return url_list
        else:
            url_list.append(next_page["href"])
            html = next_page["href"]   

def get_url(entry_list,id):
    page_list = []
    for html in entry_list:
        html = requests.get(html).content
        soup = BeautifulSoup(html, "lxml")
        urls = soup.find_all("a", {"class", "contentTitle"})
        for url in urls:
            url = url["href"]
            page_list.append(url)
    amember = "https://ameblo.jp/{0}/amemberentrylist.html".format(id)
    amember_html = requests.get(amember).content
    amember_soup = BeautifulSoup(amember_html, "lxml")
    amember_urls = amember_soup.find_all("a", {"class", "contentTitle"})
    for url in amember_urls:
        url = url["href"]
        page_list.remove(url)  
    print("finish")
    return page_list

def scraping(entry_list):
    image_list = []

    for url in entry_list:
        html = requests.get(url).content
        soup = BeautifulSoup(html, "lxml")
        body = soup.find("div", {"class", "articleText"})
        image = body.find_all("img")
        for img in image:
            img = img['src']
            image_list.append(img)
    print("finish")
    return image_list

def download(download_list, BASE_DIRE):
    BASE_DIRE = BASE_DIRE + 'image/'
    for url in download_list:
        label = url.rsplit("?", 1)[0].rsplit("/", 1)[1]
        extension = url.rsplit("?", 1)[0].rsplit(".", 1)[1]
        if not os.path.exists(BASE_DIRE):
            os.mkdir(BASE_DIRE)
        if extension == 'jpg':
            try:
                data = urllib2.urlopen(url).read()
                f = open(os.path.join(BASE_DIRE , label), 'wb')
                f.write(data)
                f.close()
            except Exception as e:
                print(e)
    print('finish')


if __name__ == "__main__":
    blog_id = "xxx"
    url = "https://ameblo.jp/{0}/entrylist.html".format(blog_id)
    BASE_DIRE = "/Users/xxx/WorkSpace/{0}/".format("xxx")
    all_entry_list = get_entry_list(url)
    page_list = get_url(all_entry_list,blog_id)
    download_list = scraping(page_list)
    download(data,BASE_DIRE)
4
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
2