0
0

More than 3 years have passed since last update.

ウェブスクレイピング

Posted at

概要

CSVに記入したURL一覧からページの情報をスクレイピングします。

ソース

package.json

{
  "name": "scraping",
  "version": "1.0.0",
  "description": "scraping",
  "author": "",
  "devDependencies": {
    "cheerio": "^1.0.0-rc.3",
    "csv-parser": "^2.3.2",
    "fs": "0.0.1-security",
    "json2csv": "^4.5.4",
    "request": "^2.88.0",
    "request-promise": "^4.2.5"
  }
}
  • cheerio ・・・ 読み込んだページをjQuery形式でDOM操作できる
  • csv-parser ・・・ CSV形式をJSONに変換
  • fs ・・・ ファイル操作
  • json2csv ・・・ JSON形式をCSVに変換
  • request ・・・ HTTP通信を行う
  • request-promise ・・・ HTTP通信を行う

url.csv

Yahoo!ニュースの記事情報をスクレイピングします。

url
https://news.yahoo.co.jp/topics/top-picks
https://news.yahoo.co.jp/topics/domestic
https://news.yahoo.co.jp/categories/world

scraping.js

const fs = require('fs');
const request= require('request');
const rp = require('request-promise');
const csv = require('csv-parser');
const cheerio = require('cheerio');
const { Parser } = require('json2csv');

const results = [];
const list = [];

let count = 0;

let filename = 'dist/data.csv';
let loadfile = 'url.csv';

fs.createReadStream(loadfile)
.pipe(csv())
.on('data', function(data){
    results.push(data);
})
.on('end', function(){
    crawl(results[count]);
});


function crawl(result){
    let url = result.url;

    let _include_headers = function(body, response, resolveWithFullResponse) {
        return {
            response : response,
            $ : cheerio.load(body),
            body : body
        };
    };

    const options = {
        method: 'GET',
        uri: url,
        json: true,
        transform: _include_headers,
    };

    rp.get(url, options)
        .then((data) => {
            scrp(data.$, url, data.response, data.body);
        })
        .catch((error) => {
            console.log(error);
        });
}

function scrp($,url,response,body) {

    let category = $('.newsFeedTab_item-current').text();

    $('.newsFeed_item').each(function() {
        let title = $(this).find('.newsFeed_item_title').text();
        let date = $(this).find('.newsFeed_item_date').text();
        let href = $(this).find('.newsFeed_item_link').attr('href');
        let thumbnail = $(this).find('.thumbnail img').attr('src');

        if(href!==undefined){
            let obj = {
                category : category,
                title : title,
                date : date,
                href : href,
                thumbnail : thumbnail,
            }
            list.push(obj);
        }
    });

    count++;

    console.log('Complete:' + url);

    if(count < results.length ){
        crawl(results[count]);
    }else{
        write();
    }
}

function write(){
    let fields = ['category', 'title', 'date', 'href', 'thumbnail'];
    const opts = { fields };

    try {
        const parser = new Parser(opts);
        const csv = parser.parse(list);
        fs.writeFile(filename , csv, function(err) {
            if (err) throw err;
        });
    } catch (err) {
        console.error(err);
    }

}

Github

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0