'use strict'
/*
* Modules
*/
// Public
require('date-utils');
const fetch = require('node-fetch');
const { JSDOM } = require('jsdom');
const fs = require("fs");
const csv = require("csv");
const log4js = require('log4js');
const chardet = require('chardet');
const iconv = require('iconv-lite');
/*
* Config
*/
const BASE_URL = "https://www.example.com/";
const IGNORE_QUERY_PARAMS = ['utm_campaign', "utm_source", "utm_keyword", "utm_content", "utm_medium"];
const LOG_DIR = "logs";
const RESULT_FILE = "result.csv";
/*
* Global Object(s)
*/
let urls = {};
/*
* Logger
*/
log4js.configure({
appenders: {
system: { type: 'file', filename: new Date().toFormat(`${LOG_DIR}/YYYYMMDD_HH24MISS.log`) }
},
categories: {
default: { appenders: ['system'], level: 'debug' },
}
});
const logger = log4js.getLogger('system');
/*
* Functions
*/
// 非同期で HTMLを取得
const getHtmlByUrl = async(target_url, source_url) => {
logger.info(`Fetch HTML from "${target_url}".`)
try {
const res = await fetch(target_url);
if (!res.ok || typeof res.arrayBuffer === 'undefined') throw (res.statusText);
return res.arrayBuffer();
} catch (e) {
var warning = 'Fetch failed: "' + target_url + '"' + (source_url ? ' from "' + source_url + '"' : "") + ' ' + e;
logger.warn(warning);
return;
}
};
// URLの有効性をチェック
const isUrlEligible = (url) => {
if (typeof url !== 'undefined' && url.indexOf(BASE_URL) !== 0) return false;
if (url.match(/\.(css|jpg|png|gif|pdf|js)($|\?.*)/)) return false;
return true;
}
// URLのフォーマット
const formatUrl = (url) => {
var formatted = url;
IGNORE_QUERY_PARAMS.forEach(param => {
var regexp = new RegExp(param + "=[^&]*");
formatted = formatted.replace(regexp, "");
});
formatted = formatted
.replace(/#.*$/, '')
.replace(/[\?&]*$/, '')
.replace(/\/index\.(html?|php|asp|cgi|jsp)\??/, '')
.replace(/\/\/$/, '/');
formatted += formatted.match(/(\.(php|html?|jsp|cgi)|\/)$/) ? '' : '/';
return formatted;
}
// 重複を除外したURL一覧を生成
const getLinkUrlsFromDom = (dom) => {
var url_list = {};
dom.window.document.querySelectorAll('a')
.forEach(a => {
if (!isUrlEligible(a.href)) {
return;
}
url_list[formatUrl(a.href)] = 1
});
return Object.keys(url_list);
}
// Webページのクロール
const crawlWebPage = async(target_url, source_url) => {
// グローバルオブジェクトの初期化
if (typeof urls[target_url] !== 'undefined') return;
urls[target_url] = { 'titie': '', 'links': [], 'is_success': false };
// HTTP応答をBufferで取得
const res_array_buffer = await getHtmlByUrl(target_url, source_url);
if (!res_array_buffer) return;
const buffer = Buffer.from(res_array_buffer);
// 文字コード判定と変換
const encoding = chardet.detect(buffer);
if (!res_array_buffer) return;
// 文字コードをUTF-8に変換
const html = iconv.decode(buffer, encoding);
if (!html) return;
// HTMLのパース
const dom = new JSDOM(html, { url: target_url });
if (!dom) return;
// URLとHTMLが有効
urls[target_url].is_success = true;
// 取り出したい情報
urls[target_url].title = dom.window.document.querySelector('title').textContent;
// HTMLに含まれるリンクの一覧を生成(重複は除外)
urls[target_url].links = getLinkUrlsFromDom(dom);
for (var link of urls[target_url].links) {
// await crawlWebPage(link, target_url);
await crawlWebPage(link, target_url).catch((error) => {
logger.warning(`Crawl failed for "${link}": ` + error);
});
}
}
/*
* main
*/
(async() => {
// 処理開始
logger.info("Start Processing.");
// URLからページの情報を取得してグローバルオブジェクトを更新
await crawlWebPage(BASE_URL).catch((error) => {
logger.warn(`Crawl failed for "${BASE_URL}": ` + error);
});
// 出力用CSVデータの変数を初期化
let csv_output = [
['url', 'valid', 'title']
];
// URLデータをCSVに変換
Object.keys(urls).forEach(url => {
csv_output.push([url, (urls[url].is_success ? "有効" : "無効"), urls[url].title]);
})
// CSVデータの出力
csv.stringify(csv_output, (error, output) => {
// データの出力準備
if (error) {
logger.error("Faild converting CSV data: " + error);
throw new Error(error);
}
// ファイル出力
fs.writeFile(RESULT_FILE, output, (error) => {
// データの出力準備
if (error) {
logger.error("Faild creating CSV file: " + error);
throw new Error(error);
}
// CSV出力完了を宣言
logger.info(`CSV file "${RESULT_FILE}" has been created.`);
});
});
})();