More than 5 years have passed since last update.

自サイトのリンク切れを見つけるスクリプト(SPA対応版)

Last updated at 2018-10-29Posted at 2018-10-26

自サイトのリンク切れを網羅的に見つける

ユーザコンテンツや外部リンクを直接参照している場合などに問題となるのがリンク切れです。
外部リンク切れ(404エラー)はサイトの評価を下げる直接の原因にはならないといわれていますが、
【SEO】外部サイトへのリンク切れは検索エンジンからの評価を下げない
リンク切れがあることでユーザ体験を阻害させ、離脱に繋がります。

簡易的に見つけるには次のようなW3Cでも用意されてはいます。
W3C Link Checkcer
が、サイト全体から見つけるにはサイト全体のリンクを総チェックする必要があります。

書いてみた

スクレイピングライブラリで有名なpuppetterを使用して自サイトをスクレイピングしてみます。
aタグとimgタグを対象とします。
puppeteerでレンダリングを実行完了してから、urlチェックするため、SPAなサイトでも対応可能です。
sitemap.xmlからインデックスが貼られているurlに対して、リンクをたどりチェックします。
なお、formタグのアクションのurlはチェックしません。
またパラメータ付きのパスに関してもチェックしません。（パラメータなしのみをチェックする）
パラメータ付きでページ内のリンクが変わるような場合は次の行をコメントアウトしてください

// パラメータ付きの場合はパラメータを削除する(パラメータなしのみをチェックする)
//if (!isAnotherSite) url = url.split('?')[0]

以下全スクリプトです。なお、負荷が高いため、深夜などにバッチ処理で回すことをおすすめします。

deadlink_checker.js

const axios = require('axios')
const cheerio = require('cheerio')
const puppeteer = require('puppeteer')
const fs = require('fs')
const xml2js = require('xml2js')
const webOrigin = '自サイトドメイン（トップページURL）'

function sleep(duration = 500) {
  return new Promise(resolve => setTimeout(resolve, duration))
}

function loadXml(filename) {
  try {
    const xml = fs.readFileSync(filename, 'utf-8')
    return new Promise((resolve, reject) => {
      xml2js.parseString(xml, (err, data) => {
        if (err) return reject(err)

        return resolve(data)
      })
    })
  } catch (e) {
    return Promise.reject(new Error(`${filename}の読み込みに失敗しました`))
  }
}

const reason = {
  '400': 'bad request',
  '404': 'not found',
  '5xx': 'response error',
  'timeout': 'request timeout',
}

// console color
const yellow  = '\u001b[33m'
const blue  = '\u001b[34m'
const red  = '\u001b[31m'
const reset   = '\u001b[0m'

const checkedUrls = new Set()
const errorLinks = new Set()
const deadLinks = new Set()
const regex = new RegExp(webOrigin)

function errorResponseStatus(url, status, target, error) {
  if (status === 400) {
    console.error(red + `error ${target}:${url}` + reset)
    errorLinks.add({url, code: status, reason: reason['400']})
    return 'error'
  }
  if (status === 404) {
    console.error(yellow + `dead ${target}:${url}` + reset)
    deadLinks.add({url, code: status, reason: reason['404']})
    return 'not found'
  }
  if (status >= 500 && status < 600) {
    console.error(red + `error ${target}:${url}` + reset)
    errorLinks.add({url, code: status, reason: reason['5xx']})
    return 'error'
  }
  if (status === 'timeout') {
    console.error(blue + `timeout error ${target}:${url}` + reset)
    errorLinks.add({url, reason: reason.timeout})
    return 'timeout'
  }
  if (status === 'unknown') {
    console.error(red + `error ${target}:${url}` + reset)
    errorLinks.add({url, reason: error.message})
    return 'unknown'
  }
  return null
}

class Browser {
  async launch() {
    await this.close()
    this.browser = await puppeteer.launch({headless: true})
    await this.browser.createIncognitoBrowserContext()
    this.page = await this.browser.newPage()
  }

  async goto(url) {
    if (!this.page) throw new Error('初期化されていません')
    return await this.page.goto(url, {waitUntil: 'load', timeout: 10000})
      .catch(async (error) => {
        if (/Timeout/.test(error.message)) {
          return {headers: {status: 'timeout'}}
        }
        await this.launch()
        return {headers: {status: 'unknown'}}
      })
  }

  async getHtml() {
    if (!this.page) return null
    return await this.page.content()
  }

  async close() {
    if (this.browser) {
      await this.browser.close()
      this.browser = null
    }
  }
}

async function checkPage(urls, browser) {

  const imgUrls = new Set()
  const aLinks = new Set()

  for (let url of Array.from(urls)) {
    const isAnotherSite = /http/.test(url) && !regex.test(url)
    // パラメータ付きの場合はパラメータを削除する(パラメータなしのみをチェックする)
    if (!isAnotherSite) url = url.split('?')[0]
    // チェック済みurl
    if (checkedUrls.has(url)) continue

    const response = await browser.goto(url)
    checkedUrls.add(url)

    if (response.headers && errorResponseStatus(url, response.headers.status, 'page')) {
      continue
    }
    // 別のサイトの場合はページの中はチェックしない
    if (isAnotherSite) {
      continue
    }
    const html = await browser.getHtml()

    // console.log(html)
    if (html) {
      const $ = cheerio.load(html)
      $('body').children().each(function() {
        const $this = $(this)
        // img
        $this.find('img').each(function (index, element) {
          const src = $(element).attr('src')
          if (src) {
            imgUrls.add(src)
          }
        })
        // a
        $this.find('a').each(function (index, element) {
          const href = $(element).attr('href')
          if (href) {
            aLinks.add(href)
          }
        })
      })
    }
  }

  for (let url of Array.from(imgUrls)) {
    // チェック済みurl
    if (checkedUrls.has(url)) continue
    const absoluteUrl = /http/.test(url) ? url : `${webOrigin}${url}`
    await sleep(300)
    await axios.get(absoluteUrl, {timeout: 3000})
      .catch(error => errorResponseStatus(absoluteUrl, error.response ? error.response.status : error.request ? 'timeout' : 'unknown', 'image', error))
    checkedUrls.add(url)
  }

  for (let url of Array.from(aLinks)) {
    // チェック済みurl
    const absoluteUrl = /http/.test(url) ? url : `${webOrigin}${url}`
    if (checkedUrls.has(absoluteUrl)) continue
    await checkPage([absoluteUrl], browser)
  }
}

const main = async () => {

  // sitemap.xmlのurlを読み込み
  const sitemaps = new Set()
  const sitemap = await loadXml('sitemap.xml')
  if (!sitemap.sitemapindex || !sitemap.sitemapindex.sitemap) return
  for (let l of sitemap.sitemapindex.sitemap) {
    if (l.loc.length > 0) {
      const path = l.loc[0].split('/')
      const data = await loadXml(path[path.length - 1])
      if (data.urlset && data.urlset.url) {
        data.urlset.url.map(l => sitemaps.add(l.loc[0]))
      }
    }
  }

  const browser = new Browser()
  await browser.launch()
  await checkPage(sitemaps, browser)
  await browser.close()
  const errors = Array.from(errorLinks)
  const deads = Array.from(deadLinks)
  const message = `【deadリンクチェッカー】:\nNotFoundリンク：${JSON.stringify(deads)}\nエラーリンク：${JSON.stringify(errors)}`
  // slackなどに通知
}

main()

sitemap.xmlは次のような構造になっています。
これはご自身のサーバによって構造が違うと思うのでsitemap.xmlの読み込み部分に関しては適宜処理を変えてください。

sitemap.xml

<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://localhost:3000/sitemap-main.xml</loc>
</sitemap>
<sitemap>
<loc>http://localhost:3000/sitemap-profile.xml</loc>
</sitemap>
<sitemap>
<loc>http://localhost:3000/sitemap-category.xml</loc>
</sitemap>
</sitemapindex>

今回使っているsitemap-main.xmlは以下の様な構造になっています。

sitemap-main.xml

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
<url> <loc>http://localhost:3000/</loc> </url>
<url> <loc>http://localhost:3000/login</loc> </url>
<url> <loc>http://localhost:3000/company</loc> </url>
<url> <loc>http://localhost:3000/policies/terms</loc> </url>
<url> <loc>http://localhost:3000/policies/privacy</loc> </url>

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up