search
LoginSignup
278

posted at

updated at

puppeteerでの要素の取得方法

#概要
puppeteerでの要素の取得のための関数は

page.evaluate + (querySelector|querySelectorALL)
page.$
page.$$
page.$eval
page.$$eval

とありますが、実際にそれらを使うためにどう書くかをまとめています。速度等は検証できていませんが、evalを使うのがそれぞれコードがシンプルになるので良いかと思います。

APIの動作の詳細はpuppeteerのAPIを参照ください。
https://pptr.dev/api/

#前提
以下のようなliの中にアンカータグが入っているページを取得することを想定しています。

<ul>
  <li><a href="some link">some HTML</a></li>
  <li><a href="some link">some HTML</a></li>
  <li><a href="some link">some HTML</a></li>
  ...
</ul>

以下のようにページ読み込みを準備します。

  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
  const page = await browser.newPage();
  await page.goto('url you want to check');

  let itemSelector="some selecter > ul > li:nth-child(1) > a";
  let listSelector="some selecter > ul > li > a";

#一つセレクターの一つの要素をとる

##page.evaluate + querySelector

  var data = await page.evaluate((selector) => {
    return document.querySelector(selector).textContent;
  }, itemSelector);

##page.$

  var item = await page.$(itemSelector);
  var data = await (await item.getProperty('textContent')).jsonValue();

##page.$eval

  var data = await page.$eval(itemSelector, item => {
      return item.textContent;
  });

#一つセレクターの複数要素をとる

##page.evaluate + querySelector

  var data = await page.evaluate((selector) => {
    return {
        href: document.querySelector(selector).href,
        textContent: document.querySelector(selector).textContent,
        innerHTML: document.querySelector(selector).innerHTML
    };
  }, itemSelector);

##page.$

  var item = await page.$(itemSelector);
  var data = {
        href: await (await item.getProperty('href')).jsonValue(),
        textContent: await (await item.getProperty('textContent')).jsonValue(),
        innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
  };

##page.$eval

  var data = await page.$eval(itemSelector, item => {
    return {
        href: item.href,
        textContent: item.textContent,
        innerHTML: item.innerHTML
    };
  });

#複数セレクターの一つの要素をとる

##page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    return list.map(data => data.textContent);
  }, listSelector);

##page.$$

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    datas.push(await (await list[i].getProperty('textContent')).jsonValue())
  }

##page.$$eval

  var datas = await page.$$eval(listSelector, list => {
      return list.map(data => data.textContent);
  });

#複数セレクターの複数要素をとる

##page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  }, listSelector);

##page.$$

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    var data = {
      href: await (await list[i].getProperty('href')).jsonValue(),
      textContent: await (await list[i].getProperty('textContent')).jsonValue(),
      innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
    };
    datas.push(data);
  }

##page.$$eval

  var datas = await page.$$eval(listSelector, list => {
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  });

#確認用コード

const fs = require('fs');
const puppeteer = require('puppeteer');

(async() => {

  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
  const page = await browser.newPage();
  await page.goto('url you want to check');

  let itemSelector="some selecter > ul > li:nth-child(1) > a";
  let listSelector="some selecter > ul > li > a";
 
  ////////////////////////////////////////////////////////

  var data = await page.evaluate((selector) => {
    return document.querySelector(selector).textContent;
  }, itemSelector);
  console.log("one item one attribute");
  console.log(data);

  var item = await page.$(itemSelector);
  var data = await (await item.getProperty('textContent')).jsonValue();
  console.log("one item one attribute using $");
  console.log(data);

  var data = await page.$eval(itemSelector, item => {
      return item.textContent;
  });
  console.log("some items one attribute using $eval");
  console.log(data);

  ////////////////////////////////////////////////////////

  var data = await page.evaluate((selector) => {
    return {
        href: document.querySelector(selector).href,
        textContent: document.querySelector(selector).textContent,
        innerHTML: document.querySelector(selector).innerHTML
    };
  }, itemSelector);
  console.log("one item some attributes");
  console.log(data);

  var item = await page.$(itemSelector);
  var data = {
        href: await (await item.getProperty('href')).jsonValue(),
        textContent: await (await item.getProperty('textContent')).jsonValue(),
        innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
  };
  console.log("one item some attributes using $");
  console.log(data);

  var data = await page.$eval(itemSelector, item => {
    return {
        href: item.href,
        textContent: item.textContent,
        innerHTML: item.innerHTML
    };
  });
  console.log("some items some attributes using $eval");
  console.log(data);

  ////////////////////////////////////////////////////////

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    return list.map(data => data.textContent);
  }, listSelector);
  console.log("some items one attribute");
  console.log(datas);

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    datas.push(await (await list[i].getProperty('textContent')).jsonValue())
  }
  console.log("some items one attribute using $$");
  console.log(datas);

  var datas = await page.$$eval(listSelector, list => {
      return list.map(data => data.textContent);
  });
  console.log("some items one attribute using $$eval");
  console.log(datas);

  ////////////////////////////////////////////////////////

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  }, listSelector);
  console.log("some items some attributes");
  console.log(datas);

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    var data = {
      href: await (await list[i].getProperty('href')).jsonValue(),
      textContent: await (await list[i].getProperty('textContent')).jsonValue(),
      innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
    };
    datas.push(data);
  }
  console.log("some items one attribute using $$");
  console.log(datas);

  var datas = await page.$$eval(listSelector, list => {
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  });
  console.log("some items some attributes using $$eval");
  console.log(datas);

  ////////////////////////////////////////////////////////

  browser.close();
  
})();

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
What you can do with signing up
278