puppeteerでの要素の取得方法


概要

puppeteerでの要素の取得のための関数は

page.evaluate + (querySelector|querySelectorALL)

page.$
page.$$
page.$eval
page.$$eval

とありますが、実際にそれらを使うためにどう書くかをまとめています。速度等は検証できていませんが、evalを使うのがそれぞれコードがシンプルになるので良いかと思います。

APIの動作の詳細はpuppeteerのAPIを参照ください。

https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md


前提

以下のようなliの中にアンカータグが入っているページを取得することを想定しています。

<ul>

<li><a href="some link">some HTML</a></li>
<li><a href="some link">some HTML</a></li>
<li><a href="some link">some HTML</a></li>
...
</ul>

以下のようにページ読み込みを準備します。

  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });

const page = await browser.newPage();
await page.goto('url you want to check');

let itemSelector="some selecter > ul > li:nth-child(1) > a";
let listSelector="some selecter > ul > li > a";


一つセレクターの一つの要素をとる


page.evaluate + querySelector

  var data = await page.evaluate((selector) => {

return document.querySelector(selector).textContent;
}, itemSelector);


page.$

  var item = await page.$(itemSelector);

var data = await (await item.getProperty('textContent')).jsonValue();


page.$eval

  var data = await page.$eval(itemSelector, item => {

return item.textContent;
});


一つセレクターの複数要素をとる


page.evaluate + querySelector

  var data = await page.evaluate((selector) => {

return {
href: document.querySelector(selector).href,
textContent: document.querySelector(selector).textContent,
innerHTML: document.querySelector(selector).innerHTML
};
}, itemSelector);


page.$

  var item = await page.$(itemSelector);

var data = {
href: await (await item.getProperty('href')).jsonValue(),
textContent: await (await item.getProperty('textContent')).jsonValue(),
innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
};


page.$eval

  var data = await page.$eval(itemSelector, item => {

return {
href: item.href,
textContent: item.textContent,
innerHTML: item.innerHTML
};
});


複数セレクターの一つの要素をとる


page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {

const list = Array.from(document.querySelectorAll(selector));
return list.map(data => data.textContent);
}, listSelector);


page.$$

  var list = await page.$$(listSelector);

var datas = [];
for (let i = 0; i < list.length; i++) {
datas.push(await (await list[i].getProperty('textContent')).jsonValue())
}


page.$$eval

  var datas = await page.$$eval(listSelector, list => {

return list.map(data => data.textContent);
});


複数セレクターの複数要素をとる


page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {

const list = Array.from(document.querySelectorAll(selector));
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
}, listSelector);


page.$$

  var list = await page.$$(listSelector);

var datas = [];
for (let i = 0; i < list.length; i++) {
var data = {
href: await (await list[i].getProperty('href')).jsonValue(),
textContent: await (await list[i].getProperty('textContent')).jsonValue(),
innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
};
datas.push(data);
}


page.$$eval

  var datas = await page.$$eval(listSelector, list => {

var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
});


確認用コード

const fs = require('fs');

const puppeteer = require('puppeteer');

(async() => {

const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
await page.goto('url you want to check');

let itemSelector="some selecter > ul > li:nth-child(1) > a";
let listSelector="some selecter > ul > li > a";

////////////////////////////////////////////////////////

var data = await page.evaluate((selector) => {
return document.querySelector(selector).textContent;
}, itemSelector);
console.log("one item one attribute");
console.log(data);

var item = await page.$(itemSelector);
var data = await (await item.getProperty('textContent')).jsonValue();
console.log("one item one attribute using $");
console.log(data);

var data = await page.$eval(itemSelector, item => {
return item.textContent;
});
console.log("some items one attribute using $eval");
console.log(data);

////////////////////////////////////////////////////////

var data = await page.evaluate((selector) => {
return {
href: document.querySelector(selector).href,
textContent: document.querySelector(selector).textContent,
innerHTML: document.querySelector(selector).innerHTML
};
}, itemSelector);
console.log("one item some attributes");
console.log(data);

var item = await page.$(itemSelector);
var data = {
href: await (await item.getProperty('href')).jsonValue(),
textContent: await (await item.getProperty('textContent')).jsonValue(),
innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
};
console.log("one item some attributes using $");
console.log(data);

var data = await page.$eval(itemSelector, item => {
return {
href: item.href,
textContent: item.textContent,
innerHTML: item.innerHTML
};
});
console.log("some items some attributes using $eval");
console.log(data);

////////////////////////////////////////////////////////

var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
return list.map(data => data.textContent);
}, listSelector);
console.log("some items one attribute");
console.log(datas);

var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
datas.push(await (await list[i].getProperty('textContent')).jsonValue())
}
console.log("some items one attribute using $$");
console.log(datas);

var datas = await page.$$eval(listSelector, list => {
return list.map(data => data.textContent);
});
console.log("some items one attribute using $$eval");
console.log(datas);

////////////////////////////////////////////////////////

var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
}, listSelector);
console.log("some items some attributes");
console.log(datas);

var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
var data = {
href: await (await list[i].getProperty('href')).jsonValue(),
textContent: await (await list[i].getProperty('textContent')).jsonValue(),
innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
};
datas.push(data);
}
console.log("some items one attribute using $$");
console.log(datas);

var datas = await page.$$eval(listSelector, list => {
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
});
console.log("some items some attributes using $$eval");
console.log(datas);

////////////////////////////////////////////////////////

browser.close();

})();