#概要
puppeteerでの要素の取得のための関数は
page.evaluate + (querySelector|querySelectorALL)
page.$
page.$$
page.$eval
page.$$eval
とありますが、実際にそれらを使うためにどう書くかをまとめています。速度等は検証できていませんが、evalを使うのがそれぞれコードがシンプルになるので良いかと思います。
APIの動作の詳細はpuppeteerのAPIを参照ください。
https://pptr.dev/api/
#前提
以下のようなliの中にアンカータグが入っているページを取得することを想定しています。
<ul>
<li><a href="some link">some HTML</a></li>
<li><a href="some link">some HTML</a></li>
<li><a href="some link">some HTML</a></li>
...
</ul>
以下のようにページ読み込みを準備します。
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
await page.goto('url you want to check');
let itemSelector="some selecter > ul > li:nth-child(1) > a";
let listSelector="some selecter > ul > li > a";
#一つセレクターの一つの要素をとる
##page.evaluate + querySelector
var data = await page.evaluate((selector) => {
return document.querySelector(selector).textContent;
}, itemSelector);
##page.$
var item = await page.$(itemSelector);
var data = await (await item.getProperty('textContent')).jsonValue();
##page.$eval
var data = await page.$eval(itemSelector, item => {
return item.textContent;
});
#一つセレクターの複数要素をとる
##page.evaluate + querySelector
var data = await page.evaluate((selector) => {
return {
href: document.querySelector(selector).href,
textContent: document.querySelector(selector).textContent,
innerHTML: document.querySelector(selector).innerHTML
};
}, itemSelector);
##page.$
var item = await page.$(itemSelector);
var data = {
href: await (await item.getProperty('href')).jsonValue(),
textContent: await (await item.getProperty('textContent')).jsonValue(),
innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
};
##page.$eval
var data = await page.$eval(itemSelector, item => {
return {
href: item.href,
textContent: item.textContent,
innerHTML: item.innerHTML
};
});
#複数セレクターの一つの要素をとる
##page.evaluate + querySelectorAll
var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
return list.map(data => data.textContent);
}, listSelector);
##page.$$
var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
datas.push(await (await list[i].getProperty('textContent')).jsonValue())
}
##page.$$eval
var datas = await page.$$eval(listSelector, list => {
return list.map(data => data.textContent);
});
#複数セレクターの複数要素をとる
##page.evaluate + querySelectorAll
var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
}, listSelector);
##page.$$
var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
var data = {
href: await (await list[i].getProperty('href')).jsonValue(),
textContent: await (await list[i].getProperty('textContent')).jsonValue(),
innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
};
datas.push(data);
}
##page.$$eval
var datas = await page.$$eval(listSelector, list => {
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
});
#確認用コード
const fs = require('fs');
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
await page.goto('url you want to check');
let itemSelector="some selecter > ul > li:nth-child(1) > a";
let listSelector="some selecter > ul > li > a";
////////////////////////////////////////////////////////
var data = await page.evaluate((selector) => {
return document.querySelector(selector).textContent;
}, itemSelector);
console.log("one item one attribute");
console.log(data);
var item = await page.$(itemSelector);
var data = await (await item.getProperty('textContent')).jsonValue();
console.log("one item one attribute using $");
console.log(data);
var data = await page.$eval(itemSelector, item => {
return item.textContent;
});
console.log("some items one attribute using $eval");
console.log(data);
////////////////////////////////////////////////////////
var data = await page.evaluate((selector) => {
return {
href: document.querySelector(selector).href,
textContent: document.querySelector(selector).textContent,
innerHTML: document.querySelector(selector).innerHTML
};
}, itemSelector);
console.log("one item some attributes");
console.log(data);
var item = await page.$(itemSelector);
var data = {
href: await (await item.getProperty('href')).jsonValue(),
textContent: await (await item.getProperty('textContent')).jsonValue(),
innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
};
console.log("one item some attributes using $");
console.log(data);
var data = await page.$eval(itemSelector, item => {
return {
href: item.href,
textContent: item.textContent,
innerHTML: item.innerHTML
};
});
console.log("some items some attributes using $eval");
console.log(data);
////////////////////////////////////////////////////////
var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
return list.map(data => data.textContent);
}, listSelector);
console.log("some items one attribute");
console.log(datas);
var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
datas.push(await (await list[i].getProperty('textContent')).jsonValue())
}
console.log("some items one attribute using $$");
console.log(datas);
var datas = await page.$$eval(listSelector, list => {
return list.map(data => data.textContent);
});
console.log("some items one attribute using $$eval");
console.log(datas);
////////////////////////////////////////////////////////
var datas = await page.evaluate((selector) => {
const list = Array.from(document.querySelectorAll(selector));
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
}, listSelector);
console.log("some items some attributes");
console.log(datas);
var list = await page.$$(listSelector);
var datas = [];
for (let i = 0; i < list.length; i++) {
var data = {
href: await (await list[i].getProperty('href')).jsonValue(),
textContent: await (await list[i].getProperty('textContent')).jsonValue(),
innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
};
datas.push(data);
}
console.log("some items one attribute using $$");
console.log(datas);
var datas = await page.$$eval(listSelector, list => {
var datas=[];
for (let i = 0; i < list.length; i++) {
var data = {
href: list[i].href,
textContent: list[i].textContent,
innerHTML: list[i].innerHTML
};
datas.push(data);
}
return datas;
});
console.log("some items some attributes using $$eval");
console.log(datas);
////////////////////////////////////////////////////////
browser.close();
})();