環境・ツール
- macOS High Sierra 10.13.2
- PHP 7.1.7 -> PHP 5.6.35
- Composer version 1.6.3
環境設定
- PHPバージョン確認
$ php -v
PHP 7.1.7 (cli) (built: Jul 15 2017 18:08:09) ( NTS )
Copyright (c) 1997-2017 The PHP Group
Zend Engine v3.1.0, Copyright (c) 1998-2017 Zend Technologies
- PHP5.6インストール
$ brew install php56
- PHP7.1からPHP5.6に切り替え
$ vim ~/.bash_profile
~/.bash_profile
# php5.6 #
export PATH=/usr/local/Cellar/php\@5.6/5.6.35/bin:$PATH
$ source ~/.bash_profile
$ php -v
PHP 5.6.35 (cli) (built: Mar 31 2018 20:21:31)
Copyright (c) 1997-2016 The PHP Group
Zend Engine v2.6.0, Copyright (c) 1998-2016 Zend Technologies
with Zend OPcache v7.0.6-dev, Copyright (c) 1999-2016, by Zend Technologies
- Composerインストール
$ brew install homebrew/php/composer
$ composer -v
______
/ ____/___ ____ ___ ____ ____ ________ _____
/ / / __ \/ __ `__ \/ __ \/ __ \/ ___/ _ \/ ___/
/ /___/ /_/ / / / / / / /_/ / /_/ (__ ) __/ /
\____/\____/_/ /_/ /_/ .___/\____/____/\___/_/
/_/
Composer version 1.6.3 2018-01-31 16:28:17
$ composer init
↑init必要なかったか?
- php-phantomjsインストール
composer.json
{
"config": {
"bin-dir": "bin"
},
"scripts": {
"post-install-cmd": [
"PhantomInstaller\\Installer::installPhantomJS"
],
"post-update-cmd": [
"PhantomInstaller\\Installer::installPhantomJS"
]
}
}
$ composer require "jonnyw/php-phantomjs:4.*"
コーディング
scrape-php-phantomjs.php
<?php
require 'vendor/autoload.php';
use JonnyW\PhantomJs\Client;
use JonnyW\PhantomJs\DependencyInjection\ServiceContainer;
$client = Client::getInstance();
$request = $client->getMessageFactory()->createRequest();
$response = $client->getMessageFactory()->createResponse();
$url = 'URL';
$request->setUrl($url);
$client->send($request,$response);
$htmlstr = $response->getContent();
$dom = new DOMDocument;
@$dom->loadHTML($htmlstr);
$xpath = new DOMXPath($dom);
$entries = [];
$q_product = '//li[@class="CLASS NAME"]';
foreach ($xpath->query($q_product) as $node) {
$entries[] = [
'title' => $xpath->evaluate('string(.//h2[@class="CLASS NAME"]/a)',$node),
'price' => $xpath->evaluate('string(.//span[@class="CLASS NAME"][1])',$node)
];
}
var_dump($entries);
?>
- スクリプト実行時Warning発生
Declaration of JonnyW\..\ServiceContainer::load() should be compatible with Symfony\..\Container::load($file)
Update ServiceContainer.php #217でServiceContainer.phpを最新化して解決
おまけ(php-phantomjsでWebスクレイピングしてExcelで入出力)
- PhpSpreadsheetインストール
$ composer require phpoffice/phpspreadsheet
- PhpSPreadsheetでのファイル読み込み、ファイル書き込み
scrape-php-phantomjs-spreadsheet.php
<?php
require 'vendor/autoload.php';
use JonnyW\PhantomJs\Client;
use JonnyW\PhantomJs\DependencyInjection\ServiceContainer;
use PhpOffice\PhpSpreadsheet\Writer\Xlsx as Writer;
use PhpOffice\PhpSpreadsheet\Reader\Xlsx as Reader;
$reader = new Reader();
$spreadsheet = $reader->load('example.xlsx');
for($i=2;$i<=3;$i++){
$sheet0 = $spreadsheet->getSheet(0);
$cell = 'A'.$i;
$code = $sheet0->getCell($cell)->getValue();
$url = 'http://www.example.com?code='.$code;
$client = Client::getInstance();
$request = $client->getMessageFactory()->createRequest();
$response = $client->getMessageFactory()->createResponse();
$request->setUrl($url);
$client->send($request,$response);
$htmlstr = $response->getContent();
$dom = new DOMDocument;
@$dom->loadHTML($htmlstr);
$xpath = new DOMXPath($dom);
$entries = [];
$q_1 = '//div[@id="ID NAME"]';
foreach ($xpath->query($q_1) as $node) {
$entries = [
'1' => $xpath->evaluate('string(.//div[@class="CLASS NAME"]/table/tbody/tr[XXX]/td[XXX])',$node),
'2' => $xpath->evaluate('string(.//div[@class="CLASS NAME"]/table/tbody/tr[XXX]/td[XXX])',$node),
'3' => $xpath->evaluate('string(.//table[XXX]/tbody/tr[XXX]/td[XXX])',$node),
];
}
$sheet0->setCellValue('B'.$i,$entries[1]);
$sheet0->setCellValue('C'.$i,$entries[2]);
$sheet1 = $spreadsheet->getSheet(1);
$sheet1->setCellValue('D'.$i,$entries[3]);
}
$writer = new Writer($spreadsheet);
$writer->save('example.xlsx');
?>
参考にしたサイト
PHPネイティブのDOMによるスクレイピング入門
Macにhomebrewでcomposerをインストール
PHP開発でComposerを使わないなんてありえない!基礎編
PHP PhantomJS を使ってPHPでヘッドレスブラウジング
GitHub - jonnnnyw/php-phantomjs: Execute PhantomJS commands through PHP
PHP PhantomJS
Amazonの検索結果をスクレイピング
PHPExcelが非推奨になったので後継のPhpSpreadsheetを使ってみる
[GitHub - PHPOffice/PhpSpreadsheet: A pure PHP library for reading and writing spreadsheet files]
(https://github.com/PHPOffice/PhpSpreadsheet)
PhpSpreadsheet Documentation