Google Apps Scriptを使用したWebスクレイピングのサンプルです
今回作成したソースはcode2019-02-16-1.gsです.
内容
- 参考サイト
- 出力プレビュー
- code2019-02-16-1.gs
1.参考サイト
以下のサイトを参考に作成しました
Google Apps Scriptでスクレイピングする方法 - QiitaGoogle Apps Scriptで正規表現を使って必要な情報を抽出する最も簡単なスクリプト
Google apps scriptでスプレッドシートを取得する | GASでスプレッドシートを操作する
並べ替え|Google Apps Script入門
JAPAN-REIT.COM - 全ての投資家のための不動産投信情報ポータル REIT一覧(リートデータ)
☆毎月、高配当金・分配金を貰うための投資方法について | 幸せ配当計画
2.出力プレビュー
3.code2019-02-16-1.gs
function scraping() {
const preURL = 'http://www.japan-reit.com/meigara/';
const URL2 = 'http://www.japan-reit.com/list/rimawari/'
var sclaypingText = "";
var response = UrlFetchApp.fetch(URL2);
var html = response.getContentText('UTF-8');
var lines = html.split('\r\n');
var myRegexp1 = "\<table.*?>";
var myRegexp2 = "\</table>";
var myRegexp3 = "class=\".*\""
var find_table_class = '"meigara-table x-x-hidden"';
var find_table_class2 = '"simple list-rimawari"';
var line;
var classList = [];
var value;
var tableSize = [];
var meigaraTable;
var meigaraList = [];
const haitouOtiKankaku = 3;
var meigaraConstractor = function(){
this.meigaraId = '';
this.meigaraNm = '';
this.rimawari = '';
this.kessan = '';
this.haitouOti = [];
};
//html上の各テーブルの情報を取得
var tableTagList = getTableTagString(lines, classList);
for(var i=0; i<tableTagList.length; i++){
if(classList[i] == find_table_class2){
//Logger.log("class["+i+"]"+classList[i]);
//Logger.log("table["+i+"]"+tableTagList[i]);
meigaraTable = tableTagList[i];
//tableSize = getTableSize(tableTagList[i]);
//value = getTableTagValue(1,0,tableTagList[i]);
//Logger.log("size:"+tableSize);
//Logger.log("value:"+value);
}
}
//テーブルの情報から中身をセット
tableSize = getTableSize(meigaraTable);
for(var j=1; j<tableSize[0]; j++){
var mId = getTableTagValue(j,0,meigaraTable);
var mNm = getTableTagValue(j,1,meigaraTable);
mNm = (getTagValueList(mNm, 'a'))[0];
mNm = deleteCommentTag(mNm);
var mrimawari = getTableTagValue(j,4,meigaraTable);
var mkessan = getTableTagValue(j,9,meigaraTable);
var mObj = new meigaraConstractor();
mObj.meigaraId = mId;
mObj.meigaraNm = mNm;
mObj.rimawari = mrimawari;
mObj.kessan = mkessan;
mObj.haitouOti = mObj.kessan.split('/');
for(var j2=0; j2<mObj.haitouOti.length; j2++){
mObj.haitouOti[j2] = mObj.haitouOti[j2].replace(' ','');
mObj.haitouOti[j2] = parseInt(mObj.haitouOti[j2])+haitouOtiKankaku;
if(parseInt(mObj.haitouOti[j2]) > 12){
mObj.haitouOti[j2] = parseInt(mObj.haitouOti[j2])-12;
}
}
if(mObj.haitouOti.length >= 2){
if(mObj.haitouOti[0] > mObj.haitouOti[1]){
var temp = mObj.haitouOti[0];
mObj.haitouOti[0] = mObj.haitouOti[1];
mObj.haitouOti[1] = temp;
}
}
meigaraList.push(mObj);
}
makeHeader();
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var cellAnote;
var cell;
for(var k=0; k<meigaraList.length; k++){
cellAnote = getColAnote(1)+(k+2);
cell = spreadsheet.getRange(cellAnote);
cell.setValue(meigaraList[k].meigaraId);
cellAnote = getColAnote(2)+(k+2);
cell = spreadsheet.getRange(cellAnote);
cell.setValue(meigaraList[k].meigaraNm);
cellAnote = getColAnote(3)+(k+2);
cell = spreadsheet.getRange(cellAnote);
cell.setValue(meigaraList[k].kessan);
cellAnote = getColAnote(4)+(k+2);
cell = spreadsheet.getRange(cellAnote);
cell.setValue(meigaraList[k].rimawari);
for(var k2=0; k2<meigaraList[k].haitouOti.length; k2++){
cellAnote = getColAnote(5+k2)+(k+2)
cell = spreadsheet.getRange(cellAnote);
var t = meigaraList[k].haitouOti[k2];
cell.setValue(meigaraList[k].haitouOti[k2]);
}
}
SortSheet({column: 4, ascending: false});
SortSheet({column: 5, ascending: true});
Logger.log("meigaraList:"+meigaraList);
}
function makeHeader(){
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var cellAnote;
var cell;
cellAnote = getColAnote(1)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("銘柄ID");
cellAnote = getColAnote(2)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("銘柄名");
cellAnote = getColAnote(3)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("決算月");
cellAnote = getColAnote(4)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("利回り");
cellAnote = getColAnote(5)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("配当落ち予想月1");
cellAnote = getColAnote(6)+'1';
cell = spreadsheet.getRange(cellAnote);
cell.setValue("配当落ち予想月2");
}
function SortSheet(row) {
var sheet = SpreadsheetApp.getActiveSheet();
var lastRow = sheet.getLastRow();
var lastCol = sheet.getLastColumn();
sheet.getRange(2, 1, lastRow, lastCol).sort(row);
}
function getColAnote(row){
var result;
if(row == 1){
return 'A';
}else if(row == 2){
return 'B';
}else if(row == 3){
return 'C';
}else if(row == 4){
return 'D';
}else if(row == 5){
return 'E';
}else if(row == 6){
return 'F';
}else if(row == 7){
return 'G';
}else if(row == 8){
return 'H';
}else if(row == 9){
return 'I';
}else if(row == 10){
return 'J';
}
return 'Z';
}
/*Spreadsheetタイトル台帳を作成*/
function newsheet(){
//createメソッドでスプレッドシートを作成
var ssNew = SpreadsheetApp.create("spreadsheetTest1");
return ssNew.getUrl();
}
function fetchData(str, pre, suf) {
var reg = new RegExp(pre + '.*' + suf);
if(str.match(reg) == null){
return null;
}
var data = str.match(reg)[0]
.replace(pre, '')
.replace(suf, '');
return data;
}
function getTableSize(tableTagStr){
var result = [0,0];
var myRegExp1 = /<tr.*?>.*?\<\/tr>/g;
var myRegExp2 = /<td.*?>.*?\<\/td>/g;
var myRegExpTHead1 = /<thead.*?>.*?\<\/thead>/g;
var myRegExpTHead2 = /<th.*?>.*?\<\/th>/g;
var HeaderList = tableTagStr.match(myRegExpTHead1);
var rowList = tableTagStr.match(myRegExp1);
if(HeaderList != null){
if(rowList != null){
result[0] = rowList.length;
var colHeaderList = rowList[0].match(myRegExpTHead2);
if(colHeaderList != null){
result[1] = colHeaderList.length;
}else{
result[1] = 0;
}
return result;
}else{
result[0] = 0;
result[1] = 0;
return result;
}
}
if(rowList != null){
result[0] = rowList.length;
var colList = rowList[rowIdx].match(myRegExp2);
if(colList != null){
result[1] = colList.length;
}else{
result[1] = 0
}
return result;
}else{
result[0] = 0;
result[1] = 0;
return result;
}
}
function getTableTagValue(rowIdx,colIdx,tableTagStr){
var myRegExp1 = /<tr.*?>.*?\<\/tr>/g;
var myRegExp2 = /<td.*?>.*?\<\/td>/g;
var myRegExpTHead1 = /<thead.*?>.*?\<\/thead>/g;
var myRegExpTHead2 = /<th.*?>.*?\<\/th>/g;
var HeaderList = tableTagStr.match(myRegExpTHead1);
var rowList = tableTagStr.match(myRegExp1);
if(HeaderList != null && rowIdx == 0){
if(rowList != null && rowList.length >= rowIdx){
var colHeaderList = rowList[rowIdx].match(myRegExpTHead2);
if(colHeaderList != null && colHeaderList.length >= colIdx){
return getTagValue(colHeaderList[colIdx], 'th');
}
}
}
if(rowList != null && rowList.length >= rowIdx){
var colList = rowList[rowIdx].match(myRegExp2);
if(colList != null && colList.length >= colIdx){
return getTagValue(colList[colIdx], 'td');
}
}
return null;
}
function getTagValue(str, tagName){
var startTagExp = new RegExp('<'+tagName+'.*?>','g');//'<'+tagName+'.*?>/g;
var matchedTag = str.match(startTagExp);
var result;
if(matchedTag != null){
result = str.replace(matchedTag[0],'');
}
return result.replace('\</'+tagName+'>','');
}
function deleteCommentTag(str){
var tagExp = new RegExp('<!--.*?-->','g');
var matchedTagList = str.match(tagExp);
var result = str;
for(var i=0; i<matchedTagList.length; i++){
result = result.replace(matchedTagList[i],'');
}
return result;
}
function getTagValueList(str, tagName){
var tagExp = new RegExp('<'+tagName+'.*?>'+'.*?'+'\</'+tagName+'>','g');//'<'+tagName+'.*?>/g;
var tagExp2 = new RegExp('<'+tagName+'.*?>','g');
var matchedTagList = str.match(tagExp);
var result = [];
var tagHead;
for(var i=0; i<matchedTagList.length; i++){
tagHead = matchedTagList[i].match(tagExp2);
result.push(matchedTagList[i].replace(tagHead[0],'').replace('\</'+tagName+'>',''));
}
return result;
}
function getTableTagString(lines, classList){
var resultList = [];
var myRegExp1 = "\<table.*>";
var myRegExp2 = "\</table>";
var myRegExp3 = "class=\".*\""
var line1, line2;
var j;
var resultStr;
for(var i=0; i<lines.length; i++){
line1 = lines[i];
if(line1.match(myRegExp1) != null){
if(line1.match(myRegExp3) != null){
var matchedStrList = line1.match(myRegExp3);
classList.push(matchedStrList[0].replace('class=',''));
}else{
classList.push("");
}
resultStr = "";
for(j=i+1; j<lines.length; j++){
line2 = lines[j];
if(line2.match(myRegExp2) == null){
resultStr += line2.replace('\r\n','');
}else{
resultList.push(resultStr);
resultStr = "";
i = j;
break;
}
}
}
}
return resultList;
}