2
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

GASスクレイピングサンプル

Posted at

Google Apps Scriptを使用したWebスクレイピングのサンプルです
今回作成したソースはcode2019-02-16-1.gsです.

内容

  1. 参考サイト
  2. 出力プレビュー
  3. code2019-02-16-1.gs

1.参考サイト

以下のサイトを参考に作成しました

Google Apps Scriptでスクレイピングする方法 - Qiita
Google Apps Scriptで正規表現を使って必要な情報を抽出する最も簡単なスクリプト
Google apps scriptでスプレッドシートを取得する | GASでスプレッドシートを操作する
並べ替え|Google Apps Script入門
JAPAN-REIT.COM - 全ての投資家のための不動産投信情報ポータル REIT一覧(リートデータ)
☆毎月、高配当金・分配金を貰うための投資方法について | 幸せ配当計画

2.出力プレビュー


3.code2019-02-16-1.gs



function scraping() {
 
  const preURL = 'http://www.japan-reit.com/meigara/';
  const URL2 = 'http://www.japan-reit.com/list/rimawari/'
  
  var sclaypingText = "";
  
  var response = UrlFetchApp.fetch(URL2);
  var html = response.getContentText('UTF-8');
  var lines = html.split('\r\n');
  
  var myRegexp1 = "\<table.*?>";
  var myRegexp2 = "\</table>";
  var myRegexp3 = "class=\".*\""
  var find_table_class = '"meigara-table x-x-hidden"';
  var find_table_class2 = '"simple list-rimawari"';
  var line;
  var classList = [];
  var value;
  var tableSize = [];
  var meigaraTable;
  var meigaraList = [];
  const haitouOtiKankaku = 3;
  
  var meigaraConstractor = function(){
    this.meigaraId = '';
    this.meigaraNm = '';
    this.rimawari = '';
    this.kessan = '';
    this.haitouOti = [];
  };
  
  //html上の各テーブルの情報を取得
  var tableTagList = getTableTagString(lines, classList);
  for(var i=0; i<tableTagList.length; i++){
    if(classList[i] == find_table_class2){
    //Logger.log("class["+i+"]"+classList[i]);
    //Logger.log("table["+i+"]"+tableTagList[i]);
     meigaraTable = tableTagList[i];
     //tableSize = getTableSize(tableTagList[i]);
     //value = getTableTagValue(1,0,tableTagList[i]);
     //Logger.log("size:"+tableSize);
     //Logger.log("value:"+value);
    }       
  }
  
  
  //テーブルの情報から中身をセット
  tableSize = getTableSize(meigaraTable);
  for(var j=1; j<tableSize[0]; j++){
    var mId = getTableTagValue(j,0,meigaraTable);
    var mNm = getTableTagValue(j,1,meigaraTable);
        mNm = (getTagValueList(mNm, 'a'))[0];
        mNm = deleteCommentTag(mNm);  
    var mrimawari = getTableTagValue(j,4,meigaraTable);
    var mkessan = getTableTagValue(j,9,meigaraTable);
    var mObj = new meigaraConstractor();
    mObj.meigaraId = mId;
    mObj.meigaraNm = mNm;
    mObj.rimawari = mrimawari;
    mObj.kessan = mkessan;
    
    mObj.haitouOti = mObj.kessan.split('/');
    for(var j2=0; j2<mObj.haitouOti.length; j2++){
      mObj.haitouOti[j2] = mObj.haitouOti[j2].replace(' ','');
      mObj.haitouOti[j2] = parseInt(mObj.haitouOti[j2])+haitouOtiKankaku;
      
      if(parseInt(mObj.haitouOti[j2]) > 12){
        mObj.haitouOti[j2] = parseInt(mObj.haitouOti[j2])-12;
      }
    }
    if(mObj.haitouOti.length >= 2){
      if(mObj.haitouOti[0] > mObj.haitouOti[1]){
        var temp = mObj.haitouOti[0];
        mObj.haitouOti[0] = mObj.haitouOti[1];
        mObj.haitouOti[1] = temp;
      }
    }
    
    meigaraList.push(mObj);    
  }

  
  makeHeader();
  var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();  
  var cellAnote;
  var cell;
  for(var k=0; k<meigaraList.length; k++){
    
    cellAnote = getColAnote(1)+(k+2);
    cell = spreadsheet.getRange(cellAnote);
    cell.setValue(meigaraList[k].meigaraId);
    
    cellAnote = getColAnote(2)+(k+2);
    cell = spreadsheet.getRange(cellAnote);
    cell.setValue(meigaraList[k].meigaraNm);

    cellAnote = getColAnote(3)+(k+2);
    cell = spreadsheet.getRange(cellAnote);
    cell.setValue(meigaraList[k].kessan);

    cellAnote = getColAnote(4)+(k+2);
    cell = spreadsheet.getRange(cellAnote);
    cell.setValue(meigaraList[k].rimawari); 
    
    for(var k2=0; k2<meigaraList[k].haitouOti.length; k2++){
      cellAnote = getColAnote(5+k2)+(k+2)
      cell = spreadsheet.getRange(cellAnote);
      var t = meigaraList[k].haitouOti[k2];
      cell.setValue(meigaraList[k].haitouOti[k2]);
    }
    
  }
  SortSheet({column: 4, ascending: false});
  SortSheet({column: 5, ascending: true});

  Logger.log("meigaraList:"+meigaraList);
  
  
}

function makeHeader(){
  var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();  
  var cellAnote;
  var cell;
  
  cellAnote = getColAnote(1)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("銘柄ID");
  
  cellAnote = getColAnote(2)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("銘柄名");
  
  cellAnote = getColAnote(3)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("決算月");
  
  cellAnote = getColAnote(4)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("利回り");
  
  cellAnote = getColAnote(5)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("配当落ち予想月1");
  
  cellAnote = getColAnote(6)+'1';
  cell = spreadsheet.getRange(cellAnote);
  cell.setValue("配当落ち予想月2");
}

function SortSheet(row) {
  var sheet = SpreadsheetApp.getActiveSheet();
  var lastRow = sheet.getLastRow();
  var lastCol = sheet.getLastColumn();
  sheet.getRange(2, 1, lastRow, lastCol).sort(row);
}
function getColAnote(row){
  var result;
  
  if(row == 1){
    return 'A';
  }else if(row == 2){
    return 'B';
  }else if(row == 3){
    return 'C';
  }else if(row == 4){
    return 'D';
  }else if(row == 5){
    return 'E';
  }else if(row == 6){
    return 'F';
  }else if(row == 7){
    return 'G';
  }else if(row == 8){
    return 'H';
  }else if(row == 9){
    return 'I';
  }else if(row == 10){
    return 'J';
  }
  
  return 'Z';
}
/*Spreadsheetタイトル台帳を作成*/
function newsheet(){
 
//createメソッドでスプレッドシートを作成
 var ssNew = SpreadsheetApp.create("spreadsheetTest1");
 
 return ssNew.getUrl();
 
}

function fetchData(str, pre, suf) {
  var reg = new RegExp(pre + '.*' + suf);
  if(str.match(reg) == null){
    return null;
  }
  
  var data = str.match(reg)[0]
    .replace(pre, '')
    .replace(suf, '');
  return data;  
}

function getTableSize(tableTagStr){
  var result = [0,0];
  var myRegExp1 = /<tr.*?>.*?\<\/tr>/g;
  var myRegExp2 = /<td.*?>.*?\<\/td>/g;
  var myRegExpTHead1 = /<thead.*?>.*?\<\/thead>/g;
  var myRegExpTHead2 =  /<th.*?>.*?\<\/th>/g;
 

  var HeaderList = tableTagStr.match(myRegExpTHead1);
  var rowList = tableTagStr.match(myRegExp1);
  
  
  if(HeaderList != null){
    if(rowList != null){
      result[0] =  rowList.length;
      var colHeaderList = rowList[0].match(myRegExpTHead2);
      if(colHeaderList != null){
        result[1] = colHeaderList.length; 
      }else{
        result[1] = 0;
      }
      return result;
    }else{
      result[0] = 0;
      result[1] = 0;
      return result;
    }
  }
  
  if(rowList != null){
    result[0] = rowList.length;
    var colList = rowList[rowIdx].match(myRegExp2);
    if(colList != null){
      result[1] = colList.length;
    }else{
      result[1] = 0
    }
    return result;
  }else{
    result[0] = 0;
    result[1] = 0;
    return result;
  }
}
function getTableTagValue(rowIdx,colIdx,tableTagStr){
  var myRegExp1 = /<tr.*?>.*?\<\/tr>/g;
  var myRegExp2 = /<td.*?>.*?\<\/td>/g;
  var myRegExpTHead1 = /<thead.*?>.*?\<\/thead>/g;
  var myRegExpTHead2 =  /<th.*?>.*?\<\/th>/g;
 

  var HeaderList = tableTagStr.match(myRegExpTHead1);
  var rowList = tableTagStr.match(myRegExp1);
  
  
  if(HeaderList != null &&  rowIdx == 0){
    if(rowList != null && rowList.length >= rowIdx){
      var colHeaderList = rowList[rowIdx].match(myRegExpTHead2);
      if(colHeaderList != null && colHeaderList.length >= colIdx){
        return getTagValue(colHeaderList[colIdx], 'th'); 
      }
    }
  }
  
  if(rowList != null && rowList.length >= rowIdx){
    var colList = rowList[rowIdx].match(myRegExp2);
    if(colList != null && colList.length >= colIdx){
      return getTagValue(colList[colIdx], 'td');
    }
  }
  return null;
}

function getTagValue(str, tagName){
  var startTagExp = new RegExp('<'+tagName+'.*?>','g');//'<'+tagName+'.*?>/g;
  var matchedTag = str.match(startTagExp);
  var result;
  
  if(matchedTag != null){
    result = str.replace(matchedTag[0],'');
  }
  return result.replace('\</'+tagName+'>','');
}

function deleteCommentTag(str){
  var tagExp = new RegExp('<!--.*?-->','g');
  var matchedTagList = str.match(tagExp);
  var result = str;
  
  for(var i=0; i<matchedTagList.length; i++){
    result = result.replace(matchedTagList[i],'');
  }
  return result;
}
function getTagValueList(str, tagName){
  var tagExp = new RegExp('<'+tagName+'.*?>'+'.*?'+'\</'+tagName+'>','g');//'<'+tagName+'.*?>/g;
  var tagExp2 = new RegExp('<'+tagName+'.*?>','g');
  var matchedTagList = str.match(tagExp);
  var result = [];
  var tagHead;
  
  for(var i=0; i<matchedTagList.length; i++){
     tagHead = matchedTagList[i].match(tagExp2);
     result.push(matchedTagList[i].replace(tagHead[0],'').replace('\</'+tagName+'>',''));
  }
  return result;
}
function getTableTagString(lines, classList){
  var resultList = [];  
  var myRegExp1 = "\<table.*>";
  var myRegExp2 = "\</table>";
  var myRegExp3 = "class=\".*\""
  var line1, line2;
  var j;
  var resultStr;
  
  for(var i=0; i<lines.length; i++){
    line1 = lines[i];
    if(line1.match(myRegExp1) != null){
      
      if(line1.match(myRegExp3) != null){
        var matchedStrList = line1.match(myRegExp3);
        classList.push(matchedStrList[0].replace('class=',''));
      }else{
        classList.push("");
      }
      
      resultStr = "";
      for(j=i+1; j<lines.length; j++){
        line2 = lines[j];
        if(line2.match(myRegExp2) == null){
          resultStr += line2.replace('\r\n','');
        }else{
          resultList.push(resultStr);
          resultStr = "";
          i = j;
          break;
        }
      }
    }
  }
  
  return resultList;
}

2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?