JavaScript
GoogleAppsScript
gas

GASでスクレイピング:SUUMOから賃貸物件情報を取得

GASでSUUMOの賃貸物件情報をスクレイピングして取得する

GAS実装

スクレイピング手法

まずはSUUMOのHTMLを読み込んで,スクレイピングのKeyとなるHTML文をピックアップした.
その結果,3つほど見つかった.
それを var tag =""で指定して,UrlFetchAppで取得したHTMLをsplit(\n)して,該当行を検索.
そこからn行と言った形で指定していって,情報を取得する

ソースコード

function doCurl(){
  // SUUMOのURL
  var url = "https://suumo.jp/chintai/bc_100106732165/?suit=STfr20160902000"
  main(url)
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//  main: 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
function main(url){
  var html = UrlFetchApp.fetch(url).getContentText();
  var splited_html = html.split("\n");

  var tag='<span class="jj-fr_detail-icon jj-fr_detail-icon--note"></span><span>お気に入りに登録する</span></a>'
  for(var i=0; i<splited_html.length; i++){
    var num = i
    var body = splited_html[i]
    var index = body.indexOf(tag);
    if(index != -1){
      Logger.log("================================")

      yachin = cut_string(splited_html[i+17].trim(),"<span>", "</span>")//家賃
      yachin = Number(cut_string(yachin, "", "万円"))*10000
      Logger.log("家賃: "+yachin)

      kanrihi = cut_string(splited_html[i+21].trim(),"<span>", "</span>")//管理費
      kanrihi = Number(cut_string(kanrihi, "", "円"))
      Logger.log("管理費: "+kanrihi)

      shikikin = cut_string(splited_html[i+29].trim(),"<span>", "</span>")//敷金
      shikikin = Number(cut_string(shikikin, "", "万円"))*10000
      Logger.log("敷金: "+shikikin)

      reikin = cut_string(splited_html[i+33].trim(),"<span>", "</span>")//礼金
      reikin = Number(cut_string(reikin, "", "万円"))*10000
      Logger.log("礼金: "+reikin)

      madori = cut_string(splited_html[i+44].trim(),"<div>", "</div>")//間取り
      Logger.log("間取り: "+madori)

      hirosa = cut_string(splited_html[i+46].trim(),"", "m<sup>")//広さ
      hirosa = Math.round(hirosa)
      Logger.log("広さ: "+hirosa)

      muki = cut_string(splited_html[i+48].trim(),"<div>", "</div>")//向き
      Logger.log("向き: "+muki)

      chikunen = cut_string(splited_html[i+56].trim(),"<div>", "</div>")//築年数
      if(chikunen=="新築"){
        chikunen = 1
      }else{
        chikunen = cut_string(chikunen, "築","年")
      }
      Logger.log("築年数: "+chikunen)
    }
  }

  var tag = '[<a href="javascript:norikaePop('
  for(var i=0; i<splited_html.length; i++){
    var num = i
    var body = splited_html[i]
    var index = body.indexOf(tag);
    if(index != -1){  
      moyori = cut_string(splited_html[i-1].trim(),"/", "駅")//最寄り
      Logger.log("最寄り駅: "+moyori)

      moyori_toho = cut_string(splited_html[i-1].trim(),"歩", "分")//最寄り
      Logger.log("駅徒歩: "+moyori_toho)
    }
  }

  var tag ='<h2><span>物件概要</span></h2>'
  for(var i=0; i<splited_html.length; i++){
    var num = i
    var body = splited_html[i]
    var index = body.indexOf(tag);
    if(index != -1){
      madori_detail = cut_string(splited_html[i+11].trim(),"<td>", "</td>")//間取り詳細
      Logger.log("間取り詳細: "+madori_detail)

      kozo = cut_string(splited_html[i+14].trim(),"<td>", "</td>")//構造
      Logger.log("構造: "+kozo)

      kaisu = cut_string(splited_html[i+18].trim(),"<td>", "</td>")//階数
      Logger.log("階: "+kaisu)

      kenchikubi = cut_string(splited_html[i+20].trim(),"<td>", "</td>")//建築日
      Logger.log("建築日: "+kenchikubi)
    }
  }
  var res = {
    "yachin": yachin,
    "kanrihi": kanrihi,
    "shikikin": shikikin,
    "reikin": reikin,
    "madori": madori,
    "hirosa": hirosa,
    "muki": muki,
    "chikunen": chikunen,
    "madori_detail": madori_detail,
    "kaisu": kaisu,
    "kenchikubi": kenchikubi,
    "moyori": moyori,
    "moyori_toho": moyori_toho,
    "kozo": kozo,
    "url": url,
  }
  Logger.log("================================")
  Logger.log(res)
  return res
}

実行結果

ログ出力はこんな感じです

スクリーンショット 2018-01-09 21.12.00.png