More than 3 years have passed since last update.

SUUMOから情報を得る

Python3

Last updated at 2021-05-26Posted at 2021-05-26

SUUMOから情報を得る

メモ

「8 columns passed, passed data had 9 columns」
項目以上のデータがあり不整合が起こる。

トラブル

vscodeでanacondaを使うとnumpyがインポートエラーになる

参考

CODE

# 必要なライブラリをインポート
import time
import re
import urllib.request, json
import urllib.parse

from bs4 import BeautifulSoup
from selenium import webdriver
import requests
from tqdm import tqdm

try:
    import pandas as pd
except:
    print("panda")

# 東京都の中古マンション一覧
def init(url):


   #データ取得
    result = requests.get(url)
    c = result.content

    #HTMLを元に、オブジェクトを作る
    soup = BeautifulSoup(c)




    #物件リストの部分を切り出し
    summary = soup.find("div", id='js-bukkenList') #1ページ分を取得
    dt = summary.find("div",class_='ui-media').find_all('dt') #dtを全て取得
    cols = [t.text for t in dt]

    cols.insert(0, 'url') #後ほど物件情報詳細を確認するためurl列を追加
    print(cols) # 項目を表示する
    print('--------------------------------------------')

    #ページ数を取得
    body = soup.find("body")
    page = body.find("div",class_='pagination pagination_set-nav')
    li = page.find_all('li')
    pg_length = int(li[-1].text)

    #URLを入れるリスト
    urls = []


    #1ページ目を格納
    urls.append(url)

    #2ページ目から最後のページまでを格納
    for i in range(pg_length)[:-1]:
        pg = str(i+2) #2ページ目から
        url_page = url + '&pn=' + pg #ページ数に合わせたurl
        urls.append(url_page)
        if len(urls )> 3:
            break

    #各ページで以下の動作をループ
    bukkens =[]
    b_count = 0
    for url in tqdm(urls):
        #物件リストを切り出し
        result = requests.get(url)
        c = result.content
        soup = BeautifulSoup(c)
        summary = soup.find("div",id='js-bukkenList')

        #マンション名、住所、立地（最寄駅/徒歩~分）、築年数、建物高さが入っているproperty_unitsを全て抜き出し
        property_units = summary.find_all("div",class_='property_unit-content')
        #print(property_units)

        #各property_unitsに対し、以下の動作をループ
        for item in property_units:
            templ = []
            #マンションへのリンク取得
            h2 = item.find("h2",class_='property_unit-title')
            href = h2.find("a").get('href')
            templ.append(href)
            if ( 7 == len(item.find_all('dd')) ):
                for youso in item.find_all('dd'):
                    if(len(templ)<=7 ):
                        templ.append(youso.text)
            else:
                isFirst = True
                for youso in item.find_all('dd'):
                    if( isFirst):
                        isFirst = False
                        continue
                    else:
                        if(len(templ)<=7 ):
                            templ.append(youso.text)                                      

            print('**************************')

            bukkens.append(templ)
        time.sleep(0.01)
        b_count+=1
        if( b_count >=99):
            break

    try:
        #列名を設定し、csvに一旦export 
        df = pd.DataFrame(bukkens,columns=cols)
        df.to_csv('./bukken.csv', encoding='utf_8_sig')
    except Exception as e:
        print('---------------------')
        print(e)

if __name__ == "__main__":
    #regix()

    #url = 'https://suumo.jp/jj/bukken/ichiran/JJ010FJ001/?ar=030&bs=021&ta=13&jspIdFlg=patternShikugun&sc=13105&kb=1&kt=9999999&tb=0&tt=9999999&hb=0&ht=9999999&ekTjCd=&ekTjNm=&tj=0&cnb=0&cn=9999999&srch_navi=1'
    
    #荒川区、１０分以内、所有権
    url = 'https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=021&cn=9999999&cnb=0&et=10&fw2=&hb=0&ht=9999999&kb=1&kr=A&kt=9999999&sc=13118&scTemp=13118&ta=13&tb=0&tj=0&tt=9999999&pc=30&po=5&pj=1'

    #荒川区　新着
    #url = 'https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=021&cn=9999999&cnb=0&ekTjCd=&ekTjNm=&hb=0&ht=9999999&kb=1&kt=9999999&sc=13118&ta=13&tb=0&tj=0&tt=9999999&pc=30&po=1&pj=2'

    #url = 'https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=021&cn=9999999&cnb=0&hb=0&ht=9999999&kb=1&kt=9999999&pc=30&pj=2&po=1&sc=13118&ta=13&tb=0&tj=0&tt=9999999&nf=021001'

    #url = 'https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=030&et=9999999&jk=2&kb=1&kj=9&km=1&kr=A&kt=9999999&pc=30&sc=13118&sc=13117&scTemp=13118&scTemp=13117&ta=13&tb=0&tj=0&tt=9999999&nf=030001'


    init(url)
    #read()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up