1
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

不動産情報を取る

Posted at
#必要なライブラリをインポート
from bs4 import BeautifulSoup
import requests
import time
import datetime
import numpy as np
import scipy.stats
from sklearn import preprocessing
from tqdm import tqdm
import re
import pandas as pd
from pandas import Series, DataFrame

def init():
    #東京都の中古マンション一覧
    url = 'https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=011&cn=9999999&cnb=0&ekTjCd=&ekTjNm=&et=9999999&fw2=&kb=1&kt=2000&mb=0&mt=9999999&sc=13102&scTemp=13102&ta=13&tj=0&pc=30&po=5&pj=1'
    #データ取得
    result = requests.get(url)
    c = result.content

    #HTMLを元に、オブジェクトを作る
    soup = BeautifulSoup(c)


    #物件リストの部分を切り出し
    summary = soup.find("div", id='js-bukkenList') #1ページ分を取得
    dt = summary.find("div",class_='ui-media').find_all('dt') #dtを全て取得
    cols = [t.text for t in dt]
    cols.insert(0, 'url') #後ほど物件情報詳細を確認するためurl列を追加

    #ページ数を取得
    body = soup.find("body")
    page = body.find("div",class_='pagination pagination_set-nav')
    li = page.find_all('li')
    pg_length = int(li[-1].text)

    #URLを入れるリスト
    urls = []
    #1ページ目を格納
    urls.append(url)

    #2ページ目から最後のページまでを格納
    for i in range(pg_length)[:-1]:
        pg = str(i+2) #2ページ目から
        url_page = url + '&pn=' + pg #ページ数に合わせたurl
        urls.append(url_page)


    #各ページで以下の動作をループ
    bukkens =[]
    for url in tqdm(urls):
        #物件リストを切り出し
        result = requests.get(url)
        c = result.content
        soup = BeautifulSoup(c)
        summary = soup.find("div",id='js-bukkenList')

        #マンション名、住所、立地(最寄駅/徒歩~分)、築年数、建物高さが入っているproperty_unitsを全て抜き出し
        property_units = summary.find_all("div",class_='property_unit-content')

        #各property_unitsに対し、以下の動作をループ
        for item in property_units:
            l = []
            #マンションへのリンク取得
            h2 = item.find("h2",class_='property_unit-title')
            href = h2.find("a").get('href')
            l.append(href)
            for youso in item.find_all('dd'):
                l.append(youso.text)
            bukkens.append(l)
        time.sleep(0.5)

    #列名を設定し、csvに一旦export 
    df = pd.DataFrame(bukkens,columns=cols)

    df.to_csv('bukken.csv', encoding='cp932')   # windowsの場合 cp932
    #df.to_csv('bukken.csv', encoding = "utf-8")

    #csvのimportと中身の確認
    #df = pd.read_csv('bukken.csv',encoding = "shift-jis")

    df[0:5]


def getPrice(content):

    ret = re.findall(r'\\n(\d+)', content)
    if len(ret) > 0:
        return int(ret[0])
    else:
        return 0

def getBukkenName(content):

    ret = re.findall(r'(\S+)\Z', content)
    if len(ret) > 0:
        return ret[0]
    else:
        return "NONE"

def conv_encode():
    df = pd.read_csv('bukken.csv',encoding = "cp932")  # DataFrame
    #print( df.iloc[0:0])
    print("--------------conv_encode-------------------------------------")
    val = 0
    for i in range(1,9):
        obj = df.iloc[i:i+1, 2:3]  # ilocは番号による参照を行います。
        #print(str(obj))
        name = getBukkenName(str(obj))

        obj = df.iloc[i:i+1, 3:4]  # ilocは番号による参照を行います。
        price = getPrice(str(obj))
        print("{},{}".format(name,price))


        
    return val

if __name__ == "__main__":
    content = r'1  \n123万円\n' 

# r はraw string(raw文字列) -> https://docs.python.org/ja/3/library/re.html
# \nはエスケープシーケンスなので raw にする
    print(re.findall(r'\\n(\d+)', content))

     #init()

    conv_encode()


1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?