pythonでseleniumを用いaタグを取得したい
Q&A
Closed
解決したいこと
pythonのseleniumを用いたwebスクレイピングについて
GUのStyleHintから、各コーディネートのURL、imgurl、投稿日時、お気に入り登録数と
コーディネートに使用されている商品のURL、imgurl、color、titleを取得し
json(現時点ではdict)で保存したいです。
書いたコード
main_table = {}
time.sleep(1)
driver.get(url_list) #StyleHintの各コーディネートURL(例.https://www.gu-global.com/jp/ja/stylingbook/stylehint/2910562)に遷移
main_table["main_url"] = url_list #main_tableにURLを保存
main_elem = driver.find_element(By.CSS_SELECTOR,".swiper-slide.h-100.w-100.swiper-slide-active")
main_img = main_elem.find_element(By.TAG_NAME, 'img')
main_imgurl = main_img.get_attribute('src')
main_likes = main_elem.find_element(By.CLASS_NAME, "like-counter__number-section").text
main_date = driver.find_element(By.CLASS_NAME,"font-sp-12.text-gray.font-weight-light.styling__style-info__date-time").text
main_table["main_imgurl"] = main_imgurl #main_tableにimageURLを保存
main_table["main_likes"] = main_likes #main_tableにlikes数を保存
main_table["main_date"] = main_date #main_tableに投稿日時を保存
component_elem_list =driver.find_elements(By.CSS_SELECTOR,".d-flex.product-cart.text-reset.text-decoration-none.sp-l3-product") #各商品の画像全て(色違いの画像や生地詳細の画像も全て)を取得したいので、各商品ページのURLを取得したい
components_list = []
components_urls_list = list()
for component_elem in component_elem_list:
print(driver.current_url)
print(component_elem)
component_table = {}
component_url = component_elem.find_element(By.TAG_NAME, 'a')
component_url = component_url.get_attribute('href') #ここで各アイテム画像のURL取得
component_title = component_elem.find_element(By.CSS_SELECTOR,".product-cart__cart-name.font-sp-11.text-reset.text-decoration-none.font-weight-600.font-weight-sp-400.d-block").text
component_color = component_elem.find_element(By.CSS_SELECTOR,".text-truncate").text
component_table["imgurl"]= list() #各アイテムのimageURLを格納するところ
component_table["title"]= component_title #商品名
component_table["color"]= component_color #色
component_table["url"]= component_url #商品ページURL
driver.get(component_url) #各アイテムのページに遷移
items_variation_elems = driver.find_elements(By.CSS_SELECTOR, ".sc-1dphr7g-0.jsAknO")
for items_variation_elem in items_variation_elems:
item_variation_img = items_variation_elem.find_element(By.TAG_NAME, 'img')
item_variation_img_url = item_variation_img.get_attribute('src')
item_variation_img_url_original_size = item_variation_img_url.split('.jpg')[0]+".jpg" #imageurlのサイズを大きくし、画質をよくする為の処理
if item_variation_img_url_original_size.startswith("https://image"): #https://imageで始まる画像のみ欲しい
if item_variation_img_url_original_size not in component_table["imgurl"]: #取得するimgURLが被らないように
component_table["imgurl"].append(item_variation_img_url_original_size)
components_list.append(component_table) #component_tabelをcomponent_listに追加する
print(main_table)
driver.get(url_list) #コーデページに遷移
main_table["components"] = components_list
実際にこのコードを実行すると、こうなります。
main_tableが出力されています。各コーディネートのURL、imgurl、投稿日時、お気に入り登録数と
コーディネートに使用されている商品のうち、一つ目の商品のみURL、imgurl、color、titleを取得できています。
二つ目以降の商品のaタグを取得する際に、エラーが出ています。
(エラーコードの部分に関係ないコメントアウトがありますが、見逃してください)
どうすれば二つ目以降の商品の情報も取得できるでしょうか?
{
"main_url": "https://www.gu-global.com/jp/ja/stylingbook/stylehint/2910562",
"main_imgurl": "https://api.fastretailing.com/ugc/v1/gu/jp/SR_IMAGES/ugc_stylehint_gu_jp_photo_220927_956227",
"main_likes": "0",
"main_date": "2022/09/27 更新",
"components": [
{
"imgurl": [
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/item/jpgoods_09_342532.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub3.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub5.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub10.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub18.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub19.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub20.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub21.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub22.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub51.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub52.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub53.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub81.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub82.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub83.jpg"
],
"title": "ウォッシャブルリラックスフィットニットベスト",
"color": "09 BLACK",
"url": "https://www.gu-global.com/jp/ja/products/E342532-000/00?colorDisplayCode=09"
}
]
}
--------------------------------------------------------------------------
StaleElementReferenceException Traceback (most recent call last)
<ipython-input-11-abe754cb3c10> in <module>
110 # component_img = component_elem.find_element(By.TAG_NAME, 'img')
111 # component_imgurl = component_img.get_attribute('src')
--> 112 component_url = component_elem.find_element(By.TAG_NAME, 'a')
113 component_url = component_url.get_attribute('href')
114 #各アイテム画像のURL取得
3 frames
/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
241 alert_text = value['alert'].get('text')
242 raise exception_class(message, screen, stacktrace, alert_text) # type: ignore[call-arg] # mypy is not smart enough here
--> 243 raise exception_class(message, screen, stacktrace)
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
一つのコーディネートに対しこのようなmain_tableを取得したい
例:https://www.gu-global.com/jp/ja/stylingbook/stylehint/2910562
[
{
"main_url": "https://www.gu-global.com/jp/ja/stylingbook/stylehint/2910562",
"main_imgurl": "https://api.fastretailing.com/ugc/v1/gu/jp/SR_IMAGES/ugc_stylehint_gu_jp_photo_220927_956227",
"main_likes": "0",
"main_date": "2022/09/27 更新",
"components": [
{
"imgurl": [
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/item/jpgoods_09_342532.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub3.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub5.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub10.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub18.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub19.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub20.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub21.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub22.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub51.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub52.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub53.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub81.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub82.jpg",
"https://image.uniqlo.com/GU/ST3/jp/imagesgoods/342532/sub/jpgoods_342532_sub83.jpg"
],
"title": "ウォッシャブルリラックスフィットニットベスト",
"color": "09 BLACK",
"url": "https://www.gu-global.com/jp/ja/products/E342532-000/00?colorDisplayCode=09"
},
{
"imgurl": [記載を省略させていただきます],
"title": "ドライラウンドヘムビッグT(長袖)",
"color": "00 WHITE",
"url": "https://www.gu-global.com/jp/ja/products/E344297-000/00?colorDisplayCode=00"
},
{
"imgurl": [記載を省略させていただきます],
"title": "カットソーストレートトラックパンツ",
"color": "09 BLACK",
"url": "https://www.gu-global.com/jp/ja/products/E343738-000/00?colorDisplayCode=09"
},
{
"imgurl": [記載を省略させていただきます],
"title": "ビーニー",
"color": "09 BLACK",
"url": "https://www.gu-global.com/jp/ja/products/E341968-000/00?colorDisplayCode=09"
}
]
}
]