@tanakamaru2022 (丸田中)posted at 2022-08-20

Python csv出力が上手くいきません。

Q&A

概要

python初心者です。
下記のコードをサイトを見ながら見よう見まねでトライしたのですが、
添付している画像のようにcsvに上手く出力されません。

エラーは出ておりませんが、どこがおかしいのかをご教授いただけますと幸いです。

該当するソースコード

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime
import time
import requests
import csv
import pandas as pd

START_DT_STR = '2021-12-01'

SEARCH_WORD = 'python'

PRTIMES_URL = 'https://prtimes.jp/'

start_dt = datetime.datetime.strptime(START_DT_STR, '%Y-%m-%d')

options=Options()
options.add_argument("--headless")
driver=webdriver.Chrome("/Users/seiya.shibata/Desktop/Python/chromedriver",options=options)
driver.get("https://www.google.com/")

driver = webdriver.Chrome('chromedriver',options=options)

#PR TIMESのトップページを開く
target_url = 'https://prtimes.jp/'   
driver.get(target_url)

driver.find_element("xpath",'/html/body/header/div/div[2]/div/input').click()

kensaku = driver.find_element("xpath",'/html/body/header/div/div[2]/div/input')
kensaku.send_keys(SEARCH_WORD)
kensaku.send_keys(Keys.ENTER)





cnt = 0
while True:
    try:
        driver.find_element_by_xpath("/html/body/main/section/section/div/div/a").click()
    except: 
        break
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #記事URLを取得(40件ずつ処理)
    articles = soup.find_all(class_='list-article__link')[cnt*40:]
    
    #記事情報を格納する配列
    
    
    
    #記事ごとの情報を取得
    for article in articles:
        article_time = article.find(class_='list-article__time')
        
        
        #csv関連
        eof_flag = False
        csv_date=datetime.datetime.today().strftime("%Y%m%d")
        csv_file_name = 'prtimes_' + csv_date + '.csv'
        f = open(csv_file_name, 'w', encoding='cp932',errors="ignore")
        writer=csv.writer(f, lineterminator='\n')
        csv_header=["title","sub_title","company","pubulished","category1"]
        writer.writerow(csv_header)

        try:
            str_to_dt = datetime.datetime.strptime(article_time.get('datetime'), '%Y-%m-%dT%H:%M:%S%z')
        except:
            try:
                article_time_cvt = article_time.get('datetime').replace('+09:00', '+0900')
                str_to_dt = datetime.datetime.strptime(article_time_cvt, '%Y-%m-%dT%H:%M:%S%z')
            except:
                str_to_dt = datetime.datetime.strptime(article_time.text, '%Y年%m月%d日 %H時%M分')

        article_time_dt = datetime.datetime(str_to_dt.year, str_to_dt.month, str_to_dt.day, str_to_dt.hour, str_to_dt.minute)
        
        if article_time_dt < start_dt:
            eof_flag = True 
            break

        relative_href = article["href"]
        url = urljoin(target_url, relative_href)

        r = requests.get(url)
        html = r.text
        soup = BeautifulSoup(html, "html.parser")
        
        records = []

        #記事タイトル
        title = soup.select_one("#main > div.content > article > div > header > h1").text
        


        sub_title_elem = soup.select_one("#main > div.content > article > div > header > h2")

        
        #サブタイトル

        if sub_title_elem:
            sub_title = sub_title_elem.text
        else:
            sub_title = ""
            

        company = soup.select_one('#main > div.content > article > div > header > div.release--info_wrapper > div.information-release > div').text
        

        published = soup.select_one('#main > div.content > article > div > header > div.release--info_wrapper > div.information-release > time').text
        

        category1= soup.select_one('#main > div.content > article > dl > dd:nth-child(4) > a:nth-child(1)').text
        
        
        records.append({'title':title,'sub_title':sub_title,'company':company,'published':published,'category1':category1})
        
        writer.writerow(records)


    if records:
        pass

    if eof_flag:
        break

    time.sleep(2)  
    cnt += 1

    f.close

2Answer

@Rasch posted at 2022-08-20

どこがおかしいかと聞かれると、

writer=csv.writer(f, lineterminator='\n')
records.append({'title':title,'sub_title':sub_title,'company':company,'published':published,'category1':category1})
writer.writerow(records)

がおかしいと思います。

辞書を与えて出力するのであれば、writerではなく、DictWriterになると思います。

他にもおかしいところがあるので、勉強が目的ではなく、CSV出力ができればよいのであれば、writerとDictWriterの違いを確認するよりも、import csvをやめて、,区切りの文字列にしてそのままファイルに出力したほうが簡単だと思います。あと、strip()をしたほうが良いと思います。

期待している出力イメージがあると、回答者が増える気がします。

1Like

@kawagoe6884 posted at 2022-08-26

pandas の DataFrame を使って書き出すのが合っているかと
質問のコードから変更箇所がいくつがありますが、
必要ならばファイルの比較をして変更箇所を確認してください

━━━━━━━━━━━━━━━━━
ソースコードを表示（折りたたみ）
━━━━━━━━━━━━━━━━━━

refactoring.py

# coding:utf-8
import datetime
import time
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

START_DT_STR = "2021-12-01"
SEARCH_WORD = "python"
PRTIMES_URL = "https://prtimes.jp/"

# csvの中身
records = []

# 開始日
start_dt = datetime.datetime.strptime(START_DT_STR, "%Y-%m-%d")

# chrome起動
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# PR TIMESのトップページを開く
driver.get(PRTIMES_URL)

# 検索
driver.find_element("xpath", "/html/body/header/div/div[2]/div/input").click()
kensaku = driver.find_element("xpath", "/html/body/header/div/div[2]/div/input")
kensaku.send_keys(SEARCH_WORD)
kensaku.send_keys(Keys.ENTER)

# 以下に該当するまで無限ループ
# エレメントのクリックが不可
# eof_flag が立つ
cnt = 0
eof_flag = False
while True:
    # クリックが可能なら処理を続行
    try:
        driver.find_element_by_xpath("/html/body/main/section/section/div/div/a").click()
    except:
        break

    # bs
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # 記事URLを取得(40件ずつ処理)
    articles = soup.find_all(class_="list-article__link")[cnt * 40 :]

    # 記事ごとの情報を取得
    for article in articles:
        article_time = article.find(class_="list-article__time")

        # datetime を文字列から日付形式にする
        # セミコロンがあるとエラー、except へ
        try:
            str_to_dt = datetime.datetime.strptime(
                article_time.get("datetime"), "%Y-%m-%dT%H:%M:%S%z"
            )
        except:
            # セミコロンを置換して日付形式にする
            # エラー時は テキストを日付形式にする
            try:
                article_time_cvt = article_time.get("datetime").replace("+09:00", "+0900")
                str_to_dt = datetime.datetime.strptime(article_time_cvt, "%Y-%m-%dT%H:%M:%S%z")
            except:
                str_to_dt = datetime.datetime.strptime(article_time.text, "%Y年%m月%d日 %H時%M分")

        article_time_dt = datetime.datetime(
            str_to_dt.year, str_to_dt.month, str_to_dt.day, str_to_dt.hour, str_to_dt.minute
        )

        # article_time_dt が start_dt 以前の日付になったとき
        # flag を立てて for を抜ける
        if article_time_dt < start_dt:
            eof_flag = True
            break

        # href を絶対パスにする
        relative_href = article["href"]
        url = urljoin(PRTIMES_URL, relative_href)

        # bs
        r = requests.get(url)
        html = r.text
        soup = BeautifulSoup(html, "html.parser")

        # 記事タイトル
        title = soup.select_one("#main > div.content > article > div > header > h1").text

        # サブタイトルが存在すれば取得
        sub_title_elem = soup.select_one("#main > div.content > article > div > header > h2")
        if sub_title_elem:
            sub_title = sub_title_elem.text
        else:
            sub_title = ""

        # 会社名
        company = soup.select_one(
            "#main > div.content > article > div > header > div.release--info_wrapper > div.information-release > div"
        ).text

        # 投稿日
        published = soup.select_one(
            "#main > div.content > article > div > header > div.release--info_wrapper > div.information-release > time"
        ).text

        # カテゴリ
        category1 = soup.select_one(
            "#main > div.content > article > dl > dd:nth-child(4) > a:nth-child(1)"
        ).text

        records.append(
            {
                "title": title,
                "sub_title": sub_title,
                "company": company,
                "published": published,
                "category1": category1,
            }
        )

    # flag が立つまでは処理を続行
    if eof_flag:
        break

    time.sleep(2)
    cnt += 1


# 保存するファイル名
today = datetime.datetime.today().strftime("%Y%m%d")
FILE_NAME = "prtimes_" + today + ".csv"

# csv形式で保存する
df = pd.DataFrame(records)
df.to_csv(FILE_NAME, index=False, encoding="cp932")

上の回答者様が言っている通り、stripした方がいいと思うので
#記事タイトルから下にある .text を
.get_text(strip=True) にすればstripされます。

0Like

Are you sure you want to delete the question?

Python csv出力が上手くいきません。

概要

該当するソースコード

2Answer

Your answer might help someone💌