More than 1 year has passed since last update.

Pythonで頻出語句ランキング

Posted at 2024-03-08

Pythonで頻出語句ランキング

ツイートを全取得する。

ツイートのアーカイブから取得

ツイートの内容のみ抽出

今回はテキトーにRTやメンションなどを除外する

tweets.jsにツイートの内容は全部入ってるので、そのファイル名をtweet.jsonに変更

import json
import re
import datetime

DATE_RANGE = {"start": 2017, "end": 3000}
NG_WORDS = ["YouTube", "youtube", "YOUTUBE", "https://t.co/"]


def remove_retweets(data):
    return [d for d in data if not d["tweet"]["retweeted"]]


# テキストの抽出関数
def extract_text(text: str) -> str:
    # RTの文字列の番号を取得
    rt_index = text.find("RT")
    if rt_index != -1:
        text = text[:rt_index]
    # 正規表現で @ から始まるユーザ名を削除
    text = re.sub(r"@[a-zA-Z0-9_]+\s", "", text)
    # URLを削除
    text = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", text)
    # 絵文字削除
    text = re.sub(r"[\U00010000-\U0010ffff]", "", text)
    return text


def remove_empty_lines(text):
    # 文字列を行に分割
    lines = text.split("\n")
    # 空行を削除
    non_empty_lines = [line for line in lines if line.strip() != ""]
    # 非空行を再結合して1つの文字列にする
    result = "\n".join(non_empty_lines)
    return result


# 特定のテキストが入ってるかを判定
def is_contain_text(text: str, target: str) -> bool:
    return target in text


# read json
with open("data/tweets.json") as f:
    data = json.load(f)

data = remove_retweets(data)
out_text: str = ""
count: int = 0
for d in data:
    tweet = d["tweet"]
    tweet_year = tweet["created_at"][-4:]
    if int(tweet_year) < DATE_RANGE["start"] or int(tweet_year) > DATE_RANGE["end"]:
        continue
    tweet_text = extract_text(tweet["full_text"])
    if tweet_text == "":
        continue
    for ng_word in NG_WORDS:
        if is_contain_text(tweet_text, ng_word):
            continue
    print(tweet_text)
    print(tweet_year)
    count += 1
    out_text += tweet_text + "\n"

# out textの空行を削除
out_text = remove_empty_lines(out_text)

# write text
with open("data/out/tweets.txt", "w") as f:
    f.write(out_text)

print(count)

ランキング

import MeCab
from collections import Counter


def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()


def extract_words(text):
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)
    words = []
    while node:
        word = node.surface
        pos = node.feature.split(",")[0]
        if pos == "名詞":
            words.append(word)
        node = node.next
    return words


def create_word_ranking(words, top_n=100):
    word_counts = Counter(words)
    return word_counts.most_common(top_n)


def main():
    file_path = "data/out/tweets.txt"
    text = load_text_file(file_path)
    words = extract_words(text)
    ranking = create_word_ranking(words)

    print("名詞の頻出語句ランキング:")
    for i, (word, count) in enumerate(ranking, 1):
        print(f"{i}. {word}: {count}回")


if __name__ == "__main__":
    main()

1文字は処理で一旦抜いたほうがいいかもね。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up