LoginSignup
1
0

Pythonで頻出語句ランキング

Posted at

Pythonで頻出語句ランキング

ツイートを全取得する。

ツイートのアーカイブから取得

ツイートの内容のみ抽出

今回はテキトーにRTやメンションなどを除外する

tweets.jsにツイートの内容は全部入ってるので、そのファイル名をtweet.jsonに変更

import json
import re
import datetime

DATE_RANGE = {"start": 2017, "end": 3000}
NG_WORDS = ["YouTube", "youtube", "YOUTUBE", "https://t.co/"]


def remove_retweets(data):
    return [d for d in data if not d["tweet"]["retweeted"]]


# テキストの抽出関数
def extract_text(text: str) -> str:
    # RTの文字列の番号を取得
    rt_index = text.find("RT")
    if rt_index != -1:
        text = text[:rt_index]
    # 正規表現で @ から始まるユーザ名を削除
    text = re.sub(r"@[a-zA-Z0-9_]+\s", "", text)
    # URLを削除
    text = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", text)
    # 絵文字削除
    text = re.sub(r"[\U00010000-\U0010ffff]", "", text)
    return text


def remove_empty_lines(text):
    # 文字列を行に分割
    lines = text.split("\n")
    # 空行を削除
    non_empty_lines = [line for line in lines if line.strip() != ""]
    # 非空行を再結合して1つの文字列にする
    result = "\n".join(non_empty_lines)
    return result


# 特定のテキストが入ってるかを判定
def is_contain_text(text: str, target: str) -> bool:
    return target in text


# read json
with open("data/tweets.json") as f:
    data = json.load(f)

data = remove_retweets(data)
out_text: str = ""
count: int = 0
for d in data:
    tweet = d["tweet"]
    tweet_year = tweet["created_at"][-4:]
    if int(tweet_year) < DATE_RANGE["start"] or int(tweet_year) > DATE_RANGE["end"]:
        continue
    tweet_text = extract_text(tweet["full_text"])
    if tweet_text == "":
        continue
    for ng_word in NG_WORDS:
        if is_contain_text(tweet_text, ng_word):
            continue
    print(tweet_text)
    print(tweet_year)
    count += 1
    out_text += tweet_text + "\n"

# out textの空行を削除
out_text = remove_empty_lines(out_text)

# write text
with open("data/out/tweets.txt", "w") as f:
    f.write(out_text)

print(count)

ランキング

import MeCab
from collections import Counter


def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()


def extract_words(text):
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)
    words = []
    while node:
        word = node.surface
        pos = node.feature.split(",")[0]
        if pos == "名詞":
            words.append(word)
        node = node.next
    return words


def create_word_ranking(words, top_n=100):
    word_counts = Counter(words)
    return word_counts.most_common(top_n)


def main():
    file_path = "data/out/tweets.txt"
    text = load_text_file(file_path)
    words = extract_words(text)
    ranking = create_word_ranking(words)

    print("名詞の頻出語句ランキング:")
    for i, (word, count) in enumerate(ranking, 1):
        print(f"{i}. {word}: {count}")


if __name__ == "__main__":
    main()


1文字は処理で一旦抜いたほうがいいかもね。

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0