More than 1 year has passed since last update.

Python3: Wikipedia, WiktionaryのID取得、AmazonのASIN取得、YouTubeのチャット取得、そうでない場合はパーセントエンコード

Last updated at 2024-03-11Posted at 2024-03-04

はじめの前のおねがい

できれば「いいね♡」をお願いします。励みになります。

はじめに

今まで私のマイページ¹では「Python, Pythonista3 & Automator: AmazonのURLからASINのみが記載されたURLを生成する²」などで、1つのコードであれもやりたい、これもやりたいと、色々な機能を含めてきたわけですが、実際に自分で運用していく中で、まあこのくらいのことができれば良いだろうという段階まで出来上がりました。つまり

WikipediaのページID取得
WiktionaryのページID取得
AmazonのASIN取得
YouTube Liveのチャット取得
パーセントエンコード³

をそれぞれ、http://以降の文字列を参考にして判別し、〒の場合は「乗換案内」の住所検索用に郵便番号の部分を取ったものを出力し、それ以外の場合はパーセントエンコードをするといったものです。

YouTubeチャットの取得に関する重要注意事項・免責

YouTubeチャットに関しましては、以下にはくれぐれもご注意ください。当方ではコードの提供はいたしますが、私はあくまで自分のYouTubeLive配信のデータ取得を目的として作成しましたので、それ以外の用途を意図していません。

注意：著作権上の問題もありますので、利用には著作権者への同意をご自身でお願いします。

免責：私個人は取得されたデータによるいかなる損害も保証しません。

手順

URLをクリップボードにコピー
Python 3あるいはPython 3コードが実行できるIDEを起動
本コードに必要なモジュール⁴をインストール
本コードをコピペ
本コードを実行
変換後のものをクリップボードに再コピー
コンソールに変換後のものや付属データを吐き出す

モジュールの準備

URL処理モジュール

pip install urllib.parse

APIリクエスト処理モジュール

pip install requests

クリップボードモジュール

pip install pyperclip

JSON処理モジュール

pip install json

YouTube処理モジュール

pip install YoutubeDL

他に記載を忘れているモジュールがありましたら、コメントでご連絡をいただければ幸いです。

ソースコード

ClipboardConverter.py

import re
import sys
import urllib.parse
import requests
import pyperclip  #Python3用クリップボード処理
from yt_dlp import YoutubeDL
import json
import os
from datetime import timedelta

# クリップボード前処理
class ClipboardURLFetcher:
    def fetch_url(self):
        url = pyperclip.paste()
        if not url:
            sys.exit("URLをクリップボードに入れてください")
        return urllib.parse.unquote(url)

# URL前処理
class BaseURLProcessor:
    def process_url(self, url):
        raise NotImplementedError

# Amazon処理
class AmazonURLProcessor(BaseURLProcessor):
  def process_url(self, url):
    if 'https://www.amazon' not in url:
      return None
    
    domain = self.extract_domain(url)
    asin = self.extract_asin(url)
    if asin:
      short_url = self.generate_short_url(asin, domain)
      self.copy_to_clipboard(short_url)
      sys.exit(0)
      
    patterns = {
      '/gp/product/': '/product/',
      '/gp/aw/d/': '/aw/d/'
    }
    
    for suffix, pattern in patterns.items():
      if suffix in url:
        short_url = self.asin_url(url, pattern, domain)
        self.copy_to_clipboard(short_url)
        sys.exit(0)
    return None
  
  def extract_domain(self, url):
    match = re.search(r'https://www\.amazon\.([a-z.]+)/', url)
    if match:
      return match.group(1)
    return None
  
  def extract_asin(self, url):
    match = re.search(r'/dp/(\w{10})', url)
    if match:
      return match.group(1)
    return None
  
  def asin_url(self, url, pattern, domain):
    pos = url.find(pattern) + len(pattern)
    asin = url[pos:pos + 10]
    return self.generate_short_url(asin, domain)
  
  def generate_short_url(self, asin, domain):
    return f'https://www.amazon.{domain}/dp/{asin}'
  
  def copy_to_clipboard(self, url):
    pyperclip.copy(url)
    print(f'クリップボードにコピー完了（Amazon）：{url}')

# WikipediaとWiktionaryの処理
class WikiURLProcessor:
    def __init__(self, site):
        self.site = site
        
    def process_url(self, url):
        if '?curid=' in url:
            sys.exit(f'既に変換済みです：{url}')
            
        if 'https://' not in url or (f'.{self.site}.org/wiki/' not in url and f'.m.{self.site}.org/wiki/' not in url):
            return None
        
        if f'.m.{self.site}.org/wiki/' in url:
            url = url.replace(f'.m.{self.site}.org/wiki/', f'.{self.site}.org/wiki/')
            
        match = re.search(r'https://([a-z\-]+)\.' + re.escape(self.site) + r'\.org/wiki/', url)
        if not match:
            sys.exit(f'無効なURLです：{url}')
        lang = match.group(1)
        text = url.replace(f'https://{lang}.{self.site}.org/wiki/', '')
        page_id = self.get_page_id(lang, text)
        
        if not page_id:
            sys.exit(f'変換不可能なリンクです：{url}')
            
        new_url = f'https://{lang}.{self.site}.org/?curid={page_id}' if self.site == 'wikipedia' else f'https://{lang}.{self.site}.org/w/index.php?curid={page_id}'
        pyperclip.copy(new_url)
        
        self.print_info(url, lang, text, page_id)
        
        return new_url
    
    def get_page_id(self, lang, title):
        url = f'https://{lang}.{self.site}.org/w/api.php'
        params = {
            'action': 'query',
            'titles': title,
            'format': 'json'
        }
        response = requests.get(url, params=params)
        data = response.json()
        page = next(iter(data['query']['pages'].values()))
        return page.get('pageid')
    
    def print_info(self, url, lang, text, page_id):
        encoded_url = urllib.parse.quote(url, safe='/:=@,.!?\"\'')
        mobile_url = url.replace(f'.{self.site}.org/wiki/', f'.m.{self.site}.org/?curid=')
        mobile_encoded_url = urllib.parse.quote(mobile_url, safe='/:=@,.!?\"\'')
        print(f'{self.site.capitalize()}タイトル：{text}')
        print(f'Page(s)ID：{page_id}')
        print(f'タイトル込みURL：{url}')
        print(f'PC用エンコード済みURL：{encoded_url}')
        print(f'Mobile用エンコード済みURL：{mobile_encoded_url}')
        print('')
        print(f'{self.site.capitalize()}タイトル：{text}')
        print(f'PC用ID込みURL：https://{lang}.{self.site}.org/?curid={page_id}')
        print(f'Mobile用ID込みURL：https://{lang}.m.{self.site}.org/?curid={page_id}')

# URLエンコード（パーセントエンコード）
class TextURLEncoder:
    def process_text(self, text):
        encoded_text = urllib.parse.quote(text, safe='/:=@,.!?"\'')
        pyperclip.copy(encoded_text)
        print(f'URLエンコードされたテキストをクリップボードにコピーしました：{encoded_text}')

# YouTubeチャット取得処理
class YouTubeChatDownloader:
    def __init__(self):
        self.video_start_usec = None

    def process_message(self, renderer):
        if "message" in renderer:
            message_items = renderer["message"]["runs"]
            message = ''
            for element in message_items:
                if "text" in element:
                    message += element["text"]
                elif "emoji" in element:
                    message += element["emoji"]["image"]["accessibility"]["accessibilityData"]["label"]
            return message
        return ""

    def process_username(self, renderer):
        if "authorName" in renderer:
            name_runs = renderer["authorName"]["simpleText"]
            return name_runs
        return "Unknown User"

    def process_superchat(self, renderer):
        amount = renderer.get("purchaseAmountText", {}).get("simpleText", "")
        message = self.process_message(renderer)
        return message, amount

    def process_timestamp(self, renderer):
        timestamp_usec = int(renderer["timestampUsec"])
        timestamp_delta = timedelta(microseconds=(timestamp_usec - self.video_start_usec))
        hours, remainder = divmod(timestamp_delta.total_seconds(), 3600)
        minutes, seconds = divmod(remainder, 60)
        formatted_timestamp = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
        return formatted_timestamp

    def download_chat(self, url):
        youtube_pattern = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/[\w-]+'
        if not re.match(youtube_pattern, url):
            print("Please provide a valid YouTube URL.")
            return

        video_id = url.split('=')[1].split('&')[0]

        ydl_video_opts = {
            'outtmpl': f'{video_id}_.mp4',
            'format': 'best',
            'writesubtitles': True,
            'skip_download': True
        }

        with YoutubeDL(ydl_video_opts) as ydl:
            ydl.download([url])

        input_file_name = f'{video_id}_.live_chat.json'
        output_file_name = os.path.splitext(input_file_name)[0] + '.txt'

        with open(input_file_name, 'r', encoding='utf-8') as infile, \
             open(output_file_name, 'w', encoding='utf-8') as outfile:

            lines = infile.readlines()

            for line in lines:
                data = json.loads(line)
                action = data["replayChatItemAction"]["actions"][0]

                if "addChatItemAction" in action:
                    item = action["addChatItemAction"]["item"]
                    if self.video_start_usec is None:
                        if "liveChatTextMessageRenderer" in item:
                            self.video_start_usec = int(item["liveChatTextMessageRenderer"]["timestampUsec"])
                        elif "liveChatPaidMessageRenderer" in item:
                            self.video_start_usec = int(item["liveChatPaidMessageRenderer"]["timestampUsec"])
                        elif "liveChatPaidStickerRenderer" in item:
                            self.video_start_usec = int(item["liveChatPaidStickerRenderer"]["timestampUsec"])

                    if "liveChatTextMessageRenderer" in item:
                        renderer = item["liveChatTextMessageRenderer"]
                        username = self.process_username(renderer)
                        message = self.process_message(renderer)
                        timestamp = self.process_timestamp(renderer)
                        outfile.write(f"{timestamp} [{username}]: {message}\n")
                    elif "liveChatPaidMessageRenderer" in item:
                        renderer = item["liveChatPaidMessageRenderer"]
                        username = self.process_username(renderer)
                        message, amount = self.process_superchat(renderer)
                        timestamp = self.process_timestamp(renderer)
                        outfile.write(f"{timestamp} [{username}] (Super Chat {amount}): {message}\n")
                    elif "liveChatPaidStickerRenderer" in item:
                        renderer = item["liveChatPaidStickerRenderer"]
                        username = self.process_username(renderer)
                        _, amount = self.process_superchat(renderer)  # Stickersの場合の処理
                        timestamp = self.process_timestamp(renderer)
                        outfile.write(f"{timestamp} [{username}] (Super Sticker {amount})\n")

# YouTubeURL処理
class YouTubeURLProcessor:
    def process_url(self, url):
        youtube_pattern = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/[\w-]+'
        if re.match(youtube_pattern, url):
            downloader = YouTubeChatDownloader()
            downloader.download_chat(url)
        else:
            print("YouTubeのURLではありません。")

# GoogleMap → 乗換案内
class AddressProcessor:
    def process_address(self, in_text):
        if '〒' in in_text[:2]:
            url = in_text[:9]
            text = in_text.replace(url, '')
            address = text.translate(str.maketrans({chr(0xFF01 + i): chr(0x21 + i) for i in range(94)}))
            address = address.replace('−','-')
            address = address.lstrip()
            url = url.replace('〒', '')
            pyperclip.copy(address)
            address = address.replace(' ', '\n')
            print(url)
            print(address)
        else:
            print("住所ではありません。")

def main():
    fetcher = ClipboardURLFetcher()
    url = fetcher.fetch_url()
    if '〒' in url[:2]:
        address_processor = AddressProcessor()
        address_processor.process_address(url)
    elif 'amazon' in url:
        AmazonURLProcessor().process_url(url)
    elif 'wikipedia.org/wiki/' in url:
        WikiURLProcessor('wikipedia').process_url(url)
    elif 'wiktionary.org/wiki/' in url:
        WikiURLProcessor('wiktionary').process_url(url) 
    elif re.match(r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/[\w-]+', url):
        YouTubeURLProcessor().process_url(url)
    else:
        TextURLEncoder().process_text(url)

if __name__ == "__main__":
    main()

終わりに

極力、classとdefを使用して、他のコードに転用できるように整理しましたが、まだ効率が十分ではありません。

「頭痛が痛い」みたいな表現になってしまいますよね。 ↩
https://qiita.com/ekemtt/items/3d7372bb6b17ddcd12f3 ↩
a.k.a. URLエンコード ↩
a.k.a. ライブラリ ↩

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up