More than 1 year has passed since last update.

英文をOCRで手軽に翻訳するツールにChatGPT APIを足し算したら、何でも聞ける万能アシスタントになった。【Python】

Last updated at 2023-03-03Posted at 2023-03-02

はじめに

以前、OCR ライブラリと DeepL API を使用して、英語論文などをコピーの可否に関わらずその場で手軽に翻訳するツールを作成しました。

DeepL APIを使って英語論文をサクッと翻訳するツールを作った。【Python】

それだけでも修論執筆時の論文読みに非常に重宝したのですが、先日公開された ChatGPT API を乗っけたら便利そうなので試してみました。

OCR → 翻訳 → 要約の様子

デモに使用した論文：https://arxiv.org/abs/2106.09685

何ができるか

日英翻訳にとどまらず、様々な言語への翻訳・要約・プログラムの作成・会話・ロールプレイなど（ChatGPT でできること全般）
OCR 機能により、入力テキストとして画像上などのコピー不可能な文字も使用可能
画面上にオーバーレイ表示
ブラウザから飛び出して、PC 上のどこでも利用可能

デメリット

安いとは言え有料
翻訳用途のみの場合、DeepL API の方が早い

実装

ちょっと長いので折りたたみ

from threading import Thread
import win32api
from PIL import ImageGrab
import pyocr
import ctypes
import time
import tkinter as tk
import openai
import pyperclip as ppc


class ClipboardOCR:
    def __init__(self):
        self.tool = pyocr.get_available_tools()[0]
        self.builder = pyocr.builders.TextBuilder(tesseract_layout=6)
        self.text = ""

    def run(self, lang="jpn"):
        img = ImageGrab.grabclipboard()
        if img:
            self.text = self.tool.image_to_string(img, lang=lang, builder=self.builder)
        return self.text


class GPT:
    def __init__(self, api_key, settings_text):
        openai.api_key = api_key
        self.settings_text = settings_text
        self.past_messages = []

    def _completion(
        self, new_message_text: str, settings_text: str = "", past_messages: list = []
    ):
        if len(past_messages) == 0 and len(settings_text) != 0:
            system = {"role": "system", "content": settings_text}
            past_messages.append(system)
        new_message = {"role": "user", "content": new_message_text}
        past_messages.append(new_message)

        result = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=past_messages
        )
        response_message = {
            "role": "assistant",
            "content": result.choices[0].message.content,
        }
        past_messages.append(response_message)
        response_message_text = result.choices[0].message.content
        return response_message_text, past_messages

    def completion(self, tk, text):
        print(f"user: {text}")
        resp, self.past_messages = self._completion(
            text, self.settings_text, self.past_messages
        )
        resp = resp.strip()
        print(f"assistant: {resp}")
        tk.label["text"] = resp
        x1, x2, y1, y2 = tk.coor
        tk.sub_win.geometry(
            f"{x2-x1}x{max(tk.label.winfo_reqheight()+40, y2-y1)}+{x1}+{y1}"
        )
        tk.btn["text"] = "Send"

    def run(self, tk, text):
        task = Thread(
            target=self.completion,
            args=(
                tk,
                text.strip(),
            ),
            daemon=True,
        )
        task.start()

    def reset(self):
        self.past_messages = []


class CommandWindow(tk.Tk):
    def __init__(self, api_key):
        super().__init__()
        settings = ""
        self.c = ClipboardOCR()
        self.gpt = GPT(api_key, settings)
        self.setup_ui()
        self.watchdog()

    def setup_ui(self):
        self.title("Command Window")
        self.geometry("300x200")
        self.attributes("-alpha", 0.95)
        self.attributes("-topmost", True)
        self.option_frame = tk.Frame(self)
        self.lang = tk.IntVar()
        self.is_remove_return = tk.BooleanVar()
        self.lang_radio_en = tk.Radiobutton(
            self.option_frame, value=0, variable=self.lang, text="EN"
        )
        self.lang_radio_ja = tk.Radiobutton(
            self.option_frame, value=1, variable=self.lang, text="JA"
        )
        self.remove_return = tk.Checkbutton(
            self.option_frame, variable=self.is_remove_return, text="Remove ⏎"
        )
        self.lang_radio_en.grid(column=0, row=0, sticky="NEW")
        self.lang_radio_ja.grid(column=1, row=0, sticky="NEW")
        self.remove_return.grid(column=2, row=0, sticky="NEW")
        self.option_frame.grid(column=0, row=0, sticky="NEW")
        self.option_frame.columnconfigure(0, weight=1)
        self.option_frame.columnconfigure(1, weight=1)
        self.option_frame.columnconfigure(2, weight=1)

        self.command = tk.Text(self, font=("游ゴシック", "12"), undo=True)
        self.command.insert("1.0", "日本語に翻訳して。")
        self.command.grid(column=0, row=1, sticky="NEWS")
        self.btn = tk.Button(self, text="Send", command=self.send)
        self.btn.grid(column=0, row=2, sticky="EW")
        self.copy_btn = tk.Button(
            self, text="Copy result", command=lambda: ppc.copy(self.label["text"])
        )
        self.copy_btn.grid(column=0, row=3, sticky="EWS")
        self.columnconfigure(0, weight=1)
        self.rowconfigure(0, weight=0)
        self.rowconfigure(1, weight=1)
        self.rowconfigure(2, weight=0)
        self.rowconfigure(3, weight=0)
        self.text = ""
        self.sub_win = None

    def runClipboardOCR(self, lang="eng"):
        text = self.c.run(lang)
        return text

    def send(self):
        self.btn["text"] = "..."
        if self.sub_win is None or not self.sub_win.winfo_exists():
            geo = [int(i) for i in self.geometry().replace("x", "+").split("+")]
            self.sub_window(
                350 + geo[2], 850 + geo[2], 15 + geo[3], 215 + geo[3], False
            )
        else:
            self.ask_gpt(self.command.get("1.0", "end -1c"))

    def sub_window(self, x1, x2, y1, y2, ocr=True):
        self.coor = x1, x2, y1, y2
        self.sub_win = tk.Toplevel()
        self.sub_win.title("Chat window")
        self.sub_win.attributes("-alpha", 0.95)
        self.sub_win.attributes("-topmost", True)
        self.sub_win.bind("<Configure>", self.sized)
        self.sub_win.bind("<Shift-ButtonPress-1>", self.toggleOverrideRedirect)
        time.sleep(0.5)
        quit = tk.Button(self.sub_win, text="x", command=self.sub_win.destroy)
        quit.place(relx=1, rely=0, anchor="ne")
        if ocr:
            self.ocr()
        else:
            self.text = ""
            self.label = tk.Label(
                self.sub_win,
                font=("游ゴシック", "12"),
                anchor="e",
                justify="left",
                text=self.text,
            )
            self.label.grid(column=0, row=0)
        self.ask_gpt(f"{self.command.get('1.0', 'end -1c')}\n\n{self.text}", init=True)
        self.sub_win.geometry(
            f"{x2-x1}x{max(self.label.winfo_reqheight()+40, y2-y1)}+{x1}+{y1}"
        )
        self.sub_win.overrideredirect(1)

    def ocr(self):
        self.text = self.runClipboardOCR(["eng", "jpn"][self.lang.get()])
        if self.is_remove_return.get():
            self.text = self.text.replace("-\n", "").replace("\n", " ")
        self.label = tk.Label(
            self.sub_win,
            font=("游ゴシック", "12"),
            anchor="e",
            justify="left",
            text=self.text,
        )
        self.label.grid(column=0, row=0)

    def ask_gpt(self, text, init=False):
        if init:
            self.gpt.reset()
        self.gpt.run(self, text)

    def sized(self, *args):
        self.label["wraplength"] = self.sub_win.winfo_width() - 40

    def watchdog(self):
        if get_key_state(91) and get_key_state(16) and get_key_state(83):
            self.sub_window(*get_rectcoordinate())
        self.after(10, self.watchdog)

    def toggleOverrideRedirect(self, ev):
        self.sub_win.overrideredirect(not self.sub_win.overrideredirect())
        self.sub_win.withdraw()
        self.sub_win.deiconify()
        self.sub_win.focus_force()
        return


def get_key_state(keycode):
    return win32api.GetAsyncKeyState(keycode) >> 15 < 0


def get_rectcoordinate():
    class _pointer(ctypes.Structure):
        _fields_ = [
            ("x", ctypes.c_long),
            ("y", ctypes.c_long),
        ]

    point = _pointer()
    vk_leftbutton = 0x01
    while 1:
        if ctypes.windll.user32.GetAsyncKeyState(vk_leftbutton) == 0x8000:
            ctypes.windll.user32.GetCursorPos(ctypes.byref(point))
            x1, y1 = point.x, point.y
            while ctypes.windll.user32.GetAsyncKeyState(vk_leftbutton) == 0x8000:
                pass
            break
    ctypes.windll.user32.GetCursorPos(ctypes.byref(point))
    x2, y2 = point.x, point.y
    return x1, x2, y1, y2

if __name__ == "__main__":
    API_KEY = ""  # api_keyを入力
    command_window = CommandWindow(API_KEY)
    command_window.mainloop()

スパゲッティですまん。

使い方

Python 及び必要なライブラリをインストールする（pyocr, openai, pyperclipあたり）。
Tesseract OCR をインストールする。
OpenAI のアカウントを作成し、支払い用のカードを登録することで、 ChatGPT API を使用可能な状態にする。https://platform.openai.com/
プログラム下部の # api_keyを入力 とある部分に API keys のページで作成した Secret Key を入力する。
プログラムを実行する。
表示されたウインドウの入力欄に、行って欲しいタスクを入力する。
Win + Ctrl + S でスクショする。
ChatGPT が答えてくれる。

7 でスクショを撮らずに「Send」ボタンを押すと、普通に ChatGPT とお話ができます。
ウィンドウ上部のボタンたちは、左2つは OCR の言語、右のは改行を削除してからChatGPTに渡すかどうかです。

注）回答側のウィンドウを閉じると記憶を失います。

使用例

翻訳・要約
リファクタリング
お話

さいごに

オーバーレイ表示の必然性のない用途では、わざわざ使用する必要はないと言えばそれまでなんですが、
常駐させておけばサッと翻訳したり、息抜きに会話を楽しんだりできて良いかも。
良かったら使ってみてください。

参考

API叩いてる部分のプログラム

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up