LoginSignup
23
24

More than 3 years have passed since last update.

続【Python】英文PDF(に限らないけど)をDeepLやGoogle翻訳で自動で翻訳させてテキストファイル、いやHTMLにしてしまおう。

Last updated at Posted at 2020-08-06

前回の内容と今回やったこと

前回の記事【Python】英文PDF(に限らないけど)をDeepLやGoogle翻訳で自動で翻訳させてテキストファイルにしてしまおう。
では翻訳結果をテキストファイルに出力しましたが、翻訳前の文章と横に並べて見比べられたら便利だと思いませんか。
まだ改善の余地はありますがHTMLで実現しました。
image.png
例に使った論文

ごちゃごちゃと盛り付けたのでコードが汚いですがご容赦ください(もとからか)。

8/7追記

HTML表示時における
対応する英文or和文への、ハイライト機能・ジャンプ機能
ダークモード
を追加しました。また、
非ダークモード時の色合いも優しくしました。

逆翻訳(日本語 → 英語)に対応しました。


例に使った文

コード

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def parse_merge(text, n=4900, m=1, inv=False):
    sentences = []
    sentence = ""
    for j, i in enumerate(" ".join(
            text.splitlines()).split(". " if inv == False else "。")):
        if i in ("", " ", "."): continue
        if (len(sentence) + len(i) > n) or (j % m == 0):
            sentences.append(sentence)
            sentence = ""
        sentence += i + ("." if inv == False else "。")
    sentences.append(sentence)
    return sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title,
                           sentence_cnt, inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    en = parse_merge(ppc.paste(), m=sentence_cnt, inv=inv)
    ja = []
    for sentence in en:
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        while transtext == "":
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    time.sleep(1)
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
        if isPrint: print(transtext)
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = [
        "DeepL", False, "translated_text.txt", True, False, "EN ↔ JP", 1, False
    ]
    if input("1. 英語 → 日本語    2. 日本語 → 英語   ") == "2": args[7] = True
    if input("1. DeepL 2.GoogleTranslate  ") == "2": args[0] = "GT"
    if input("翻訳結果を書き出しますか? y/n  ") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"出力ファイルにつける名前を入力してください(デフォルトは'translated_text{format_}')  ")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("(論文の)タイトルを入力してください   ")
            if title:
                args[5] = title
    try:
        args[6] = int(
            input("何文ずつ翻訳しますか?(デフォルトは1文ずつ。小さいほど出力がきれいで、大きいほど早くなります。)   "))
    except:
        pass
    if input("翻訳経過をここに表示しますか? y/n  ") == "n":
        args[3] = False
    input("準備ができたらEnterを押してください")
    TranslateFromClipboard(*args)

読みやすさはかなり上がりましたが、1文ずつ翻訳しているので全文翻訳するのにかなり時間がかかります(実行時の選択でまとめて翻訳することも出来ます)。

8/9追記

さらにひと手間かかりますが、より見やすく翻訳できる方法がありました。
PDFをWordで開いてからコピーして、以下の文章を分割する関数部分を少し変えたスクリプトで翻訳しましょう。
さすがWordと言うべきか改行をまたぐ文章等もきれいに整形してくれます。

Wordごし用微調整コード
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def parse_merge(text, n=4900):
    sentences = []
    sentence = ""
    for i in text.splitlines():
        if i in ("", " ", "."): continue
        sentences.append(i)
    return sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title,inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    en = parse_merge(ppc.paste())
    ja = []
    for sentence in en:
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        while transtext == "":
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    time.sleep(1)
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
        if isPrint: print(transtext)
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = ["DeepL", False, "translated_text.txt", True, False, "EN ↔ JP",False]
    if input("1. 英語 → 日本語    2. 日本語 → 英語   ") == "2": args[6] = True
    if input("1. DeepL 2.GoogleTranslate  ") == "2": args[0] = "GT"
    if input("翻訳結果を書き出しますか? y/n  ") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"出力ファイルにつける名前を入力してください(デフォルトは'translated_text{format_}')  ")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("(論文の)タイトルを入力してください   ")
            if title:
                args[5] = title
    if input("翻訳経過をここに表示しますか? y/n  ") == "n":
        args[3] = False
    input("準備ができたらEnterを押してください")
    TranslateFromClipboard(*args)

8/11追記

Word通さずともある程度段落を分解できるようにしました。
(だいたい)段落ごとに翻訳するため、1文ずつに比べ翻訳速度もだいぶマシになりました。

文章分解強化版コード
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
import re
import unicodedata

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def len_(text):
    cnt = 0
    for t in text:
        if unicodedata.east_asian_width(t) in "FWA":
            cnt += 2
        else:
            cnt += 1
    return cnt


def textParser(text, n=30, bracketDetect=True):
    text = text.splitlines()
    sentences = []
    t = ""
    bra_cnt = ket_cnt = bra_cnt_jp = ket_cnt_jp = 0
    for i in range(len(text)):
        if not bool(re.search("\S", text[i])): continue
        if bracketDetect:
            bra_cnt += len(re.findall("[\((]", text[i]))
            ket_cnt += len(re.findall("[\))]", text[i]))
            bra_cnt_jp += len(re.findall("[「「『]", text[i]))
            ket_cnt_jp += len(re.findall("[」」』]", text[i]))
        if i != len(text) - 1:
            if bool(re.fullmatch(r"[A-Z\s]+", text[i])):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif bool(
                    re.match(
                        "(\d{1,2}[\.,、.]\s?(\d{1,2}[\.,、.]*)*\s?|I{1,3}V{0,1}X{0,1}[\.,、.]|V{0,1}X{0,1}I{1,3}[\.,、.]|[・•●])+\s",
                        text[i])) or re.match("\d{1,2}.\w", text[i]) or (
                            bool(re.match("[A-Z]", text[i][0]))
                            and abs(len_(text[i]) - len_(text[i + 1])) > n
                            and len_(text[i]) < n):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif (
                    text[i][-1] not in ("。", ".", ".") and
                (abs(len_(text[i]) - len_(text[i + 1])) < n or
                 (len_(t + text[i]) > len_(text[i + 1]) and bool(
                     re.search("[。\..]\s\d|..[。\..]|.[。\..]", text[i + 1][-3:])
                     or bool(re.match("[A-Z]", text[i + 1][:1]))))
                 or bool(re.match("\s?[a-z,\)]", text[i + 1]))
                 or bra_cnt > ket_cnt or bra_cnt_jp > ket_cnt_jp)):
                t += text[i]
            else:
                sentences.append(t + text[i])
                t = ""
        else:
            sentences.append(t + text[i])
    return len(sentences), sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title, inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    length, en = textParser(ppc.paste())
    ja = []
    preText = ""
    for i, sentence in enumerate(en):
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        cnt = 0
        while transtext in ("", preText):
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
            cnt += 1
            if cnt % 10 == 0: textarea.send_keys(".")
        if isPrint:
            print(sentence)
            print(transtext)
        print(f"\n{i+1}/{length}  {int(100*(i+1)/length)}% done\n")
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
        preText = transtext
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = [
        "DeepL", False, "translated_text.txt", True, False,
        "ORIGINAL ↔ TRANSLATED", False
    ]
    if input("1. 英語 → 日本語    2. 日本語 → 英語   ") == "2": args[6] = True
    if input("1. DeepL 2.GoogleTranslate  ") == "2": args[0] = "GT"
    if input("翻訳結果を書き出しますか? y/n  ") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"出力ファイルにつける名前を入力してください(デフォルトは'translated_text{format_}')  ")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("(論文の)タイトルを入力してください   ")
            if title:
                args[5] = title
    if input("翻訳経過をここに表示しますか? y/n  ") == "n":
        args[3] = False
    input("準備ができたらEnterを押してください")
    TranslateFromClipboard(*args)

8/16追記

マルチスレッドで大量にChromeを開く力技高速化を施しました。
こちらは書き出しはHTMLのみとなっています。
またDeepLの場合、多く開きすぎると制限がかかって翻訳止まりますのでご注意ください。
(8/22追 対応する文章へのジャンプ機能を改良しました。)

マルチスレッド版コード
from selenium import webdriver
from selenium.webdriver import Chrome
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
import re
import unicodedata
from math import ceil
from threading import Thread, Lock
DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def html_gen(title, filename):
    global tr
    source = ""
    result = ""
    tr = sorted(tr, key=lambda x: x["order"])
    for item in tr:
        i, s, r = item.values()
        source += f'<br><a id="s{i}" onclick="scroller(' + f"'s{i}', 'r{i}'" + ')" onmouseover="over(' + f"'r{i}'" + ')" onmouseout="out(' + f"'r{i}'" + f')">{s}</a><br>'
        result += f'<br><a id="r{i}" onclick="scroller(' + f"'r{i}', 's{i}'" + ')" onmouseover="over(' + f"'s{i}'" + ')" onmouseout="out(' + f"'s{i}'" + f')">{r}</a><br>'
    with open(filename + ".html", "w", encoding='UTF-8') as f:
        f.write(
            f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="source">\n{source}\n</div>\n<div id="result">\n{result}\n</div>\n</div>'
            +
            '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#source {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#result {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nfunction scroller(s, o) {\nvar elements = document.getElementById(s);\nvar elemento = document.getElementById(o);\nvar rects = elements.getBoundingClientRect();\nvar recto = elemento.getBoundingClientRect();\nvar x = recto.left;\nvar y = recto.top - rects.top;\nelemento.parentNode.scrollBy(x, y);\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
        )


def len_(text):
    cnt = 0
    for t in text:
        if unicodedata.east_asian_width(t) in "FWA":
            cnt += 2
        else:
            cnt += 1
    return cnt


def textParser(text, n=30, bracketDetect=True):
    text = text.splitlines()
    sentences = []
    t = ""
    bra_cnt = ket_cnt = bra_cnt_jp = ket_cnt_jp = 0
    for i in range(len(text)):
        if not bool(re.search("\S", text[i])): continue
        if bracketDetect:
            bra_cnt += len(re.findall("[\((]", text[i]))
            ket_cnt += len(re.findall("[\))]", text[i]))
            bra_cnt_jp += len(re.findall("[「「『]", text[i]))
            ket_cnt_jp += len(re.findall("[」」』]", text[i]))
        if i != len(text) - 1:
            if bool(re.fullmatch(r"[A-Z\s]+", text[i])):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif bool(
                    re.match(
                        "(\d{1,2}[\.,、.]\s?(\d{1,2}[\.,、.]*)*\s?|I{1,3}V{0,1}X{0,1}[\.,、.]|V{0,1}X{0,1}I{1,3}[\.,、.]|[・•●])+\s",
                        text[i])) or re.match("\d{1,2}.\w", text[i]) or (
                            bool(re.match("[A-Z]", text[i][0]))
                            and abs(len_(text[i]) - len_(text[i + 1])) > n
                            and len_(text[i]) < n):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif (
                    text[i][-1] not in ("。", ".", ".") and
                (abs(len_(text[i]) - len_(text[i + 1])) < n or
                 (len_(t + text[i]) > len_(text[i + 1]) and bool(
                     re.search("[。\..]\s\d|..[。\..]|.[。\..]", text[i + 1][-3:])
                     or bool(re.match("[A-Z]", text[i + 1][:1]))))
                 or bool(re.match("\s?[a-z,\)]", text[i + 1]))
                 or bra_cnt > ket_cnt or bra_cnt_jp > ket_cnt_jp)):
                t += text[i]
            else:
                sentences.append(t + text[i])
                t = ""
        else:
            sentences.append(t + text[i])
    return len(sentences), sentences


def translate(driver, i, texts):
    global tr
    preText = ""
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    for j, t in enumerate(texts):
        etr = {"order": i * unit + j, "source": t}
        lock.acquire()
        ppc.copy(t)
        textarea.send_keys(Keys.CONTROL, "v")
        lock.release()
        transtext = ""
        cnt = 0
        while transtext in ("", preText):
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
            cnt += 1
            if cnt % 10 == 0: textarea.send_keys(".")
        etr["result"] = transtext
        tr.append(etr)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)

        preText = transtext
    if len(tr) == length: html_gen(title, filename)


def runDriver(order):
    driver = webdriver.Chrome(DRIVER_PATH)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    translate(driver, order,
              source[order * unit:min(len(source), (order + 1) * unit)])
    driver.close()


def multiThreadTranslate(n):
    global tr, threads, source, length, unit, lock
    length, source = textParser(ppc.paste())
    unit = ceil(length / n)
    lock = Lock()
    clipboard = ppc.paste()
    for t in range(n):
        thread = Thread(target=runDriver, args=(t, ))
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    ppc.copy(clipboard)


if __name__ == "__main__":
    tr = []
    threads = []
    n, inv, tool, filename, title = 10, False, "DeepL", "translated_text", "ORIGINAL ↔ TRANSLATED"
    n_ = input("いくつのChromeで並行翻訳しますか?    ")
    if n_.isdigit(): n = int(n_)
    if input("1. 英語 → 日本語    2. 日本語 → 英語   ") == "2": inv = True
    if input("1. DeepL 2.GoogleTranslate  ") == "2": tool = "GT"
    filename_ = input("出力ファイルにつける名前を入力してください(デフォルトは'translated_text.html')  ")
    if filename_:
        filename = filename_.replace(" ", "_")
    title_ = input("(論文の)タイトルを入力してください   ")
    if title_:
        title = title_
    input("準備ができたらEnterを押してください")
    multiThreadTranslate(n)

マウスオーバーで対応する文章へと高さを揃えるver.
from selenium import webdriver
from selenium.webdriver import Chrome
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
import re
import unicodedata
from math import ceil
from threading import Thread, Lock
DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def html_gen(title, filename):
    global tr
    source = ""
    result = ""
    tr = sorted(tr, key=lambda x: x["order"])
    for item in tr:
        i, s, r = item.values()
        source += f'<br><a id="s{i}" onmouseover="over(' + f"'s{i}', 'r{i}'" + ')" onmouseout="out(' + f"'r{i}'" + f')">{s}</a><br>'
        result += f'<br><a id="r{i}" onmouseover="over(' + f"'r{i}', 's{i}'" + ')" onmouseout="out(' + f"'s{i}'" + f')">{r}</a><br>'
    with open(filename + ".html", "w", encoding='UTF-8') as f:
        f.write(
            f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="source">\n{source}\n</div>\n<div id="result">\n{result}\n</div>\n</div>'
            +
            '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#source {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#result {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(s,o) {\nvar elements = document.getElementById(s);\nvar elemento = document.getElementById(o);\nvar rects = elements.getBoundingClientRect();\nvar recto = elemento.getBoundingClientRect();\nvar x = recto.left;\nvar y = recto.top - rects.top;\nelemento.parentNode.scrollBy(x, y);\nelemento.style.color = getComputedStyle(elemento).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
        )


def len_(text):
    cnt = 0
    for t in text:
        if unicodedata.east_asian_width(t) in "FWA":
            cnt += 2
        else:
            cnt += 1
    return cnt


def textParser(text, n=30, bracketDetect=True):
    text = text.splitlines()
    sentences = []
    t = ""
    bra_cnt = ket_cnt = bra_cnt_jp = ket_cnt_jp = 0
    for i in range(len(text)):
        if not bool(re.search("\S", text[i])): continue
        if bracketDetect:
            bra_cnt += len(re.findall("[\((]", text[i]))
            ket_cnt += len(re.findall("[\))]", text[i]))
            bra_cnt_jp += len(re.findall("[「「『]", text[i]))
            ket_cnt_jp += len(re.findall("[」」』]", text[i]))
        if i != len(text) - 1:
            if bool(re.fullmatch(r"[A-Z\s]+", text[i])):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif bool(
                    re.match(
                        "(\d{1,2}[\.,、.]\s?(\d{1,2}[\.,、.]*)*\s?|I{1,3}V{0,1}X{0,1}[\.,、.]|V{0,1}X{0,1}I{1,3}[\.,、.]|[・•●])+\s",
                        text[i])) or re.match("\d{1,2}.\w", text[i]) or (
                            bool(re.match("[A-Z]", text[i][0]))
                            and abs(len_(text[i]) - len_(text[i + 1])) > n
                            and len_(text[i]) < n):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif (
                    text[i][-1] not in ("。", ".", ".") and
                (abs(len_(text[i]) - len_(text[i + 1])) < n or
                 (len_(t + text[i]) > len_(text[i + 1]) and bool(
                     re.search("[。\..]\s\d|..[。\..]|.[。\..]", text[i + 1][-3:])
                     or bool(re.match("[A-Z]", text[i + 1][:1]))))
                 or bool(re.match("\s?[a-z,\)]", text[i + 1]))
                 or bra_cnt > ket_cnt or bra_cnt_jp > ket_cnt_jp)):
                t += text[i]
            else:
                sentences.append(t + text[i])
                t = ""
        else:
            sentences.append(t + text[i])
    return len(sentences), sentences


def translate(driver, i, texts):
    global tr
    preText = ""
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    for j, t in enumerate(texts):
        etr = {"order": i * unit + j, "source": t}
        lock.acquire()
        ppc.copy(t)
        textarea.send_keys(Keys.CONTROL, "v")
        lock.release()
        transtext = ""
        cnt = 0
        while transtext in ("", preText):
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
            cnt += 1
            if cnt % 10 == 0: textarea.send_keys(".")
        etr["result"] = transtext
        tr.append(etr)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)

        preText = transtext
    if len(tr) == length: html_gen(title, filename)


def runDriver(order):
    driver = webdriver.Chrome(DRIVER_PATH)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    translate(driver, order,
              source[order * unit:min(len(source), (order + 1) * unit)])
    driver.close()


def multiThreadTranslate(n):
    global tr, threads, source, length, unit, lock
    length, source = textParser(ppc.paste())
    unit = ceil(length / n)
    lock = Lock()
    clipboard = ppc.paste()
    for t in range(n):
        thread = Thread(target=runDriver, args=(t, ))
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    ppc.copy(clipboard)


if __name__ == "__main__":
    tr = []
    threads = []
    n, inv, tool, filename, title = 10, False, "DeepL", "translated_text", "ORIGINAL ↔ TRANSLATED"
    n_ = input("いくつのChromeで並行翻訳しますか?    ")
    if n_.isdigit(): n = int(n_)
    if input("1. 英語 → 日本語    2. 日本語 → 英語   ") == "2": inv = True
    if input("1. DeepL 2.GoogleTranslate  ") == "2": tool = "GT"
    filename_ = input("出力ファイルにつける名前を入力してください(デフォルトは'translated_text.html')  ")
    if filename_:
        filename = filename_.replace(" ", "_")
    title_ = input("(論文の)タイトルを入力してください   ")
    if title_:
        title = title_
    input("準備ができたらEnterを押してください")
    multiThreadTranslate(n)

まとめ

HTMLやCSSはズブの素人なのでこうしたらもっと良くなる!という点があればご教示いただけると嬉しいです。

23
24
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
23
24