OCRでスクリーンショット画像から数値を抽出

Last updated at 2025-01-17Posted at 2025-01-17

概要

Pythonでスクリーンショットの画像を加工し、OCRによって文字認識をした。
また、処理が動いていることがわかるようにプログレスバーで表現した。

はじめに

業務でスクリーンショットした画像データの数値をExcelに転記する作業が多々ある
頻度が高く、入力する桁が多いため、「面倒だなぁぁ」と感じていた
この作業をPythonで効率化できないか挑戦してみた

OCR

調べていく中でOCRという技術があること知った。
OCR(Optical Character Recognition）とは、日本語にすると光学的文字認識で、
画像データからテキスト部分を認識し、文字データに変換する技術のこと。
例えば、スキャナーで読み込んだ書類などの文字情報や写真に写っている文字を認識して、テキストデータとして出力することができる。
Pythonでは以下の表にあるライブラリがよく使われているらしい。

とりあえず、簡単そうなTesseract OCRとEasyOCRを試してみることにした

お試し

PYOCRは以下のリンクを参考した

EasyOCRはこれだけ

pip install easyocr

その他ライブラリのインストール

pip install opencv-python
pip install pillow

OCRでどんな結果が得られるか試してみた

sample_EasyOCR_PYOCR.py

import os
import sys
import cv2
import easyocr
import pyocr
from PIL import Image


def OCR_engine():
    '''OCRエンジンの取得'''

    # Tesseractのインストールパスを指定
    path_tesseract = r'C:\Users\****\AppData\Local\Programs\Tesseract-OCR'
    if path_tesseract not in os.environ['PATH']:
        os.environ['PATH'] += os.pathsep + path_tesseract

    # OCRエンジンの取得
    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("Error: No OCR tool found.")
        sys.exit(1)
    tool = tools[0]
    return tool

def read_image(image):
    '''画像の読込み'''
    # 画像を読み込む
    img = cv2.imread(image, cv2.IMREAD_COLOR)
    # グレースケールに変換
    imgCV = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    imgPIL = Image.fromarray(imgCV)
    imgPIL.show()
    return imgCV,imgPIL

# ファイルパスを指定
file_path = r"sample_Image.JPG"
image = file_path
# 画像読込み
imgCV,imgPIL = read_image(image)

# PYOCR
# OCRエンジンの取得
tool = OCR_engine()
# 言語設定
langs = tool.get_available_languages()
lang = "eng" if "eng" in langs else langs[0]    #　英語優先
# PYOCRでテキストを抽出
PYOCR_result = tool.image_to_string(
    imgPIL,
    lang=lang,
    builder=pyocr.builders.TextBuilder()
)
# OCRで抽出したテキストを表示
print(f"Tesseract OCR 結果\n{PYOCR_result}")

# EasyOCR
# OCRエンジンの取得
reader = easyocr.Reader(['en'], gpu=True)
# easyOCRでテキストを抽出
EasyOCR_result = reader.readtext(imgCV, detail=0)
# OCRで抽出したテキストを表示
print(f"Easy OCR 結果\n{EasyOCR_result}")

まずは、スクリーンショットの画像を自分でトリミングしてOCRを実行
読み込んだ画像がこれ。

Tesseract OCR 結果
ee 54.706 A       
Nace 54,704 A     
Uni 122.556 V     
Wi) 261.409 A 

Easy OCR 結果
['54.704', '54.704', '122.556', '261.409']

どちらも数値は正しく読めていそう。正規表現とか使えば、ほしい結果は得られそうだ！！

別の画像でも試してみた

Tesseract OCR 結果

Easy OCR 結果
['96% F8416km', '20', '80', "'00", '40', 'SE', '60', '120', 'READY', 'e-Pedal OFF', '40', '140', 'D', '20', '160', 'PoweR %', 'kmh', 'SIANDARD', '100', '180', "24'c", '1269km', 'OFF', 'AutO']

あれ？Tesseract OCRは空文字になってしまった。
optionとか使えばうまく読めるのだろうけど、それはまた別の機会に
EasyOCRはデフォルトで高精度で出力してくれるし、リストで返してくれるので扱いやすそう
今回は、簡単に使えるEasyOCRで実装することにした

実装

main.pyを実行するとスレッド処理でEasyOCR.pyが実行されるようにした。
特定のフォルダを監視して、スクリーンショットが保存されたら、OCRが実行される。
フォルダ監視→画像を読込→加工→OCR→正規表現→必要な値抽出→Excel転記→フォルダ監視
ユーザにはプログレスバーで処理が行われていることを視覚的にわかるようにして、処理をやめる場合は、×ボタンを押せば、すべての処理が終了する仕組みにした。

実装にあたり、新たにインストールしたライブラリ

pip install pywin32

main.py

import threading
import EasyOCR
import tkinter as tk
from tkinter import messagebox
from tkinter import ttk


def GUI(stop_event):
    '''GUI'''
    def on_close(root, stop_event):
        '''終了フラグ'''
        response = messagebox.askyesno("確認", "OCR処理を終了しますか?")
        if response:
            stop_event.set()
            root.quit()
            root.destroy()
        else:
            pass

    root = tk.Tk()
    root.geometry("350x70")
    root.title("OCR")
    root.resizable(False, False)  # ウィンドウサイズの変更を無効化
    # ラベル
    label_var = tk.StringVar(value=f"ScreenShot folder monitoring")
    label = ttk.Label(root, textvariable=label_var)
    label.place(x=10, y=10)
    #非確定的Progressbar
    indeterPb = ttk.Progressbar(root, maximum=100, mode="indeterminate", length=330)
    indeterPb.pack(side="bottom", pady=10)  # 上下に余白を追加
    indeterPb.start(interval=100)

    # ×ボタンのイベント処理
    root.protocol("WM_DELETE_WINDOW", lambda: on_close(root, stop_event))
    root.mainloop()

if __name__ == "__main__":
    # 終了フラグを作成
    stop_event = threading.Event()

    # スレッドを作る
    thread_EasyOCR = threading.Thread(target=EasyOCR.run_process, args=(stop_event,))#, name="EasyOCRThread")

    # スレッドの処理を開始
    thread_EasyOCR.start()

    # GUI起動
    GUI(stop_event)

    # スレッドの終了を待つ
    stop_event.set()  # プログレスバー終了時に他スレッドにも終了を通知
    thread_EasyOCR.join()

EasyOCR.py

import os
from datetime import datetime
import re
import cv2
import pythoncom
import win32com.client as win32
import easyocr
import time

def crop_image(image):
    '''初期画像のトリミング'''
    # 画像を読み込む
    img = cv2.imread(image, cv2.IMREAD_COLOR)
    # 画像のサイズを取得
    height, width, _ = img.shape
    # 下半分と左半分をトリミング
    img = img[0:height // 2, width // 2:width]
    # 画像のサイズを取得
    height, width, _ = img.shape
    # 上半分と左半分をトリミング
    img = img[height // 2 - 10:height, 0:width // 2]
    return img

def correct_item(item):
    '''リスト内の要素を修正'''
    # 修正ルールを定義 (正規表現パターンをキーとして指定)
    corrections = {
        r'^F.*A$': 'F/A',   # 'F' で始まり 'A' で終わる文字列を 'F/A' に変換
        r'^Av.*$': 'Ave.',  # 'Av' で始まる文字列を 'Ave.' に変換
        r'^R.*A$': 'R/A'    # 'R' で始まり 'A' で終わる文字列を 'R/A' に変換
        }
    for pattern, replacement in corrections.items():
        if re.match(pattern, item, flags=re.IGNORECASE):  # 大文字小文字を無視してマッチ
            return replacement
    return item  # どのパターンにも一致しない場合はそのまま

def remove_whitespace(lst):
    '''リスト内の要素から空白を削除'''
    return [re.sub(r'\s+', '', item) for item in lst]

def extract_distance_value(lst):
    '''distanceの数値を取得 'Ave.' の右横にある数値を取得'''
    for i, item in enumerate(lst):
        if item == 'Ave.':  # 'Ave.' を見つけたら
            if i + 1 < len(lst):  # 次の要素が存在するか確認
                # 数値をチェック
                if re.match(r'^\d+(\.\d+)?$', lst[i + 1]):  # 数値にマッチする正規表現
                    return lst[i + 1]
    return None  # 見つからない場合は None を返す

def extract_wh_strings(lst):
    '''リスト内の要素で 'Wh' を含むものを別のリストに格納'''
    return [item for item in lst if re.search(r'wh', item, flags=re.IGNORECASE)]

def extract_numbers(lst):
    '''文字列内の数値部分を抽出して数値のリストを作成'''
    return [float(re.search(r'\d+\.?\d*', item).group()) for item in lst if re.search(r'\d+\.?\d*', item)]

def get_distance_value(img,reader):
    '''距離を取得'''
    # 画像のサイズを取得
    height, width, _ = img.shape
    # 距離の部分だけをトリミング
    distance_img = img[0:height // 3 - 50, width // 4 + 20:width - 220]
    # グレースケールに変換
    distance_img = cv2.cvtColor(distance_img, cv2.COLOR_BGR2GRAY)
    # 表示
    cv2.imshow("distance Image",distance_img)
    cv2.waitKey(1)
    # easyOCRでテキストを抽出
    OCR_result = reader.readtext(distance_img, detail=0)
    # OCRで抽出したテキストを表示
    # print(OCR_result)
    # リスト全体を修正
    corrected_result = [correct_item(item) for item in OCR_result]
    # 修正後の結果を表示
    # print(corrected_result)
    # 結果を取得
    return extract_distance_value(corrected_result)

def get_energy_value(img,reader):
    '''電力量を取得'''
    # 画像のサイズを取得
    height, width, _ = img.shape
    # 距離の部分だけをトリミング
    power_img = img[height // 3:height, 0:width // 4]
    # グレースケールに変換
    power_img = cv2.cvtColor(power_img, cv2.COLOR_BGR2GRAY)
    # 表示
    cv2.imshow("distance Image",power_img)
    cv2.waitKey(1)
    # easyOCRでテキストを抽出
    OCR_result = reader.readtext(power_img, detail=0)
    # OCRで抽出したテキストを表示
    # print(OCR_result)
    # 空白を削除
    corrected_result = remove_whitespace(OCR_result)
    # print(corrected_result)
    # 'Wh'がある文字列を抽出
    wh_strings = extract_wh_strings(corrected_result)
    # print(wh_strings)
    # 'Wh'の場合は数値に*10^-3をする
    wh_strings = [
    item if 'kWh' in item else str(float(item.replace('Wh', '')) * 10**-3)
    if 'Wh' in item else item
    for item in wh_strings
    ]
    # print(wh_strings)
    # 数値に変換
    return extract_numbers(wh_strings)

def set_excel():
    '''Excelを設定する'''
    pythoncom.CoInitialize()  # COMライブラリを初期化
    excel = win32.Dispatch("Excel.Application")
    excel.Visible = True        # Excelを表示する
    excel.DisplayAlerts = False # 警告非表示
    return excel

def open_excel():
    '''Excelファイルを開く'''
    # 現在の日時を取得
    date_now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    base_exl_path = r"Base_result.xlsx"　# ベースのExcelパス 必要に応じて修正
    result_folder_path = r"C:\result"　　# 結果を保存するフォルダパス 必要に応じて修正
    output_file_path = f'{date_now}_result.xlsx'
    output_file_path = os.path.join(result_folder_path, output_file_path)
    # Excelを開く
    excel = set_excel()
    wb = excel.Workbooks.Open(base_exl_path)
    # 名前をつけて保存
    wb.SaveAs(output_file_path)
    return output_file_path

def write_to_excel(result, output_file_path):
    '''結果をExcelファイルに書き込む'''
    # Excelを開く
    excel = set_excel()
    wb = excel.Workbooks.Open(output_file_path)
    ws = wb.Worksheets(1)
    # 空白列を探す
    current_col = 5
    while True:
        # x行目をチェックして空白ならそのセルを返す
        if ws.Cells(3, current_col).Value is None:
            empty_col = current_col
            break
        current_col += 1
    # 各データを指定されたセルに転記
    ws.Cells(3, empty_col).Value = result['file_path']
    ws.Cells(4, empty_col).Value = result['distance']
    ws.Cells(5, empty_col).Value = result['HVBatt_energy_minus']
    ws.Cells(6, empty_col).Value = result['HVBatt_energy_plus']
    ws.Cells(7, empty_col).Value = result['LowVolt_energy']
    ws.Cells(8, empty_col).Value = result['HVBatt_energy']
    # Excelファイルを保存
    wb.Save()

def close_excel(output_file_path):
    '''Excelファイルを閉じる'''
    # Excelを開く
    excel = set_excel()
    wb = excel.Workbooks.Open(output_file_path)
    wb.Close(True)

def process_file(file_path,reader):
    """ファイルの処理を行う関数"""
    result = {}
    print(f"Processing file: {file_path}")
    image = file_path
    # 画像の切り取り
    img = crop_image(image)
    # 距離の抽出
    distance_value = get_distance_value(img,reader)
    # 電力量の取得
    energy_values = get_energy_value(img,reader)
    # print(energy_values)
    # 一番大きい値を取得
    HVBatt_energy_minus_value = max(energy_values) if energy_values else None
    # 2番目に大きい値を取得
    energy_values.remove(HVBatt_energy_minus_value)
    HVBatt_energy_value = max(energy_values) if energy_values else None
    # 3番目に大きい値を取得
    energy_values.remove(HVBatt_energy_value)
    HVBatt_energy_plus_value = max(energy_values) if energy_values else None
    # リストの1番最後の要素を取得
    LowVolt_energy_value = energy_values[-1] if energy_values else None
    # 結果を格納
    result["file_path"] = image
    result["distance"] = float(distance_value)
    result["HVBatt_energy_minus"] = HVBatt_energy_minus_value
    result["HVBatt_energy_plus"] = HVBatt_energy_plus_value
    result["LowVolt_energy"] = LowVolt_energy_value
    result["HVBatt_energy"] = HVBatt_energy_value
    # 結果を表示
    print(result)
    return result

def get_latest_file(folder_path, processed_files):
    """指定されたフォルダ内で更新日時が一番新しい .jpg ファイルを取得する関数"""
    # 指定されたフォルダ内のファイルを取得し、フルパスを作成
    jpg_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.jpg')]
    jpg_files = [f for f in jpg_files if f not in processed_files]
    # .jpg ファイルが存在しない場合は None を返す
    if not jpg_files:
        return None
    # 更新日時が一番新しいファイルを取得
    latest_file = max(jpg_files, key=os.path.getmtime)
    return latest_file

def main(folder_path, stop_event):
    """メインの監視ループを実行する関数"""
    # 既にあるjpgファイルのリストを作る
    processed_files = set(os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.jpg'))
    last_processed_file = None
    # Excelファイルを開く
    output_file_path = open_excel()
    # OCRエンジンの取得
    reader = easyocr.Reader(['en'], gpu=True)
    # メインループ 
    while not stop_event.is_set():
        print("Monitoring folder")
        latest_file = get_latest_file(folder_path, processed_files)
        if latest_file and latest_file != last_processed_file:
            # 画像ファイルの処理
            result = process_file(latest_file, reader)
            # Excelに書き込む
            write_to_excel(result, output_file_path)
            # 既存jpgファイルリストに追加
            processed_files.add(latest_file)
            last_processed_file = latest_file
        time.sleep(0.5)  # 過剰なCPU使用を防ぐための待機時間
    # Excelファイルを閉じる
    close_excel(output_file_path)
    print("Stop event detected. Closing Excel.")

# if __name__ == "__main__":
def run_process(stop_event):
    folder_path = r"C:\screenShot"  # フォルダのパスを指定 必要に応じて修正
    main(folder_path, stop_event)

終わり

OCRでスクリーンショット画像から数値を抽出することができた！
少しは業務が楽になるはず！
この技術を応用させれば、レシートを写真で撮って家計簿に転記するAppを作れたり、
OpenCVのmatchTemplate関数と組み合わせれば、動画や画像の文字認識&図形認識ができそうだ！！

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up