More than 1 year has passed since last update.

Azure Form Recognizer layout API ver3.0をlocalのpythonから叩いてみた

Last updated at 2023-04-27Posted at 2023-04-27

全体概要

表題はこちらのサービスになります。
現状様々なベンダーから様々なOCRサービスが提供されています。しかしどれも文字画像をどれだけ正しく読むかに重点を置いていますが、このAPIが画期的なのは、表の構造だとかヘッダーだとかを認識して読んでくれます。
2023/04/27時点でのpublicなAPIだとAzure Form Recognizerが最強なのではというのが個人的感想です。

やったこと

現在API仕様書がきちんと公開されているのがver2です。(以下stable版)
しかしGUIサイトで動いているのはver3です。(以下preview版)
こちらのサイトではver3のAPI仕様の説明がありますが、ググってもpreview版の実装例があまりありませんでした。
ないなら書こうということで、以下がpythonでの実装例です。

preview版のリクエスト

fromrecognizerver3.py

import cv2
import numpy as np
import requests
import json
import time
import numpy as np
import glob
import os

SUBSCRIPTIONKEY = "hogehogehogehogehogehogehogehoge"
ENDPOINT = "https://hogehogehoge.cognitiveservices.azure.com/"

def analyze_layout(ipath):
    post_url = ENDPOINT + "/formrecognizer/documentModels/prebuilt-layout:analyze?api-version=2022-08-31"
    params = {"includeTextDetails": True}
    headers = {
        'Content-Type': 'image/png',
        'Ocp-Apim-Subscription-Key': SUBSCRIPTIONKEY,
    }
    
    _, num_bytes = cv2.imencode('.png', img)
    img_bytes = num_bytes.tobytes()

    try:
        table_analysis = requests.post(url=post_url, data=img_bytes, headers=headers, params=params)
        if table_analysis.status_code == 202:
            resulturl =table_analysis.headers["Operation-Location"]
        else:
            print("POST analyze failed:\n%s" % json.dumps(table_analysis.json()))
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
    return resulturl

def get_analyze_layout_result(resulturl):
    n_tries = 10
    n_try = 0
    wait_sec = 1
    max_wait_sec = 60
    while n_try < n_tries:
        try:
            resp = requests.get(url = resulturl, headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTIONKEY})
            resp_json = resp.json()
            if resp.status_code != 200:
                print("GET analyze results failed:\n%s" % json.dumps(resp_json))
            status = resp_json["status"]
            print("status:",status)
            if status == "succeeded":
                tables_info = resp_json['analyzeResult']['tables']
                break
            if status == "failed":
                print("Analysis failed:\n%s" % json.dumps(resp_json))
            
            time.sleep(wait_sec)
            n_try += 1
            wait_sec = min(2*wait_sec, max_wait_sec)     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
    return tables_info

def format_header_result(tables_info):
    n_table = len(tables_info)
    header_texts = []
    img_d = None
    if n_table == 0:
        pass
    else:
        img = cv2.imread("image_org.png",1)
        for table_info in tables_info:
            cells = table_info['cells']
            n_row = table_info['rowCount']
            n_col = table_info['columnCount']
            
            headercells_info = []
            for i,cell in enumerate(cells):
                points = cell['boundingRegions'][0]['polygon']
                is_header = True if "kind" in cell and cell['kind'] == 'columnHeader' else False
                yomi = cell['content']
                columnIndex = cell['columnIndex']
                columnSpan = cell['columnSpan'] if 'columnSpan' in cell else 1
                x1,x2,y1,y2 = points[0],points[2], points[1], points[5]
                if is_header and yomi != "":
                    headercells_info.append([yomi.split("\n",1)[0],columnSpan,columnIndex,i])
                    img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,124,254),thickness=3,lineType=cv2.LINE_4,shift=0)
                else:
                    img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,254,124),thickness=1,lineType=cv2.LINE_4,shift=0)
            #headercell_info = [読み、スパン、インデックス、順番]
            for headercell_info in headercells_info:
                if headercell_info[1] == 1:
                    pass
                else:
                    for n in range(headercell_info[1]-1):
                        headercells_info.append([headercell_info[0],1,headercell_info[2]+n+1,headercell_info[3]])
            headercells_info.sort(key=lambda x:x[3])
            header_text = []
            for n in range(n_col):
                y = [f[0] for f in headercells_info if f[2]==n]
                if isinstance(y,list):
                    header_text.append(("_".join(y)).replace(" ",""))
                else:
                    header_text.append("")
            header_texts.append(header_text)
            cv2.imwrite("image_drawn.png",img_d)
    
    return {'n_table' : n_table,'headers':header_texts, 'drawn_image':img_d}

if __name__ == "__main__":
    ipath = "image_org.png"
    resulturl = analyze_layout(ipath)
    tables_info = get_analyze_layout_result(resulturl)
    dst = format_header_result(tables_info)

結果としてheaderのリストが見つけたテーブルの数ぶん出力されるようにしました。

stable版のリクエスト

も載せておきます。

fromrecognizerver2.py

import cv2
import numpy as np
import requests
import json
import time
import numpy as np
import glob
import os

SUBSCRIPTIONKEY = "hogehogehogehogehogehogehogehoge"
ENDPOINT = "https://hogehogehoge.cognitiveservices.azure.com/"

def analyze_layout(ipath):
    post_url = ENDPOINT + "/formrecognizer/v2.1/layout/analyze"
    params = {"includeTextDetails": True}
    headers = {
        # Request headers
        'Content-Type': 'image/png',
        'Ocp-Apim-Subscription-Key': SUBSCRIPTIONKEY,
    }

    img = cv2.imread(ipath,1)
    
    _, num_bytes = cv2.imencode('.png', img)
    img_bytes = num_bytes.tobytes()
    try:
        table_analysis = requests.post(url=post_url, data=img_bytes, headers=headers, params=params)
        if table_analysis.status_code == 202:
            resulturl =table_analysis.headers["Operation-Location"]
        else:
            print("POST analyze failed:\n%s" % json.dumps(table_analysis.json()))
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
    return resulturl

def get_analyze_layout_result(resulturl):
    n_tries = 10
    n_try = 0
    wait_sec = 1
    max_wait_sec = 60
    while n_try < n_tries:
        try:
            resp = requests.get(url = resulturl, headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTIONKEY})
            resp_json = resp.json()
            if resp.status_code != 200:
                print("GET analyze results failed:\n%s" % json.dumps(resp_json))
            status = resp_json["status"]
            print("status:",status)
            if status == "succeeded":
                tables_info = resp_json['analyzeResult']['pageResults'][0]['tables']
                break
            if status == "failed":
                print("Analysis failed:\n%s" % json.dumps(resp_json))
            
            # Analysis still running. Wait and retry.
            time.sleep(wait_sec)
            n_try += 1
            wait_sec = min(2*wait_sec, max_wait_sec)     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
    return tables_info

def format_header_result(tables_info):
    n_table = len(tables_info)
    header_texts = []
    img_d = None
    if n_table == 0:
        pass
    else:
        img = cv2.imread("2send.png",1)
        for table_info in tables_info:
            cells = table_info['cells']
            n_row = table_info['columns']
            
            headercells_info = []
            for i,cell in enumerate(cells):
                points = cell['boundingBox']
                is_header = cell['isHeader']
                yomi = cell['text']
                columnIndex = cell['columnIndex']
                columnSpan = cell['columnSpan'] if 'columnSpan' in cell else 1
                x1,x2,y1,y2 = points[0],points[2], points[1], points[5]
                if is_header:
                    headercells_info.append([yomi,columnSpan,columnIndex,i])
                    img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,124,254),thickness=3,lineType=cv2.LINE_4,shift=0)
                else:
                    img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,254,124),thickness=1,lineType=cv2.LINE_4,shift=0)

            #headercell_info = [読み、スパン、インデックス、順番]
            for headercell_info in headercells_info:
                if headercell_info[1] == 1:
                    pass
                else:
                    for n in range(headercell_info[1]-1):
                        headercells_info.append([headercell_info[0],1,headercell_info[2]+n+1,headercell_info[3]])

            headercells_info.sort(key=lambda x:x[3])
            header_text = []
            for n in range(n_row):
                y = [f[0] for f in headercells_info if f[2]==n]
                header_text.append(("_".join(y)).replace(" ",""))
            header_texts.append(header_text)
    
    return {'n_table' : n_table,'headers':header_texts, 'drawn_image':img_d}

if __name__ == "__main__":
    ipath = "image_org.png"
    resulturl = analyze_layout(ipath)
    tables_info = get_analyze_layout_result(resulturl)
    dst = format_header_result(tables_info)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up