全体概要
表題はこちらのサービスになります。
現状様々なベンダーから様々なOCRサービスが提供されています。しかしどれも文字画像をどれだけ正しく読むかに重点を置いていますが、このAPIが画期的なのは、表の構造だとかヘッダーだとかを認識して読んでくれます。
2023/04/27時点でのpublicなAPIだとAzure Form Recognizerが最強なのではというのが個人的感想です。
やったこと
現在API仕様書がきちんと公開されているのがver2です。(以下stable版)
しかしGUIサイトで動いているのはver3です。(以下preview版)
こちらのサイトではver3のAPI仕様の説明がありますが、ググってもpreview版の実装例があまりありませんでした。
ないなら書こうということで、以下がpythonでの実装例です。
preview版のリクエスト
fromrecognizerver3.py
import cv2
import numpy as np
import requests
import json
import time
import numpy as np
import glob
import os
SUBSCRIPTIONKEY = "hogehogehogehogehogehogehogehoge"
ENDPOINT = "https://hogehogehoge.cognitiveservices.azure.com/"
def analyze_layout(ipath):
post_url = ENDPOINT + "/formrecognizer/documentModels/prebuilt-layout:analyze?api-version=2022-08-31"
params = {"includeTextDetails": True}
headers = {
'Content-Type': 'image/png',
'Ocp-Apim-Subscription-Key': SUBSCRIPTIONKEY,
}
_, num_bytes = cv2.imencode('.png', img)
img_bytes = num_bytes.tobytes()
try:
table_analysis = requests.post(url=post_url, data=img_bytes, headers=headers, params=params)
if table_analysis.status_code == 202:
resulturl =table_analysis.headers["Operation-Location"]
else:
print("POST analyze failed:\n%s" % json.dumps(table_analysis.json()))
except Exception as e:
print("POST analyze failed:\n%s" % str(e))
return resulturl
def get_analyze_layout_result(resulturl):
n_tries = 10
n_try = 0
wait_sec = 1
max_wait_sec = 60
while n_try < n_tries:
try:
resp = requests.get(url = resulturl, headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTIONKEY})
resp_json = resp.json()
if resp.status_code != 200:
print("GET analyze results failed:\n%s" % json.dumps(resp_json))
status = resp_json["status"]
print("status:",status)
if status == "succeeded":
tables_info = resp_json['analyzeResult']['tables']
break
if status == "failed":
print("Analysis failed:\n%s" % json.dumps(resp_json))
time.sleep(wait_sec)
n_try += 1
wait_sec = min(2*wait_sec, max_wait_sec)
except Exception as e:
msg = "GET analyze results failed:\n%s" % str(e)
print(msg)
return tables_info
def format_header_result(tables_info):
n_table = len(tables_info)
header_texts = []
img_d = None
if n_table == 0:
pass
else:
img = cv2.imread("image_org.png",1)
for table_info in tables_info:
cells = table_info['cells']
n_row = table_info['rowCount']
n_col = table_info['columnCount']
headercells_info = []
for i,cell in enumerate(cells):
points = cell['boundingRegions'][0]['polygon']
is_header = True if "kind" in cell and cell['kind'] == 'columnHeader' else False
yomi = cell['content']
columnIndex = cell['columnIndex']
columnSpan = cell['columnSpan'] if 'columnSpan' in cell else 1
x1,x2,y1,y2 = points[0],points[2], points[1], points[5]
if is_header and yomi != "":
headercells_info.append([yomi.split("\n",1)[0],columnSpan,columnIndex,i])
img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,124,254),thickness=3,lineType=cv2.LINE_4,shift=0)
else:
img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,254,124),thickness=1,lineType=cv2.LINE_4,shift=0)
#headercell_info = [読み、スパン、インデックス、順番]
for headercell_info in headercells_info:
if headercell_info[1] == 1:
pass
else:
for n in range(headercell_info[1]-1):
headercells_info.append([headercell_info[0],1,headercell_info[2]+n+1,headercell_info[3]])
headercells_info.sort(key=lambda x:x[3])
header_text = []
for n in range(n_col):
y = [f[0] for f in headercells_info if f[2]==n]
if isinstance(y,list):
header_text.append(("_".join(y)).replace(" ",""))
else:
header_text.append("")
header_texts.append(header_text)
cv2.imwrite("image_drawn.png",img_d)
return {'n_table' : n_table,'headers':header_texts, 'drawn_image':img_d}
if __name__ == "__main__":
ipath = "image_org.png"
resulturl = analyze_layout(ipath)
tables_info = get_analyze_layout_result(resulturl)
dst = format_header_result(tables_info)
結果としてheaderのリストが見つけたテーブルの数ぶん出力されるようにしました。
stable版のリクエスト
も載せておきます。
fromrecognizerver2.py
import cv2
import numpy as np
import requests
import json
import time
import numpy as np
import glob
import os
SUBSCRIPTIONKEY = "hogehogehogehogehogehogehogehoge"
ENDPOINT = "https://hogehogehoge.cognitiveservices.azure.com/"
def analyze_layout(ipath):
post_url = ENDPOINT + "/formrecognizer/v2.1/layout/analyze"
params = {"includeTextDetails": True}
headers = {
# Request headers
'Content-Type': 'image/png',
'Ocp-Apim-Subscription-Key': SUBSCRIPTIONKEY,
}
img = cv2.imread(ipath,1)
_, num_bytes = cv2.imencode('.png', img)
img_bytes = num_bytes.tobytes()
try:
table_analysis = requests.post(url=post_url, data=img_bytes, headers=headers, params=params)
if table_analysis.status_code == 202:
resulturl =table_analysis.headers["Operation-Location"]
else:
print("POST analyze failed:\n%s" % json.dumps(table_analysis.json()))
except Exception as e:
print("POST analyze failed:\n%s" % str(e))
return resulturl
def get_analyze_layout_result(resulturl):
n_tries = 10
n_try = 0
wait_sec = 1
max_wait_sec = 60
while n_try < n_tries:
try:
resp = requests.get(url = resulturl, headers = {"Ocp-Apim-Subscription-Key": SUBSCRIPTIONKEY})
resp_json = resp.json()
if resp.status_code != 200:
print("GET analyze results failed:\n%s" % json.dumps(resp_json))
status = resp_json["status"]
print("status:",status)
if status == "succeeded":
tables_info = resp_json['analyzeResult']['pageResults'][0]['tables']
break
if status == "failed":
print("Analysis failed:\n%s" % json.dumps(resp_json))
# Analysis still running. Wait and retry.
time.sleep(wait_sec)
n_try += 1
wait_sec = min(2*wait_sec, max_wait_sec)
except Exception as e:
msg = "GET analyze results failed:\n%s" % str(e)
print(msg)
return tables_info
def format_header_result(tables_info):
n_table = len(tables_info)
header_texts = []
img_d = None
if n_table == 0:
pass
else:
img = cv2.imread("2send.png",1)
for table_info in tables_info:
cells = table_info['cells']
n_row = table_info['columns']
headercells_info = []
for i,cell in enumerate(cells):
points = cell['boundingBox']
is_header = cell['isHeader']
yomi = cell['text']
columnIndex = cell['columnIndex']
columnSpan = cell['columnSpan'] if 'columnSpan' in cell else 1
x1,x2,y1,y2 = points[0],points[2], points[1], points[5]
if is_header:
headercells_info.append([yomi,columnSpan,columnIndex,i])
img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,124,254),thickness=3,lineType=cv2.LINE_4,shift=0)
else:
img_d = cv2.rectangle(img,(x1,y1),(x2,y2),color=(0,254,124),thickness=1,lineType=cv2.LINE_4,shift=0)
#headercell_info = [読み、スパン、インデックス、順番]
for headercell_info in headercells_info:
if headercell_info[1] == 1:
pass
else:
for n in range(headercell_info[1]-1):
headercells_info.append([headercell_info[0],1,headercell_info[2]+n+1,headercell_info[3]])
headercells_info.sort(key=lambda x:x[3])
header_text = []
for n in range(n_row):
y = [f[0] for f in headercells_info if f[2]==n]
header_text.append(("_".join(y)).replace(" ",""))
header_texts.append(header_text)
return {'n_table' : n_table,'headers':header_texts, 'drawn_image':img_d}
if __name__ == "__main__":
ipath = "image_org.png"
resulturl = analyze_layout(ipath)
tables_info = get_analyze_layout_result(resulturl)
dst = format_header_result(tables_info)