はじめに
Azure AI Document IntelligenceでPDFからMarkdownを生成していきます
開発環境
- Windows 10 PC
- Python 3.11
- Azure
導入
こちらを参考にします
URL指定
PDFファイルをURLで指定して、Document Intelligenceに投げて、結果のMarkdownを保存します
sample_analyze_documents_output_in_markdown.py
import os
def analyze_documents_output_in_markdown():
# [START analyze_documents_output_in_markdown]
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
DocumentContentFormat,
AnalyzeResult,
)
endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
url = "https://raw.githubusercontent.com/Azure/azure-sdk-for-python/main/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_forms/forms/Invoice_1.pdf"
document_intelligence_client = DocumentIntelligenceClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-layout",
AnalyzeDocumentRequest(url_source=url),
output_content_format=DocumentContentFormat.MARKDOWN,
)
result: AnalyzeResult = poller.result()
# Save markdown content to file
output_file = "Invoice_1.md"
with open(output_file, "w", encoding="utf-8") as f:
f.write(result.content)
print(f"Content has been saved to {output_file}")
print(f"Here's the full content in format {result.content_format}:\n")
print(result.content)
# [END analyze_documents_output_in_markdown]
if __name__ == "__main__":
from azure.core.exceptions import HttpResponseError
from dotenv import find_dotenv, load_dotenv
try:
load_dotenv(find_dotenv())
analyze_documents_output_in_markdown()
except HttpResponseError as error:
# Examples of how to check an HttpResponseError
# Check by error code:
if error.error is not None:
if error.error.code == "InvalidImage":
print(f"Received an invalid image error: {error.error}")
if error.error.code == "InvalidRequest":
print(f"Received an invalid request error: {error.error}")
# Raise the error again after printing it
raise
# If the inner error is None and then it is possible to check the message to get more information:
if "Invalid request".casefold() in error.message.casefold():
print(f"Uh-oh! Seems there was an invalid request: {error}")
# Raise the error again
raise
.envを作成してください
.env
DOCUMENTINTELLIGENCE_ENDPOINT=<INSERT-YOUR-ENDPOINT>
DOCUMENTINTELLIGENCE_API_KEY=<INSERT-YOUR-API-KEY>
実行結果は下記のようになります
Contoso
Address:
1 Redmond way Suite
6000 Redmond, WA
99243
Invoice For: Microsoft
1020 Enterprise Way
Sunnayvale, CA 87659
<table>
<tr>
<th>Invoice Number</th>
<th>Invoice Date</th>
<th>Invoice Due Date</th>
<th>Charges</th>
<th>VAT ID</th>
</tr>
<tr>
<td>34278587</td>
<td>6/18/2017</td>
<td>6/24/2017</td>
<td>$56,651.49</td>
<td>PT</td>
</tr>
</table>
ファイル指定
ローカルのPDFファイルをDocument Intelligenceに投げて、結果をMarkdownファイルで保存します
sample_analyze_documents_output_in_markdown_from_bytes.py
import os
def analyze_documents_output_in_markdown():
# [START analyze_documents_output_in_markdown]
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
DocumentContentFormat,
AnalyzeResult,
)
endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
filename = "Invoice_1.pdf"
document_intelligence_client = DocumentIntelligenceClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-layout",
AnalyzeDocumentRequest(bytes_source=open(filename, "rb").read()),
output_content_format=DocumentContentFormat.MARKDOWN,
)
result: AnalyzeResult = poller.result()
# Save markdown content to file
output_file = os.path.splitext(filename)[0] + ".md"
with open(output_file, "w", encoding="utf-8") as f:
f.write(result.content)
print(f"Content has been saved to {output_file}")
print(f"Here's the full content in format {result.content_format}:\n")
print(result.content)
# [END analyze_documents_output_in_markdown]
if __name__ == "__main__":
from azure.core.exceptions import HttpResponseError
from dotenv import find_dotenv, load_dotenv
try:
load_dotenv(find_dotenv())
analyze_documents_output_in_markdown()
except HttpResponseError as error:
# Examples of how to check an HttpResponseError
# Check by error code:
if error.error is not None:
if error.error.code == "InvalidImage":
print(f"Received an invalid image error: {error.error}")
if error.error.code == "InvalidRequest":
print(f"Received an invalid request error: {error.error}")
# Raise the error again after printing it
raise
# If the inner error is None and then it is possible to check the message to get more information:
if "Invalid request".casefold() in error.message.casefold():
print(f"Uh-oh! Seems there was an invalid request: {error}")
# Raise the error again
raise
結果はこのようになります
Contoso
Address:
1 Redmond way Suite
6000 Redmond, WA
99243
Invoice For: Microsoft
1020 Enterprise Way
Sunnayvale, CA 87659
<table>
<tr>
<th>Invoice Number</th>
<th>Invoice Date</th>
<th>Invoice Due Date</th>
<th>Charges</th>
<th>VAT ID</th>
</tr>
<tr>
<td>34278587</td>
<td>6/18/2017</td>
<td>6/24/2017</td>
<td>$56,651.49</td>
<td>PT</td>
</tr>
</table>
お疲れ様でした