身の回りの困りごとを楽しく解決！ by Works Human IntelligenceAdvent Calendar 2024

Azure: Azure AI Service index 内ドキュメントを確認するサンプル

Last updated at 2025-02-17Posted at 2024-12-11

背景

Azure AI Search で Index に何が格納されているか？を確認する方法が欲しかったので用意した記録

サンプルコード

AZURE_SEARCH_SERVICE の対象とする index_name 内のドキュメントを一覧表示します

ファイルへ出力するようにしてみた。json化して

filter: OData $filter で指定。eq, ne などはいけるが、startswith() は未対応
top: 取得数。None で全部

preview documents in index


import os
import json
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from dotenv import load_dotenv
from icecream import ic

# 環境変数から設定を取得
def load_environment():
    dotenv_path = os.path.join(".azure", "azd env name", ".env")
    load_dotenv(dotenv_path)

def get_search_client(search_service, index_name):
    service_endpoint = f"https://{search_service}.search.windows.net"
    credential = DefaultAzureCredential()
    return SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

def get_documents(search_client, top=None, filter=None):
    search_text = "*"
    results = search_client.search(search_text=search_text, top=top, filter=filter)
    return [result for result in results]

def process_documents(documents):
    processed_docs = []
    
    for doc in documents:
        doc_copy = {k: v for k, v in doc.items() if v is not None}  # None要素は非表示
        
        if "embedding" in doc_copy:
            doc_copy["embedding"] = doc_copy["embedding"][:1]  # embeddingの最初の要素だけ残す
        
        processed_docs.append(doc_copy)
        ic(doc_copy)
    
    return processed_docs

def write_to_json(file_path, data):
    with open(file_path, "w", encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def main():
    load_environment()
    AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
    index_name = "index_name"  # Index nameを指定
    search_client = get_search_client(AZURE_SEARCH_SERVICE, index_name)

    filter = "sourcefile eq 'filename.pdf'"
    number_of_documents = 10  # Noneに設定すると全ドキュメントを取得
    documents = get_documents(search_client, number_of_documents, filter=filter)

    output_file_path = f"indexDocuments_{index_name}.json"  # index_nameをファイル名に使用
    
    if len(documents) == 0:
        print("No documents found.")
    else:
        processed_docs = process_documents(documents)
        write_to_json(output_file_path, processed_docs)
        print(f"Retrieved {len(documents)} documents and saved to {output_file_path}.")

if __name__ == "__main__":
    main()

あとがき

新しい機能がバンバン入ってくるわけで・・skillset とか試したいところ

skillset を試した記録


split_skill = SplitSkill(
    uuid = 2459b0227aaa4a939c6642bd17af184f
    name=f"{index_name}-split-skill",
    description="Split skill to chunk documents",
    text_split_mode="pages",
    context="/document",
    default_language_code="ja",
    maximum_page_length=2048,
    page_overlap_length=20,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
    ],
    outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)
conditional_skill = ConditionalSkill(
    name=f"{index_name}-conditional-skill",
    description="Conditional skill to extract file extension from file name",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="condition", source="= $(/document/metadata_storage_size)>180"),
        InputFieldMappingEntry(name="whenTrue", source="/document/metadata_storage_name"),
        InputFieldMappingEntry(name="whenFalse", source="/document/metadata_storage_path"),
    ],
    outputs=[OutputFieldMappingEntry(name="output", target_name="conditionResult")],
)
merge_skill = MergeSkill(
    name=f"{index_name}-merge-skill",
    description="Merge text from OCR and text from document",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="itemsToInsert", source="/document/pages/*"),
        InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
        # InputFieldMappingEntry(name="text", source="/document/content"),
        # InputFieldMappingEntry(
        #     name="itemsToInsert", source="/document/normalized_images/*/text"
        # ),
        # InputFieldMappingEntry(
        #     name="offsets", source="/document/normalized_images/*/contentOffset"
        # ),
    ],
    outputs=[OutputFieldMappingEntry(name="mergedText", target_name="mergedContent")],
)
skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset to chunk documents and generate embeddings",
    skills=[split_skill, conditional_skill, merge_skill, document_extraction_skill],
    # index_projection=index_projection,
)
indexer = SearchIndexer(
    skillset_name=skillset_name,
    output_field_mappings=[

        FieldMapping(source_field_name="/document/pages/*", target_field_name="pages"),
        FieldMapping(source_field_name="/document/conditionResult", target_field_name="conditionResult"),
        FieldMapping(source_field_name="/document/mergedContent", target_field_name="mergedContent"),
    ],
    )

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up