1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Azure: Azure AI Service index 内ドキュメントを確認するサンプル

Last updated at Posted at 2024-12-11

背景

Azure AI Search で Index に何が格納されているか?を確認する方法が欲しかったので用意した記録

サンプルコード

AZURE_SEARCH_SERVICE の対象とする index_name 内のドキュメントを一覧表示します

ファイルへ出力するようにしてみた。json化して

  • filter: OData $filter で指定。eq, ne などはいけるが、startswith() は未対応
  • top: 取得数。None で全部
preview documents in index

import os
import json
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from dotenv import load_dotenv
from icecream import ic

# 環境変数から設定を取得
def load_environment():
    dotenv_path = os.path.join(".azure", "azd env name", ".env")
    load_dotenv(dotenv_path)

def get_search_client(search_service, index_name):
    service_endpoint = f"https://{search_service}.search.windows.net"
    credential = DefaultAzureCredential()
    return SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

def get_documents(search_client, top=None, filter=None):
    search_text = "*"
    results = search_client.search(search_text=search_text, top=top, filter=filter)
    return [result for result in results]

def process_documents(documents):
    processed_docs = []
    
    for doc in documents:
        doc_copy = {k: v for k, v in doc.items() if v is not None}  # None要素は非表示
        
        if "embedding" in doc_copy:
            doc_copy["embedding"] = doc_copy["embedding"][:1]  # embeddingの最初の要素だけ残す
        
        processed_docs.append(doc_copy)
        ic(doc_copy)
    
    return processed_docs

def write_to_json(file_path, data):
    with open(file_path, "w", encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def main():
    load_environment()
    AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
    index_name = "index_name"  # Index nameを指定
    search_client = get_search_client(AZURE_SEARCH_SERVICE, index_name)

    filter = "sourcefile eq 'filename.pdf'"
    number_of_documents = 10  # Noneに設定すると全ドキュメントを取得
    documents = get_documents(search_client, number_of_documents, filter=filter)

    output_file_path = f"indexDocuments_{index_name}.json"  # index_nameをファイル名に使用
    
    if len(documents) == 0:
        print("No documents found.")
    else:
        processed_docs = process_documents(documents)
        write_to_json(output_file_path, processed_docs)
        print(f"Retrieved {len(documents)} documents and saved to {output_file_path}.")

if __name__ == "__main__":
    main()

あとがき

新しい機能がバンバン入ってくるわけで・・skillset とか試したいところ :sweat:

skillset を試した記録

split_skill = SplitSkill(
    uuid = 2459b0227aaa4a939c6642bd17af184f
    name=f"{index_name}-split-skill",
    description="Split skill to chunk documents",
    text_split_mode="pages",
    context="/document",
    default_language_code="ja",
    maximum_page_length=2048,
    page_overlap_length=20,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
    ],
    outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)
conditional_skill = ConditionalSkill(
    name=f"{index_name}-conditional-skill",
    description="Conditional skill to extract file extension from file name",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="condition", source="= $(/document/metadata_storage_size)>180"),
        InputFieldMappingEntry(name="whenTrue", source="/document/metadata_storage_name"),
        InputFieldMappingEntry(name="whenFalse", source="/document/metadata_storage_path"),
    ],
    outputs=[OutputFieldMappingEntry(name="output", target_name="conditionResult")],
)
merge_skill = MergeSkill(
    name=f"{index_name}-merge-skill",
    description="Merge text from OCR and text from document",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="itemsToInsert", source="/document/pages/*"),
        InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
        # InputFieldMappingEntry(name="text", source="/document/content"),
        # InputFieldMappingEntry(
        #     name="itemsToInsert", source="/document/normalized_images/*/text"
        # ),
        # InputFieldMappingEntry(
        #     name="offsets", source="/document/normalized_images/*/contentOffset"
        # ),
    ],
    outputs=[OutputFieldMappingEntry(name="mergedText", target_name="mergedContent")],
)
skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset to chunk documents and generate embeddings",
    skills=[split_skill, conditional_skill, merge_skill, document_extraction_skill],
    # index_projection=index_projection,
)
indexer = SearchIndexer(
    skillset_name=skillset_name,
    output_field_mappings=[

        FieldMapping(source_field_name="/document/pages/*", target_field_name="pages"),
        FieldMapping(source_field_name="/document/conditionResult", target_field_name="conditionResult"),
        FieldMapping(source_field_name="/document/mergedContent", target_field_name="mergedContent"),
    ],
    )

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?