背景
Azure AI Search で Index に何が格納されているか?を確認する方法が欲しかったので用意した記録
サンプルコード
AZURE_SEARCH_SERVICE
の対象とする index_name
内のドキュメントを一覧表示します
ファイルへ出力するようにしてみた。json化して
- filter: OData $filter で指定。eq, ne などはいけるが、startswith() は未対応
- top: 取得数。None で全部
preview documents in index
import os
import json
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from dotenv import load_dotenv
from icecream import ic
# 環境変数から設定を取得
def load_environment():
dotenv_path = os.path.join(".azure", "azd env name", ".env")
load_dotenv(dotenv_path)
def get_search_client(search_service, index_name):
service_endpoint = f"https://{search_service}.search.windows.net"
credential = DefaultAzureCredential()
return SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
def get_documents(search_client, top=None, filter=None):
search_text = "*"
results = search_client.search(search_text=search_text, top=top, filter=filter)
return [result for result in results]
def process_documents(documents):
processed_docs = []
for doc in documents:
doc_copy = {k: v for k, v in doc.items() if v is not None} # None要素は非表示
if "embedding" in doc_copy:
doc_copy["embedding"] = doc_copy["embedding"][:1] # embeddingの最初の要素だけ残す
processed_docs.append(doc_copy)
ic(doc_copy)
return processed_docs
def write_to_json(file_path, data):
with open(file_path, "w", encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def main():
load_environment()
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
index_name = "index_name" # Index nameを指定
search_client = get_search_client(AZURE_SEARCH_SERVICE, index_name)
filter = "sourcefile eq 'filename.pdf'"
number_of_documents = 10 # Noneに設定すると全ドキュメントを取得
documents = get_documents(search_client, number_of_documents, filter=filter)
output_file_path = f"indexDocuments_{index_name}.json" # index_nameをファイル名に使用
if len(documents) == 0:
print("No documents found.")
else:
processed_docs = process_documents(documents)
write_to_json(output_file_path, processed_docs)
print(f"Retrieved {len(documents)} documents and saved to {output_file_path}.")
if __name__ == "__main__":
main()
あとがき
新しい機能がバンバン入ってくるわけで・・skillset とか試したいところ
skillset を試した記録
split_skill = SplitSkill(
uuid = 2459b0227aaa4a939c6642bd17af184f
name=f"{index_name}-split-skill",
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
default_language_code="ja",
maximum_page_length=2048,
page_overlap_length=20,
inputs=[
InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
],
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)
conditional_skill = ConditionalSkill(
name=f"{index_name}-conditional-skill",
description="Conditional skill to extract file extension from file name",
context="/document",
inputs=[
InputFieldMappingEntry(name="condition", source="= $(/document/metadata_storage_size)>180"),
InputFieldMappingEntry(name="whenTrue", source="/document/metadata_storage_name"),
InputFieldMappingEntry(name="whenFalse", source="/document/metadata_storage_path"),
],
outputs=[OutputFieldMappingEntry(name="output", target_name="conditionResult")],
)
merge_skill = MergeSkill(
name=f"{index_name}-merge-skill",
description="Merge text from OCR and text from document",
context="/document",
inputs=[
InputFieldMappingEntry(name="itemsToInsert", source="/document/pages/*"),
InputFieldMappingEntry(name="text", source="/document/metadata_storage_name"),
# InputFieldMappingEntry(name="text", source="/document/content"),
# InputFieldMappingEntry(
# name="itemsToInsert", source="/document/normalized_images/*/text"
# ),
# InputFieldMappingEntry(
# name="offsets", source="/document/normalized_images/*/contentOffset"
# ),
],
outputs=[OutputFieldMappingEntry(name="mergedText", target_name="mergedContent")],
)
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to chunk documents and generate embeddings",
skills=[split_skill, conditional_skill, merge_skill, document_extraction_skill],
# index_projection=index_projection,
)
indexer = SearchIndexer(
skillset_name=skillset_name,
output_field_mappings=[
FieldMapping(source_field_name="/document/pages/*", target_field_name="pages"),
FieldMapping(source_field_name="/document/conditionResult", target_field_name="conditionResult"),
FieldMapping(source_field_name="/document/mergedContent", target_field_name="mergedContent"),
],
)