More than 1 year has passed since last update.

SharePointオンラインをCognitiveSearchのデータソースとして使用する

Posted at 2024-01-27

はじめに

CognitiveSearchのSharePointインデクサーがパブリックプレビューで公開されており、手順がMSLearn上で公開されておりました！。

対象者

この記事は下記のような人を対象にしています。

駆け出しエンジニア
プログラミング初学者

前準備

以下大まかな手順となるコードをまとめました。

変数の定義

#必要な変数の設定
# CognitiveSearchの情報
search_service_name = "your_searchservice_name"
index_name = "your_index_name"
api_version = "2023-07-01-Preview"
api_key = "＊＊＊＊＊＊＊＊"

#作成するインデクサーの名称
indexer_name = "test_indexer"

#作成するデータソースの名称
data_source_name = "test_datasource"

#SharePointサイトの接続文字列作成に必要な情報
sharepoint_endpoint = "https://{tenuntname}.sharepoint.com/sites/{sitename}"
application_id = "*****"
 #以下二つはマルチテナントアプリケーションの場合にのみ必要となります
application_secret = "＊＊＊＊＊＊＊＊＊＊＊＊"
tenant_id = "＊＊＊＊＊＊＊"

データソースの作成

CognitiveSearchがsharePointにアクセスするために必要な接続文字列などを含むデータソースを作成します。

以下のセルを実行するとCognitiveSearch上にデータソースが作成されます。

import requests
url =f"https://{search_service_name}.search.windows.net/datasources?api-version={api_version}"

jsondata = {
    "name": f"{data_source_name}",
    "type": "sharepoint",
    "credentials": {"connectionString": f"SharePointOnlineEndpoint={sharepoint_endpoint};ApplicationId={application_id};ApplicationSecret={application_secret};TenantId={tenant_id}"},
    "container": {"name": "defaultSiteLibrary", "query": None}
}

response = requests.post(url, headers={"Content-Type": "application/json", "api-key": api_key}, json=jsondata)
print(response.status_code, response.text)

空のIndexを作成

インデクサーによってインデックスを作成する際にターゲットとするインデックスを作成する。

import requests
import json


url = f"https://{search_service_name}.search.windows.net/indexes?api-version={api_version}"

headers = {
    "Content-Type": "application/json",
    "api-key": api_key
}
data = {
    "name": f"{index_name}",
    "fields": [
        {"name": "id", "type": "Edm.String", "key": True, "searchable": False},
        {"name": "metadata_spo_item_name", "type": "Edm.String", "key": False, "searchable": True, "filterable": False, "sortable": False, "facetable": False},
        {"name": "metadata_spo_item_path", "type": "Edm.String", "key": False, "searchable": False, "filterable": False, "sortable": False, "facetable": False},
        {"name": "metadata_spo_item_content_type", "type": "Edm.String", "key": False, "searchable": False, "filterable": True, "sortable": False, "facetable": True},
        {"name": "metadata_spo_item_last_modified", "type": "Edm.DateTimeOffset", "key": False, "searchable": False, "filterable": False, "sortable": True, "facetable": False},
        {"name": "metadata_spo_item_size", "type": "Edm.Int64", "key": False, "searchable": False, "filterable": False, "sortable": False, "facetable": False},
        {"name": "content", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False, "facetable": False}
    ]
}
response = requests.post(url, headers=headers, json=data)

print(response.status_code)  # レスポンスのステータスコードを表示
print(response.json())

インデクサーの作成

インデクサーとはインデックスを作成するためのクローラーの様な物。

先程作成したデータソースを用いることで、対象のSharePointサイト内のドキュメントがインデックスの作成対象となる。

インデクサースケジュールスケジュール設定なども行える。作成したいんでクサーはAzurePortalから手動で実行することも可能。

import requests
import json
# リクエストのURL
url = f"https://{search_service_name}.search.windows.net/indexers?api-version={api_version}"

# リクエストヘッダー
headers = {
    "Content-Type": "application/json",
    "api-key": f"{api_key}"
}
# リクエストボディ
data = {
    "name" : f"{indexer_name}",
    "dataSourceName" : f"{data_source_name}",
    "targetIndexName" : f"{index_name}",
    "parameters": {
        "batchSize": None,
        "maxFailedItems": None,
        "maxFailedItemsPerBatch": None,
        "base64EncodeKeys": None,
        "configuration": {
            "indexedFileNameExtensions" : ".CSV, .EML, .JSON, .KML, .DOCX, .DOC, .DOCM, .XLSX, .XLS, .XLSM, .PPTX, .PPT, .PPTM, .MSG, .XML, .ODT, .ODS, .ODP, .PDF, .TXT, .XML, .ZIP",
            "excludedFileNameExtensions" : ".png, .jpg, .EPUB, .GZ, .HTML, .RTF",
            "dataToExtract": "contentAndMetadata"
        }
    },
    "schedule" : { },
    "fieldMappings" : [
        {
            "sourceFieldName" : "metadata_spo_site_library_item_id",
            "targetFieldName" : "id",
            "mappingFunction" : {
                "name" : "base64Encode"
            }
         }
    ]
}
# POSTリクエストの送信
response = requests.post(url, headers=headers, data=json.dumps(data))
# レスポンスの表示
print(response.text)

LangChainでテスト

最後に作成したindexに対して検索を実行し、indexが作成されているか確認してみます。今回はLangchainを使って検索をかけてみました。

pip install langchain

import os
from langchain.retrievers import AzureCognitiveSearchRetriever

os.environ["AZURE_COGNITIVE_SEARCH_SERVICE_NAME"] = search_service_name
os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"] = index_name
os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"] = api_key

retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=10)

retriever.get_relevant_documents("検索ワード")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up