DatabricksでNebulaGraphを用いた知識グラフベースRAGを動かしてみる(日本語編)

Posted at 2024-01-23

こちらの続きです。

日本語の文書を使って知識グラフベースのRAGを動かします。

前回と同様にDatabricksクラスターにNebulaGraphがインストールされるようにします。

ライブラリのインストール

%pip install -U llama-index
%pip install ipython-ngql nebula3-python pyvis
%pip install "databricks-sql-connector[sqlalchemy]"
dbutils.library.restartPython()

ライブラリのインポート

import os

# OpenAI APIキー
os.environ["OPENAI_API_KEY"] = dbutils.secrets.get("demo-token-takaaki.yayoi", "openai_api_key")

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.DEBUG #logging.INFO
)  # より詳細なログを出力するには logging.DEBUG

from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import OpenAI

from IPython.display import Markdown, display

# LLMを定義
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
service_context = ServiceContext.from_defaults(llm=llm, chunk_size_limit=512)

NeburaGraphとの接続

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"  # デフォルトは "nebula"
os.environ[
    "NEBULA_ADDRESS"
] = "127.0.0.1:9669"  # ローカルにNeburaGraphがインストールされている前提

space_name = "llamaindex"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  # デフォルト値、空の知識グラフから作成する際には省略可能
tags = ["entity"]  # デフォルト値、空の知識グラフから作成する際には省略可能

スペースの作成

%load_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

%ngql add hosts 127.0.0.1:9779

すぐに実行するとエラーになるのでスリープ時間を挿入します。

import time
time.sleep(10)

%ngql CREATE SPACE llamaindex(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);

time.sleep(10)

タグとエッジ、インデックスの作成

%ngql USE llamaindex;
%ngql CREATE TAG entity(name string);
%ngql CREATE EDGE relationship(relationship string);

time.sleep(10)

%ngql CREATE TAG INDEX entity_index ON entity(name(256));

グラフストアの作成

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

Unity Catalogとの接続

from llama_index import download_loader
from llama_index.utilities.sql_wrapper import SQLDatabase

DatabaseReader = download_loader('DatabaseReader')

# パーソナルアクセストークン
access_token    = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
# Databricksワークスペースホスト名
server_hostname = "<ワークスペースのホスト名>"
# クラスターのHTTPパス
http_path       = "sql/protocolv1/o/2556758628403379/1118-004519-nu0u899x"
# Unity Catalogカタログ
catalog         = "takaakiyayoi_catalog"
# スキーマ(データベース)
schema          = "qiita_2023"

reader = DatabaseReader(
    sql_database = SQLDatabase.from_uri(f"databricks://token:{access_token}@{server_hostname}?" +
        f"http_path={http_path}&catalog={catalog}&schema={schema}")
)

テキストデータのロード

# テキストを格納しているカラムに応じてSQLを変更します
query = f"""
SELECT
    body
FROM takaakiyayoi_catalog.qiita_2023.taka_qiita_2023 LIMIT 100
"""

documents = reader.load_data(query=query)

知識グラフインデックスの作成

それなりに時間を要するので、最初はドキュメントの数を減らした方がいいかもしれません。

kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=10,
    service_context=service_context,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
)

知識グラフへのクエリー

%ngql USE llamaindex;
%ngql MATCH ()-[e]->() RETURN e LIMIT 10

# draw the result
%ng_draw

前回はノートブックにレンダリングできませんでしたが、以下の方法でレンダリングできました。

html_file_content = open("/Workspace/Users/takaaki.yayoi@databricks.com/20240122_nebula_llama/nebulagraph.html", 'r').read()
displayHTML(html_file_content)

RAGへの問い合わせ

from llama_index.query_engine import KnowledgeGraphQueryEngine

from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore

query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
    verbose=True,
)

response = query_engine.query(
    "Databricksとは",
)
display(Markdown(f"<b>{response}</b>"))

response = query_engine.query(
    "Apache Sparkとは",
)
display(Markdown(f"<b>{response}</b>"))

動いてますね。

もう少し挙動を理解してチューニングできるところあればしたいところです。

はじめてのDatabricks

Databricks無料トライアル

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up