Knowledge Graphを使った RAG をLangChainで実装[後編]

Posted at 2024-07-16

少しだけではあるが、前編の内容を実践的に試してみた。
以下のWikipedia記事の3万文字を読み込んでNeo4jに入れてRAGを実施。

サマリ

Neo4jにデータ入れる前に名寄せはした方が良い
日本語でGraph DBであいまい検索をすると精度下がる場合あり

Neo4jへのデータ格納

プログラム

前篇のプログラムとほぼ同じなので解説省略。大きく違うのは、もとテキストをWikipediaから読み込んでいる部分。

from dotenv import load_dotenv
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import pandas as pd

load_dotenv()

docs = WikipediaLoader(query="寄生獣", 
                       lang='ja',
                       doc_content_chars_max=30_000,
                       load_max_docs=1).load()

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    # Existing args
)

texts = text_splitter.create_documents([docs[0].page_content])

graph = Neo4jGraph()
graph.query("MATCH (n)  DETACH DELETE n;") 

llm=ChatOpenAI(temperature=0, model_name="gpt-4o") 
llm_transformer = LLMGraphTransformer(llm=llm)

# ここでLLMでgraph化
graph_documents = llm_transformer.convert_to_graph_documents(texts)
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(model='text-embedding-3-small'),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

# Full Text Index作成
graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

格納結果

CypherをNeo4j Consoleから実行

MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50

だいたいのグラフ。自然文からよく作っていると感心しますが、一方で同じ人物が「新一」と「泉新一」と「泉新一」の3つに分かれてしまっています。
あとは、目的によっては「日本テレビ」とか「月間アフタヌーン」なんかは不要なケースも多いでしょう。Prompt内容やNodeのLabelを挿入前に削除などで調整できるかも)。

RAGでのLLM実行

プログラム

こちらもほぼ前編と同じなので解説省略。

import os
from typing import List

from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

graph = Neo4jGraph()

vector_index = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(model='text-embedding-3-small'),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    index_name="vector",
)

class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

# Entity ExtractionのChain
llm_ner=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo") 
entity_chain = prompt | llm_ner.with_structured_output(Entities)
entity_chain.invoke({"question": "花田一路はどんな人間？"}).names

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""

    # remove_lucene_chars は特殊文字(|など)を削除
    # スペース分割なので日本語では機能しない
    # 最後のifは文字があればTrueになるので、意味がよくわからない(splitでスペース分割後にブランクがelに来るパターンが？)
    words = [el for el in remove_lucene_chars(input).split() if el]
    
    # ANDを末尾につけるのは最後以外
    for word in words[:-1]:
#        full_text_query += f" {word} AND"
#    full_text_query += f" {words[-1]}"
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""

    # Entity Extraction
    entities = entity_chain.invoke({"question": question})

    for entity in entities.names:
        print(generate_full_text_query(entity))
        # 方向が逆のものをUNIONしている
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    vector_index.similarity_search(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | ChatOpenAI(temperature=0, model_name="gpt-4o")
    | StrOutputParser()
)

chain.invoke("新一の敵は？")

「新一の敵は？」と入れた場合の結果です。合っていますが、ざっくりですね・・・

結果

新一の敵は、パラサイト（寄生生物）です。

詳細確認

LangSmithで見ると、Neo4jから取得しているデータ(Structured data)が新一に関するものでないです。

Entity ExtractionはOK

このCyperの新一~2がFuzzinessが大きくて駄目ですね。

CALL db.index.fulltext.queryNodes('entity', "新一~2", {limit:2})
 YIELD node,score
 CALL {
   WITH node
   MATCH (node)-[r:!MENTIONS]->(neighbor)
   RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
   UNION ALL
   WITH node
   MATCH (node)<-[r:!MENTIONS]-(neighbor)
   RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
   }
 RETURN output LIMIT 50

試しに~2を取るとうまく行きます。

Fuzzy Searchやめる

CALL db.index.fulltext.queryNodes('entity', "新一", {limit:2})
 YIELD node,score
 CALL {
   WITH node
   MATCH (node)-[r:!MENTIONS]->(neighbor)
   RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
   UNION ALL
   WITH node
   MATCH (node)<-[r:!MENTIONS]-(neighbor)
   RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
   }
 RETURN output LIMIT 50

Pythonも関数generate_full_text_query変更して再実行。すると、うまく明確な敵がでてきました。

新一の敵は、島田秀雄、後藤、浦上です。

例1

他にも質問。うまく回答できています。ただ、特にGraphの結果は使っていないですね。

例2

これもうまくいっています。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up