More than 1 year has passed since last update.

LangChainにオープンな言語モデルを組み込んでアレコレしてみる (3) ~チャットボット~

Last updated at 2023-05-10Posted at 2023-05-09

完成品

動作イメージ

※モデルのロードに少し時間がかかっています。
しばらくするとチャット風景に移ります

ソースコード

変数名やクラス化・関数化は大目にみてください、、殴り書きです、、

chatbot/chatbot_langchain.py

from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import StoppingCriteria
from transformers import StoppingCriteriaList
from langchain.memory import ConversationBufferMemory
import torch

def model_setup(model_id:str):
    # モデル&トークナイザーのダウンロード
    print(f"!!! Downloading Model from {model_id} !!!")
    model = AutoModelForCausalLM.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    return model, tokenizer


def pipeline_setup(model, tokenizer, isGPU:bool, **kwargs) -> HuggingFacePipeline:
    # GPUの確認
    if isGPU:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"\n!!! current device is {device} !!!")
        model = model.to(device)
        
        # GPUにモデルを展開する際に必要な引数を追加
        device = 0
        framework = 'pt'
    else:
        device = -1
        framework = None
        
        
    # パイプラインの作成
    task = "text-generation"
    pipe = pipeline(
        task,
        model=model,
        tokenizer=tokenizer,
        device=device,
        framework=framework,
        pad_token_id=0,
        **kwargs
    )

    # LLMs: LangChainで利用可能な形に変換
    llm = HuggingFacePipeline(pipeline=pipe)
    
    print("!!! Pipeline Setup Completed !!!\n\n")
    
    return llm



# Stopの条件を設定するクラスを作成 (StoppingCriteriaを継承する)
class MyStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_str, num_iter, tokenizer, isGPU):
        if isGPU:
            self.stop_token_ids = tokenizer(stop_str, return_tensors='pt')["input_ids"].to('cuda')
            self.stop_token_ids_iter = tokenizer(stop_str*2, return_tensors='pt')["input_ids"].to('cuda')
        else:
            self.stop_token_ids = tokenizer(stop_str, return_tensors='pt')["input_ids"]
            self.stop_token_ids_iter = tokenizer(stop_str, return_tensors='pt')["input_ids"]
            
        self.num_iter = num_iter
        self.tokenizer = tokenizer
        
    def __call__(self, input_ids:torch.LongTensor, score:torch.FloatTensor, **kwargs):
        # 出力の最後尾の文字列とstop_strが一致した回数
        match_count = 0
        
        # 出力文字列を最後尾から順に、num_iterで指定された要素数だけ処理する
        for i in range(1, self.num_iter+1): 
            input_id = input_ids[0][-i]
            stop_id = self.stop_token_ids[0][0]
            stop_iter_id = self.stop_token_ids_iter[0][0]
            
            # 対象文字列とstop_strが一致した場合、カウントを増やす
            if input_id == stop_id:
                match_count += 1
            
        # \nが2回続いた場合、または\n\nが現れた場合、generate()をStopする
        if match_count == self.num_iter or input_id == stop_iter_id:
            isStop = True
            # print(f"!!! Generate() Stopped !!!\n!!!!!!!!!\n{self.tokenizer.decode(input_ids[0])} \n!!!!!!!!!")
        else:
            isStop = False
        return isStop
    
    
def chat_chain_setup(template, llm) -> LLMChain:
    # Memory: メモリ上に会話を記録する設定
    memory_key = "chat_history"
    memory = ConversationBufferMemory(memory_key=memory_key, ai_prefix="")
    
    # Prompts: プロンプトを作成
    prompt = PromptTemplate(template=template, input_variables=["chat_history", "input"])

    # Chains: プロンプト&モデル&メモリをチェーンに登録
    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,
        memory=memory
    )
    
    return llm_chain


def main(isGPU=False):
    # モデルをダウンロード
    model_id = "andreaskoepf/pythia-1.4b-gpt4all-pretrain"
    model, tokenizer = model_setup(model_id)

    # Stopの条件式に用いる文字と、その文字が何回続いたらStopするかを指定
    stop_str = "\n"
    num_iter = 2  # \nが2回繰り返された場合、generate()をstopする

    # StoppingCriteriaListクラスのインスタンスを生成
    stopcriteria_list = StoppingCriteriaList([MyStoppingCriteria(stop_str, num_iter, tokenizer, isGPU=True)])
    print(stopcriteria_list)

    # HuggingFacePipelineを作成
    model_args = {"temperature":0.1, "max_length": 256, "stopping_criteria": stopcriteria_list}
    llm = pipeline_setup(model=model, tokenizer=tokenizer, isGPU=isGPU, **model_args)

    # プロンプトテンプレートを作成
    template = """
You are an AI who responds to user Input.
Please provide an answer to the human's question.
Additonaly, you are having a conversation with a human based on past interactions.

### Answer Sample
Human: Hi!
AI: Hi, nice to meet you.

### Past Interactions
{chat_history}

### 
Human:{input}
"""

    # Chat用のチェーンを作成
    llm_chain = chat_chain_setup(template, llm)

    # チャット形式
    while True:
        user_input = input("User: ")
        if user_input == "exit":
            break
        else:
            response = llm_chain.predict(input=user_input)
            print(response)


if __name__ == "__main__":
    import sys
    try:
        isGPU = bool(sys.argv[1])
    except Exception as e:
        print(f"{str(e)}: You are using CPU")
        isGPU = False

    main(isGPU)

背景

当初の目的はReActの実装でした。今回はその前段階としてチャット機能を作ってみます。
LangChain×オープン言語モデルなので無料です。アツい。
今回使用している言語モデルはGPT4ALLです。

手順①：モデルの読み込み~入力に対する文章生成

まずは、ユーザがなにか入力したらモデルに出力させるところまでを作ります。

手順①に関するソースコード

from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# GPUの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n!!! current device is {device} !!!\n")

# モデルのダウンロード
model_id = "andreaskoepf/pythia-1.4b-gpt4all-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

# パイプラインの作成
task = "text-generation"
pipe = pipeline(
    task,
    model=model,
    tokenizer=tokenizer,
    device=0,
    framework='pt',
    temperature=0.1,
    max_new_tokens=128
)

# LLMs: LangChainで利用可能な形に変換
llm = HuggingFacePipeline(pipeline=pipe)

# Prompts: プロンプトを作成
template = """You are an assistant who responds to user Input. \nPlease provide an answer to the user's question, as shown in the following example.\n\nExample:\nQuestion: What is the height of Tokyo Tower in meters?\nAnswer: The height of Tokyo Tower is 333 meters.\n\n###\n\nInput:\n{question}"""
prompt = PromptTemplate(template=template, input_variables=["question"])

# Chains: チェーンに登録
llm_chain = LLMChain(prompt=prompt, llm=llm, verbose=True)

# 質問を投げる
# question = "How can I get end of the list in Python?Take an example of Python Code."
question = input("Enter your question")
generated_text = llm_chain.run(question)
print(generated_text)

1. モデルの用意

pythia-1.4b-gpt4all-pretrain
前回に引き続き、よさげなモデルをHuggingFace Hubから探します。
今回はandreaskoepf/pythia-1.4b-gpt4all-pretrainを使用します。

モデルの読み込み

前回のように、最終的にはHuggingFace HubのモデルをLangChainで使用します。
ここでは、前処理としてPythonパッケージtransformersのAutoModelForCausalLMクラスでモデルを読み込んでいます
- ローカルGPUに転送しているので不要な方.to(device)を削除してください。

HugginFace Hubのモデルを読み込み

from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# GPUの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n!!! current device is {device} !!!\n")

# モデルのダウンロード & 読み込んでGPUに転送
model_id = "andreaskoepf/pythia-1.4b-gpt4all-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

2. パイプラインの作成とLangChainへの統合

transformersのpipline()を使って、パイプラインを作ります。
ここでは以下を設定しています。
- task: モデルに解かせるタスク
- model: 利用するモデル
- tokenizer: 利用するトークナイザー
- device, framework: モデル入出力のGPU利用有無
- **kwargs: モデルのパラメータ(temperature, max_lengthなど)

パイプラインの作成&LangChainに統合

# パイプラインの作成
task = "text-generation"
pipe = pipeline(
    task,
    model=model,
    tokenizer=tokenizer,
    device=0,
    framework='pt',
    # 以下、**kwargsに該当
    temperature=0.1,
    max_new_tokens=256
)

# LLMs: LangChainで利用可能な形に変換
llm = HuggingFacePipeline(pipeline=pipe)

3. プロンプトと言語モデルをChainsに登録して推論実行

推論に使用する言語モデルを作成できたので、あとはプロンプトを作ってモデルに投げる部分を作ります。
プロンプトを作成後、プロンプト&言語モデルをChainsに登録します。
あとはChainsでrun()を実行すると、LangChainが色々してくれて出力を得られます。

# Prompts: プロンプトを作成
template = """You are an assistant who responds to user Input. \nPlease provide an answer to the user's question, as shown in the following example.\n\nExample:\nQuestion: What is the height of Tokyo Tower in meters?\nAnswer: The height of Tokyo Tower is 333 meters.\n\n###\n\nInput:\n{question}"""
prompt = PromptTemplate(template=template, input_variables=["question"])

# Chains: チェーンに登録
llm_chain = LLMChain(prompt=prompt, llm=llm, verbose=True)

# 質問を投げる
question = "How can I get end of the list in Python?Take an example of Python Code."
generated_text = llm_chain.run(question)
print(generated_text)

### OutPut (長いので割愛)###
ちゃんとリストのインデックスに-1を指定する方法を出力していました。
しかも解説付きでした。すごい。
##########################

プロンプトの補足

日本語に直すと以下のような感じです
- One-Shotにしています。
- question=ユーザの入力としてます

あなたはユーザーの入力に応答するアシスタントです。
以下の例のように、ユーザーの質問に回答してください。

例:
質問：東京タワーの高さは何メートルですか？
答え：東京タワーの高さは333メートルです。

###
入力:
{question}

手順②: チャットのように対話させる

会話を記憶させ、過去のやり取りに基づいた回答をできるようにします。
- ChatGPTのこの機能便利ですよね、人間並みに過去の文脈を理解してくれる

手順②に関するソースコード

from langchain.memory import ConversationBufferMemory

# Memory: メモリ上に会話を記録する設定
memory_key = "chat_history"
memory = ConversationBufferMemory(memory_key=memory_key, ai_prefix="")

# Prompts: プロンプトを作成。会話履歴もinput_variablesとして指定する
template = """
You are an AI who responds to user Input.
Please provide an answer to the human's question.
Additonaly, you are having a conversation with a human based on past interactions.

### Answer Sample
Human: Hi!
AI: Hi, nice to meet you.

### Past Interactions
{chat_history}

### 
Human:{input}
"""
prompt = PromptTemplate(template=template, input_variables=["chat_history", "input"])

# Chains: プロンプト&モデル&メモリをチェーンに登録
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
    verbose=True,
)

# 実行①
user_input = "What is the Japanese word for mountain？"
response = llm_chain.predict(input=user_input)
print(response)

# 履歴表示
memory.load_memory_variables({})

会話を記憶させる

会話の記録 (修正前)

まずは、素直にドキュメントに従って会話の記憶を実装してみます。
手順①と違うポイントはざっくり以下の通りです
- 会話履歴をメモリ上に記録する
- プロンプトに会話履歴を追加する
- プロンプトを会話用に編集する

以下のコードを実行すると、会話の記録には成功します。
しかし、モデルの出力が繰り返されてしまっていて、無駄な記憶をしているのが分かります。
- max_lengthまで文字列を埋めるために繰り返ししているっぽいです

会話の記憶 (修正前)

# Prompts: プロンプトを作成。会話履歴もinputに指定する
template = """You are an assistant who responds to user Input. \nPlease provide an answer to the human's question.\nAdditonaly, you are having a conversation with a human.\n\n{chat_history}\n\n{question}"""
prompt = PromptTemplate(template=template, input_variables=["chat_history", "question"]) # 会話履歴もinputにする

# Memory: メモリ上に会話を記録する設定
memory_key = "chat_history"
memory = ConversationBufferMemory(memory_key=memory_key)

# Chains: プロンプト&モデル&メモリをチェーンに登録
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
    verbose=True,
)

# 実行
question = "What is the Japanese word for mountain？"
response = llm_chain.predict(question=question)
print(response)

モデルの出力

### Output ###
The Japanese word for mountain is 山.

The Japanese word for mountain is 山.
.
.
.

記憶の確認

# 履歴表示
memory.load_memory_variables({})

### Output ###
{'chat_history': 'Human: What is the Japanese word for mountain？\nAI: \n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is 山.\n\nThe Japanese word for mountain is '}

会話の記録 (修正後)

繰り返しを避けないことには無駄な記憶のせいで色々と不都合なことが起きそうなので対策します。
対策方法はパッと思いつくので2通りですかね、、（いい方法あれば教えてください）
- ①記憶=memoryのデータを直接編集
- ②出力をStopする条件を作り、モデルに繰り返しさせない
本質的なのは、②だと思うのでこちらを実装していきます。
- OpenAI APIにもありますよね。stop=[\n\n]みたいなやつ。
- ↑これと同じような機能を実装してみます。

例のごとく色々と調査開始です。

LangChainのソースコード・Transformersのドキュメント↑を読んでいる中で、
model.generate()で指定可能な引数一覧にたどり着き、以下のようなものを見つけました。

stopping_criteria (StoppingCriteriaList, optional) — 
Custom stopping criteria that complement the default stopping criteria built from arguments and a model’s config.
If a stopping criteria is passed that is already created with the arguments or a model’s config an error is thrown. 
This feature is intended for advanced users.

stopping_criteriaという引数があり、StoppingCriteriaListクラスのインスタンスを渡せるそうです。
StoppingCriteriaListってなんだ？と思いドキュメントを見たところ、使い方がわかりませんでした、、

色々調べていると、以下サイトを発見しました。

内容をまとめると、以下のような感じですね。

StoppingCriteriaクラスを継承したクラスを自作
自作クラスにて、モデルの出力がどのような場合にStopするかを記述
StoppingCriteriaListに自作クラスを指定

自分で作る分、細かく制御できそうですね。
上記サイトに載っていた例と、今回作成した例を載せておきます。

サイトに載っていた例

class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self):
        pass
    
    def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs) -> bool:
        return your_condition # True or False

stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria()])

\n\nが出力に現れたら出力をストップする例

from transformers import StoppingCriteria
from transformers import StoppingCriteriaList

# Stopの条件式に用いる文字と、その文字が何回続いたらStopするかを指定
stop_str = "\n"
num_iter = 2  # \nが2回繰り返された、または\n\nが現れた場合にgenerate()をstopする


# Stopの条件を設定するクラスを作成 (StoppingCriteriaを継承する)
class MyStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_str, num_iter, tokenizer, isGPU):
        if isGPU:
            self.stop_token_ids = tokenizer(stop_str, return_tensors='pt')["input_ids"].to('cuda')
            self.stop_token_ids_iter = tokenizer(stop_str*2, return_tensors='pt')["input_ids"].to('cuda')
        else:
            self.stop_token_ids = tokenizer(stop_str, return_tensors='pt')["input_ids"]
            self.stop_token_ids_iter = tokenizer(stop_str, return_tensors='pt')["input_ids"]
            
        self.num_iter = num_iter
        self.tokenizer = tokenizer
        
    def __call__(self, input_ids:torch.LongTensor, score:torch.FloatTensor, **kwargs):
        # 出力の最後尾の文字列とstop_strが一致した回数
        match_count = 0
        
        # 出力文字列を最後尾から順に、num_iterで指定された要素数だけ処理する
        for i in range(1, self.num_iter+1): 
            input_id = input_ids[0][-i]
            stop_id = self.stop_token_ids[0][0]
            stop_iter_id = self.stop_token_ids_iter[0][0]
            
            # 対象文字列とstop_strが一致した場合、カウントを増やす
            if input_id == stop_id:
                match_count += 1
            
        # \nが2回続いた場合、または\n\nが現れた場合、generate()をStopする
        if match_count == self.num_iter or input_id == stop_iter_id:
            isStop = True
            print(f"!!! Generate() Stopped !!!\n!!!!!!!!!\n{self.tokenizer.decode(input_ids[0])} \n!!!!!!!!!")
        else:
            isStop = False
        return isStop


# StoppingCriteriaListクラスのインスタンスを生成
stopcriteria_list = StoppingCriteriaList([MyStoppingCriteria(stop_str, num_iter, tokenizer, isGPU=True)])
print(stopcriteria_list)

# HuggingFacePipelineを作成
model_args = {"temperature":0.1, "max_length": 256, "stopping_criteria": stopcriteria_list}
llm = pipeline_setup(model=model, tokenizer=tokenizer, isGPU=True, **model_args)

殴り書きしたのでめちゃくちゃ読みにくくなりました、、リファクタリングします　（そのうち）
プロンプトを若干修正し、上記コードで再度llm_chainを実行すると、ちゃんと必要な情報だけ記憶されていました。
しかし、よく記憶されたデータをよく見てみると、AI:AI:xxxxのようにプレフィックスが重複されています、、

記憶の確認

# Prompts: プロンプトを作成。会話履歴もinput_variablesとして指定する
template = """
You are an AI who responds to user Input.
Please provide an answer to the human's question.
Additonaly, you are having a conversation with a human based on past interactions.

### Answer Sample
Human: Hi!
AI: Hi, nice to meet you.

### Past Interactions
{chat_history}

### 
Human:{input}
"""
prompt = PromptTemplate(template=template, input_variables=["chat_history", "input"])

# Chains: プロンプト&モデル&メモリをチェーンに登録
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
    verbose=True,
)

# 実行①
input = "What is the Japanese word for mountain？"
response = llm_chain.predict(input=input)
print(response)

# 履歴表示
memory.load_memory_variables({})


### Output ###
{'chat_history': 
    'Human: What is the Japanese word for mountain
     AI: AI: Japanese word for mountain is 山.\n\n'
}

memoryの保存形式について

memoryは以下のような辞書型で保存されます。(ソースコードに記載されていました)
- Key: memory_keyで指定した文字列
- Value: 「人間の入力」 + 「AIの回答」が改行でつながれた文字列
  - 「人間の入力」のプレフィックスはhuman_prefixで指定。(デフォルト=Human)
  - 「AIの回答」のプレフィックスはai_prefixで指定。(デフォルト=AI)

なので、ai_prefix=""とすればいけそうですよね。

from langchain.memory import ConversationBufferMemory

# Memory: メモリ上に会話を記録する設定
memory_key = "chat_history"
memory = ConversationBufferMemory(memory_key=memory_key, ai_prefix="")

このようにすると、AIの回答のプレフィックスの繰り返しはなくなりました。
しかし、ソースコード上、ai_preffix + ":" + 回答となる仕様っぽいので、先頭に:がついてしまっています。
(replace("\n:", "")とすればよさそうですが今回は無視します、、)

会話記憶を出力してみたところ、以下のようになりました。よさげです。

### Before ###
{'chat_history': 
    'Human: What is the Japanese word for mountain
     AI: AI: Japanese word for mountain is 山.\n\n'
}

### After ###
{'chat_history': 
    'Human: What is the Japanese word for mountain？
    : AI: Japanese word for mountain is 山.\n\n'
}

手順③: 入力→回答部分をループさせてチャットボットっぽくする

ここまで以下を達成できています。
- ①入力に応じて回答する
- ②過去の会話を記憶する
上記の①②を含んだ処理をループさせることでチャットボットっぽくします。
- といっても特別なことをするわけではないです。ただのWhile文です。

while True:
    user_input = input("\n> ")
    if user_input == "exit":
        break
    else:
        response = llm_chain.predict(input=user_input)
        print(response)

まとめ

OpenAI APIはお金かかる分、抽象化してくれているのでユーザ側の開発負荷を下げてくれるなと思いました。
あと、めちゃくちゃ大規模なモデルはローカルでは動かせませんしね、、
次はAgent機能を扱います！やっとReActに近づいてきた

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up