はじめに
以前、llama.cpp でllama2 のモデルを動かしましたが、GUIのチャット形式で稼働する方法を検討し、Python ライブラリの gradio を使用してGUI形式で動かしたログです。
以前CLIで動かした時の記録↓
環境
Power Systems上のローカル環境での稼働です。
・HW : IBM Power S1022 (0.75 CPU, 128GB Memory)
(もっと CPU を載せたかったのですがリソース不足で 0.75 までです)
・OS : RHEL 9.2 ppc64le
・python 仮想環境
python 3.11 (# python -m venv ./test)
(# source ./test/bin/activate)
仮想環境にgradio を導入(他依存関係も導入実行) ((test)# pip install gradio)
・llama.cpp 設定(以前の記事と同様の設定環境)
実行コード
gradio のコードを自前で書いて試みましたが、なかなかうまくできないので人様のコードを参照させていただきました。ありがとうございます。
参照させていただいたコード:https://gist.github.com/kohya-ss/e23fa9a321dba07fabd1ef61eab6863c
(Apache License 2.0)
実行コード(編集なしで使用しています)
Apache License 2.0
# 使用法は gist のコメントを見てください
import argparse
from typing import List, Optional, Union, Iterator
from llama_cpp.llama_chat_format import _convert_completion_to_chat, register_chat_completion_handler
import llama_cpp.llama_types as llama_types
from llama_cpp.llama import LogitsProcessorList, LlamaGrammar
from llama_cpp import Llama, llama_chat_format
import gradio as gr
debug_flag = False
@register_chat_completion_handler("command-r")
def command_r_chat_handler(
llama: Llama,
messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
min_p: float = 0.05,
typical_p: float = 1.0,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = [],
response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
max_tokens: Optional[int] = None,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
repeat_penalty: float = 1.1,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None,
**kwargs, # type: ignore
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
bos_token = "<BOS_TOKEN>"
start_turn_token = "<|START_OF_TURN_TOKEN|>"
end_turn_token = "<|END_OF_TURN_TOKEN|>"
user_token = "<|USER_TOKEN|>"
chatbot_token = "<|CHATBOT_TOKEN|>"
system_token = "<|SYSTEM_TOKEN|>"
prompt = bos_token
if len(messages) > 0 and messages[0]["role"] == "system":
prompt += start_turn_token + system_token + messages[0]["content"] + end_turn_token
messages = messages[1:]
for message in messages:
if message["role"] == "user":
prompt += start_turn_token + user_token + message["content"] + end_turn_token
elif message["role"] == "assistant":
prompt += start_turn_token + chatbot_token + message["content"] + end_turn_token
prompt += start_turn_token + chatbot_token
if debug_flag:
print(f"Prompt: {prompt}")
stop_tokens = [end_turn_token] # , bos_token]
return _convert_completion_to_chat(
llama.create_completion(
prompt=prompt,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
typical_p=typical_p,
stream=stream,
stop=stop_tokens,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
repeat_penalty=repeat_penalty,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
model=model,
logits_processor=logits_processor,
grammar=grammar,
),
stream=stream,
)
def generate_completion(llama, prompt, max_tokens, temperature, top_p, repeat_penalty, top_k):
global stop_generating
stop_generating = False
output = prompt
if debug_flag:
print(
f"temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, repeat_penalty: {repeat_penalty}, max_tokens: {max_tokens}"
)
for chunk in llama(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
repeat_penalty=repeat_penalty,
top_k=top_k,
stream=True,
):
if debug_flag:
print(chunk)
if stop_generating:
break
if "choices" in chunk and len(chunk["choices"]) > 0:
if "text" in chunk["choices"][0]:
text = chunk["choices"][0]["text"]
# check EOS_TOKEN
if text.endswith("<EOS_TOKEN>"): # llama.tokenizer.EOS_TOKEN):
output += text[: -len("<EOS_TOKEN>")]
yield output[len(prompt) :]
break
output += text
yield output[len(prompt) :] # remove prompt
def launch_completion(llama, listen=False):
# css = """
# .prompt textarea {font-size:1.0em !important}
# """
# with gr.Blocks(css=css) as demo:
with gr.Blocks() as demo:
with gr.Row():
# change font size
io_textbox = gr.Textbox(
label="Input/Output: Text may not be scrolled automatically. Shift+Enter to newline. テキストは自動スクロールしないことがあります。Shift+Enterで改行。",
placeholder="Enter your prompt here...",
interactive=True,
elem_classes=["prompt"],
autoscroll=True,
)
with gr.Row():
generate_button = gr.Button("Generate")
stop_button = gr.Button("Stop", visible=False)
with gr.Row():
max_tokens = gr.Slider(minimum=1, maximum=2048, value=128, step=1, label="Max Tokens")
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Temperature")
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P")
repeat_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repeat Penalty")
top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="Top K")
def generate_and_display(prompt, max_tokens, temperature, top_p, repeat_penalty, top_k):
output_generator = generate_completion(llama, prompt, max_tokens, temperature, top_p, repeat_penalty, top_k)
for output in output_generator:
yield gr.update(value=prompt + output, autoscroll=True), gr.update(visible=False), gr.update(visible=True)
yield gr.update(value=prompt + output, autoscroll=True), gr.update(visible=True), gr.update(visible=False)
def stop_generation():
globals().update(stop_generating=True)
return gr.update(visible=True), gr.update(visible=False)
generate_button.click(
generate_and_display,
inputs=[io_textbox, max_tokens, temperature, top_p, repeat_penalty, top_k],
outputs=[io_textbox, generate_button, stop_button],
show_progress=True,
)
stop_button.click(
stop_generation,
outputs=[generate_button, stop_button],
)
# add event to textbox to add new line on enter
io_textbox.submit(
lambda x: x + "\n",
inputs=[io_textbox],
outputs=[io_textbox],
)
demo.launch(server_name="0.0.0.0" if listen else None)
def launch_chat(llama, handler_name, listen=False):
def chat(message, history, system_prompt, max_tokens, temperature, top_p, repeat_penalty, top_k):
user_input = message
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
for message in history:
messages.append({"role": "user", "content": message[0]})
messages.append({"role": "assistant", "content": message[1]})
messages.append({"role": "user", "content": user_input})
if debug_flag:
print(f"Messages: {messages}")
print(
f"System prompt: {system_prompt}, temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, repeat_penalty: {repeat_penalty}, max_tokens: {max_tokens}"
)
handler = llama_chat_format.get_chat_completion_handler(handler_name)
chat_completion_chunks = handler(
llama=llama,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
repeat_penalty=repeat_penalty,
top_k=int(top_k),
stream=True,
)
response = ""
for chunk in chat_completion_chunks:
if debug_flag:
print(chunk)
if "choices" in chunk and len(chunk["choices"]) > 0:
if "delta" in chunk["choices"][0]:
if "content" in chunk["choices"][0]["delta"]:
response += chunk["choices"][0]["delta"]["content"]
yield response
system_prompt = gr.Textbox(label="System Prompt", placeholder="Enter system prompt here...")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="Temperature")
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P")
repeat_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repeat Penalty")
top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="Top K")
additional_inputs = [system_prompt, max_tokens, temperature, top_p, repeat_penalty, top_k]
chatbot = gr.ChatInterface(chat, additional_inputs=additional_inputs)
chatbot.launch(server_name="0.0.0.0" if listen else None)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", type=str, default=None, help="Model file path")
parser.add_argument("-ngl", "--n_gpu_layers", type=int, default=0, help="Number of GPU layers")
parser.add_argument("-c", "--n_ctx", type=int, default=2048, help="Context length")
parser.add_argument(
"-ch",
"--chat_handler",
type=str,
default="command-r",
help="Chat handler, e.g. command-r, mistral-instruct, alpaca, llama-3 etc. default: command-r",
)
parser.add_argument("--chat", action="store_true", help="Chat mode")
parser.add_argument("--listen", action="store_true", help="Listen mode")
parser.add_argument(
"-ts", "--tensor_split", type=str, default=None, help="Tensor split, float values separated by comma for each gpu"
)
parser.add_argument("--debug", action="store_true", help="Debug mode")
args = parser.parse_args()
# tokenizer initialization doesn't seem to be needed
# print("Initializing tokenizer")
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(f"Initializing Llama. Model ID: {args.model}, N_GPU_LAYERS: {args.n_gpu_layers}, N_CTX: {args.n_ctx}")
# llama_tokenizer = LlamaHFTokenizer(tokenizer)
tensor_split = None if args.tensor_split is None else [float(x) for x in args.tensor_split.split(",")]
llama = Llama(
model_path=args.model,
n_gpu_layers=args.n_gpu_layers,
tensor_split=tensor_split,
n_ctx=args.n_ctx,
# tokenizer=llama_tokenizer,
# n_threads=n_threads,
)
debug_flag = args.debug
if args.chat:
print(f"Launching chat with handler: {args.chat_handler}")
launch_chat(llama, args.chat_handler, args.listen)
else:
print("Launching completion")
launch_completion(llama, args.listen)
実行ログ
実行したコマンドは以下オプションです。
"Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" をダウンロードして使用しています。(llama3の量子化モデル)
# python gradio_cmdrp.py -m /work/llama.cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf -c 2048 --chat
実行ログ
(test) [root@test0506gradio script]# python gradio_cmdrp.py -m /work/llama.cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf -c 2048 --chat
Initializing Llama. Model ID: /work/llama.cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf, N_GPU_LAYERS: 0, N_CTX: 2048
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /work/llama.cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.name str = .
llama_model_loader: - kv 2: llama.vocab_size u32 = 128256
llama_model_loader: - kv 3: llama.context_length u32 = 8192
llama_model_loader: - kv 4: llama.embedding_length u32 = 4096
llama_model_loader: - kv 5: llama.block_count u32 = 32
llama_model_loader: - kv 6: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 7: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 8: llama.attention.head_count u32 = 32
llama_model_loader: - kv 9: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 10: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 11: llama.rope.freq_base f32 = 500000.000000
llama_model_loader: - kv 12: general.file_type u32 = 15
llama_model_loader: - kv 13: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 14: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv 15: tokenizer.ggml.scores arr[f32,128256] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 16: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 17: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 128000
llama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 128009
llama_model_loader: - kv 20: tokenizer.chat_template str = {% set loop_messages = messages %}{% ...
llama_model_loader: - kv 21: general.quantization_version u32 = 2
llama_model_loader: - type f32: 65 tensors
llama_model_loader: - type q4_K: 193 tensors
llama_model_loader: - type q6_K: 33 tensors
llm_load_vocab: missing pre-tokenizer type, using: 'default'
llm_load_vocab:
llm_load_vocab: ************************************
llm_load_vocab: GENERATION QUALITY WILL BE DEGRADED!
llm_load_vocab: CONSIDER REGENERATING THE MODEL
llm_load_vocab: ************************************
llm_load_vocab:
llm_load_vocab: special tokens definition check successful ( 256/128256 ).
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 128256
llm_load_print_meta: n_merges = 280147
llm_load_print_meta: n_ctx_train = 8192
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 8
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_embd_head_k = 128
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 4
llm_load_print_meta: n_embd_k_gqa = 1024
llm_load_print_meta: n_embd_v_gqa = 1024
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 14336
llm_load_print_meta: n_expert = 0
llm_load_print_meta: n_expert_used = 0
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = linear
llm_load_print_meta: freq_base_train = 500000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx = 8192
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: model type = 8B
llm_load_print_meta: model ftype = Q4_K - Medium
llm_load_print_meta: model params = 8.03 B
llm_load_print_meta: model size = 4.58 GiB (4.89 BPW)
llm_load_print_meta: general.name = .
llm_load_print_meta: BOS token = 128000 '<|begin_of_text|>'
llm_load_print_meta: EOS token = 128009 '<|eot_id|>'
llm_load_print_meta: LF token = 128 'Ä'
llm_load_print_meta: EOT token = 128009 '<|eot_id|>'
llm_load_tensors: ggml ctx size = 0.15 MiB
llm_load_tensors: CPU buffer size = 4685.30 MiB
........................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: n_batch = 512
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base = 500000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init: CPU KV buffer size = 256.00 MiB
llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB
llama_new_context_with_model: CPU output buffer size = 0.49 MiB
llama_new_context_with_model: CPU compute buffer size = 258.50 MiB
llama_new_context_with_model: graph nodes = 1030
llama_new_context_with_model: graph splits = 1
AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
Model metadata: {'tokenizer.chat_template': "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", 'tokenizer.ggml.eos_token_id': '128009', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'general.architecture': 'llama', 'llama.rope.freq_base': '500000.000000', 'llama.context_length': '8192', 'general.name': '.', 'llama.vocab_size': '128256', 'general.file_type': '15', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '128000', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8'}
Using gguf chat template: {% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>
' }}
Using chat eos_token: <|eot_id|>
Using chat bos_token: <|begin_of_text|>
Launching chat with handler: command-r
Running on local URL: http://127.0.0.1:7860
To create a public link, set `share=True` in `launch()`.
llama_print_timings: load time = 79984.16 ms
llama_print_timings: sample time = 10.23 ms / 19 runs ( 0.54 ms per token, 1856.92 tokens per second)
llama_print_timings: prompt eval time = 79984.08 ms / 53 tokens ( 1509.13 ms per token, 0.66 tokens per second)
llama_print_timings: eval time = 35759.61 ms / 18 runs ( 1986.64 ms per token, 0.50 tokens per second)
llama_print_timings: total time = 115928.92 ms / 71 tokens
Llama.generate: prefix-match hit
llama_print_timings: load time = 79984.16 ms
llama_print_timings: sample time = 13.25 ms / 25 runs ( 0.53 ms per token, 1886.22 tokens per second)
llama_print_timings: prompt eval time = 80605.80 ms / 52 tokens ( 1550.11 ms per token, 0.65 tokens per second)
llama_print_timings: eval time = 47783.11 ms / 24 runs ( 1990.96 ms per token, 0.50 tokens per second)
llama_print_timings: total time = 128630.88 ms / 76 tokens
Llama.generate: prefix-match hit
llama_print_timings: load time = 79984.16 ms
llama_print_timings: sample time = 9.21 ms / 17 runs ( 0.54 ms per token, 1845.02 tokens per second)
llama_print_timings: prompt eval time = 42448.80 ms / 28 tokens ( 1516.03 ms per token, 0.66 tokens per second)
llama_print_timings: eval time = 31776.28 ms / 16 runs ( 1986.02 ms per token, 0.50 tokens per second)
llama_print_timings: total time = 74386.80 ms / 44 tokens
^CKeyboard interruption in main thread... closing server.
(test) [root@test0506gradio script]#
実行イメージ
実際はもう少し時間がかかっているのですが、無事 gradio の GUI ベースでチャット形式で実行できています。
改善したい内容
とりあえずPower で動かす! というところを目標に実施していますが、世の中に出回っているもっと高品質な生成AIに近づけたいものです
(Open Source でこれだけ動くのであればまぁまぁ良いのではとも思うこともありますが)
・レスポンスを早くするには
リソース or チューニングで速度が速くなるのかどうか
・E1080 (Power10 のハイエンドモデル) だとサクサク動くらしいという噂を聞いたので、E1080での稼働確認
・他の基盤モデルの使用
・GUI の見た目
・RAG とかマルチモーダル とか...
(余白と興味関心で試しているためできればというところですが..。)
以上です。