はじめに
前回「CyberAgentLM3-22B-Chat」や「Llama-3-ELYZA-JP-8B」や「gemma-2-9b-it」で同じことをしました。touch-sp.hatenablog.com
touch-sp.hatenablog.com
touch-sp.hatenablog.com
今回は「llama-3-youko-8b-instruct」です。
モデルの量子化
今回は量子化を行いませんでした。float16でモデルを読み込みました。Gradioで実行
import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import torch system_prompt_text = "あなたは誠実で優秀なアシスタントです。どうか、簡潔かつ正直に答えてください。" init = { "role": "system", "content": system_prompt_text, } # model was downloaded from https://huggingface.co/rinna/llama-3-youko-8b-instruct model = AutoModelForCausalLM.from_pretrained( "llama-3-youko-8b-instruct", device_map="auto", torch_dtype=torch.float16 ) tokenizer = AutoTokenizer.from_pretrained("llama-3-youko-8b-instruct") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) def call_llm( message: str, history: list[dict], max_tokens: int, temperature: float, top_p: float, repetition_penalty: float ): history_openai_format = [] if len(history) == 0: history_openai_format.append(init) history_openai_format.append({"role": "user", "content": message}) else: history_openai_format.append(init) for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({"role": "assistant", "content": assistant}) history_openai_format.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( history_openai_format, add_generation_prompt=True, return_tensors="pt" ).to(model.device) terminators = [ tokenizer.convert_tokens_to_ids("<|end_of_text|>"), tokenizer.convert_tokens_to_ids("<|eot_id|>") ] generation_kwargs = dict( inputs=input_ids, streamer=streamer, eos_token_id=terminators, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() generated_text = "" for new_text in streamer: generated_text += new_text yield generated_text def run(): chatbot = gr.Chatbot( elem_id="chatbot", scale=1, show_copy_button=True, height="70%", layout="panel", ) with gr.Blocks(fill_height=True) as demo: gr.Markdown("# llama-3-youko-8b-instruct") gr.ChatInterface( fn=call_llm, stop_btn="Stop Generation", cache_examples=False, multimodal=False, chatbot=chatbot, additional_inputs_accordion=gr.Accordion( label="Parameters", open=False, render=False ), additional_inputs=[ gr.Slider( minimum=1, maximum=4096, step=1, value=512, label="Max tokens", visible=True, render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.6, label="Temperature", visible=True, render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.9, label="Top-p", visible=True, render=False, ), gr.Slider( minimum=0, maximum=2, step=0.1, value=1.1, label="repetition_penalty", visible=True, render=False, ), ], ) demo.launch(share=False) if __name__ == "__main__": run()