はじめに
前回「CyberAgentLM3-22B-Chat」や「Llama-3-ELYZA-JP-8B」で同じことをしました。touch-sp.hatenablog.com
touch-sp.hatenablog.com
今回は「gemma-2-9b-it」です。
小規模でかつ日本語特化モデルでないにもかかわらず日本語性能は高い印象です。
モデルの量子化
今回は量子化を行いませんでした。Gradioで実行
import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig from threading import Thread # model was downloaded from https://huggingface.co/google/gemma-2-9b-it model = AutoModelForCausalLM.from_pretrained( "gemma-2-9b-it", device_map="auto", torch_dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained("gemma-2-9b-it") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) def call_llm( message: str, history: list[dict], max_tokens: int, temperature: float, top_p: float, ): history_openai_format = [] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({"role": "assistant", "content": assistant}) history_openai_format.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( history_openai_format, add_generation_prompt=True, return_tensors="pt" ).to(model.device) generation_kwargs = dict( inputs=input_ids, streamer=streamer, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_p=top_p ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() generated_text = "" for new_text in streamer: generated_text += new_text yield generated_text def run(): chatbot = gr.Chatbot( elem_id="chatbot", scale=1, show_copy_button=True, height="70%", layout="panel", ) with gr.Blocks(fill_height=True) as demo: gr.Markdown("# gemma-2-9b-it") gr.ChatInterface( fn=call_llm, stop_btn="Stop Generation", cache_examples=False, multimodal=False, chatbot=chatbot, additional_inputs_accordion=gr.Accordion( label="Parameters", open=False, render=False ), additional_inputs=[ gr.Slider( minimum=1, maximum=4096, step=1, value=1024, label="Max tokens", visible=True, render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.3, label="Temperature", visible=True, render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.9, label="Top-p", visible=True, render=False, ), ], ) demo.launch(share=False) if __name__ == "__main__": run()