diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py index 4d271e5fd..8eabc3205 100644 --- a/llama_stack/models/llama/llama4/tokenizer.py +++ b/llama_stack/models/llama/llama4/tokenizer.py @@ -56,9 +56,11 @@ LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [ "<|text_post_train_reserved_special_token_3|>", "<|text_post_train_reserved_special_token_4|>", "<|text_post_train_reserved_special_token_5|>", + "<|text_post_train_reserved_special_token_6|>", + "<|text_post_train_reserved_special_token_7|>", "<|finetune_right_pad|>", ] + get_reserved_special_tokens( - "text_post_train", 61, 6 + "text_post_train", 61, 8 ) # <|text_post_train_reserved_special_token_6|>, ..., <|text_post_train_reserved_special_token_66|> # 200080, ..., 201133 diff --git a/llama_stack/providers/inline/inference/meta_reference/generators.py b/llama_stack/providers/inline/inference/meta_reference/generators.py index 65bed4d8c..34dd58a9a 100644 --- a/llama_stack/providers/inline/inference/meta_reference/generators.py +++ b/llama_stack/providers/inline/inference/meta_reference/generators.py @@ -259,7 +259,7 @@ class Llama3Generator: temperature, top_p = _infer_sampling_params(sampling_params) for result in self.inner_generator.generate( - llm_inputs=[self.formatter.encode_content(request.content)], + model_inputs=[self.formatter.encode_content(request.content)], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -284,7 +284,7 @@ class Llama3Generator: temperature, top_p = _infer_sampling_params(sampling_params) for result in self.inner_generator.generate( - llm_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], + model_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,