diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index ad2124676f..744be74c09 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -7,7 +7,7 @@ ALL Bedrock models (Anthropic, Meta, Deepseek, Mistral, Amazon, etc.) are Suppor | Property | Details | |-------|-------| | Description | Amazon Bedrock is a fully managed service that offers a choice of high-performing foundation models (FMs). | -| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#bedrock-imported-models-deepseek) | +| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1) | | Provider Doc | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) | | Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings`, `/images/generations` | | Pass-through Endpoint | [Supported](../pass_through/bedrock.md) | @@ -1277,13 +1277,83 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ https://some-api-url/models ``` -## Bedrock Imported Models (Deepseek) +## Bedrock Imported Models (Deepseek, Deepseek R1) + +### Deepseek R1 + +This is a separate route, as the chat template is different. + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/deepseek_r1/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | + + + + +```python +from litellm import completion +import os + +response = completion( + model="bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/deepseek_r1/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], +) +``` + + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: DeepSeek-R1-Distill-Llama-70B + litellm_params: + model: bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n + +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + + +### Deepseek (not R1) | Property | Details | |----------|---------| | Provider Route | `bedrock/llama/{model_arn}` | | Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | + + Use this route to call Bedrock Imported Models that follow the `llama` Invoke Request / Response spec diff --git a/litellm/__init__.py b/litellm/__init__.py index b8de8a4298..91457f9b04 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -52,6 +52,7 @@ from litellm.constants import ( open_ai_embedding_models, cohere_embedding_models, bedrock_embedding_models, + known_tokenizer_config, ) from litellm.types.guardrails import GuardrailItem from litellm.proxy._types import ( @@ -360,7 +361,15 @@ BEDROCK_CONVERSE_MODELS = [ "meta.llama3-2-90b-instruct-v1:0", ] BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[ - "cohere", "anthropic", "mistral", "amazon", "meta", "llama", "ai21", "nova" + "cohere", + "anthropic", + "mistral", + "amazon", + "meta", + "llama", + "ai21", + "nova", + "deepseek_r1", ] ####### COMPLETION MODELS ################### open_ai_chat_completion_models: List = [] diff --git a/litellm/constants.py b/litellm/constants.py index 997b664f50..8d5cc2361a 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -335,6 +335,63 @@ bedrock_embedding_models: List = [ "cohere.embed-multilingual-v3", ] +known_tokenizer_config = { + "mistralai/Mistral-7B-Instruct-v0.1": { + "tokenizer": { + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "bos_token": "", + "eos_token": "", + }, + "status": "success", + }, + "meta-llama/Meta-Llama-3-8B-Instruct": { + "tokenizer": { + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", + "bos_token": "<|begin_of_text|>", + "eos_token": "", + }, + "status": "success", + }, + "deepseek-r1/deepseek-r1-7b-instruct": { + "tokenizer": { + "add_bos_token": True, + "add_eos_token": False, + "bos_token": { + "__type": "AddedToken", + "content": "<|begin▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "clean_up_tokenization_spaces": False, + "eos_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "legacy": True, + "model_max_length": 16384, + "pad_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "sp_model_kwargs": {}, + "unk_token": None, + "tokenizer_class": "LlamaTokenizerFast", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + }, + "status": "success", + }, +} + OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"] HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60 # 1 minute diff --git a/litellm/litellm_core_utils/exception_mapping_utils.py b/litellm/litellm_core_utils/exception_mapping_utils.py index 648330241e..9ac20de4c0 100644 --- a/litellm/litellm_core_utils/exception_mapping_utils.py +++ b/litellm/litellm_core_utils/exception_mapping_utils.py @@ -223,6 +223,7 @@ def exception_type( # type: ignore # noqa: PLR0915 "Request Timeout Error" in error_str or "Request timed out" in error_str or "Timed out generating response" in error_str + or "The read operation timed out" in error_str ): exception_mapping_worked = True diff --git a/litellm/litellm_core_utils/prompt_templates/factory.py b/litellm/litellm_core_utils/prompt_templates/factory.py index 1ed072e086..bf2153a878 100644 --- a/litellm/litellm_core_utils/prompt_templates/factory.py +++ b/litellm/litellm_core_utils/prompt_templates/factory.py @@ -325,26 +325,6 @@ def phind_codellama_pt(messages): return prompt -known_tokenizer_config = { - "mistralai/Mistral-7B-Instruct-v0.1": { - "tokenizer": { - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", - "bos_token": "", - "eos_token": "", - }, - "status": "success", - }, - "meta-llama/Meta-Llama-3-8B-Instruct": { - "tokenizer": { - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", - "bos_token": "<|begin_of_text|>", - "eos_token": "", - }, - "status": "success", - }, -} - - def hf_chat_template( # noqa: PLR0915 model: str, messages: list, chat_template: Optional[Any] = None ): @@ -378,11 +358,11 @@ def hf_chat_template( # noqa: PLR0915 else: return {"status": "failure"} - if model in known_tokenizer_config: - tokenizer_config = known_tokenizer_config[model] + if model in litellm.known_tokenizer_config: + tokenizer_config = litellm.known_tokenizer_config[model] else: tokenizer_config = _get_tokenizer_config(model) - known_tokenizer_config.update({model: tokenizer_config}) + litellm.known_tokenizer_config.update({model: tokenizer_config}) if ( tokenizer_config["status"] == "failure" @@ -475,6 +455,12 @@ def hf_chat_template( # noqa: PLR0915 ) # don't use verbose_logger.exception, if exception is raised +def deepseek_r1_pt(messages): + return hf_chat_template( + model="deepseek-r1/deepseek-r1-7b-instruct", messages=messages + ) + + # Anthropic template def claude_2_1_pt( messages: list, diff --git a/litellm/llms/azure/chat/gpt_transformation.py b/litellm/llms/azure/chat/gpt_transformation.py index b117583bd0..7aa4fffab5 100644 --- a/litellm/llms/azure/chat/gpt_transformation.py +++ b/litellm/llms/azure/chat/gpt_transformation.py @@ -98,6 +98,7 @@ class AzureOpenAIConfig(BaseConfig): "seed", "extra_headers", "parallel_tool_calls", + "prediction", ] def _is_response_format_supported_model(self, model: str) -> bool: diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py index 43fdc061e7..acee2f8ac5 100644 --- a/litellm/llms/bedrock/chat/invoke_handler.py +++ b/litellm/llms/bedrock/chat/invoke_handler.py @@ -1,5 +1,5 @@ """ -Manages calling Bedrock's `/converse` API + `/invoke` API +TODO: DELETE FILE. Bedrock LLM is no longer used. Goto `litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py` """ import copy diff --git a/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py index 5eb006f6ca..a080e55bb3 100644 --- a/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py +++ b/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py @@ -14,6 +14,7 @@ from litellm.litellm_core_utils.logging_utils import track_llm_api_timing from litellm.litellm_core_utils.prompt_templates.factory import ( cohere_message_pt, custom_prompt, + deepseek_r1_pt, prompt_factory, ) from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException @@ -178,11 +179,15 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM): ## SETUP ## stream = optional_params.pop("stream", None) custom_prompt_dict: dict = litellm_params.pop("custom_prompt_dict", None) or {} + hf_model_name = litellm_params.get("hf_model_name", None) provider = self.get_bedrock_invoke_provider(model) prompt, chat_history = self.convert_messages_to_prompt( - model, messages, provider, custom_prompt_dict + model=hf_model_name or model, + messages=messages, + provider=provider, + custom_prompt_dict=custom_prompt_dict, ) inference_params = copy.deepcopy(optional_params) inference_params = { @@ -266,7 +271,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM): "inputText": prompt, "textGenerationConfig": inference_params, } - elif provider == "meta" or provider == "llama": + elif provider == "meta" or provider == "llama" or provider == "deepseek_r1": ## LOAD CONFIG config = litellm.AmazonLlamaConfig.get_config() for k, v in config.items(): @@ -351,7 +356,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM): outputText = ( completion_response.get("completions")[0].get("data").get("text") ) - elif provider == "meta" or provider == "llama": + elif provider == "meta" or provider == "llama" or provider == "deepseek_r1": outputText = completion_response["generation"] elif provider == "mistral": outputText = completion_response["outputs"][0]["text"] @@ -664,6 +669,8 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM): ) elif provider == "cohere": prompt, chat_history = cohere_message_pt(messages=messages) + elif provider == "deepseek_r1": + prompt = deepseek_r1_pt(messages=messages) else: prompt = "" for message in messages: diff --git a/litellm/main.py b/litellm/main.py index 8326140fab..14e9f45d1e 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -215,7 +215,6 @@ azure_audio_transcriptions = AzureAudioTranscription() huggingface = Huggingface() predibase_chat_completions = PredibaseChatCompletion() codestral_text_completions = CodestralTextCompletion() -bedrock_chat_completion = BedrockLLM() bedrock_converse_chat_completion = BedrockConverseLLM() bedrock_embedding = BedrockEmbedding() bedrock_image_generation = BedrockImageGeneration() @@ -3947,7 +3946,7 @@ async def atext_completion( ), model=model, custom_llm_provider=custom_llm_provider, - stream_options=kwargs.get('stream_options'), + stream_options=kwargs.get("stream_options"), ) else: ## OpenAI / Azure Text Completion Returns here diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 30912186e5..f6fa0c5b9d 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -3658,6 +3658,42 @@ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models", "supports_tool_choice": true }, + "gemini-2.0-pro-exp-02-05": { + "max_tokens": 8192, + "max_input_tokens": 2097152, + "max_output_tokens": 8192, + "max_images_per_prompt": 3000, + "max_videos_per_prompt": 10, + "max_video_length": 1, + "max_audio_length_hours": 8.4, + "max_audio_per_prompt": 1, + "max_pdf_size_mb": 30, + "input_cost_per_image": 0, + "input_cost_per_video_per_second": 0, + "input_cost_per_audio_per_second": 0, + "input_cost_per_token": 0, + "input_cost_per_character": 0, + "input_cost_per_token_above_128k_tokens": 0, + "input_cost_per_character_above_128k_tokens": 0, + "input_cost_per_image_above_128k_tokens": 0, + "input_cost_per_video_per_second_above_128k_tokens": 0, + "input_cost_per_audio_per_second_above_128k_tokens": 0, + "output_cost_per_token": 0, + "output_cost_per_character": 0, + "output_cost_per_token_above_128k_tokens": 0, + "output_cost_per_character_above_128k_tokens": 0, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "supports_system_messages": true, + "supports_function_calling": true, + "supports_vision": true, + "supports_audio_input": true, + "supports_video_input": true, + "supports_pdf_input": true, + "supports_response_schema": true, + "supports_tool_choice": true, + "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing" + }, "gemini-2.0-flash-exp": { "max_tokens": 8192, "max_input_tokens": 1048576, diff --git a/litellm/utils.py b/litellm/utils.py index 34a5dc596c..58ce9dcaf2 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5194,9 +5194,10 @@ def _calculate_retry_after( # custom prompt helper function def register_prompt_template( model: str, - roles: dict, + roles: dict = {}, initial_prompt_value: str = "", final_prompt_value: str = "", + tokenizer_config: dict = {}, ): """ Register a prompt template to follow your custom format for a given model @@ -5233,12 +5234,27 @@ def register_prompt_template( ) ``` """ - model = get_llm_provider(model=model)[0] - litellm.custom_prompt_dict[model] = { - "roles": roles, - "initial_prompt_value": initial_prompt_value, - "final_prompt_value": final_prompt_value, - } + complete_model = model + potential_models = [complete_model] + try: + model = get_llm_provider(model=model)[0] + potential_models.append(model) + except Exception: + pass + if tokenizer_config: + for m in potential_models: + litellm.known_tokenizer_config[m] = { + "tokenizer": tokenizer_config, + "status": "success", + } + else: + for m in potential_models: + litellm.custom_prompt_dict[m] = { + "roles": roles, + "initial_prompt_value": initial_prompt_value, + "final_prompt_value": final_prompt_value, + } + return litellm.custom_prompt_dict diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 30912186e5..f6fa0c5b9d 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -3658,6 +3658,42 @@ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models", "supports_tool_choice": true }, + "gemini-2.0-pro-exp-02-05": { + "max_tokens": 8192, + "max_input_tokens": 2097152, + "max_output_tokens": 8192, + "max_images_per_prompt": 3000, + "max_videos_per_prompt": 10, + "max_video_length": 1, + "max_audio_length_hours": 8.4, + "max_audio_per_prompt": 1, + "max_pdf_size_mb": 30, + "input_cost_per_image": 0, + "input_cost_per_video_per_second": 0, + "input_cost_per_audio_per_second": 0, + "input_cost_per_token": 0, + "input_cost_per_character": 0, + "input_cost_per_token_above_128k_tokens": 0, + "input_cost_per_character_above_128k_tokens": 0, + "input_cost_per_image_above_128k_tokens": 0, + "input_cost_per_video_per_second_above_128k_tokens": 0, + "input_cost_per_audio_per_second_above_128k_tokens": 0, + "output_cost_per_token": 0, + "output_cost_per_character": 0, + "output_cost_per_token_above_128k_tokens": 0, + "output_cost_per_character_above_128k_tokens": 0, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "supports_system_messages": true, + "supports_function_calling": true, + "supports_vision": true, + "supports_audio_input": true, + "supports_video_input": true, + "supports_pdf_input": true, + "supports_response_schema": true, + "supports_tool_choice": true, + "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing" + }, "gemini-2.0-flash-exp": { "max_tokens": 8192, "max_input_tokens": 1048576, diff --git a/tests/llm_translation/test_optional_params.py b/tests/llm_translation/test_optional_params.py index e7f2f8ac28..01c751e146 100644 --- a/tests/llm_translation/test_optional_params.py +++ b/tests/llm_translation/test_optional_params.py @@ -1069,6 +1069,21 @@ def test_gemini_frequency_penalty(): assert optional_params["frequency_penalty"] == 0.5 + +def test_azure_prediction_param(): + optional_params = get_optional_params( + model="chatgpt-v2", + custom_llm_provider="azure", + prediction={ + "type": "content", + "content": "LiteLLM is a very useful way to connect to a variety of LLMs.", + }, + ) + assert optional_params["prediction"] == { + "type": "content", + "content": "LiteLLM is a very useful way to connect to a variety of LLMs.", + } + def test_vertex_ai_ft_llama(): optional_params = get_optional_params( model="1984786713414729728", diff --git a/tests/llm_translation/test_prompt_factory.py b/tests/llm_translation/test_prompt_factory.py index e6fc69d7d7..3a3675442f 100644 --- a/tests/llm_translation/test_prompt_factory.py +++ b/tests/llm_translation/test_prompt_factory.py @@ -708,3 +708,60 @@ def test_convert_generic_image_chunk_to_openai_image_obj(): url_str = convert_generic_image_chunk_to_openai_image_obj(image_obj) image_obj = convert_to_anthropic_image_obj(url_str) print(image_obj) + + +def test_hf_chat_template(): + from litellm.litellm_core_utils.prompt_templates.factory import ( + hf_chat_template, + ) + + model = "llama/arn:aws:bedrock:us-east-1:1234:imported-model/45d34re" + litellm.register_prompt_template( + model=model, + tokenizer_config={ + "add_bos_token": True, + "add_eos_token": False, + "bos_token": { + "__type": "AddedToken", + "content": "<|begin▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "clean_up_tokenization_spaces": False, + "eos_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "legacy": True, + "model_max_length": 16384, + "pad_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "sp_model_kwargs": {}, + "unk_token": None, + "tokenizer_class": "LlamaTokenizerFast", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + }, + ) + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the weather in Copenhagen?"}, + ] + chat_template = hf_chat_template(model=model, messages=messages) + print(chat_template) + assert ( + chat_template.rstrip() + == """<|begin▁of▁sentence|>You are a helpful assistant.<|User|>What is the weather in Copenhagen?<|Assistant|>""" + ) diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py index f367f8dc03..819dea8f93 100644 --- a/tests/local_testing/test_completion.py +++ b/tests/local_testing/test_completion.py @@ -3242,6 +3242,121 @@ def test_replicate_custom_prompt_dict(): litellm.custom_prompt_dict = {} # reset +def test_bedrock_deepseek_custom_prompt_dict(): + model = "llama/arn:aws:bedrock:us-east-1:1234:imported-model/45d34re" + litellm.register_prompt_template( + model=model, + tokenizer_config={ + "add_bos_token": True, + "add_eos_token": False, + "bos_token": { + "__type": "AddedToken", + "content": "<|begin▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "clean_up_tokenization_spaces": False, + "eos_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "legacy": True, + "model_max_length": 16384, + "pad_token": { + "__type": "AddedToken", + "content": "<|end▁of▁sentence|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False, + }, + "sp_model_kwargs": {}, + "unk_token": None, + "tokenizer_class": "LlamaTokenizerFast", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + }, + ) + assert model in litellm.known_tokenizer_config + from litellm.llms.custom_httpx.http_handler import HTTPHandler + + client = HTTPHandler() + + messages = [ + {"role": "system", "content": "You are a good assistant"}, + {"role": "user", "content": "What is the weather in Copenhagen?"}, + ] + + with patch.object(client, "post") as mock_post: + try: + completion( + model="bedrock/" + model, + messages=messages, + client=client, + ) + except Exception as e: + pass + + mock_post.assert_called_once() + print(mock_post.call_args.kwargs) + json_data = json.loads(mock_post.call_args.kwargs["data"]) + assert ( + json_data["prompt"].rstrip() + == """<|begin▁of▁sentence|>You are a good assistant<|User|>What is the weather in Copenhagen?<|Assistant|>""" + ) + + +def test_bedrock_deepseek_known_tokenizer_config(): + model = "deepseek_r1/arn:aws:bedrock:us-east-1:1234:imported-model/45d34re" + from litellm.llms.custom_httpx.http_handler import HTTPHandler + from unittest.mock import Mock + import httpx + + mock_response = Mock(spec=httpx.Response) + mock_response.status_code = 200 + mock_response.headers = { + "x-amzn-bedrock-input-token-count": "20", + "x-amzn-bedrock-output-token-count": "30", + } + + # The response format for deepseek_r1 + response_data = { + "generation": "The weather in Copenhagen is currently sunny with a temperature of 20°C (68°F). The forecast shows clear skies throughout the day with a gentle breeze from the northwest.", + "stop_reason": "stop", + "stop_sequence": None, + } + + mock_response.json.return_value = response_data + mock_response.text = json.dumps(response_data) + + client = HTTPHandler() + + messages = [ + {"role": "system", "content": "You are a good assistant"}, + {"role": "user", "content": "What is the weather in Copenhagen?"}, + ] + + with patch.object(client, "post", return_value=mock_response) as mock_post: + completion( + model="bedrock/" + model, + messages=messages, + client=client, + ) + + mock_post.assert_called_once() + print(mock_post.call_args.kwargs) + json_data = json.loads(mock_post.call_args.kwargs["data"]) + assert ( + json_data["prompt"].rstrip() + == """<|begin▁of▁sentence|>You are a good assistant<|User|>What is the weather in Copenhagen?<|Assistant|>""" + ) + + # test_replicate_custom_prompt_dict() # commenthing this out since we won't be always testing a custom, replicate deployment