feat(groq/): add response_format support for groq

Closes https://github.com/BerriAI/litellm/issues/6845
2024-11-21 18:13:37 +05:30 · 2024-11-21 18:13:37 +05:30 · 42beb618ae
commit 42beb618ae
parent d7f9999cef
10 changed files with 275 additions and 82 deletions
--- a/litellm/llms/groq/chat/transformation.py
+++ b/litellm/llms/groq/chat/transformation.py
@ -2,6 +2,7 @@
 Translate from OpenAI's `/v1/chat/completions` to Groq's `/v1/chat/completions`
 """
 import json
 import types
 from typing import List, Optional, Tuple, Union
@ -9,7 +10,12 @@ from pydantic import BaseModel
 import litellm
 from litellm.secret_managers.main import get_secret_str
-from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage
+from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionAssistantMessage,
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
 )
 from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig
@ -108,3 +114,60 @@ class GroqChatConfig(OpenAIGPTConfig):
            return True
        return False
    def _create_json_tool_call_for_response_format(
        self,
        json_schema: dict,
    ):
        """
        Handles creating a tool call for getting responses in JSON format.
        Args:
            json_schema (Optional[dict]): The JSON schema the response should be in
        Returns:
            AnthropicMessagesTool: The tool call to send to Anthropic API to get responses in JSON format
        """
        return ChatCompletionToolParam(
            type="function",
            function=ChatCompletionToolParamFunctionChunk(
                name="json_tool_call",
                parameters=json_schema,
            ),
        )
    def map_openai_params(
        self,
        non_default_params: dict,
        optional_params: dict,
        model: str,
        drop_params: bool = False,
    ) -> dict:
        _response_format = non_default_params.get("response_format")
        if _response_format is not None and isinstance(_response_format, dict):
            json_schema: Optional[dict] = None
            if "response_schema" in _response_format:
                json_schema = _response_format["response_schema"]
            elif "json_schema" in _response_format:
                json_schema = _response_format["json_schema"]["schema"]
            """
            When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
            - You usually want to provide a single tool
            - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
            - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
            """
            if json_schema is not None:
                _tool_choice = {
                    "type": "function",
                    "function": {"name": "json_tool_call"},
                }
                _tool = self._create_json_tool_call_for_response_format(
                    json_schema=json_schema,
                )
                optional_params["tools"] = [_tool]
                optional_params["tool_choice"] = _tool_choice
                optional_params["json_mode"] = True
            non_default_params.pop("response_format", None)
        return super().map_openai_params(
            non_default_params, optional_params, model, drop_params
        )
--- a/litellm/llms/openai_like/chat/handler.py
+++ b/litellm/llms/openai_like/chat/handler.py
@ -39,6 +39,7 @@ from litellm.utils import (
 )
 from ..common_utils import OpenAILikeBase, OpenAILikeError
 from .transformation import OpenAILikeChatConfig
 async def make_call(
@ -190,6 +191,7 @@ class OpenAILikeChatHandler(OpenAILikeBase):
        logger_fn=None,
        headers={},
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        json_mode: bool = False,
    ) -> ModelResponse:
        if timeout is None:
            timeout = httpx.Timeout(timeout=600.0, connect=5.0)
@ -202,8 +204,6 @@ class OpenAILikeChatHandler(OpenAILikeBase):
                api_base, headers=headers, data=json.dumps(data), timeout=timeout
            )
            response.raise_for_status()
            response_json = response.json()
        except httpx.HTTPStatusError as e:
            raise OpenAILikeError(
                status_code=e.response.status_code,
@ -214,19 +214,22 @@ class OpenAILikeChatHandler(OpenAILikeBase):
        except Exception as e:
            raise OpenAILikeError(status_code=500, message=str(e))
-        logging_obj.post_call(
+        return OpenAILikeChatConfig._transform_response(
-            input=messages,
+            model=model,
-            api_key="",
+            response=response,
-            original_response=response_json,
+            model_response=model_response,
-            additional_args={"complete_input_dict": data},
+            stream=stream,
            logging_obj=logging_obj,
            optional_params=optional_params,
            api_key=api_key,
            data=data,
            messages=messages,
            print_verbose=print_verbose,
            encoding=encoding,
            json_mode=json_mode,
            custom_llm_provider=custom_llm_provider,
            base_model=base_model,
        )
        response = ModelResponse(**response_json)
        response.model = custom_llm_provider + "/" + (response.model or "")
        if base_model is not None:
            response._hidden_params["model"] = base_model
        return response
    def completion(
        self,
@ -268,6 +271,7 @@ class OpenAILikeChatHandler(OpenAILikeBase):
        stream: bool = optional_params.pop("stream", None) or False
        extra_body = optional_params.pop("extra_body", {})
        json_mode = optional_params.pop("json_mode", None)
        if not fake_stream:
            optional_params["stream"] = stream
@ -390,17 +394,19 @@ class OpenAILikeChatHandler(OpenAILikeBase):
                    )
                except Exception as e:
                    raise OpenAILikeError(status_code=500, message=str(e))
-        logging_obj.post_call(
+        return OpenAILikeChatConfig._transform_response(
-            input=messages,
+            model=model,
-            api_key="",
+            response=response,
-            original_response=response_json,
+            model_response=model_response,
-            additional_args={"complete_input_dict": data},
+            stream=stream,
            logging_obj=logging_obj,
            optional_params=optional_params,
            api_key=api_key,
            data=data,
            messages=messages,
            print_verbose=print_verbose,
            encoding=encoding,
            json_mode=json_mode,
            custom_llm_provider=custom_llm_provider,
            base_model=base_model,
        )
        response = ModelResponse(**response_json)
        response.model = custom_llm_provider + "/" + (response.model or "")
        if base_model is not None:
            response._hidden_params["model"] = base_model
        return response
--- a/litellm/llms/openai_like/chat/transformation.py
+++ b/litellm/llms/openai_like/chat/transformation.py
@ -0,0 +1,98 @@
 """
 OpenAI-like chat completion transformation
 """
 import types
 from typing import List, Optional, Tuple, Union
 import httpx
 from pydantic import BaseModel
 import litellm
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage
 from litellm.types.utils import ModelResponse
 from ....utils import _remove_additional_properties, _remove_strict_from_schema
 from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig
 class OpenAILikeChatConfig(OpenAIGPTConfig):
    def _get_openai_compatible_provider_info(
        self, api_base: Optional[str], api_key: Optional[str]
    ) -> Tuple[Optional[str], Optional[str]]:
        api_base = api_base or get_secret_str("OPENAI_LIKE_API_BASE")  # type: ignore
        dynamic_api_key = (
            api_key or get_secret_str("OPENAI_LIKE_API_KEY") or ""
        )  # vllm does not require an api key
        return api_base, dynamic_api_key
    @staticmethod
    def _convert_tool_response_to_message(
        message: ChatCompletionAssistantMessage, json_mode: bool
    ) -> ChatCompletionAssistantMessage:
        """
        if json_mode is true, convert the returned tool call response to a content with json str
        e.g. input:
        {"role": "assistant", "tool_calls": [{"id": "call_5ms4", "type": "function", "function": {"name": "json_tool_call", "arguments": "{\"key\": \"question\", \"value\": \"What is the capital of France?\"}"}}]}
        output:
        {"role": "assistant", "content": "{\"key\": \"question\", \"value\": \"What is the capital of France?\"}"}
        """
        if not json_mode:
            return message
        _tool_calls = message.get("tool_calls")
        if _tool_calls is None or len(_tool_calls) != 1:
            return message
        message["content"] = _tool_calls[0]["function"].get("arguments") or ""
        message["tool_calls"] = None
        return message
    @staticmethod
    def _transform_response(
        model: str,
        response: httpx.Response,
        model_response: ModelResponse,
        stream: bool,
        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,  # type: ignore
        optional_params: dict,
        api_key: Optional[str],
        data: Union[dict, str],
        messages: List,
        print_verbose,
        encoding,
        json_mode: bool,
        custom_llm_provider: str,
        base_model: Optional[str],
    ) -> ModelResponse:
        response_json = response.json()
        logging_obj.post_call(
            input=messages,
            api_key="",
            original_response=response_json,
            additional_args={"complete_input_dict": data},
        )
        if json_mode:
            for choice in response_json["choices"]:
                message = OpenAILikeChatConfig._convert_tool_response_to_message(
                    choice.get("message"), json_mode
                )
                choice["message"] = message
        returned_response = ModelResponse(**response_json)
        returned_response.model = (
            custom_llm_provider + "/" + (returned_response.model or "")
        )
        if base_model is not None:
            returned_response._hidden_params["model"] = base_model
        return returned_response
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1495,7 +1495,6 @@ def completion(  # type: ignore # noqa: PLR0915
                timeout=timeout,  # type: ignore
                custom_prompt_dict=custom_prompt_dict,
                client=client,  # pass AsyncOpenAI, OpenAI client
                organization=organization,
                custom_llm_provider=custom_llm_provider,
                encoding=encoding,
            )
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1745,7 +1745,8 @@
        "output_cost_per_token": 0.00000080,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-8b-8192": {
        "max_tokens": 8192,
@ -1755,7 +1756,8 @@
        "output_cost_per_token": 0.00000008,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-70b-8192": {
        "max_tokens": 8192,
@ -1765,7 +1767,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-8b-instant": {
        "max_tokens": 8192,
@ -1775,7 +1778,8 @@
        "output_cost_per_token": 0.00000008,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-70b-versatile": {
        "max_tokens": 8192,
@ -1785,7 +1789,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-405b-reasoning": {
        "max_tokens": 8192,
@ -1795,7 +1800,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/mixtral-8x7b-32768": {
        "max_tokens": 32768,
@ -1805,7 +1811,8 @@
        "output_cost_per_token": 0.00000024,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/gemma-7b-it": {
        "max_tokens": 8192,
@ -1815,7 +1822,8 @@
        "output_cost_per_token": 0.00000007,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/gemma2-9b-it": {
        "max_tokens": 8192,
@ -1825,7 +1833,8 @@
        "output_cost_per_token": 0.00000020,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-groq-70b-8192-tool-use-preview": {
        "max_tokens": 8192,
@ -1835,7 +1844,8 @@
        "output_cost_per_token": 0.00000089,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-groq-8b-8192-tool-use-preview": {
        "max_tokens": 8192,
@ -1845,7 +1855,8 @@
        "output_cost_per_token": 0.00000019,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "cerebras/llama3.1-8b": {
        "max_tokens": 128000,
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1739,15 +1739,15 @@ def supports_response_schema(model: str, custom_llm_provider: Optional[str]) ->
    Does not raise error. Defaults to 'False'. Outputs logging.error.
    """
    ## GET LLM PROVIDER ##
    model, custom_llm_provider, _, _ = get_llm_provider(
        model=model, custom_llm_provider=custom_llm_provider
    )
    if custom_llm_provider == "predibase":  # predibase supports this globally
        return True
    try:
        ## GET LLM PROVIDER ##
        model, custom_llm_provider, _, _ = get_llm_provider(
            model=model, custom_llm_provider=custom_llm_provider
        )
        if custom_llm_provider == "predibase":  # predibase supports this globally
            return True
        ## GET MODEL INFO
        model_info = litellm.get_model_info(
            model=model, custom_llm_provider=custom_llm_provider
@ -1755,12 +1755,17 @@ def supports_response_schema(model: str, custom_llm_provider: Optional[str]) ->
        if model_info.get("supports_response_schema", False) is True:
            return True
        return False
    except Exception:
-        verbose_logger.error(
+        ## check if provider supports response schema globally
-            f"Model not supports response_schema. You passed model={model}, custom_llm_provider={custom_llm_provider}."
+        supported_params = get_supported_openai_params(
            model=model,
            custom_llm_provider=custom_llm_provider,
            request_type="chat_completion",
        )
-        return False
+        if supported_params is not None and "response_schema" in supported_params:
            return True
    return False
 def supports_function_calling(
@ -2710,6 +2715,7 @@ def get_optional_params(  # noqa: PLR0915
        non_default_params["response_format"] = type_to_response_format_param(
            response_format=non_default_params["response_format"]
        )
    if "tools" in non_default_params and isinstance(
        non_default_params, list
    ):  # fixes https://github.com/BerriAI/litellm/issues/4933
@ -3494,24 +3500,16 @@ def get_optional_params(  # noqa: PLR0915
        )
        _check_valid_arg(supported_params=supported_params)
-        if temperature is not None:
+        optional_params = litellm.GroqChatConfig().map_openai_params(
-            optional_params["temperature"] = temperature
+            non_default_params=non_default_params,
-        if max_tokens is not None:
+            optional_params=optional_params,
-            optional_params["max_tokens"] = max_tokens
+            model=model,
-        if top_p is not None:
+            drop_params=(
-            optional_params["top_p"] = top_p
+                drop_params
-        if stream is not None:
+                if drop_params is not None and isinstance(drop_params, bool)
-            optional_params["stream"] = stream
+                else False
-        if stop is not None:
+            ),
-            optional_params["stop"] = stop
+        )
        if tools is not None:
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
        if response_format is not None:
            optional_params["response_format"] = response_format
        if seed is not None:
            optional_params["seed"] = seed
    elif custom_llm_provider == "deepseek":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1745,7 +1745,8 @@
        "output_cost_per_token": 0.00000080,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-8b-8192": {
        "max_tokens": 8192,
@ -1755,7 +1756,8 @@
        "output_cost_per_token": 0.00000008,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-70b-8192": {
        "max_tokens": 8192,
@ -1765,7 +1767,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-8b-instant": {
        "max_tokens": 8192,
@ -1775,7 +1778,8 @@
        "output_cost_per_token": 0.00000008,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-70b-versatile": {
        "max_tokens": 8192,
@ -1785,7 +1789,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama-3.1-405b-reasoning": {
        "max_tokens": 8192,
@ -1795,7 +1800,8 @@
        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/mixtral-8x7b-32768": {
        "max_tokens": 32768,
@ -1805,7 +1811,8 @@
        "output_cost_per_token": 0.00000024,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/gemma-7b-it": {
        "max_tokens": 8192,
@ -1815,7 +1822,8 @@
        "output_cost_per_token": 0.00000007,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/gemma2-9b-it": {
        "max_tokens": 8192,
@ -1825,7 +1833,8 @@
        "output_cost_per_token": 0.00000020,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-groq-70b-8192-tool-use-preview": {
        "max_tokens": 8192,
@ -1835,7 +1844,8 @@
        "output_cost_per_token": 0.00000089,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "groq/llama3-groq-8b-8192-tool-use-preview": {
        "max_tokens": 8192,
@ -1845,7 +1855,8 @@
        "output_cost_per_token": 0.00000019,
        "litellm_provider": "groq",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
        "supports_response_schema": true
    },
    "cerebras/llama3.1-8b": {
        "max_tokens": 128000,
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -93,6 +93,7 @@ class BaseLLMChatTest(ABC):
        assert response.choices[0].message.content is not None
    def test_json_response_pydantic_obj(self):
        litellm.set_verbose = True
        from pydantic import BaseModel
        from litellm.utils import supports_response_schema
@ -119,6 +120,11 @@ class BaseLLMChatTest(ABC):
                response_format=TestModel,
            )
            assert res is not None
            print(res.choices[0].message)
            assert res.choices[0].message.content is not None
            assert res.choices[0].message.tool_calls is None
        except litellm.InternalServerError:
            pytest.skip("Model is overloaded")
--- a/tests/llm_translation/test_groq.py
+++ b/tests/llm_translation/test_groq.py
@ -4,7 +4,7 @@ from base_llm_unit_tests import BaseLLMChatTest
 class TestGroq(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
        return {
-            "model": "groq/llama3-70b-8192",
+            "model": "groq/llama-3.1-70b-versatile",
        }
    def test_tool_call_no_arguments(self, tool_call_no_arguments):
--- a/tests/local_testing/test_utils.py
+++ b/tests/local_testing/test_utils.py
@ -749,6 +749,7 @@ def test_convert_model_response_object():
        ("gemini/gemini-1.5-pro", True),
        ("predibase/llama3-8b-instruct", True),
        ("gpt-3.5-turbo", False),
        ("groq/llama3-70b-8192", True),
    ],
 )
 def test_supports_response_schema(model, expected_bool):