Merge pull request #4918 from BerriAI/litellm_ollama_tool_calling

feat(ollama_chat.py): support ollama tool calling
2024-07-26 22:16:58 -07:00 · 2024-07-26 22:16:58 -07:00 · f9c2fec1a6
commit f9c2fec1a6
parent 1506e74332 77fe8f57cf
8 changed files with 194 additions and 25 deletions
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Ollama 
 LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama)

@ -84,6 +87,120 @@ response = completion(
 )
 ```

+## Example Usage - Tool Calling 
+
+To use ollama tool calling, pass `tools=[{..}]` to `litellm.completion()` 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import litellm 
+
+## [OPTIONAL] REGISTER MODEL - not all ollama models support function calling, litellm defaults to json mode tool calls if native tool calling not supported.
+
+# litellm.register_model(model_cost={
+#                 "ollama_chat/llama3.1": { 
+#                   "supports_function_calling": true
+#                 },
+#             })
+
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather in a given location",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+          },
+          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+      },
+    }
+  }
+]
+
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+
+response = completion(
+  model="ollama_chat/llama3.1",
+  messages=messages,
+  tools=tools
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: "llama3.1"             
+    litellm_params:
+      model: "ollama_chat/llama3.1"
+    model_info:
+      supports_function_calling: true
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "llama3.1",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What'\''s the weather like in Boston today?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+  "stream": true
+}'
+```
+</TabItem>
+</Tabs>
+
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -149,7 +149,9 @@ class OllamaChatConfig:
            "response_format",
        ]

-    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+    def map_openai_params(
+        self, model: str, non_default_params: dict, optional_params: dict
+    ):
        for param, value in non_default_params.items():
            if param == "max_tokens":
                optional_params["num_predict"] = value
@ -170,16 +172,26 @@ class OllamaChatConfig:
            ### FUNCTION CALLING LOGIC ###
            if param == "tools":
                # ollama actually supports json output
-                optional_params["format"] = "json"
-                litellm.add_function_to_prompt = (
-                    True  # so that main.py adds the function call to the prompt
-                )
-                optional_params["functions_unsupported_model"] = value
+                ## CHECK IF MODEL SUPPORTS TOOL CALLING ##
+                try:
+                    model_info = litellm.get_model_info(
+                        model=model, custom_llm_provider="ollama_chat"
+                    )
+                    if model_info.get("supports_function_calling") is True:
+                        optional_params["tools"] = value
+                    else:
+                        raise Exception
+                except Exception:
+                    optional_params["format"] = "json"
+                    litellm.add_function_to_prompt = (
+                        True  # so that main.py adds the function call to the prompt
+                    )
+                    optional_params["functions_unsupported_model"] = value

-                if len(optional_params["functions_unsupported_model"]) == 1:
-                    optional_params["function_name"] = optional_params[
-                        "functions_unsupported_model"
-                    ][0]["function"]["name"]
+                    if len(optional_params["functions_unsupported_model"]) == 1:
+                        optional_params["function_name"] = optional_params[
+                            "functions_unsupported_model"
+                        ][0]["function"]["name"]

            if param == "functions":
                # ollama actually supports json output
@ -198,11 +210,11 @@ class OllamaChatConfig:
 # ollama implementation
 def get_ollama_response(
    model_response: litellm.ModelResponse,
+    messages: list,
+    optional_params: dict,
    api_base="http://localhost:11434",
    api_key: Optional[str] = None,
    model="llama2",
-    messages=None,
-    optional_params=None,
    logging_obj=None,
    acompletion: bool = False,
    encoding=None,
@ -223,6 +235,7 @@ def get_ollama_response(
    stream = optional_params.pop("stream", False)
    format = optional_params.pop("format", None)
    function_name = optional_params.pop("function_name", None)
+    tools = optional_params.pop("tools", None)

    for m in messages:
        if "role" in m and m["role"] == "tool":
@ -236,6 +249,8 @@ def get_ollama_response(
    }
    if format is not None:
        data["format"] = format
+    if tools is not None:
+        data["tools"] = tools
    ## LOGGING
    logging_obj.pre_call(
        input=None,
@ -499,7 +514,8 @@ async def ollama_acompletion(

            ## RESPONSE OBJECT
            model_response.choices[0].finish_reason = "stop"
-            if data.get("format", "") == "json":
+
+            if data.get("format", "") == "json" and function_name is not None:
                function_call = json.loads(response_json["message"]["content"])
                message = litellm.Message(
                    content=None,
@ -519,11 +535,8 @@ async def ollama_acompletion(
                model_response.choices[0].message = message  # type: ignore
                model_response.choices[0].finish_reason = "tool_calls"
            else:
-                model_response.choices[0].message.content = response_json[  # type: ignore
-                    "message"
-                ][
-                    "content"
-                ]
+                _message = litellm.Message(**response_json["message"])
+                model_response.choices[0].message = _message  # type: ignore

            model_response.created = int(time.time())
            model_response.model = "ollama_chat/" + data["model"]
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -3956,6 +3956,16 @@
        "litellm_provider": "ollama",
        "mode": "chat"
    },
+    "ollama/llama3.1": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "chat", 
+        "supports_function_calling": true
+    },
    "ollama/mistral": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,8 +1,6 @@
 model_list:
-  - model_name: "*"             
+  - model_name: "llama3.1"             
    litellm_params:
-      model: "*"
-
-litellm_settings:
-  success_callback: ["logfire"]
-  cache: true
+      model: "ollama_chat/llama3.1"
+    model_info:
+      supports_function_calling: true
--- a/litellm/router.py
+++ b/litellm/router.py
@ -3469,6 +3469,18 @@ class Router:
            model_info=_model_info,
        )

+        ## REGISTER MODEL INFO IN LITELLM MODEL COST MAP
+        _model_name = deployment.litellm_params.model
+        if deployment.litellm_params.custom_llm_provider is not None:
+            _model_name = (
+                deployment.litellm_params.custom_llm_provider + "/" + _model_name
+            )
+        litellm.register_model(
+            model_cost={
+                _model_name: _model_info,
+            }
+        )
+
        deployment = self._add_deployment(deployment=deployment)

        model = deployment.to_json(exclude_none=True)
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -74,6 +74,7 @@ class ModelInfo(TypedDict, total=False):
    supports_system_messages: Optional[bool]
    supports_response_schema: Optional[bool]
    supports_vision: Optional[bool]
+    supports_function_calling: Optional[bool]


 class GenericStreamingChunk(TypedDict):
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2089,6 +2089,7 @@ def supports_function_calling(model: str) -> bool:
    Raises:
    Exception: If the given model is not found in model_prices_and_context_window.json.
    """
+
    if model in litellm.model_cost:
        model_info = litellm.model_cost[model]
        if model_info.get("supports_function_calling", False) is True:
@ -3293,7 +3294,9 @@ def get_optional_params(
        _check_valid_arg(supported_params=supported_params)

        optional_params = litellm.OllamaChatConfig().map_openai_params(
-            non_default_params=non_default_params, optional_params=optional_params
+            model=model,
+            non_default_params=non_default_params,
+            optional_params=optional_params,
        )
    elif custom_llm_provider == "nlp_cloud":
        supported_params = get_supported_openai_params(
@ -4877,6 +4880,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
            supports_system_messages: Optional[bool]
            supports_response_schema: Optional[bool]
            supports_vision: Optional[bool]
+            supports_function_calling: Optional[bool]
    Raises:
        Exception: If the model is not mapped yet.

@ -4951,6 +4955,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                supported_openai_params=supported_openai_params,
                supports_system_messages=None,
                supports_response_schema=None,
+                supports_function_calling=None,
            )
        else:
            """
@ -5041,6 +5046,9 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                    "supports_response_schema", None
                ),
                supports_vision=_model_info.get("supports_vision", False),
+                supports_function_calling=_model_info.get(
+                    "supports_function_calling", False
+                ),
            )
    except Exception:
        raise Exception(
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -3956,6 +3956,16 @@
        "litellm_provider": "ollama",
        "mode": "chat"
    },
+    "ollama/llama3.1": {
+        "max_tokens": 8192,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "chat", 
+        "supports_function_calling": true
+    },
    "ollama/mistral": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,