(feat) add nvidia nim embeddings (#6032)

* nvidia nim support embedding config

* add nvidia config in init

* nvidia nim embeddings

* docs nvidia nim embeddings

* docs embeddings on nvidia nim

* fix llm translation test
This commit is contained in:
Ishaan Jaff 2024-10-03 04:42:14 -07:00 committed by GitHub
parent 05df9cc6d0
commit d92696a303
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 238 additions and 9 deletions

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Nvidia NIM # Nvidia NIM
https://docs.api.nvidia.com/nim/reference/ https://docs.api.nvidia.com/nim/reference/
@ -65,6 +68,96 @@ for chunk in response:
``` ```
## Usage - embedding
```python
import litellm
import os
response = litellm.embedding(
model="nvidia_nim/nvidia/nv-embedqa-e5-v5", # add `nvidia_nim/` prefix to model so litellm knows to route to Nvidia NIM
input=["good morning from litellm"],
encoding_format = "float",
user_id = "user-1234",
# Nvidia NIM Specific Parameters
input_type = "passage", # Optional
truncate = "NONE" # Optional
)
print(response)
```
## **Usage - LiteLLM Proxy Server**
Here's how to call an Nvidia NIM Endpoint with the LiteLLM Proxy Server
1. Modify the config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: nvidia_nim/<your-model-name> # add nvidia_nim/ prefix to route as Nvidia NIM provider
api_key: api-key # api key to send your model
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
```
3. Send Request to LiteLLM Proxy Server
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
)
response = client.chat.completions.create(
model="my-model",
messages = [
{
"role": "user",
"content": "what llm are you"
}
],
)
print(response)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "my-model",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
## Supported Models - 💥 ALL Nvidia NIM Models Supported! ## Supported Models - 💥 ALL Nvidia NIM Models Supported!
We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests

View file

@ -981,7 +981,13 @@ from .llms.OpenAI.chat.o1_transformation import (
from .llms.OpenAI.chat.gpt_transformation import ( from .llms.OpenAI.chat.gpt_transformation import (
OpenAIGPTConfig, OpenAIGPTConfig,
) )
from .llms.nvidia_nim import NvidiaNimConfig
from .llms.nvidia_nim.chat import NvidiaNimConfig
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
nvidiaNimConfig = NvidiaNimConfig()
nvidiaNimEmbeddingConfig = NvidiaNimEmbeddingConfig()
from .llms.cerebras.chat import CerebrasConfig from .llms.cerebras.chat import CerebrasConfig
from .llms.sambanova.chat import SambanovaConfig from .llms.sambanova.chat import SambanovaConfig
from .llms.AI21.chat import AI21ChatConfig from .llms.AI21.chat import AI21ChatConfig

View file

@ -0,0 +1,80 @@
"""
Nvidia NIM embeddings endpoint: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
This is OpenAI compatible
This file only contains param mapping logic
API calling is done using the OpenAI SDK with an api_base
"""
import types
from typing import Optional, Union
class NvidiaNimEmbeddingConfig:
"""
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
"""
# OpenAI params
encoding_format: Optional[str] = None
user: Optional[str] = None
# Nvidia NIM params
input_type: Optional[str] = None
truncate: Optional[str] = None
def __init__(
self,
encoding_format: Optional[str] = None,
user: Optional[str] = None,
input_type: Optional[str] = None,
truncate: Optional[str] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(
self,
):
return ["encoding_format", "user"]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
kwargs: Optional[dict] = None,
):
if "extra_body" not in optional_params:
optional_params["extra_body"] = {}
for k, v in non_default_params.items():
if k == "input_type":
optional_params["extra_body"].update({"input_type": v})
elif k == "truncate":
optional_params["extra_body"].update({"truncate": v})
if kwargs is not None:
# pass kwargs in extra_body
optional_params["extra_body"].update(kwargs)
return optional_params

View file

@ -3316,6 +3316,7 @@ def embedding(
input=[], input=[],
# Optional params # Optional params
dimensions: Optional[int] = None, dimensions: Optional[int] = None,
encoding_format: Optional[str] = None,
timeout=600, # default to 10 minutes timeout=600, # default to 10 minutes
# set api_base, api_version, api_key # set api_base, api_version, api_key
api_base: Optional[str] = None, api_base: Optional[str] = None,
@ -3336,6 +3337,7 @@ def embedding(
Parameters: Parameters:
- model: The embedding model to use. - model: The embedding model to use.
- input: The input for which embeddings are to be generated. - input: The input for which embeddings are to be generated.
- encoding_format: Optional[str] The format to return the embeddings in. Can be either `float` or `base64`
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models. - dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins - timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging. - litellm_call_id: The call ID for litellm logging.
@ -3362,7 +3364,6 @@ def embedding(
max_parallel_requests = kwargs.pop("max_parallel_requests", None) max_parallel_requests = kwargs.pop("max_parallel_requests", None)
model_info = kwargs.get("model_info", None) model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", None) metadata = kwargs.get("metadata", None)
encoding_format = kwargs.get("encoding_format", None)
proxy_server_request = kwargs.get("proxy_server_request", None) proxy_server_request = kwargs.get("proxy_server_request", None)
aembedding = kwargs.get("aembedding", None) aembedding = kwargs.get("aembedding", None)
extra_headers = kwargs.get("extra_headers", None) extra_headers = kwargs.get("extra_headers", None)
@ -3556,6 +3557,7 @@ def embedding(
model in litellm.open_ai_embedding_models model in litellm.open_ai_embedding_models
or custom_llm_provider == "openai" or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai" or custom_llm_provider == "together_ai"
or custom_llm_provider == "nvidia_nim"
): ):
api_base = ( api_base = (
api_base api_base

View file

@ -2552,9 +2552,9 @@ def get_optional_params_image_gen(
def get_optional_params_embeddings( def get_optional_params_embeddings(
# 2 optional params # 2 optional params
model: str, model: str,
user=None, user: Optional[str] = None,
encoding_format=None, encoding_format: Optional[str] = None,
dimensions=None, dimensions: Optional[int] = None,
custom_llm_provider="", custom_llm_provider="",
drop_params: Optional[bool] = None, drop_params: Optional[bool] = None,
additional_drop_params: Optional[bool] = None, additional_drop_params: Optional[bool] = None,
@ -2595,7 +2595,6 @@ def get_optional_params_embeddings(
default_params=default_params, default_params=default_params,
additional_drop_params=additional_drop_params, additional_drop_params=additional_drop_params,
) )
## raise exception if non-default value passed for non-openai/azure embedding calls ## raise exception if non-default value passed for non-openai/azure embedding calls
if custom_llm_provider == "openai": if custom_llm_provider == "openai":
# 'dimensions` is only supported in `text-embedding-3` and later models # 'dimensions` is only supported in `text-embedding-3` and later models
@ -2627,6 +2626,17 @@ def get_optional_params_embeddings(
) )
final_params = {**optional_params, **kwargs} final_params = {**optional_params, **kwargs}
return final_params return final_params
elif custom_llm_provider == "nvidia_nim":
supported_params = get_supported_openai_params(
model=model or "",
custom_llm_provider="nvidia_nim",
request_type="embeddings",
)
_check_valid_arg(supported_params=supported_params)
optional_params = litellm.nvidiaNimEmbeddingConfig.map_openai_params(
non_default_params=non_default_params, optional_params={}, kwargs=kwargs
)
return optional_params
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
supported_params = get_supported_openai_params( supported_params = get_supported_openai_params(
model=model, model=model,
@ -4308,7 +4318,10 @@ def get_supported_openai_params(
else: else:
return litellm.FireworksAIConfig().get_supported_openai_params() return litellm.FireworksAIConfig().get_supported_openai_params()
elif custom_llm_provider == "nvidia_nim": elif custom_llm_provider == "nvidia_nim":
return litellm.NvidiaNimConfig().get_supported_openai_params(model=model) if request_type == "chat_completion":
return litellm.nvidiaNimConfig.get_supported_openai_params(model=model)
elif request_type == "embeddings":
return litellm.nvidiaNimEmbeddingConfig.get_supported_openai_params()
elif custom_llm_provider == "cerebras": elif custom_llm_provider == "cerebras":
return litellm.CerebrasConfig().get_supported_openai_params(model=model) return litellm.CerebrasConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "ai21_chat": elif custom_llm_provider == "ai21_chat":

View file

@ -165,7 +165,7 @@ def test_all_model_configs():
"max_new_tokens": 10 "max_new_tokens": 10
} }
from litellm.llms.nvidia_nim import NvidiaNimConfig from litellm.llms.nvidia_nim.chat import NvidiaNimConfig
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params( assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
model="llama3" model="llama3"

View file

@ -14,7 +14,7 @@ import pytest
from respx import MockRouter from respx import MockRouter
import litellm import litellm
from litellm import Choices, Message, ModelResponse from litellm import Choices, Message, ModelResponse, EmbeddingResponse, Usage
from litellm import completion from litellm import completion
@ -69,3 +69,38 @@ def test_completion_nvidia_nim(respx_mock: MockRouter):
pass pass
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_embedding_nvidia_nim(respx_mock: MockRouter):
litellm.set_verbose = True
mock_response = EmbeddingResponse(
model="nvidia_nim/databricks/dbrx-instruct",
data=[
{
"embedding": [0.1, 0.2, 0.3],
"index": 0,
}
],
usage=Usage(
prompt_tokens=10,
completion_tokens=0,
total_tokens=10,
),
)
mock_request = respx_mock.post(
"https://integrate.api.nvidia.com/v1/embeddings"
).mock(return_value=httpx.Response(200, json=mock_response.dict()))
response = litellm.embedding(
model="nvidia_nim/nvidia/nv-embedqa-e5-v5",
input="What is the meaning of life?",
input_type="passage",
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"input": "What is the meaning of life?",
"model": "nvidia/nv-embedqa-e5-v5",
"input_type": "passage",
"encoding_format": "base64",
}