(feat) add nvidia nim embeddings (#6032)

* nvidia nim support embedding config

* add nvidia config in init

* nvidia nim embeddings

* docs nvidia nim embeddings

* docs embeddings on nvidia nim

* fix llm translation test
This commit is contained in:
Ishaan Jaff 2024-10-03 04:42:14 -07:00 committed by GitHub
parent 05df9cc6d0
commit d92696a303
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 238 additions and 9 deletions

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Nvidia NIM
https://docs.api.nvidia.com/nim/reference/
@ -65,6 +68,96 @@ for chunk in response:
```
## Usage - embedding
```python
import litellm
import os
response = litellm.embedding(
model="nvidia_nim/nvidia/nv-embedqa-e5-v5", # add `nvidia_nim/` prefix to model so litellm knows to route to Nvidia NIM
input=["good morning from litellm"],
encoding_format = "float",
user_id = "user-1234",
# Nvidia NIM Specific Parameters
input_type = "passage", # Optional
truncate = "NONE" # Optional
)
print(response)
```
## **Usage - LiteLLM Proxy Server**
Here's how to call an Nvidia NIM Endpoint with the LiteLLM Proxy Server
1. Modify the config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: nvidia_nim/<your-model-name> # add nvidia_nim/ prefix to route as Nvidia NIM provider
api_key: api-key # api key to send your model
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
```
3. Send Request to LiteLLM Proxy Server
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
)
response = client.chat.completions.create(
model="my-model",
messages = [
{
"role": "user",
"content": "what llm are you"
}
],
)
print(response)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "my-model",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
## Supported Models - 💥 ALL Nvidia NIM Models Supported!
We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests

View file

@ -981,7 +981,13 @@ from .llms.OpenAI.chat.o1_transformation import (
from .llms.OpenAI.chat.gpt_transformation import (
OpenAIGPTConfig,
)
from .llms.nvidia_nim import NvidiaNimConfig
from .llms.nvidia_nim.chat import NvidiaNimConfig
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
nvidiaNimConfig = NvidiaNimConfig()
nvidiaNimEmbeddingConfig = NvidiaNimEmbeddingConfig()
from .llms.cerebras.chat import CerebrasConfig
from .llms.sambanova.chat import SambanovaConfig
from .llms.AI21.chat import AI21ChatConfig

View file

@ -0,0 +1,80 @@
"""
Nvidia NIM embeddings endpoint: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
This is OpenAI compatible
This file only contains param mapping logic
API calling is done using the OpenAI SDK with an api_base
"""
import types
from typing import Optional, Union
class NvidiaNimEmbeddingConfig:
"""
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
"""
# OpenAI params
encoding_format: Optional[str] = None
user: Optional[str] = None
# Nvidia NIM params
input_type: Optional[str] = None
truncate: Optional[str] = None
def __init__(
self,
encoding_format: Optional[str] = None,
user: Optional[str] = None,
input_type: Optional[str] = None,
truncate: Optional[str] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(
self,
):
return ["encoding_format", "user"]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
kwargs: Optional[dict] = None,
):
if "extra_body" not in optional_params:
optional_params["extra_body"] = {}
for k, v in non_default_params.items():
if k == "input_type":
optional_params["extra_body"].update({"input_type": v})
elif k == "truncate":
optional_params["extra_body"].update({"truncate": v})
if kwargs is not None:
# pass kwargs in extra_body
optional_params["extra_body"].update(kwargs)
return optional_params

View file

@ -3316,6 +3316,7 @@ def embedding(
input=[],
# Optional params
dimensions: Optional[int] = None,
encoding_format: Optional[str] = None,
timeout=600, # default to 10 minutes
# set api_base, api_version, api_key
api_base: Optional[str] = None,
@ -3336,6 +3337,7 @@ def embedding(
Parameters:
- model: The embedding model to use.
- input: The input for which embeddings are to be generated.
- encoding_format: Optional[str] The format to return the embeddings in. Can be either `float` or `base64`
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging.
@ -3362,7 +3364,6 @@ def embedding(
max_parallel_requests = kwargs.pop("max_parallel_requests", None)
model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", None)
encoding_format = kwargs.get("encoding_format", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
aembedding = kwargs.get("aembedding", None)
extra_headers = kwargs.get("extra_headers", None)
@ -3556,6 +3557,7 @@ def embedding(
model in litellm.open_ai_embedding_models
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
or custom_llm_provider == "nvidia_nim"
):
api_base = (
api_base

View file

@ -2552,9 +2552,9 @@ def get_optional_params_image_gen(
def get_optional_params_embeddings(
# 2 optional params
model: str,
user=None,
encoding_format=None,
dimensions=None,
user: Optional[str] = None,
encoding_format: Optional[str] = None,
dimensions: Optional[int] = None,
custom_llm_provider="",
drop_params: Optional[bool] = None,
additional_drop_params: Optional[bool] = None,
@ -2595,7 +2595,6 @@ def get_optional_params_embeddings(
default_params=default_params,
additional_drop_params=additional_drop_params,
)
## raise exception if non-default value passed for non-openai/azure embedding calls
if custom_llm_provider == "openai":
# 'dimensions` is only supported in `text-embedding-3` and later models
@ -2627,6 +2626,17 @@ def get_optional_params_embeddings(
)
final_params = {**optional_params, **kwargs}
return final_params
elif custom_llm_provider == "nvidia_nim":
supported_params = get_supported_openai_params(
model=model or "",
custom_llm_provider="nvidia_nim",
request_type="embeddings",
)
_check_valid_arg(supported_params=supported_params)
optional_params = litellm.nvidiaNimEmbeddingConfig.map_openai_params(
non_default_params=non_default_params, optional_params={}, kwargs=kwargs
)
return optional_params
elif custom_llm_provider == "vertex_ai":
supported_params = get_supported_openai_params(
model=model,
@ -4308,7 +4318,10 @@ def get_supported_openai_params(
else:
return litellm.FireworksAIConfig().get_supported_openai_params()
elif custom_llm_provider == "nvidia_nim":
return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
if request_type == "chat_completion":
return litellm.nvidiaNimConfig.get_supported_openai_params(model=model)
elif request_type == "embeddings":
return litellm.nvidiaNimEmbeddingConfig.get_supported_openai_params()
elif custom_llm_provider == "cerebras":
return litellm.CerebrasConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "ai21_chat":

View file

@ -165,7 +165,7 @@ def test_all_model_configs():
"max_new_tokens": 10
}
from litellm.llms.nvidia_nim import NvidiaNimConfig
from litellm.llms.nvidia_nim.chat import NvidiaNimConfig
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
model="llama3"

View file

@ -14,7 +14,7 @@ import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
from litellm import Choices, Message, ModelResponse, EmbeddingResponse, Usage
from litellm import completion
@ -69,3 +69,38 @@ def test_completion_nvidia_nim(respx_mock: MockRouter):
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_embedding_nvidia_nim(respx_mock: MockRouter):
litellm.set_verbose = True
mock_response = EmbeddingResponse(
model="nvidia_nim/databricks/dbrx-instruct",
data=[
{
"embedding": [0.1, 0.2, 0.3],
"index": 0,
}
],
usage=Usage(
prompt_tokens=10,
completion_tokens=0,
total_tokens=10,
),
)
mock_request = respx_mock.post(
"https://integrate.api.nvidia.com/v1/embeddings"
).mock(return_value=httpx.Response(200, json=mock_response.dict()))
response = litellm.embedding(
model="nvidia_nim/nvidia/nv-embedqa-e5-v5",
input="What is the meaning of life?",
input_type="passage",
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"input": "What is the meaning of life?",
"model": "nvidia/nv-embedqa-e5-v5",
"input_type": "passage",
"encoding_format": "base64",
}