forked from phoenix/litellm-mirror
(feat) add nvidia nim embeddings (#6032)
* nvidia nim support embedding config * add nvidia config in init * nvidia nim embeddings * docs nvidia nim embeddings * docs embeddings on nvidia nim * fix llm translation test
This commit is contained in:
parent
05df9cc6d0
commit
d92696a303
8 changed files with 238 additions and 9 deletions
|
@ -1,3 +1,6 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Nvidia NIM
|
# Nvidia NIM
|
||||||
https://docs.api.nvidia.com/nim/reference/
|
https://docs.api.nvidia.com/nim/reference/
|
||||||
|
|
||||||
|
@ -65,6 +68,96 @@ for chunk in response:
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Usage - embedding
|
||||||
|
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
import os
|
||||||
|
|
||||||
|
response = litellm.embedding(
|
||||||
|
model="nvidia_nim/nvidia/nv-embedqa-e5-v5", # add `nvidia_nim/` prefix to model so litellm knows to route to Nvidia NIM
|
||||||
|
input=["good morning from litellm"],
|
||||||
|
encoding_format = "float",
|
||||||
|
user_id = "user-1234",
|
||||||
|
|
||||||
|
# Nvidia NIM Specific Parameters
|
||||||
|
input_type = "passage", # Optional
|
||||||
|
truncate = "NONE" # Optional
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## **Usage - LiteLLM Proxy Server**
|
||||||
|
|
||||||
|
Here's how to call an Nvidia NIM Endpoint with the LiteLLM Proxy Server
|
||||||
|
|
||||||
|
1. Modify the config.yaml
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: my-model
|
||||||
|
litellm_params:
|
||||||
|
model: nvidia_nim/<your-model-name> # add nvidia_nim/ prefix to route as Nvidia NIM provider
|
||||||
|
api_key: api-key # api key to send your model
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
2. Start the proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Send Request to LiteLLM Proxy Server
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
|
||||||
|
<TabItem value="openai" label="OpenAI Python v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
|
||||||
|
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="my-model",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="curl" label="curl">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"model": "my-model",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Supported Models - 💥 ALL Nvidia NIM Models Supported!
|
## Supported Models - 💥 ALL Nvidia NIM Models Supported!
|
||||||
We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests
|
We support ALL `nvidia_nim` models, just set `nvidia_nim/` as a prefix when sending completion requests
|
||||||
|
|
||||||
|
|
|
@ -981,7 +981,13 @@ from .llms.OpenAI.chat.o1_transformation import (
|
||||||
from .llms.OpenAI.chat.gpt_transformation import (
|
from .llms.OpenAI.chat.gpt_transformation import (
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
)
|
)
|
||||||
from .llms.nvidia_nim import NvidiaNimConfig
|
|
||||||
|
from .llms.nvidia_nim.chat import NvidiaNimConfig
|
||||||
|
from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
|
||||||
|
|
||||||
|
nvidiaNimConfig = NvidiaNimConfig()
|
||||||
|
nvidiaNimEmbeddingConfig = NvidiaNimEmbeddingConfig()
|
||||||
|
|
||||||
from .llms.cerebras.chat import CerebrasConfig
|
from .llms.cerebras.chat import CerebrasConfig
|
||||||
from .llms.sambanova.chat import SambanovaConfig
|
from .llms.sambanova.chat import SambanovaConfig
|
||||||
from .llms.AI21.chat import AI21ChatConfig
|
from .llms.AI21.chat import AI21ChatConfig
|
||||||
|
|
80
litellm/llms/nvidia_nim/embed.py
Normal file
80
litellm/llms/nvidia_nim/embed.py
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
"""
|
||||||
|
Nvidia NIM embeddings endpoint: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
|
||||||
|
|
||||||
|
This is OpenAI compatible
|
||||||
|
|
||||||
|
This file only contains param mapping logic
|
||||||
|
|
||||||
|
API calling is done using the OpenAI SDK with an api_base
|
||||||
|
"""
|
||||||
|
|
||||||
|
import types
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
|
||||||
|
class NvidiaNimEmbeddingConfig:
|
||||||
|
"""
|
||||||
|
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
|
||||||
|
"""
|
||||||
|
|
||||||
|
# OpenAI params
|
||||||
|
encoding_format: Optional[str] = None
|
||||||
|
user: Optional[str] = None
|
||||||
|
|
||||||
|
# Nvidia NIM params
|
||||||
|
input_type: Optional[str] = None
|
||||||
|
truncate: Optional[str] = None
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
encoding_format: Optional[str] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
input_type: Optional[str] = None,
|
||||||
|
truncate: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
locals_ = locals()
|
||||||
|
for key, value in locals_.items():
|
||||||
|
if key != "self" and value is not None:
|
||||||
|
setattr(self.__class__, key, value)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(
|
||||||
|
self,
|
||||||
|
):
|
||||||
|
return ["encoding_format", "user"]
|
||||||
|
|
||||||
|
def map_openai_params(
|
||||||
|
self,
|
||||||
|
non_default_params: dict,
|
||||||
|
optional_params: dict,
|
||||||
|
kwargs: Optional[dict] = None,
|
||||||
|
):
|
||||||
|
if "extra_body" not in optional_params:
|
||||||
|
optional_params["extra_body"] = {}
|
||||||
|
for k, v in non_default_params.items():
|
||||||
|
if k == "input_type":
|
||||||
|
optional_params["extra_body"].update({"input_type": v})
|
||||||
|
elif k == "truncate":
|
||||||
|
optional_params["extra_body"].update({"truncate": v})
|
||||||
|
|
||||||
|
if kwargs is not None:
|
||||||
|
# pass kwargs in extra_body
|
||||||
|
optional_params["extra_body"].update(kwargs)
|
||||||
|
return optional_params
|
|
@ -3316,6 +3316,7 @@ def embedding(
|
||||||
input=[],
|
input=[],
|
||||||
# Optional params
|
# Optional params
|
||||||
dimensions: Optional[int] = None,
|
dimensions: Optional[int] = None,
|
||||||
|
encoding_format: Optional[str] = None,
|
||||||
timeout=600, # default to 10 minutes
|
timeout=600, # default to 10 minutes
|
||||||
# set api_base, api_version, api_key
|
# set api_base, api_version, api_key
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
@ -3336,6 +3337,7 @@ def embedding(
|
||||||
Parameters:
|
Parameters:
|
||||||
- model: The embedding model to use.
|
- model: The embedding model to use.
|
||||||
- input: The input for which embeddings are to be generated.
|
- input: The input for which embeddings are to be generated.
|
||||||
|
- encoding_format: Optional[str] The format to return the embeddings in. Can be either `float` or `base64`
|
||||||
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
||||||
- timeout: The timeout value for the API call, default 10 mins
|
- timeout: The timeout value for the API call, default 10 mins
|
||||||
- litellm_call_id: The call ID for litellm logging.
|
- litellm_call_id: The call ID for litellm logging.
|
||||||
|
@ -3362,7 +3364,6 @@ def embedding(
|
||||||
max_parallel_requests = kwargs.pop("max_parallel_requests", None)
|
max_parallel_requests = kwargs.pop("max_parallel_requests", None)
|
||||||
model_info = kwargs.get("model_info", None)
|
model_info = kwargs.get("model_info", None)
|
||||||
metadata = kwargs.get("metadata", None)
|
metadata = kwargs.get("metadata", None)
|
||||||
encoding_format = kwargs.get("encoding_format", None)
|
|
||||||
proxy_server_request = kwargs.get("proxy_server_request", None)
|
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||||
aembedding = kwargs.get("aembedding", None)
|
aembedding = kwargs.get("aembedding", None)
|
||||||
extra_headers = kwargs.get("extra_headers", None)
|
extra_headers = kwargs.get("extra_headers", None)
|
||||||
|
@ -3556,6 +3557,7 @@ def embedding(
|
||||||
model in litellm.open_ai_embedding_models
|
model in litellm.open_ai_embedding_models
|
||||||
or custom_llm_provider == "openai"
|
or custom_llm_provider == "openai"
|
||||||
or custom_llm_provider == "together_ai"
|
or custom_llm_provider == "together_ai"
|
||||||
|
or custom_llm_provider == "nvidia_nim"
|
||||||
):
|
):
|
||||||
api_base = (
|
api_base = (
|
||||||
api_base
|
api_base
|
||||||
|
|
|
@ -2552,9 +2552,9 @@ def get_optional_params_image_gen(
|
||||||
def get_optional_params_embeddings(
|
def get_optional_params_embeddings(
|
||||||
# 2 optional params
|
# 2 optional params
|
||||||
model: str,
|
model: str,
|
||||||
user=None,
|
user: Optional[str] = None,
|
||||||
encoding_format=None,
|
encoding_format: Optional[str] = None,
|
||||||
dimensions=None,
|
dimensions: Optional[int] = None,
|
||||||
custom_llm_provider="",
|
custom_llm_provider="",
|
||||||
drop_params: Optional[bool] = None,
|
drop_params: Optional[bool] = None,
|
||||||
additional_drop_params: Optional[bool] = None,
|
additional_drop_params: Optional[bool] = None,
|
||||||
|
@ -2595,7 +2595,6 @@ def get_optional_params_embeddings(
|
||||||
default_params=default_params,
|
default_params=default_params,
|
||||||
additional_drop_params=additional_drop_params,
|
additional_drop_params=additional_drop_params,
|
||||||
)
|
)
|
||||||
|
|
||||||
## raise exception if non-default value passed for non-openai/azure embedding calls
|
## raise exception if non-default value passed for non-openai/azure embedding calls
|
||||||
if custom_llm_provider == "openai":
|
if custom_llm_provider == "openai":
|
||||||
# 'dimensions` is only supported in `text-embedding-3` and later models
|
# 'dimensions` is only supported in `text-embedding-3` and later models
|
||||||
|
@ -2627,6 +2626,17 @@ def get_optional_params_embeddings(
|
||||||
)
|
)
|
||||||
final_params = {**optional_params, **kwargs}
|
final_params = {**optional_params, **kwargs}
|
||||||
return final_params
|
return final_params
|
||||||
|
elif custom_llm_provider == "nvidia_nim":
|
||||||
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model or "",
|
||||||
|
custom_llm_provider="nvidia_nim",
|
||||||
|
request_type="embeddings",
|
||||||
|
)
|
||||||
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
optional_params = litellm.nvidiaNimEmbeddingConfig.map_openai_params(
|
||||||
|
non_default_params=non_default_params, optional_params={}, kwargs=kwargs
|
||||||
|
)
|
||||||
|
return optional_params
|
||||||
elif custom_llm_provider == "vertex_ai":
|
elif custom_llm_provider == "vertex_ai":
|
||||||
supported_params = get_supported_openai_params(
|
supported_params = get_supported_openai_params(
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -4308,7 +4318,10 @@ def get_supported_openai_params(
|
||||||
else:
|
else:
|
||||||
return litellm.FireworksAIConfig().get_supported_openai_params()
|
return litellm.FireworksAIConfig().get_supported_openai_params()
|
||||||
elif custom_llm_provider == "nvidia_nim":
|
elif custom_llm_provider == "nvidia_nim":
|
||||||
return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
|
if request_type == "chat_completion":
|
||||||
|
return litellm.nvidiaNimConfig.get_supported_openai_params(model=model)
|
||||||
|
elif request_type == "embeddings":
|
||||||
|
return litellm.nvidiaNimEmbeddingConfig.get_supported_openai_params()
|
||||||
elif custom_llm_provider == "cerebras":
|
elif custom_llm_provider == "cerebras":
|
||||||
return litellm.CerebrasConfig().get_supported_openai_params(model=model)
|
return litellm.CerebrasConfig().get_supported_openai_params(model=model)
|
||||||
elif custom_llm_provider == "ai21_chat":
|
elif custom_llm_provider == "ai21_chat":
|
||||||
|
|
|
@ -165,7 +165,7 @@ def test_all_model_configs():
|
||||||
"max_new_tokens": 10
|
"max_new_tokens": 10
|
||||||
}
|
}
|
||||||
|
|
||||||
from litellm.llms.nvidia_nim import NvidiaNimConfig
|
from litellm.llms.nvidia_nim.chat import NvidiaNimConfig
|
||||||
|
|
||||||
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
|
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
|
||||||
model="llama3"
|
model="llama3"
|
||||||
|
|
|
@ -14,7 +14,7 @@ import pytest
|
||||||
from respx import MockRouter
|
from respx import MockRouter
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm import Choices, Message, ModelResponse
|
from litellm import Choices, Message, ModelResponse, EmbeddingResponse, Usage
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,3 +69,38 @@ def test_completion_nvidia_nim(respx_mock: MockRouter):
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_embedding_nvidia_nim(respx_mock: MockRouter):
|
||||||
|
litellm.set_verbose = True
|
||||||
|
mock_response = EmbeddingResponse(
|
||||||
|
model="nvidia_nim/databricks/dbrx-instruct",
|
||||||
|
data=[
|
||||||
|
{
|
||||||
|
"embedding": [0.1, 0.2, 0.3],
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
usage=Usage(
|
||||||
|
prompt_tokens=10,
|
||||||
|
completion_tokens=0,
|
||||||
|
total_tokens=10,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
mock_request = respx_mock.post(
|
||||||
|
"https://integrate.api.nvidia.com/v1/embeddings"
|
||||||
|
).mock(return_value=httpx.Response(200, json=mock_response.dict()))
|
||||||
|
response = litellm.embedding(
|
||||||
|
model="nvidia_nim/nvidia/nv-embedqa-e5-v5",
|
||||||
|
input="What is the meaning of life?",
|
||||||
|
input_type="passage",
|
||||||
|
)
|
||||||
|
assert mock_request.called
|
||||||
|
request_body = json.loads(mock_request.calls[0].request.content)
|
||||||
|
print("request_body: ", request_body)
|
||||||
|
assert request_body == {
|
||||||
|
"input": "What is the meaning of life?",
|
||||||
|
"model": "nvidia/nv-embedqa-e5-v5",
|
||||||
|
"input_type": "passage",
|
||||||
|
"encoding_format": "base64",
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue