mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-08 04:54:38 +00:00
feat: add refresh_models support to inference adapters (default: false) (#3719)
# What does this PR do? inference adapters can now configure `refresh_models: bool` to control periodic model listing from their providers BREAKING CHANGE: together inference adapter default changed. previously always refreshed, now follows config. addresses "models: refresh" on #3517 ## Test Plan ci w/ new tests
This commit is contained in:
parent
8b9af03a1b
commit
e892a3f7f4
31 changed files with 33 additions and 67 deletions
|
@ -15,6 +15,7 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `str \| None` | No | | API key for Anthropic models |
|
| `api_key` | `str \| None` | No | | API key for Anthropic models |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -22,6 +22,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
|
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
|
||||||
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
|
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
|
||||||
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
|
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
|
||||||
|
|
|
@ -15,6 +15,7 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
|
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
|
||||||
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
|
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
|
||||||
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
|
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
|
||||||
|
|
|
@ -15,6 +15,7 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
|
| `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
|
||||||
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Cerebras API Key |
|
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Cerebras API Key |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Databricks inference provider for running models on Databricks' unified analytic
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint |
|
| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint |
|
||||||
| `api_token` | `<class 'pydantic.types.SecretStr'>` | No | | The Databricks API token |
|
| `api_token` | `<class 'pydantic.types.SecretStr'>` | No | | The Databricks API token |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
|
| `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Fireworks.ai API Key |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Fireworks.ai API Key |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `str \| None` | No | | API key for Gemini models |
|
| `api_key` | `str \| None` | No | | API key for Gemini models |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -15,6 +15,7 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `str \| None` | No | | The Groq API key |
|
| `api_key` | `str \| None` | No | | The Groq API key |
|
||||||
| `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |
|
| `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `str \| None` | No | | The Llama API key |
|
| `api_key` | `str \| None` | No | | The Llama API key |
|
||||||
| `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
|
| `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
|
| `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The NVIDIA API key, only needed of using the hosted service |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The NVIDIA API key, only needed of using the hosted service |
|
||||||
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
|
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
|
||||||
|
|
|
@ -15,8 +15,8 @@ Ollama inference provider for running local models through the Ollama runtime.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | http://localhost:11434 | |
|
| `url` | `<class 'str'>` | No | http://localhost:11434 | |
|
||||||
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `api_key` | `str \| None` | No | | API key for OpenAI models |
|
| `api_key` | `str \| None` | No | | API key for OpenAI models |
|
||||||
| `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
|
| `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Passthrough inference provider for connecting to any external inference service
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint |
|
| `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | API Key for the passthrouth endpoint |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | API Key for the passthrouth endpoint |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint |
|
| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint |
|
||||||
| `api_token` | `str \| None` | No | | The API token |
|
| `api_token` | `str \| None` | No | | The API token |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
|
| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The SambaNova cloud API Key |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The SambaNova cloud API Key |
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | | The URL for the TGI serving endpoint |
|
| `url` | `<class 'str'>` | No | | The URL for the TGI serving endpoint |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -15,6 +15,7 @@ Together AI inference provider for open-source models and collaborative AI devel
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
|
| `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Together AI API Key |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Together AI API Key |
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@ Available Models:
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `project` | `<class 'str'>` | No | | Google Cloud project ID for Vertex AI |
|
| `project` | `<class 'str'>` | No | | Google Cloud project ID for Vertex AI |
|
||||||
| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
|
| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
|
||||||
|
|
||||||
|
|
|
@ -15,11 +15,11 @@ Remote vLLM inference provider for connecting to vLLM servers.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint |
|
| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint |
|
||||||
| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
|
| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
|
||||||
| `api_token` | `str \| None` | No | fake | The API token |
|
| `api_token` | `str \| None` | No | fake | The API token |
|
||||||
| `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
|
| `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
|
||||||
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
|
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
|
||||||
| `project_id` | `str \| None` | No | | The Project ID key |
|
| `project_id` | `str \| None` | No | | The Project ID key |
|
||||||
|
|
|
@ -15,6 +15,7 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
|
||||||
|
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
|
||||||
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
|
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
|
||||||
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
|
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
|
||||||
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
|
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
|
||||||
|
|
|
@ -41,9 +41,6 @@ class DatabricksInferenceAdapter(OpenAIMixin):
|
||||||
).serving_endpoints.list() # TODO: this is not async
|
).serving_endpoints.list() # TODO: this is not async
|
||||||
]
|
]
|
||||||
|
|
||||||
async def should_refresh_models(self) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def openai_completion(
|
async def openai_completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
|
|
@ -6,8 +6,6 @@
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
||||||
|
|
||||||
DEFAULT_OLLAMA_URL = "http://localhost:11434"
|
DEFAULT_OLLAMA_URL = "http://localhost:11434"
|
||||||
|
@ -15,10 +13,6 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"
|
||||||
|
|
||||||
class OllamaImplConfig(RemoteInferenceProviderConfig):
|
class OllamaImplConfig(RemoteInferenceProviderConfig):
|
||||||
url: str = DEFAULT_OLLAMA_URL
|
url: str = DEFAULT_OLLAMA_URL
|
||||||
refresh_models: bool = Field(
|
|
||||||
default=False,
|
|
||||||
description="Whether to refresh models periodically",
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
|
def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
|
||||||
|
|
|
@ -72,9 +72,6 @@ class OllamaInferenceAdapter(OpenAIMixin):
|
||||||
f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
|
f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
|
||||||
)
|
)
|
||||||
|
|
||||||
async def should_refresh_models(self) -> bool:
|
|
||||||
return self.config.refresh_models
|
|
||||||
|
|
||||||
async def health(self) -> HealthResponse:
|
async def health(self) -> HealthResponse:
|
||||||
"""
|
"""
|
||||||
Performs a health check by verifying connectivity to the Ollama server.
|
Performs a health check by verifying connectivity to the Ollama server.
|
||||||
|
|
|
@ -63,9 +63,6 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
|
||||||
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
|
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
|
||||||
return [m.id for m in await self._get_client().models.list()]
|
return [m.id for m in await self._get_client().models.list()]
|
||||||
|
|
||||||
async def should_refresh_models(self) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
async def openai_embeddings(
|
async def openai_embeddings(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
|
|
@ -30,10 +30,6 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
|
||||||
default=True,
|
default=True,
|
||||||
description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
|
description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
|
||||||
)
|
)
|
||||||
refresh_models: bool = Field(
|
|
||||||
default=False,
|
|
||||||
description="Whether to refresh models periodically",
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("tls_verify")
|
@field_validator("tls_verify")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -53,10 +53,6 @@ class VLLMInferenceAdapter(OpenAIMixin):
|
||||||
"You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
|
"You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
|
||||||
)
|
)
|
||||||
|
|
||||||
async def should_refresh_models(self) -> bool:
|
|
||||||
# Strictly respecting the refresh_models directive
|
|
||||||
return self.config.refresh_models
|
|
||||||
|
|
||||||
async def health(self) -> HealthResponse:
|
async def health(self) -> HealthResponse:
|
||||||
"""
|
"""
|
||||||
Performs a health check by verifying connectivity to the remote vLLM server.
|
Performs a health check by verifying connectivity to the remote vLLM server.
|
||||||
|
|
|
@ -24,6 +24,10 @@ class RemoteInferenceProviderConfig(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="List of models that should be registered with the model registry. If None, all models are allowed.",
|
description="List of models that should be registered with the model registry. If None, all models are allowed.",
|
||||||
)
|
)
|
||||||
|
refresh_models: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Whether to refresh models periodically from the provider",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: this class is more confusing than useful right now. We need to make it
|
# TODO: this class is more confusing than useful right now. We need to make it
|
||||||
|
|
|
@ -484,7 +484,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
|
||||||
return model in self._model_cache
|
return model in self._model_cache
|
||||||
|
|
||||||
async def should_refresh_models(self) -> bool:
|
async def should_refresh_models(self) -> bool:
|
||||||
return False
|
return self.config.refresh_models
|
||||||
|
|
||||||
#
|
#
|
||||||
# The model_dump implementations are to avoid serializing the extra fields,
|
# The model_dump implementations are to avoid serializing the extra fields,
|
||||||
|
|
|
@ -186,43 +186,3 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter):
|
||||||
|
|
||||||
assert mock_create_client.call_count == 4 # no cheating
|
assert mock_create_client.call_count == 4 # no cheating
|
||||||
assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"
|
assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"
|
||||||
|
|
||||||
|
|
||||||
async def test_should_refresh_models():
|
|
||||||
"""
|
|
||||||
Test the should_refresh_models method with different refresh_models configurations.
|
|
||||||
|
|
||||||
This test verifies that:
|
|
||||||
1. When refresh_models is True, should_refresh_models returns True regardless of api_token
|
|
||||||
2. When refresh_models is False, should_refresh_models returns False regardless of api_token
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Test case 1: refresh_models is True, api_token is None
|
|
||||||
config1 = VLLMInferenceAdapterConfig(url="http://test.localhost", api_token=None, refresh_models=True)
|
|
||||||
adapter1 = VLLMInferenceAdapter(config=config1)
|
|
||||||
result1 = await adapter1.should_refresh_models()
|
|
||||||
assert result1 is True, "should_refresh_models should return True when refresh_models is True"
|
|
||||||
|
|
||||||
# Test case 2: refresh_models is True, api_token is empty string
|
|
||||||
config2 = VLLMInferenceAdapterConfig(url="http://test.localhost", api_token="", refresh_models=True)
|
|
||||||
adapter2 = VLLMInferenceAdapter(config=config2)
|
|
||||||
result2 = await adapter2.should_refresh_models()
|
|
||||||
assert result2 is True, "should_refresh_models should return True when refresh_models is True"
|
|
||||||
|
|
||||||
# Test case 3: refresh_models is True, api_token is "fake" (default)
|
|
||||||
config3 = VLLMInferenceAdapterConfig(url="http://test.localhost", api_token="fake", refresh_models=True)
|
|
||||||
adapter3 = VLLMInferenceAdapter(config=config3)
|
|
||||||
result3 = await adapter3.should_refresh_models()
|
|
||||||
assert result3 is True, "should_refresh_models should return True when refresh_models is True"
|
|
||||||
|
|
||||||
# Test case 4: refresh_models is True, api_token is real token
|
|
||||||
config4 = VLLMInferenceAdapterConfig(url="http://test.localhost", api_token="real-token-123", refresh_models=True)
|
|
||||||
adapter4 = VLLMInferenceAdapter(config=config4)
|
|
||||||
result4 = await adapter4.should_refresh_models()
|
|
||||||
assert result4 is True, "should_refresh_models should return True when refresh_models is True"
|
|
||||||
|
|
||||||
# Test case 5: refresh_models is False, api_token is real token
|
|
||||||
config5 = VLLMInferenceAdapterConfig(url="http://test.localhost", api_token="real-token-456", refresh_models=False)
|
|
||||||
adapter5 = VLLMInferenceAdapter(config=config5)
|
|
||||||
result5 = await adapter5.should_refresh_models()
|
|
||||||
assert result5 is False, "should_refresh_models should return False when refresh_models is False"
|
|
||||||
|
|
|
@ -466,10 +466,16 @@ class TestOpenAIMixinModelRegistration:
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
async def test_should_refresh_models(self, mixin):
|
async def test_should_refresh_models(self, mixin):
|
||||||
"""Test should_refresh_models method (should always return False)"""
|
"""Test should_refresh_models method returns config value"""
|
||||||
|
# Default config has refresh_models=False
|
||||||
result = await mixin.should_refresh_models()
|
result = await mixin.should_refresh_models()
|
||||||
assert result is False
|
assert result is False
|
||||||
|
|
||||||
|
config_with_refresh = RemoteInferenceProviderConfig(refresh_models=True)
|
||||||
|
mixin_with_refresh = OpenAIMixinImpl(config=config_with_refresh)
|
||||||
|
result_with_refresh = await mixin_with_refresh.should_refresh_models()
|
||||||
|
assert result_with_refresh is True
|
||||||
|
|
||||||
async def test_register_model_error_propagation(self, mixin, mock_client_with_exception, mock_client_context):
|
async def test_register_model_error_propagation(self, mixin, mock_client_with_exception, mock_client_context):
|
||||||
"""Test that errors from provider API are properly propagated during registration"""
|
"""Test that errors from provider API are properly propagated during registration"""
|
||||||
model = Model(
|
model = Model(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue