diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 0b6c1be3b..f8f01756d 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -71,7 +71,7 @@ jobs: - name: Build Llama Stack run: | - uv run llama stack build --template starter --image-type venv + uv run llama stack build --template ci-tests --image-type venv - name: Check Storage and Memory Available Before Tests if: ${{ always() }} @@ -92,9 +92,9 @@ jobs: shell: bash run: | if [ "${{ matrix.client-type }}" == "library" ]; then - stack_config="starter" + stack_config="ci-tests" else - stack_config="server:starter" + stack_config="server:ci-tests" fi uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ diff --git a/.github/workflows/integration-vector-io-tests.yml b/.github/workflows/integration-vector-io-tests.yml index c11720b4b..ec236b33b 100644 --- a/.github/workflows/integration-vector-io-tests.yml +++ b/.github/workflows/integration-vector-io-tests.yml @@ -93,7 +93,7 @@ jobs: - name: Build Llama Stack run: | - uv run llama stack build --template starter --image-type venv + uv run llama stack build --template ci-tests --image-type venv - name: Check Storage and Memory Available Before Tests if: ${{ always() }} diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml index 6de72cd60..392fddda6 100644 --- a/.github/workflows/providers-build.yml +++ b/.github/workflows/providers-build.yml @@ -97,9 +97,9 @@ jobs: - name: Build a single provider run: | - yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml - yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml - USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml + yq -i '.image_type = "container"' llama_stack/templates/ci-tests/build.yaml + yq -i '.image_name = "test"' llama_stack/templates/ci-tests/build.yaml + USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml - name: Inspect the container image entrypoint run: | @@ -126,14 +126,14 @@ jobs: .image_type = "container" | .image_name = "ubi9-test" | .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest" - ' llama_stack/templates/starter/build.yaml + ' llama_stack/templates/ci-tests/build.yaml - name: Build dev container (UBI9) env: USE_COPY_NOT_MOUNT: "true" LLAMA_STACK_DIR: "." run: | - uv run llama stack build --config llama_stack/templates/starter/build.yaml + uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml - name: Inspect UBI9 image run: | diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml index 63ddd9b54..efd1f2cc9 100644 --- a/.github/workflows/python-build-test.yml +++ b/.github/workflows/python-build-test.yml @@ -20,7 +20,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install uv - uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1 + uses: astral-sh/setup-uv@7edac99f961f18b581bbd960d59d049f04c0002f # v6.4.1 with: python-version: ${{ matrix.python-version }} activate-environment: true diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md index 05773efce..6582e08de 100644 --- a/docs/source/providers/inference/index.md +++ b/docs/source/providers/inference/index.md @@ -4,7 +4,6 @@ This section contains documentation for all available providers for the **infere - [inline::meta-reference](inline_meta-reference.md) - [inline::sentence-transformers](inline_sentence-transformers.md) -- [inline::vllm](inline_vllm.md) - [remote::anthropic](remote_anthropic.md) - [remote::bedrock](remote_bedrock.md) - [remote::cerebras](remote_cerebras.md) diff --git a/docs/source/providers/inference/inline_vllm.md b/docs/source/providers/inference/inline_vllm.md deleted file mode 100644 index 6ea34acb8..000000000 --- a/docs/source/providers/inference/inline_vllm.md +++ /dev/null @@ -1,29 +0,0 @@ -# inline::vllm - -## Description - -vLLM inference provider for high-performance model serving with PagedAttention and continuous batching. - -## Configuration - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `tensor_parallel_size` | `` | No | 1 | Number of tensor parallel replicas (number of GPUs to use). | -| `max_tokens` | `` | No | 4096 | Maximum number of tokens to generate. | -| `max_model_len` | `` | No | 4096 | Maximum context length to use during serving. | -| `max_num_seqs` | `` | No | 4 | Maximum parallel batch size for generation. | -| `enforce_eager` | `` | No | False | Whether to use eager mode for inference (otherwise cuda graphs are used). | -| `gpu_memory_utilization` | `` | No | 0.3 | How much GPU memory will be allocated when this provider has finished loading, including memory that was already allocated before loading. | - -## Sample Configuration - -```yaml -tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:=1} -max_tokens: ${env.MAX_TOKENS:=4096} -max_model_len: ${env.MAX_MODEL_LEN:=4096} -max_num_seqs: ${env.MAX_NUM_SEQS:=4} -enforce_eager: ${env.ENFORCE_EAGER:=False} -gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:=0.3} - -``` - diff --git a/docs/source/providers/inference/remote_vllm.md b/docs/source/providers/inference/remote_vllm.md index 6c725fb41..5291199a4 100644 --- a/docs/source/providers/inference/remote_vllm.md +++ b/docs/source/providers/inference/remote_vllm.md @@ -12,11 +12,13 @@ Remote vLLM inference provider for connecting to vLLM servers. | `max_tokens` | `` | No | 4096 | Maximum number of tokens to generate. | | `api_token` | `str \| None` | No | fake | The API token | | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. | +| `refresh_models` | `` | No | False | Whether to refresh models periodically | +| `refresh_models_interval` | `` | No | 300 | Interval in seconds to refresh models | ## Sample Configuration ```yaml -url: ${env.VLLM_URL} +url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 26de04b68..b2bb8a8e6 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -819,7 +819,7 @@ class OpenAIEmbeddingsResponse(BaseModel): class ModelStore(Protocol): async def get_model(self, identifier: str) -> Model: ... - async def update_registered_models( + async def update_registered_llm_models( self, provider_id: str, models: list[Model], diff --git a/llama_stack/distribution/routing_tables/models.py b/llama_stack/distribution/routing_tables/models.py index 90f8afa1c..9a9db7257 100644 --- a/llama_stack/distribution/routing_tables/models.py +++ b/llama_stack/distribution/routing_tables/models.py @@ -81,7 +81,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): raise ValueError(f"Model {model_id} not found") await self.unregister_object(existing_model) - async def update_registered_models( + async def update_registered_llm_models( self, provider_id: str, models: list[Model], @@ -92,12 +92,16 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): # from run.yaml) that we need to keep track of model_ids = {} for model in existing_models: - if model.provider_id == provider_id: + # we leave embeddings models alone because often we don't get metadata + # (embedding dimension, etc.) from the provider + if model.provider_id == provider_id and model.model_type == ModelType.llm: model_ids[model.provider_resource_id] = model.identifier logger.debug(f"unregistering model {model.identifier}") await self.unregister_object(model) for model in models: + if model.model_type != ModelType.llm: + continue if model.provider_resource_id in model_ids: model.identifier = model_ids[model.provider_resource_id] diff --git a/llama_stack/providers/inline/files/localfs/__init__.py b/llama_stack/providers/inline/files/localfs/__init__.py index 7a04e61c6..71664efad 100644 --- a/llama_stack/providers/inline/files/localfs/__init__.py +++ b/llama_stack/providers/inline/files/localfs/__init__.py @@ -6,7 +6,7 @@ from typing import Any -from llama_stack.distribution.datatypes import Api +from llama_stack.distribution.datatypes import AccessRule, Api from .config import LocalfsFilesImplConfig from .files import LocalfsFilesImpl @@ -14,7 +14,7 @@ from .files import LocalfsFilesImpl __all__ = ["LocalfsFilesImpl", "LocalfsFilesImplConfig"] -async def get_provider_impl(config: LocalfsFilesImplConfig, deps: dict[Api, Any]): - impl = LocalfsFilesImpl(config) +async def get_provider_impl(config: LocalfsFilesImplConfig, deps: dict[Api, Any], policy: list[AccessRule]): + impl = LocalfsFilesImpl(config, policy) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/files/localfs/files.py b/llama_stack/providers/inline/files/localfs/files.py index bdf8c42c7..433762c5a 100644 --- a/llama_stack/providers/inline/files/localfs/files.py +++ b/llama_stack/providers/inline/files/localfs/files.py @@ -19,16 +19,19 @@ from llama_stack.apis.files import ( OpenAIFileObject, OpenAIFilePurpose, ) +from llama_stack.distribution.datatypes import AccessRule from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType -from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl +from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore +from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl from .config import LocalfsFilesImplConfig class LocalfsFilesImpl(Files): - def __init__(self, config: LocalfsFilesImplConfig) -> None: + def __init__(self, config: LocalfsFilesImplConfig, policy: list[AccessRule]) -> None: self.config = config - self.sql_store: SqlStore | None = None + self.policy = policy + self.sql_store: AuthorizedSqlStore | None = None async def initialize(self) -> None: """Initialize the files provider by setting up storage directory and metadata database.""" @@ -37,7 +40,7 @@ class LocalfsFilesImpl(Files): storage_path.mkdir(parents=True, exist_ok=True) # Initialize SQL store for metadata - self.sql_store = sqlstore_impl(self.config.metadata_store) + self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.config.metadata_store)) await self.sql_store.create_table( "openai_files", { @@ -126,6 +129,7 @@ class LocalfsFilesImpl(Files): paginated_result = await self.sql_store.fetch_all( table="openai_files", + policy=self.policy, where=where_conditions if where_conditions else None, order_by=[("created_at", order.value)], cursor=("id", after) if after else None, @@ -156,7 +160,7 @@ class LocalfsFilesImpl(Files): if not self.sql_store: raise RuntimeError("Files provider not initialized") - row = await self.sql_store.fetch_one("openai_files", where={"id": file_id}) + row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id}) if not row: raise ValueError(f"File with id {file_id} not found") @@ -174,7 +178,7 @@ class LocalfsFilesImpl(Files): if not self.sql_store: raise RuntimeError("Files provider not initialized") - row = await self.sql_store.fetch_one("openai_files", where={"id": file_id}) + row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id}) if not row: raise ValueError(f"File with id {file_id} not found") @@ -197,7 +201,7 @@ class LocalfsFilesImpl(Files): raise RuntimeError("Files provider not initialized") # Get file metadata - row = await self.sql_store.fetch_one("openai_files", where={"id": file_id}) + row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id}) if not row: raise ValueError(f"File with id {file_id} not found") diff --git a/llama_stack/providers/inline/inference/vllm/__init__.py b/llama_stack/providers/inline/inference/vllm/__init__.py deleted file mode 100644 index d0ec3e084..000000000 --- a/llama_stack/providers/inline/inference/vllm/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import VLLMConfig - - -async def get_provider_impl(config: VLLMConfig, _deps: dict[str, Any]): - from .vllm import VLLMInferenceImpl - - impl = VLLMInferenceImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py deleted file mode 100644 index 660ef206b..000000000 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from pydantic import BaseModel, Field - -from llama_stack.schema_utils import json_schema_type - - -@json_schema_type -class VLLMConfig(BaseModel): - """Configuration for the vLLM inference provider. - - Note that the model name is no longer part of this static configuration. - You can bind an instance of this provider to a specific model with the - ``models.register()`` API call.""" - - tensor_parallel_size: int = Field( - default=1, - description="Number of tensor parallel replicas (number of GPUs to use).", - ) - max_tokens: int = Field( - default=4096, - description="Maximum number of tokens to generate.", - ) - max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.") - max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.") - enforce_eager: bool = Field( - default=False, - description="Whether to use eager mode for inference (otherwise cuda graphs are used).", - ) - gpu_memory_utilization: float = Field( - default=0.3, - description=( - "How much GPU memory will be allocated when this provider has finished " - "loading, including memory that was already allocated before loading." - ), - ) - - @classmethod - def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]: - return { - "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:=1}", - "max_tokens": "${env.MAX_TOKENS:=4096}", - "max_model_len": "${env.MAX_MODEL_LEN:=4096}", - "max_num_seqs": "${env.MAX_NUM_SEQS:=4}", - "enforce_eager": "${env.ENFORCE_EAGER:=False}", - "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:=0.3}", - } diff --git a/llama_stack/providers/inline/inference/vllm/openai_utils.py b/llama_stack/providers/inline/inference/vllm/openai_utils.py deleted file mode 100644 index 77cbf0403..000000000 --- a/llama_stack/providers/inline/inference/vllm/openai_utils.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -import vllm - -from llama_stack.apis.inference import ( - ChatCompletionRequest, - GrammarResponseFormat, - JsonSchemaResponseFormat, - Message, - ToolChoice, - ToolDefinition, - UserMessage, -) -from llama_stack.models.llama.datatypes import BuiltinTool -from llama_stack.providers.utils.inference.openai_compat import ( - convert_message_to_openai_dict, - get_sampling_options, -) - -############################################################################### -# This file contains OpenAI compatibility code that is currently only used -# by the inline vLLM connector. Some or all of this code may be moved to a -# central location at a later date. - - -def _merge_context_into_content(message: Message) -> Message: # type: ignore - """ - Merge the ``context`` field of a Llama Stack ``Message`` object into - the content field for compabilitiy with OpenAI-style APIs. - - Generates a content string that emulates the current behavior - of ``llama_models.llama3.api.chat_format.encode_message()``. - - :param message: Message that may include ``context`` field - - :returns: A version of ``message`` with any context merged into the - ``content`` field. - """ - if not isinstance(message, UserMessage): # Separate type check for linter - return message - if message.context is None: - return message - return UserMessage( - role=message.role, - # Emumate llama_models.llama3.api.chat_format.encode_message() - content=message.content + "\n\n" + message.context, - context=None, - ) - - -def _llama_stack_tools_to_openai_tools( - tools: list[ToolDefinition] | None = None, -) -> list[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]: - """ - Convert the list of available tools from Llama Stack's format to vLLM's - version of OpenAI's format. - """ - if tools is None: - return [] - - result = [] - for t in tools: - if isinstance(t.tool_name, BuiltinTool): - raise NotImplementedError("Built-in tools not yet implemented") - if t.parameters is None: - parameters = None - else: # if t.parameters is not None - # Convert the "required" flags to a list of required params - required_params = [k for k, v in t.parameters.items() if v.required] - parameters = { - "type": "object", # Mystery value that shows up in OpenAI docs - "properties": { - k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items() - }, - "required": required_params, - } - - function_def = vllm.entrypoints.openai.protocol.FunctionDefinition( - name=t.tool_name, description=t.description, parameters=parameters - ) - - # Every tool definition is double-boxed in a ChatCompletionToolsParam - result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def)) - return result - - -async def llama_stack_chat_completion_to_openai_chat_completion_dict( - request: ChatCompletionRequest, -) -> dict: - """ - Convert a chat completion request in Llama Stack format into an - equivalent set of arguments to pass to an OpenAI-compatible - chat completions API. - - :param request: Bundled request parameters in Llama Stack format. - - :returns: Dictionary of key-value pairs to use as an initializer - for a dataclass or to be converted directly to JSON and sent - over the wire. - """ - - converted_messages = [ - # This mystery async call makes the parent function also be async - await convert_message_to_openai_dict(_merge_context_into_content(m), download=True) - for m in request.messages - ] - converted_tools = _llama_stack_tools_to_openai_tools(request.tools) - - # Llama will try to use built-in tools with no tool catalog, so don't enable - # tool choice unless at least one tool is enabled. - converted_tool_choice = "none" - if ( - request.tool_config is not None - and request.tool_config.tool_choice == ToolChoice.auto - and request.tools is not None - and len(request.tools) > 0 - ): - converted_tool_choice = "auto" - - # TODO: Figure out what to do with the tool_prompt_format argument. - # Other connectors appear to drop it quietly. - - # Use Llama Stack shared code to translate sampling parameters. - sampling_options = get_sampling_options(request.sampling_params) - - # get_sampling_options() translates repetition penalties to an option that - # OpenAI's APIs don't know about. - # vLLM's OpenAI-compatible API also handles repetition penalties wrong. - # For now, translate repetition penalties into a format that vLLM's broken - # API will handle correctly. Two wrongs make a right... - if "repeat_penalty" in sampling_options: - del sampling_options["repeat_penalty"] - if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0: - sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty - - # Convert a single response format into four different parameters, per - # the OpenAI spec - guided_decoding_options = dict() - if request.response_format is None: - # Use defaults - pass - elif isinstance(request.response_format, JsonSchemaResponseFormat): - guided_decoding_options["guided_json"] = request.response_format.json_schema - elif isinstance(request.response_format, GrammarResponseFormat): - guided_decoding_options["guided_grammar"] = request.response_format.bnf - else: - raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'") - - logprob_options = dict() - if request.logprobs is not None: - logprob_options["logprobs"] = request.logprobs.top_k - - # Marshall together all the arguments for a ChatCompletionRequest - request_options = { - "model": request.model, - "messages": converted_messages, - "tools": converted_tools, - "tool_choice": converted_tool_choice, - "stream": request.stream, - **sampling_options, - **guided_decoding_options, - **logprob_options, - } - - return request_options diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py deleted file mode 100644 index bf54462b5..000000000 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ /dev/null @@ -1,811 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import json -import re -import uuid -from collections.abc import AsyncGenerator, AsyncIterator - -# These vLLM modules contain names that overlap with Llama Stack names, so we import -# fully-qualified names -import vllm.entrypoints.openai.protocol -import vllm.sampling_params -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels - -from llama_stack.apis.common.content_types import ( - InterleavedContent, - InterleavedContentItem, - TextDelta, - ToolCallDelta, -) -from llama_stack.apis.inference import ( - ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionResponseEvent, - ChatCompletionResponseEventType, - ChatCompletionResponseStreamChunk, - CompletionMessage, - CompletionResponse, - CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, - GrammarResponseFormat, - Inference, - JsonSchemaResponseFormat, - LogProbConfig, - Message, - OpenAIEmbeddingsResponse, - ResponseFormat, - SamplingParams, - TextTruncation, - TokenLogProbs, - ToolChoice, - ToolConfig, - TopKSamplingStrategy, - TopPSamplingStrategy, -) -from llama_stack.apis.models import Model -from llama_stack.log import get_logger -from llama_stack.models.llama import sku_list -from llama_stack.models.llama.datatypes import ( - StopReason, - ToolCall, - ToolDefinition, - ToolPromptFormat, -) -from llama_stack.models.llama.llama3.chat_format import ChatFormat -from llama_stack.models.llama.llama3.tokenizer import Tokenizer -from llama_stack.providers.remote.inference.vllm.vllm import build_hf_repo_model_entries -from llama_stack.providers.utils.inference.model_registry import ( - ModelRegistryHelper, - ModelsProtocolPrivate, -) -from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionToLlamaStackMixin, - OpenAICompatCompletionChoice, - OpenAICompatCompletionResponse, - OpenAICompletionToLlamaStackMixin, - get_stop_reason, - process_chat_completion_stream_response, -) -from llama_stack.providers.utils.inference.prompt_adapter import ( - chat_completion_request_to_prompt, -) - -from .config import VLLMConfig -from .openai_utils import llama_stack_chat_completion_to_openai_chat_completion_dict - -# Map from Hugging Face model architecture name to appropriate tool parser. -# See vllm.entrypoints.openai.tool_parsers.ToolParserManager.tool_parsers for the full list of -# available parsers. -# TODO: Expand this list -CONFIG_TYPE_TO_TOOL_PARSER = { - "GraniteConfig": "granite", - "MllamaConfig": "llama3_json", - "LlamaConfig": "llama3_json", -} -DEFAULT_TOOL_PARSER = "pythonic" - - -logger = get_logger(__name__, category="inference") - - -def _random_uuid_str() -> str: - return str(uuid.uuid4().hex) - - -def _response_format_to_guided_decoding_params( - response_format: ResponseFormat | None, # type: ignore -) -> vllm.sampling_params.GuidedDecodingParams: - """ - Translate constrained decoding parameters from Llama Stack's format to vLLM's format. - - :param response_format: Llama Stack version of constrained decoding info. Can be ``None``, - indicating no constraints. - :returns: The equivalent dataclass object for the low-level inference layer of vLLM. - """ - if response_format is None: - # As of vLLM 0.6.3, the default constructor for GuidedDecodingParams() returns an invalid - # value that crashes the executor on some code paths. Use ``None`` instead. - return None - - # Llama Stack currently implements fewer types of constrained decoding than vLLM does. - # Translate the types that exist and detect if Llama Stack adds new ones. - if isinstance(response_format, JsonSchemaResponseFormat): - return vllm.sampling_params.GuidedDecodingParams(json=response_format.json_schema) - elif isinstance(response_format, GrammarResponseFormat): - # BNF grammar. - # Llama Stack uses the parse tree of the grammar, while vLLM uses the string - # representation of the grammar. - raise TypeError( - "Constrained decoding with BNF grammars is not currently implemented, because the " - "reference implementation does not implement it." - ) - else: - raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(response_format)}'") - - -def _convert_sampling_params( - sampling_params: SamplingParams | None, - response_format: ResponseFormat | None, # type: ignore - log_prob_config: LogProbConfig | None, -) -> vllm.SamplingParams: - """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's - format.""" - # In the absence of provided config values, use Llama Stack defaults as encoded in the Llama - # Stack dataclasses. These defaults are different from vLLM's defaults. - if sampling_params is None: - sampling_params = SamplingParams() - if log_prob_config is None: - log_prob_config = LogProbConfig() - - if isinstance(sampling_params.strategy, TopKSamplingStrategy): - if sampling_params.strategy.top_k == 0: - # vLLM treats "k" differently for top-k sampling - vllm_top_k = -1 - else: - vllm_top_k = sampling_params.strategy.top_k - else: - vllm_top_k = -1 - - if isinstance(sampling_params.strategy, TopPSamplingStrategy): - vllm_top_p = sampling_params.strategy.top_p - # Llama Stack only allows temperature with top-P. - vllm_temperature = sampling_params.strategy.temperature - else: - vllm_top_p = 1.0 - vllm_temperature = 0.0 - - # vLLM allows top-p and top-k at the same time. - vllm_sampling_params = vllm.SamplingParams.from_optional( - max_tokens=(None if sampling_params.max_tokens == 0 else sampling_params.max_tokens), - temperature=vllm_temperature, - top_p=vllm_top_p, - top_k=vllm_top_k, - repetition_penalty=sampling_params.repetition_penalty, - guided_decoding=_response_format_to_guided_decoding_params(response_format), - logprobs=log_prob_config.top_k, - ) - return vllm_sampling_params - - -class VLLMInferenceImpl( - Inference, - OpenAIChatCompletionToLlamaStackMixin, - OpenAICompletionToLlamaStackMixin, - ModelsProtocolPrivate, -): - """ - vLLM-based inference model adapter for Llama Stack with support for multiple models. - - Requires the configuration parameters documented in the :class:`VllmConfig2` class. - """ - - config: VLLMConfig - register_helper: ModelRegistryHelper - model_ids: set[str] - resolved_model_id: str | None - engine: AsyncLLMEngine | None - chat: OpenAIServingChat | None - is_meta_llama_model: bool - - def __init__(self, config: VLLMConfig): - self.config = config - logger.info(f"Config is: {self.config}") - - self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries()) - self.formatter = ChatFormat(Tokenizer.get_instance()) - - # The following are initialized when paths are bound to this provider - self.resolved_model_id = None - self.model_ids = set() - self.engine = None - self.chat = None - self.is_meta_llama_model = False - - ########################################################################### - # METHODS INHERITED FROM IMPLICIT BASE CLASS. - # TODO: Make this class inherit from the new base class ProviderBase once that class exists. - - async def initialize(self) -> None: - """ - Callback that is invoked through many levels of indirection during provider class - instantiation, sometime after when __init__() is called and before any model registration - methods or methods connected to a REST API are called. - - It's not clear what assumptions the class can make about the platform's initialization - state here that can't be made during __init__(), and vLLM can't be started until we know - what model it's supposed to be serving, so nothing happens here currently. - """ - pass - - async def shutdown(self) -> None: - logger.info(f"Shutting down inline vLLM inference provider {self}.") - if self.engine is not None: - self.engine.shutdown_background_loop() - self.engine = None - self.chat = None - self.model_ids = set() - self.resolved_model_id = None - - ########################################################################### - # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE - - # Note that the return type of the superclass method is WRONG - async def register_model(self, model: Model) -> Model: - """ - Callback that is called when the server associates an inference endpoint with an - inference provider. - - :param model: Object that encapsulates parameters necessary for identifying a specific - LLM. - - :returns: The input ``Model`` object. It may or may not be permissible to change fields - before returning this object. - """ - logger.debug(f"In register_model({model})") - - # First attempt to interpret the model coordinates as a Llama model name - resolved_llama_model = sku_list.resolve_model(model.provider_model_id) - if resolved_llama_model is not None: - # Load from Hugging Face repo into default local cache dir - model_id_for_vllm = resolved_llama_model.huggingface_repo - - # Detect a genuine Meta Llama model to trigger Meta-specific preprocessing. - # Don't set self.is_meta_llama_model until we actually load the model. - is_meta_llama_model = True - else: # if resolved_llama_model is None - # Not a Llama model name. Pass the model id through to vLLM's loader - model_id_for_vllm = model.provider_model_id - is_meta_llama_model = False - - if self.resolved_model_id is not None: - if model_id_for_vllm != self.resolved_model_id: - raise ValueError( - f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and " - f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple " - f"copies of the provider instead." - ) - else: - # Model already loaded - logger.info( - f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing." - ) - self.model_ids.add(model.model_id) - return model - - logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.") - if is_meta_llama_model: - logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.") - self.is_meta_llama_model = is_meta_llama_model - - # If we get here, this is the first time registering a model. - # Preload so that the first inference request won't time out. - engine_args = AsyncEngineArgs( - model=model_id_for_vllm, - tokenizer=model_id_for_vllm, - tensor_parallel_size=self.config.tensor_parallel_size, - enforce_eager=self.config.enforce_eager, - gpu_memory_utilization=self.config.gpu_memory_utilization, - max_num_seqs=self.config.max_num_seqs, - max_model_len=self.config.max_model_len, - ) - self.engine = AsyncLLMEngine.from_engine_args(engine_args) - - # vLLM currently requires the user to specify the tool parser manually. To choose a tool - # parser, we need to determine what model architecture is being used. For now, we infer - # that information from what config class the model uses. - low_level_model_config = self.engine.engine.get_model_config() - hf_config = low_level_model_config.hf_config - hf_config_class_name = hf_config.__class__.__name__ - if hf_config_class_name in CONFIG_TYPE_TO_TOOL_PARSER: - tool_parser = CONFIG_TYPE_TO_TOOL_PARSER[hf_config_class_name] - else: - # No info -- choose a default so we can at least attempt tool - # use. - tool_parser = DEFAULT_TOOL_PARSER - logger.debug(f"{hf_config_class_name=}") - logger.debug(f"{tool_parser=}") - - # Wrap the lower-level engine in an OpenAI-compatible chat API - model_config = await self.engine.get_model_config() - self.chat = OpenAIServingChat( - engine_client=self.engine, - model_config=model_config, - models=OpenAIServingModels( - engine_client=self.engine, - model_config=model_config, - base_model_paths=[ - # The layer below us will only see resolved model IDs - BaseModelPath(model_id_for_vllm, model_id_for_vllm) - ], - ), - response_role="assistant", - request_logger=None, # Use default logging - chat_template=None, # Use default template from model checkpoint - enable_auto_tools=True, - tool_parser=tool_parser, - chat_template_content_format="auto", - ) - self.resolved_model_id = model_id_for_vllm - self.model_ids.add(model.model_id) - - logger.info(f"Finished preloading model: {model_id_for_vllm}") - - return model - - async def unregister_model(self, model_id: str) -> None: - """ - Callback that is called when the server removes an inference endpoint from an inference - provider. - - :param model_id: The same external ID that the higher layers of the stack previously passed - to :func:`register_model()` - """ - if model_id not in self.model_ids: - raise ValueError( - f"Attempted to unregister model ID '{model_id}', but that ID is not registered to this provider." - ) - self.model_ids.remove(model_id) - - if len(self.model_ids) == 0: - # Last model was just unregistered. Shut down the connection to vLLM and free up - # resources. - # Note that this operation may cause in-flight chat completion requests on the - # now-unregistered model to return errors. - self.resolved_model_id = None - self.chat = None - self.engine.shutdown_background_loop() - self.engine = None - - ########################################################################### - # METHODS INHERITED FROM Inference INTERFACE - - async def completion( - self, - model_id: str, - content: InterleavedContent, - sampling_params: SamplingParams | None = None, - response_format: ResponseFormat | None = None, - stream: bool | None = False, - logprobs: LogProbConfig | None = None, - ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]: - if model_id not in self.model_ids: - raise ValueError( - f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}" - ) - if not isinstance(content, str): - raise NotImplementedError("Multimodal input not currently supported") - if sampling_params is None: - sampling_params = SamplingParams() - - converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs) - - logger.debug(f"{converted_sampling_params=}") - - if stream: - return self._streaming_completion(content, converted_sampling_params) - else: - streaming_result = None - async for _ in self._streaming_completion(content, converted_sampling_params): - pass - return CompletionResponse( - content=streaming_result.delta, - stop_reason=streaming_result.stop_reason, - logprobs=streaming_result.logprobs, - ) - - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - - async def openai_embeddings( - self, - model: str, - input: str | list[str], - encoding_format: str | None = "float", - dimensions: int | None = None, - user: str | None = None, - ) -> OpenAIEmbeddingsResponse: - raise NotImplementedError() - - async def chat_completion( - self, - model_id: str, - messages: list[Message], # type: ignore - sampling_params: SamplingParams | None = None, - response_format: ResponseFormat | None = None, # type: ignore - tools: list[ToolDefinition] | None = None, - tool_choice: ToolChoice | None = ToolChoice.auto, - tool_prompt_format: ToolPromptFormat | None = None, - stream: bool | None = False, - logprobs: LogProbConfig | None = None, - tool_config: ToolConfig | None = None, - ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk: - sampling_params = sampling_params or SamplingParams() - if model_id not in self.model_ids: - raise ValueError( - f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}" - ) - - # Convert to Llama Stack internal format for consistency - request = ChatCompletionRequest( - model=self.resolved_model_id, - messages=messages, - sampling_params=sampling_params, - response_format=response_format, - tools=tools, - tool_choice=tool_choice, - tool_prompt_format=tool_prompt_format, - stream=stream, - logprobs=logprobs, - ) - - if self.is_meta_llama_model: - # Bypass vLLM chat templating layer for Meta Llama models, because the - # templating layer in Llama Stack currently produces better results. - logger.debug( - f"Routing {self.resolved_model_id} chat completion through " - f"Llama Stack's templating layer instead of vLLM's." - ) - return await self._chat_completion_for_meta_llama(request) - - logger.debug(f"{self.resolved_model_id} is not a Meta Llama model") - - # Arguments to the vLLM call must be packaged as a ChatCompletionRequest dataclass. - # Note that this dataclass has the same name as a similar dataclass in Llama Stack. - request_options = await llama_stack_chat_completion_to_openai_chat_completion_dict(request) - chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options) - - logger.debug(f"Converted request: {chat_completion_request}") - - vllm_result = await self.chat.create_chat_completion(chat_completion_request) - logger.debug(f"Result from vLLM: {vllm_result}") - if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse): - raise ValueError(f"Error from vLLM layer: {vllm_result}") - - # Return type depends on "stream" argument - if stream: - if not isinstance(vllm_result, AsyncGenerator): - raise TypeError(f"Unexpected result type {type(vllm_result)} for streaming inference call") - # vLLM client returns a stream of strings, which need to be parsed. - # Stream comes in the form of an async generator. - return self._convert_streaming_results(vllm_result) - else: - if not isinstance(vllm_result, vllm.entrypoints.openai.protocol.ChatCompletionResponse): - raise TypeError(f"Unexpected result type {type(vllm_result)} for non-streaming inference call") - return self._convert_non_streaming_results(vllm_result) - - ########################################################################### - # INTERNAL METHODS - - async def _streaming_completion( - self, content: str, sampling_params: vllm.SamplingParams - ) -> AsyncIterator[CompletionResponseStreamChunk]: - """Internal implementation of :func:`completion()` API for the streaming case. Assumes - that arguments have been validated upstream. - - :param content: Must be a string - :param sampling_params: Paramters from public API's ``response_format`` - and ``sampling_params`` arguments, converted to VLLM format - """ - # We run agains the vLLM generate() call directly instead of using the OpenAI-compatible - # layer, because doing so simplifies the code here. - - # The vLLM engine requires a unique identifier for each call to generate() - request_id = _random_uuid_str() - - # The vLLM generate() API is streaming-only and returns an async generator. - # The generator returns objects of type vllm.RequestOutput. - results_generator = self.engine.generate(content, sampling_params, request_id) - - # Need to know the model's EOS token ID for the conversion code below. - # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if - # we drill down to the LLMEngine inside the AsyncLLMEngine. - # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup, - # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup. - llm_engine = self.engine.engine - tokenizer_group = llm_engine.tokenizer - eos_token_id = tokenizer_group.tokenizer.eos_token_id - - request_output: vllm.RequestOutput = None - async for request_output in results_generator: - # Check for weird inference failures - if request_output.outputs is None or len(request_output.outputs) == 0: - # This case also should never happen - raise ValueError("Inference produced empty result") - - # If we get here, then request_output contains the final output of the generate() call. - # The result may include multiple alternate outputs, but Llama Stack APIs only allow - # us to return one. - output: vllm.CompletionOutput = request_output.outputs[0] - completion_string = output.text - - # Convert logprobs from vLLM's format to Llama Stack's format - logprobs = [ - TokenLogProbs(logprobs_by_token={v.decoded_token: v.logprob for _, v in logprob_dict.items()}) - for logprob_dict in output.logprobs - ] - - # The final output chunk should be labeled with the reason that the overall generate() - # call completed. - logger.debug(f"{output.stop_reason=}; {type(output.stop_reason)=}") - if output.stop_reason is None: - stop_reason = None # Still going - elif output.stop_reason == "stop": - stop_reason = StopReason.end_of_turn - elif output.stop_reason == "length": - stop_reason = StopReason.out_of_tokens - elif isinstance(output.stop_reason, int): - # If the model config specifies multiple end-of-sequence tokens, then vLLM - # will return the token ID of the EOS token in the stop_reason field. - stop_reason = StopReason.end_of_turn - else: - raise ValueError(f"Unrecognized stop reason '{output.stop_reason}'") - - # vLLM's protocol outputs the stop token, then sets end of message on the next step for - # some reason. - if request_output.outputs[-1].token_ids[-1] == eos_token_id: - stop_reason = StopReason.end_of_message - - yield CompletionResponseStreamChunk(delta=completion_string, stop_reason=stop_reason, logprobs=logprobs) - - # Llama Stack requires that the last chunk have a stop reason, but vLLM doesn't always - # provide one if it runs out of tokens. - if stop_reason is None: - yield CompletionResponseStreamChunk( - delta=completion_string, - stop_reason=StopReason.out_of_tokens, - logprobs=logprobs, - ) - - def _convert_non_streaming_results( - self, vllm_result: vllm.entrypoints.openai.protocol.ChatCompletionResponse - ) -> ChatCompletionResponse: - """ - Subroutine to convert the non-streaming output of vLLM's OpenAI-compatible API into an - equivalent Llama Stack object. - - The result from vLLM's non-streaming API is a dataclass with the same name as the Llama - Stack ChatCompletionResponse dataclass, but with more and different field names. We ignore - the fields that aren't currently present in the Llama Stack dataclass. - """ - - # There may be multiple responses, but we can only pass through the first one. - if len(vllm_result.choices) == 0: - raise ValueError("Don't know how to convert response object without any responses") - vllm_message = vllm_result.choices[0].message - vllm_finish_reason = vllm_result.choices[0].finish_reason - - converted_message = CompletionMessage( - role=vllm_message.role, - # Llama Stack API won't accept None for content field. - content=("" if vllm_message.content is None else vllm_message.content), - stop_reason=get_stop_reason(vllm_finish_reason), - tool_calls=[ - ToolCall( - call_id=t.id, - tool_name=t.function.name, - # vLLM function args come back as a string. Llama Stack expects JSON. - arguments=json.loads(t.function.arguments), - arguments_json=t.function.arguments, - ) - for t in vllm_message.tool_calls - ], - ) - - # TODO: Convert logprobs - - logger.debug(f"Converted message: {converted_message}") - - return ChatCompletionResponse( - completion_message=converted_message, - ) - - async def _chat_completion_for_meta_llama( - self, request: ChatCompletionRequest - ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]: - """ - Subroutine that routes chat completions for Meta Llama models through Llama Stack's - chat template instead of using vLLM's version of that template. The Llama Stack version - of the chat template currently produces more reliable outputs. - - Once vLLM's support for Meta Llama models has matured more, we should consider routing - Meta Llama requests through the vLLM chat completions API instead of using this method. - """ - formatter = ChatFormat(Tokenizer.get_instance()) - - # Note that this function call modifies `request` in place. - prompt = await chat_completion_request_to_prompt(request, self.resolved_model_id) - - model_id = list(self.model_ids)[0] # Any model ID will do here - completion_response_or_iterator = await self.completion( - model_id=model_id, - content=prompt, - sampling_params=request.sampling_params, - response_format=request.response_format, - stream=request.stream, - logprobs=request.logprobs, - ) - - if request.stream: - if not isinstance(completion_response_or_iterator, AsyncIterator): - raise TypeError( - f"Received unexpected result type {type(completion_response_or_iterator)}for streaming request." - ) - return self._chat_completion_for_meta_llama_streaming(completion_response_or_iterator, request) - - # elsif not request.stream: - if not isinstance(completion_response_or_iterator, CompletionResponse): - raise TypeError( - f"Received unexpected result type {type(completion_response_or_iterator)}for non-streaming request." - ) - completion_response: CompletionResponse = completion_response_or_iterator - raw_message = formatter.decode_assistant_message_from_content( - completion_response.content, completion_response.stop_reason - ) - return ChatCompletionResponse( - completion_message=CompletionMessage( - content=raw_message.content, - stop_reason=raw_message.stop_reason, - tool_calls=raw_message.tool_calls, - ), - logprobs=completion_response.logprobs, - ) - - async def _chat_completion_for_meta_llama_streaming( - self, results_iterator: AsyncIterator, request: ChatCompletionRequest - ) -> AsyncIterator: - """ - Code from :func:`_chat_completion_for_meta_llama()` that needs to be a separate - method to keep asyncio happy. - """ - - # Convert to OpenAI format, then use shared code to convert to Llama Stack format. - async def _generate_and_convert_to_openai_compat(): - chunk: CompletionResponseStreamChunk # Make Pylance happy - last_text_len = 0 - async for chunk in results_iterator: - if chunk.stop_reason == StopReason.end_of_turn: - finish_reason = "stop" - elif chunk.stop_reason == StopReason.end_of_message: - finish_reason = "eos" - elif chunk.stop_reason == StopReason.out_of_tokens: - finish_reason = "length" - else: - finish_reason = None - - # Convert delta back to an actual delta - text_delta = chunk.delta[last_text_len:] - last_text_len = len(chunk.delta) - - logger.debug(f"{text_delta=}; {finish_reason=}") - - yield OpenAICompatCompletionResponse( - choices=[OpenAICompatCompletionChoice(finish_reason=finish_reason, text=text_delta)] - ) - - stream = _generate_and_convert_to_openai_compat() - async for chunk in process_chat_completion_stream_response(stream, request): - logger.debug(f"Returning chunk: {chunk}") - yield chunk - - async def _convert_streaming_results(self, vllm_result: AsyncIterator) -> AsyncIterator: - """ - Subroutine that wraps the streaming outputs of vLLM's OpenAI-compatible - API into a second async iterator that returns Llama Stack objects. - - :param vllm_result: Stream of strings that need to be parsed - """ - # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up - # those chunks and output them at the end. - # This data structure holds the current set of partial tool calls. - index_to_tool_call: dict[int, dict] = dict() - - # The Llama Stack event stream must always start with a start event. Use an empty one to - # simplify logic below - yield ChatCompletionResponseStreamChunk( - event=ChatCompletionResponseEvent( - event_type=ChatCompletionResponseEventType.start, - delta=TextDelta(text=""), - stop_reason=None, - ) - ) - - converted_stop_reason = None - async for chunk_str in vllm_result: - # Due to OpenAI compatibility, each event in the stream will start with "data: " and - # end with "\n\n". - _prefix = "data: " - _suffix = "\n\n" - if not chunk_str.startswith(_prefix) or not chunk_str.endswith(_suffix): - raise ValueError(f"Can't parse result string from vLLM: '{re.escape(chunk_str)}'") - - # In between the "data: " and newlines is an event record - data_str = chunk_str[len(_prefix) : -len(_suffix)] - - # The end of the stream is indicated with "[DONE]" - if data_str == "[DONE]": - yield ChatCompletionResponseStreamChunk( - event=ChatCompletionResponseEvent( - event_type=ChatCompletionResponseEventType.complete, - delta=TextDelta(text=""), - stop_reason=converted_stop_reason, - ) - ) - return - - # Anything that is not "[DONE]" should be a JSON record - parsed_chunk = json.loads(data_str) - - logger.debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}") - - # The result may contain multiple completions, but Llama Stack APIs only support - # returning one. - first_choice = parsed_chunk["choices"][0] - converted_stop_reason = get_stop_reason(first_choice["finish_reason"]) - delta_record = first_choice["delta"] - - if "content" in delta_record: - # Text delta - yield ChatCompletionResponseStreamChunk( - event=ChatCompletionResponseEvent( - event_type=ChatCompletionResponseEventType.progress, - delta=TextDelta(text=delta_record["content"]), - stop_reason=converted_stop_reason, - ) - ) - elif "tool_calls" in delta_record: - # Tool call(s). Llama Stack APIs do not have a clear way to return partial tool - # calls, so buffer until we get a "tool calls" stop reason - for tc in delta_record["tool_calls"]: - index = tc["index"] - if index not in index_to_tool_call: - # First time this tool call is showing up - index_to_tool_call[index] = dict() - tool_call = index_to_tool_call[index] - if "id" in tc: - tool_call["call_id"] = tc["id"] - if "function" in tc: - if "name" in tc["function"]: - tool_call["tool_name"] = tc["function"]["name"] - if "arguments" in tc["function"]: - # Arguments comes in as pieces of a string - if "arguments_str" not in tool_call: - tool_call["arguments_str"] = "" - tool_call["arguments_str"] += tc["function"]["arguments"] - else: - raise ValueError(f"Don't know how to parse event delta: {delta_record}") - - if first_choice["finish_reason"] == "tool_calls": - # Special OpenAI code for "tool calls complete". - # Output the buffered tool calls. Llama Stack requires a separate event per tool - # call. - for tool_call_record in index_to_tool_call.values(): - # Arguments come in as a string. Parse the completed string. - tool_call_record["arguments"] = json.loads(tool_call_record["arguments_str"]) - del tool_call_record["arguments_str"] - - yield ChatCompletionResponseStreamChunk( - event=ChatCompletionResponseEvent( - event_type=ChatCompletionResponseEventType.progress, - delta=ToolCallDelta(tool_call=tool_call_record, parse_status="succeeded"), - stop_reason=converted_stop_reason, - ) - ) - - # If we get here, we've lost the connection with the vLLM event stream before it ended - # normally. - raise ValueError("vLLM event stream ended without [DONE] message.") diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 217870ec9..ffd30a5b5 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -37,16 +37,6 @@ def available_providers() -> list[ProviderSpec]: config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceInferenceConfig", description="Meta's reference implementation of inference with support for various model formats and optimization techniques.", ), - InlineProviderSpec( - api=Api.inference, - provider_type="inline::vllm", - pip_packages=[ - "vllm", - ], - module="llama_stack.providers.inline.inference.vllm", - config_class="llama_stack.providers.inline.inference.vllm.VLLMConfig", - description="vLLM inference provider for high-performance model serving with PagedAttention and continuous batching.", - ), InlineProviderSpec( api=Api.inference, provider_type="inline::sentence-transformers", diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index a1f7743d5..76d789d07 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -159,18 +159,18 @@ class OllamaInferenceAdapter( models = [] for m in response.models: model_type = ModelType.embedding if m.details.family in ["bert"] else ModelType.llm - # unfortunately, ollama does not provide embedding dimension in the model list :( - # we should likely add a hard-coded mapping of model name to embedding dimension + if model_type == ModelType.embedding: + continue models.append( Model( identifier=m.model, provider_resource_id=m.model, provider_id=provider_id, - metadata={"embedding_dimension": 384} if model_type == ModelType.embedding else {}, + metadata={}, model_type=model_type, ) ) - await self.model_store.update_registered_models(provider_id, models) + await self.model_store.update_registered_llm_models(provider_id, models) logger.debug(f"ollama refreshed model list ({len(models)} models)") await asyncio.sleep(self.config.refresh_models_interval) diff --git a/llama_stack/providers/remote/inference/vllm/config.py b/llama_stack/providers/remote/inference/vllm/config.py index e11efa7f0..ee72f974a 100644 --- a/llama_stack/providers/remote/inference/vllm/config.py +++ b/llama_stack/providers/remote/inference/vllm/config.py @@ -29,6 +29,14 @@ class VLLMInferenceAdapterConfig(BaseModel): default=True, description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.", ) + refresh_models: bool = Field( + default=False, + description="Whether to refresh models periodically", + ) + refresh_models_interval: int = Field( + default=300, + description="Interval in seconds to refresh models", + ) @field_validator("tls_verify") @classmethod @@ -46,7 +54,7 @@ class VLLMInferenceAdapterConfig(BaseModel): @classmethod def sample_run_config( cls, - url: str = "${env.VLLM_URL}", + url: str = "${env.VLLM_URL:=}", **kwargs, ): return { diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index d1455acaa..8bdba1e88 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -3,8 +3,8 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import asyncio import json -import logging from collections.abc import AsyncGenerator, AsyncIterator from typing import Any @@ -38,6 +38,7 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, LogProbConfig, Message, + ModelStore, OpenAIChatCompletion, OpenAICompletion, OpenAIEmbeddingData, @@ -54,6 +55,7 @@ from llama_stack.apis.inference import ( ToolPromptFormat, ) from llama_stack.apis.models import Model, ModelType +from llama_stack.log import get_logger from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall from llama_stack.models.llama.sku_list import all_registered_models from llama_stack.providers.datatypes import ( @@ -84,7 +86,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( from .config import VLLMInferenceAdapterConfig -log = logging.getLogger(__name__) +log = get_logger(name=__name__, category="inference") def build_hf_repo_model_entries(): @@ -288,16 +290,76 @@ async def _process_vllm_chat_completion_stream_response( class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): + # automatically set by the resolver when instantiating the provider + __provider_id__: str + model_store: ModelStore | None = None + _refresh_task: asyncio.Task | None = None + def __init__(self, config: VLLMInferenceAdapterConfig) -> None: self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries()) self.config = config self.client = None async def initialize(self) -> None: - pass + if not self.config.url: + # intentionally don't raise an error here, we want to allow the provider to be "dormant" + # or available in distributions like "starter" without causing a ruckus + return + + if self.config.refresh_models: + self._refresh_task = asyncio.create_task(self._refresh_models()) + + def cb(task): + import traceback + + if task.cancelled(): + log.error(f"vLLM background refresh task canceled:\n{''.join(traceback.format_stack())}") + elif task.exception(): + # print the stack trace for the exception + exc = task.exception() + log.error(f"vLLM background refresh task died: {exc}") + traceback.print_exception(exc) + else: + log.error("vLLM background refresh task completed unexpectedly") + + self._refresh_task.add_done_callback(cb) + + async def _refresh_models(self) -> None: + provider_id = self.__provider_id__ + waited_time = 0 + while not self.model_store and waited_time < 60: + await asyncio.sleep(1) + waited_time += 1 + + if not self.model_store: + raise ValueError("Model store not set after waiting 60 seconds") + + self._lazy_initialize_client() + assert self.client is not None # mypy + while True: + try: + models = [] + async for m in self.client.models.list(): + model_type = ModelType.llm # unclear how to determine embedding vs. llm models + models.append( + Model( + identifier=m.id, + provider_resource_id=m.id, + provider_id=provider_id, + metadata={}, + model_type=model_type, + ) + ) + await self.model_store.update_registered_llm_models(provider_id, models) + log.debug(f"vLLM refreshed model list ({len(models)} models)") + except Exception as e: + log.error(f"vLLM background refresh task failed: {e}") + await asyncio.sleep(self.config.refresh_models_interval) async def shutdown(self) -> None: - pass + if self._refresh_task: + self._refresh_task.cancel() + self._refresh_task = None async def unregister_model(self, model_id: str) -> None: pass @@ -312,6 +374,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): HealthResponse: A dictionary containing the health status. """ try: + if not self.config.url: + return HealthResponse(status=HealthStatus.ERROR, message="vLLM URL is not set") + client = self._create_client() if self.client is None else self.client _ = [m async for m in client.models.list()] # Ensure the client is initialized return HealthResponse(status=HealthStatus.OK) @@ -327,6 +392,11 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): if self.client is not None: return + if not self.config.url: + raise ValueError( + "You must provide a vLLM URL in the run.yaml file (or set the VLLM_URL environment variable)" + ) + log.info(f"Initializing vLLM client with base_url={self.config.url}") self.client = self._create_client() diff --git a/llama_stack/templates/vllm-gpu/__init__.py b/llama_stack/templates/ci-tests/__init__.py similarity index 76% rename from llama_stack/templates/vllm-gpu/__init__.py rename to llama_stack/templates/ci-tests/__init__.py index 7b3d59a01..b309587f5 100644 --- a/llama_stack/templates/vllm-gpu/__init__.py +++ b/llama_stack/templates/ci-tests/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .vllm import get_distribution_template # noqa: F401 +from .ci_tests import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/ci-tests/build.yaml b/llama_stack/templates/ci-tests/build.yaml new file mode 100644 index 000000000..0aed1d185 --- /dev/null +++ b/llama_stack/templates/ci-tests/build.yaml @@ -0,0 +1,65 @@ +version: 2 +distribution_spec: + description: CI tests for Llama Stack + providers: + inference: + - remote::cerebras + - remote::ollama + - remote::vllm + - remote::tgi + - remote::hf::serverless + - remote::hf::endpoint + - remote::fireworks + - remote::together + - remote::bedrock + - remote::databricks + - remote::nvidia + - remote::runpod + - remote::openai + - remote::anthropic + - remote::gemini + - remote::groq + - remote::fireworks-openai-compat + - remote::llama-openai-compat + - remote::together-openai-compat + - remote::groq-openai-compat + - remote::sambanova-openai-compat + - remote::cerebras-openai-compat + - remote::sambanova + - remote::passthrough + - inline::sentence-transformers + vector_io: + - inline::faiss + - inline::sqlite-vec + - inline::milvus + - remote::chromadb + - remote::pgvector + files: + - inline::localfs + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference + post_training: + - inline::huggingface + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + tool_runtime: + - remote::brave-search + - remote::tavily-search + - inline::rag-runtime + - remote::model-context-protocol +image_type: conda +additional_pip_packages: +- aiosqlite +- asyncpg +- sqlalchemy[asyncio] diff --git a/llama_stack/templates/ci-tests/ci_tests.py b/llama_stack/templates/ci-tests/ci_tests.py new file mode 100644 index 000000000..49cb36e39 --- /dev/null +++ b/llama_stack/templates/ci-tests/ci_tests.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +from llama_stack.templates.template import DistributionTemplate + +from ..starter.starter import get_distribution_template as get_starter_distribution_template + + +def get_distribution_template() -> DistributionTemplate: + template = get_starter_distribution_template() + name = "ci-tests" + template.name = name + template.description = "CI tests for Llama Stack" + + return template diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml new file mode 100644 index 000000000..cc7378c97 --- /dev/null +++ b/llama_stack/templates/ci-tests/run.yaml @@ -0,0 +1,1189 @@ +version: 2 +image_name: ci-tests +apis: +- agents +- datasetio +- eval +- files +- inference +- post_training +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_type: remote::cerebras + config: + base_url: https://api.cerebras.ai + api_key: ${env.CEREBRAS_API_KEY} + - provider_id: ${env.ENABLE_OLLAMA:=__disabled__} + provider_type: remote::ollama + config: + url: ${env.OLLAMA_URL:=http://localhost:11434} + - provider_id: ${env.ENABLE_VLLM:=__disabled__} + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:=} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} + - provider_id: ${env.ENABLE_TGI:=__disabled__} + provider_type: remote::tgi + config: + url: ${env.TGI_URL} + - provider_id: ${env.ENABLE_HF_SERVERLESS:=__disabled__} + provider_type: remote::hf::serverless + config: + huggingface_repo: ${env.INFERENCE_MODEL} + api_token: ${env.HF_API_TOKEN} + - provider_id: ${env.ENABLE_HF_ENDPOINT:=__disabled__} + provider_type: remote::hf::endpoint + config: + endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} + api_token: ${env.HF_API_TOKEN} + - provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inference/v1 + api_key: ${env.FIREWORKS_API_KEY} + - provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + api_key: ${env.TOGETHER_API_KEY} + - provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_type: remote::bedrock + config: {} + - provider_id: ${env.ENABLE_DATABRICKS:=__disabled__} + provider_type: remote::databricks + config: + url: ${env.DATABRICKS_URL} + api_token: ${env.DATABRICKS_API_TOKEN} + - provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_type: remote::nvidia + config: + url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + api_key: ${env.NVIDIA_API_KEY:=} + append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_type: remote::runpod + config: + url: ${env.RUNPOD_URL:=} + api_token: ${env.RUNPOD_API_TOKEN} + - provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + - provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_type: remote::anthropic + config: + api_key: ${env.ANTHROPIC_API_KEY} + - provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_type: remote::gemini + config: + api_key: ${env.GEMINI_API_KEY} + - provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_type: remote::groq + config: + url: https://api.groq.com + api_key: ${env.GROQ_API_KEY} + - provider_id: ${env.ENABLE_FIREWORKS_OPENAI_COMPAT:=__disabled__} + provider_type: remote::fireworks-openai-compat + config: + openai_compat_api_base: https://api.fireworks.ai/inference/v1 + api_key: ${env.FIREWORKS_API_KEY} + - provider_id: ${env.ENABLE_LLAMA_OPENAI_COMPAT:=__disabled__} + provider_type: remote::llama-openai-compat + config: + openai_compat_api_base: https://api.llama.com/compat/v1/ + api_key: ${env.LLAMA_API_KEY} + - provider_id: ${env.ENABLE_TOGETHER_OPENAI_COMPAT:=__disabled__} + provider_type: remote::together-openai-compat + config: + openai_compat_api_base: https://api.together.xyz/v1 + api_key: ${env.TOGETHER_API_KEY} + - provider_id: ${env.ENABLE_GROQ_OPENAI_COMPAT:=__disabled__} + provider_type: remote::groq-openai-compat + config: + openai_compat_api_base: https://api.groq.com/openai/v1 + api_key: ${env.GROQ_API_KEY} + - provider_id: ${env.ENABLE_SAMBANOVA_OPENAI_COMPAT:=__disabled__} + provider_type: remote::sambanova-openai-compat + config: + openai_compat_api_base: https://api.sambanova.ai/v1 + api_key: ${env.SAMBANOVA_API_KEY} + - provider_id: ${env.ENABLE_CEREBRAS_OPENAI_COMPAT:=__disabled__} + provider_type: remote::cerebras-openai-compat + config: + openai_compat_api_base: https://api.cerebras.ai/v1 + api_key: ${env.CEREBRAS_API_KEY} + - provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_type: remote::sambanova + config: + url: https://api.sambanova.ai/v1 + api_key: ${env.SAMBANOVA_API_KEY} + - provider_id: ${env.ENABLE_PASSTHROUGH:=__disabled__} + provider_type: remote::passthrough + config: + url: ${env.PASSTHROUGH_URL} + api_key: ${env.PASSTHROUGH_API_KEY} + - provider_id: ${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers} + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: ${env.ENABLE_FAISS:=faiss} + provider_type: inline::faiss + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db + - provider_id: ${env.ENABLE_SQLITE_VEC:=__disabled__} + provider_type: inline::sqlite-vec + config: + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db + - provider_id: ${env.ENABLE_MILVUS:=__disabled__} + provider_type: inline::milvus + config: + db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db + - provider_id: ${env.ENABLE_CHROMADB:=__disabled__} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:=} + - provider_id: ${env.ENABLE_PGVECTOR:=__disabled__} + provider_type: remote::pgvector + config: + host: ${env.PGVECTOR_HOST:=localhost} + port: ${env.PGVECTOR_PORT:=5432} + db: ${env.PGVECTOR_DB:=} + user: ${env.PGVECTOR_USER:=} + password: ${env.PGVECTOR_PASSWORD:=} + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/agents_store.db + responses_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/responses_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console,sqlite} + sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db + otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=} + post_training: + - provider_id: huggingface + provider_type: inline::huggingface + config: + checkpoint_format: huggingface + distributed_backend: null + device: cpu + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:=} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:=} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:=} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db +inference_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/inference_store.db +models: +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: ${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers} + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/llama3.1-8b + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama3.1-8b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama3.1-8b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/llama-3.3-70b + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama-3.3-70b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama-3.3-70b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/llama-4-scout-17b-16e-instruct + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_CEREBRAS:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_model_id: llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.OLLAMA_INFERENCE_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} + provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:=__disabled__} + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.SAFETY_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} + provider_model_id: ${env.SAFETY_MODEL:=__disabled__} + model_type: llm +- metadata: + embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:=384} + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.OLLAMA_EMBEDDING_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} + provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:=__disabled__} + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_VLLM:=__disabled__}/${env.VLLM_INFERENCE_MODEL:=__disabled__} + provider_id: ${env.ENABLE_VLLM:=__disabled__} + provider_model_id: ${env.VLLM_INFERENCE_MODEL:=__disabled__} + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-8b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-70b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-405b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-3b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-11b-vision-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-90b-vision-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p3-70b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: + embedding_dimension: 768 + context_length: 8192 + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/nomic-ai/nomic-embed-text-v1.5 + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: nomic-ai/nomic-embed-text-v1.5 + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-guard-3-8b + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-guard-3-8b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-guard-3-8b + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-guard-3-11b-vision + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} + provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + model_type: llm +- metadata: + embedding_dimension: 768 + context_length: 8192 + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/togethercomputer/m2-bert-80M-8k-retrieval + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval + model_type: embedding +- metadata: + embedding_dimension: 768 + context_length: 32768 + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/togethercomputer/m2-bert-80M-32k-retrieval + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} + provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta.llama3-1-8b-instruct-v1:0 + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-8b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-8b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta.llama3-1-70b-instruct-v1:0 + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-70b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-70b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta.llama3-1-405b-instruct-v1:0 + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-405b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_BEDROCK:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_model_id: meta.llama3-1-405b-instruct-v1:0 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_DATABRICKS:=__disabled__}/databricks-meta-llama-3-1-70b-instruct + provider_id: ${env.ENABLE_DATABRICKS:=__disabled__} + provider_model_id: databricks-meta-llama-3-1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_DATABRICKS:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_DATABRICKS:=__disabled__} + provider_model_id: databricks-meta-llama-3-1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_DATABRICKS:=__disabled__}/databricks-meta-llama-3-1-405b-instruct + provider_id: ${env.ENABLE_DATABRICKS:=__disabled__} + provider_model_id: databricks-meta-llama-3-1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_DATABRICKS:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_DATABRICKS:=__disabled__} + provider_model_id: databricks-meta-llama-3-1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama3-8b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama3-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3-8B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama3-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama3-70b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama3-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3-70B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama3-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.1-8b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-8b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.1-70b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.1-405b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.1-405b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.2-1b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-1b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.2-1B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-1b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.2-3b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-3b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-3b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.2-11b-vision-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.2-90b-vision-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta/llama-3.3-70b-instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.3-70b-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: meta/llama-3.3-70b-instruct + model_type: llm +- metadata: + embedding_dimension: 2048 + context_length: 8192 + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/nvidia/llama-3.2-nv-embedqa-1b-v2 + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2 + model_type: embedding +- metadata: + embedding_dimension: 1024 + context_length: 512 + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/nvidia/nv-embedqa-e5-v5 + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: nvidia/nv-embedqa-e5-v5 + model_type: embedding +- metadata: + embedding_dimension: 4096 + context_length: 512 + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/nvidia/nv-embedqa-mistral-7b-v2 + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: nvidia/nv-embedqa-mistral-7b-v2 + model_type: embedding +- metadata: + embedding_dimension: 1024 + context_length: 512 + model_id: ${env.ENABLE_NVIDIA:=__disabled__}/snowflake/arctic-embed-l + provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_model_id: snowflake/arctic-embed-l + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-8B + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-8B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-70B + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-70B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B:bf16-mp8 + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B:bf16-mp8 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B:bf16-mp16 + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B:bf16-mp16 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-8B-Instruct + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-8B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-70B-Instruct + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-70B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B-Instruct:bf16-mp8 + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B-Instruct:bf16-mp8 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B-Instruct + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.1-405B-Instruct:bf16-mp16 + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.1-405B-Instruct:bf16-mp16 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.2-1B + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.2-1B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_RUNPOD:=__disabled__}/Llama3.2-3B + provider_id: ${env.ENABLE_RUNPOD:=__disabled__} + provider_model_id: Llama3.2-3B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/gpt-4o + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: openai/gpt-4o + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/gpt-4o-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: openai/gpt-4o-mini + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/chatgpt-4o-latest + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: openai/chatgpt-4o-latest + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo-0125 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-3.5-turbo-0125 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-3.5-turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo-instruct + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-3.5-turbo-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4-turbo + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4-turbo + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4o + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-2024-08-06 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4o-2024-08-06 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4o-mini + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-audio-preview + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: gpt-4o-audio-preview + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/chatgpt-4o-latest + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: chatgpt-4o-latest + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o1 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: o1 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o1-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: o1-mini + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o3-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: o3-mini + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o4-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: o4-mini + model_type: llm +- metadata: + embedding_dimension: 1536 + context_length: 8192 + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/text-embedding-3-small + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: openai/text-embedding-3-small + model_type: embedding +- metadata: + embedding_dimension: 3072 + context_length: 8192 + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/text-embedding-3-large + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: openai/text-embedding-3-large + model_type: embedding +- metadata: + embedding_dimension: 1536 + context_length: 8192 + model_id: ${env.ENABLE_OPENAI:=__disabled__}/text-embedding-3-small + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: text-embedding-3-small + model_type: embedding +- metadata: + embedding_dimension: 3072 + context_length: 8192 + model_id: ${env.ENABLE_OPENAI:=__disabled__}/text-embedding-3-large + provider_id: ${env.ENABLE_OPENAI:=__disabled__} + provider_model_id: text-embedding-3-large + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-5-sonnet-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/claude-3-5-sonnet-latest + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-7-sonnet-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/claude-3-7-sonnet-latest + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-5-haiku-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/claude-3-5-haiku-latest + model_type: llm +- metadata: + embedding_dimension: 1024 + context_length: 32000 + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-3 + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/voyage-3 + model_type: embedding +- metadata: + embedding_dimension: 512 + context_length: 32000 + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-3-lite + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/voyage-3-lite + model_type: embedding +- metadata: + embedding_dimension: 1024 + context_length: 32000 + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-code-3 + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} + provider_model_id: anthropic/voyage-code-3 + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-1.5-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/gemini-1.5-flash + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-1.5-pro + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/gemini-1.5-pro + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.0-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/gemini-2.0-flash + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.5-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/gemini-2.5-flash + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.5-pro + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/gemini-2.5-pro + model_type: llm +- metadata: + embedding_dimension: 768 + context_length: 2048 + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/text-embedding-004 + provider_id: ${env.ENABLE_GEMINI:=__disabled__} + provider_model_id: gemini/text-embedding-004 + model_type: embedding +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama3-8b-8192 + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama3-8b-8192 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama3-8b-8192 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.1-8b-instant + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-3.1-8b-instant + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama3-70b-8192 + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama3-70b-8192 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3-70B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama3-70b-8192 + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.3-70b-versatile + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-3.3-70b-versatile + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-3.3-70b-versatile + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.2-3b-preview + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-3.2-3b-preview + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-3.2-3b-preview + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-4-scout-17b-16e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/meta-llama/llama-4-scout-17b-16e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-4-maverick-17b-128e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/meta-llama/llama-4-maverick-17b-128e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.1-405B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.2-1B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-1B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-Guard-3-8B + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} + provider_model_id: sambanova/Meta-Llama-Guard-3-8B + model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:=__disabled__} + provider_shield_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.SAFETY_MODEL:=__disabled__} +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +server: + port: 8321 diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml index 27400348a..46573848c 100644 --- a/llama_stack/templates/starter/run.yaml +++ b/llama_stack/templates/starter/run.yaml @@ -26,7 +26,7 @@ providers: - provider_id: ${env.ENABLE_VLLM:=__disabled__} provider_type: remote::vllm config: - url: ${env.VLLM_URL} + url: ${env.VLLM_URL:=} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml deleted file mode 100644 index 147dca50d..000000000 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Use a built-in vLLM engine for running LLM inference - providers: - inference: - - inline::vllm - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml deleted file mode 100644 index 4241569a4..000000000 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ /dev/null @@ -1,132 +0,0 @@ -version: 2 -image_name: vllm-gpu -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: vllm - provider_type: inline::vllm - config: - tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:=1} - max_tokens: ${env.MAX_TOKENS:=4096} - max_model_len: ${env.MAX_MODEL_LEN:=4096} - max_num_seqs: ${env.MAX_NUM_SEQS:=4} - enforce_eager: ${env.ENFORCE_EAGER:=False} - gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:=0.3} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/trace_store.db - otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:=} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:=} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:=} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/vllm-gpu}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py deleted file mode 100644 index 443fcd7a3..000000000 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ModelInput, Provider -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.inference.vllm import VLLMConfig -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - ToolGroupInput, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["inline::vllm", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - - name = "vllm-gpu" - inference_provider = Provider( - provider_id="vllm", - provider_type="inline::vllm", - config=VLLMConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="vllm", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use a built-in vLLM engine for running LLM inference", - container_image=None, - template_path=None, - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the vLLM engine", - ), - "TENSOR_PARALLEL_SIZE": ( - "1", - "Number of tensor parallel replicas (number of GPUs to use).", - ), - "MAX_TOKENS": ( - "4096", - "Maximum number of tokens to generate.", - ), - "ENFORCE_EAGER": ( - "False", - "Whether to use eager mode for inference (otherwise cuda graphs are used).", - ), - "GPU_MEMORY_UTILIZATION": ( - "0.7", - "GPU memory utilization for the vLLM engine.", - ), - }, - ) diff --git a/pyproject.toml b/pyproject.toml index 15e2e10b4..30e768dcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -257,7 +257,6 @@ exclude = [ "^llama_stack/models/llama/llama4/", "^llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls\\.py$", "^llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$", - "^llama_stack/providers/inline/inference/vllm/", "^llama_stack/providers/inline/post_training/common/validator\\.py$", "^llama_stack/providers/inline/safety/code_scanner/", "^llama_stack/providers/inline/safety/llama_guard/", diff --git a/tests/integration/files/test_files.py b/tests/integration/files/test_files.py index 8547ef2f3..118a751f0 100644 --- a/tests/integration/files/test_files.py +++ b/tests/integration/files/test_files.py @@ -5,10 +5,12 @@ # the root directory of this source tree. from io import BytesIO +from unittest.mock import patch import pytest from openai import OpenAI +from llama_stack.distribution.datatypes import User from llama_stack.distribution.library_client import LlamaStackAsLibraryClient @@ -61,3 +63,218 @@ def test_openai_client_basic_operations(compat_client, client_with_models): except Exception: pass raise e + + +@patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user") +def test_files_authentication_isolation(mock_get_authenticated_user, compat_client, client_with_models): + """Test that users can only access their own files.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI): + pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient") + if not isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)") + + client = compat_client + + # Create two test users + user1 = User("user1", {"roles": ["user"], "teams": ["team-a"]}) + user2 = User("user2", {"roles": ["user"], "teams": ["team-b"]}) + + # User 1 uploads a file + mock_get_authenticated_user.return_value = user1 + test_content_1 = b"User 1's private file content" + + with BytesIO(test_content_1) as file_buffer: + file_buffer.name = "user1_file.txt" + user1_file = client.files.create(file=file_buffer, purpose="assistants") + + # User 2 uploads a file + mock_get_authenticated_user.return_value = user2 + test_content_2 = b"User 2's private file content" + + with BytesIO(test_content_2) as file_buffer: + file_buffer.name = "user2_file.txt" + user2_file = client.files.create(file=file_buffer, purpose="assistants") + + try: + # User 1 can see their own file + mock_get_authenticated_user.return_value = user1 + user1_files = client.files.list() + user1_file_ids = [f.id for f in user1_files.data] + assert user1_file.id in user1_file_ids + assert user2_file.id not in user1_file_ids # Cannot see user2's file + + # User 2 can see their own file + mock_get_authenticated_user.return_value = user2 + user2_files = client.files.list() + user2_file_ids = [f.id for f in user2_files.data] + assert user2_file.id in user2_file_ids + assert user1_file.id not in user2_file_ids # Cannot see user1's file + + # User 1 can retrieve their own file + mock_get_authenticated_user.return_value = user1 + retrieved_file = client.files.retrieve(user1_file.id) + assert retrieved_file.id == user1_file.id + + # User 1 cannot retrieve user2's file + mock_get_authenticated_user.return_value = user1 + with pytest.raises(ValueError, match="not found"): + client.files.retrieve(user2_file.id) + + # User 1 can access their file content + mock_get_authenticated_user.return_value = user1 + content_response = client.files.content(user1_file.id) + if isinstance(content_response, str): + content = bytes(content_response, "utf-8") + else: + content = content_response.content + assert content == test_content_1 + + # User 1 cannot access user2's file content + mock_get_authenticated_user.return_value = user1 + with pytest.raises(ValueError, match="not found"): + client.files.content(user2_file.id) + + # User 1 can delete their own file + mock_get_authenticated_user.return_value = user1 + delete_response = client.files.delete(user1_file.id) + assert delete_response.deleted is True + + # User 1 cannot delete user2's file + mock_get_authenticated_user.return_value = user1 + with pytest.raises(ValueError, match="not found"): + client.files.delete(user2_file.id) + + # User 2 can still access their file after user1's file is deleted + mock_get_authenticated_user.return_value = user2 + retrieved_file = client.files.retrieve(user2_file.id) + assert retrieved_file.id == user2_file.id + + # Cleanup user2's file + mock_get_authenticated_user.return_value = user2 + client.files.delete(user2_file.id) + + except Exception as e: + # Cleanup in case of failure + try: + mock_get_authenticated_user.return_value = user1 + client.files.delete(user1_file.id) + except Exception: + pass + try: + mock_get_authenticated_user.return_value = user2 + client.files.delete(user2_file.id) + except Exception: + pass + raise e + + +@patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user") +def test_files_authentication_shared_attributes(mock_get_authenticated_user, compat_client, client_with_models): + """Test access control with users having identical attributes.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI): + pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient") + if not isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)") + + client = compat_client + + # Create users with identical attributes (required for default policy) + user_a = User("user-a", {"roles": ["user"], "teams": ["shared-team"]}) + user_b = User("user-b", {"roles": ["user"], "teams": ["shared-team"]}) + + # User A uploads a file + mock_get_authenticated_user.return_value = user_a + test_content = b"Shared attributes file content" + + with BytesIO(test_content) as file_buffer: + file_buffer.name = "shared_attributes_file.txt" + shared_file = client.files.create(file=file_buffer, purpose="assistants") + + try: + # User B with identical attributes can access the file + mock_get_authenticated_user.return_value = user_b + files_list = client.files.list() + file_ids = [f.id for f in files_list.data] + + # User B should be able to see the file due to identical attributes + assert shared_file.id in file_ids + + # User B can retrieve file info + retrieved_file = client.files.retrieve(shared_file.id) + assert retrieved_file.id == shared_file.id + + # User B can access file content + content_response = client.files.content(shared_file.id) + if isinstance(content_response, str): + content = bytes(content_response, "utf-8") + else: + content = content_response.content + assert content == test_content + + # Cleanup + mock_get_authenticated_user.return_value = user_a + client.files.delete(shared_file.id) + + except Exception as e: + # Cleanup in case of failure + try: + mock_get_authenticated_user.return_value = user_a + client.files.delete(shared_file.id) + except Exception: + pass + try: + mock_get_authenticated_user.return_value = user_b + client.files.delete(shared_file.id) + except Exception: + pass + raise e + + +@patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user") +def test_files_authentication_anonymous_access(mock_get_authenticated_user, compat_client, client_with_models): + """Test anonymous user behavior when no authentication is present.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI): + pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient") + if not isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)") + + client = compat_client + + # Simulate anonymous user (no authentication) + mock_get_authenticated_user.return_value = None + + test_content = b"Anonymous file content" + + with BytesIO(test_content) as file_buffer: + file_buffer.name = "anonymous_file.txt" + anonymous_file = client.files.create(file=file_buffer, purpose="assistants") + + try: + # Anonymous user should be able to access their own uploaded file + files_list = client.files.list() + file_ids = [f.id for f in files_list.data] + assert anonymous_file.id in file_ids + + # Can retrieve file info + retrieved_file = client.files.retrieve(anonymous_file.id) + assert retrieved_file.id == anonymous_file.id + + # Can access file content + content_response = client.files.content(anonymous_file.id) + if isinstance(content_response, str): + content = bytes(content_response, "utf-8") + else: + content = content_response.content + assert content == test_content + + # Can delete the file + delete_response = client.files.delete(anonymous_file.id) + assert delete_response.deleted is True + + except Exception as e: + # Cleanup in case of failure + try: + client.files.delete(anonymous_file.id) + except Exception: + pass + raise e diff --git a/tests/unit/files/test_files.py b/tests/unit/files/test_files.py index 785077e91..c3ec25116 100644 --- a/tests/unit/files/test_files.py +++ b/tests/unit/files/test_files.py @@ -9,6 +9,7 @@ import pytest from llama_stack.apis.common.responses import Order from llama_stack.apis.files import OpenAIFilePurpose +from llama_stack.distribution.access_control.access_control import default_policy from llama_stack.providers.inline.files.localfs import ( LocalfsFilesImpl, LocalfsFilesImplConfig, @@ -38,7 +39,7 @@ async def files_provider(tmp_path): storage_dir=storage_dir.as_posix(), metadata_store=SqliteSqlStoreConfig(db_path=db_path.as_posix()) ) - provider = LocalfsFilesImpl(config) + provider = LocalfsFilesImpl(config, default_policy()) await provider.initialize() yield provider