From 35384770705f81702b1cbe3913bdece9191c53f0 Mon Sep 17 00:00:00 2001 From: Jiayi Date: Fri, 12 Sep 2025 19:55:04 -0700 Subject: [PATCH] Update docs --- docs/docs/providers/inference/index.mdx | 2 +- docs/static/llama-stack-spec.html | 2 +- docs/static/llama-stack-spec.yaml | 3 +- llama_stack/apis/inference/inference.py | 2 +- llama_stack/apis/models/models.py | 2 +- llama_stack/core/routers/inference.py | 1 - .../remote/inference/nvidia/models.py | 131 ------------------ 7 files changed, 6 insertions(+), 137 deletions(-) delete mode 100644 llama_stack/providers/remote/inference/nvidia/models.py diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index 1cbeb12f0..98ba10cc7 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -18,6 +18,6 @@ Llama Stack Inference API for generating completions, chat completions, and embe This API provides the raw interface to the underlying models. Three kinds of models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents by relevance. + - Rerank models: these models reorder the documents based on their relevance to a query. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 8192a9cf6..0fdf3f415 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -17875,7 +17875,7 @@ }, { "name": "Inference", - "description": "This API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents by relevance.", + "description": "This API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.", "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings." }, { diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 895b939ab..ec0409849 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -13460,7 +13460,8 @@ tags: - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents by relevance. + - Rerank models: these models reorder the documents based on their relevance + to a query. x-displayName: >- Llama Stack Inference API for generating completions, chat completions, and embeddings. diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 7bd9f5918..6260ba552 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -1162,7 +1162,7 @@ class Inference(InferenceProvider): This API provides the raw interface to the underlying models. Three kinds of models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents by relevance. + - Rerank models: these models reorder the documents based on their relevance to a query. """ @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True) diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 359f5bf0c..1275e90e3 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -27,7 +27,7 @@ class ModelType(StrEnum): """Enumeration of supported model types in Llama Stack. :cvar llm: Large language model for text generation and completion :cvar embedding: Embedding model for converting text to vector representations - :cvar rerank: Reranking model for reordering documents by relevance + :cvar rerank: Reranking model for reordering documents based on their relevance to a query """ llm = "llm" diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index e5826685e..c1d4203c2 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -191,7 +191,6 @@ class InferenceRouter(Inference): items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], max_num_results: int | None = None, ) -> RerankResponse: - """Route rerank requests to the appropriate provider based on the model.""" logger.debug(f"InferenceRouter.rerank: {model}") model_obj = await self._get_model(model, ModelType.rerank) provider = await self.routing_table.get_provider_impl(model_obj.identifier) diff --git a/llama_stack/providers/remote/inference/nvidia/models.py b/llama_stack/providers/remote/inference/nvidia/models.py deleted file mode 100644 index a79a1c6aa..000000000 --- a/llama_stack/providers/remote/inference/nvidia/models.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.models import ModelType -from llama_stack.models.llama.sku_types import CoreModelId -from llama_stack.providers.utils.inference.model_registry import ( - ProviderModelEntry, - build_hf_repo_model_entry, -) - -SAFETY_MODELS_ENTRIES = [] - -# https://docs.nvidia.com/nim/large-language-models/latest/supported-llm-agnostic-architectures.html -MODEL_ENTRIES = [ - build_hf_repo_model_entry( - "meta/llama3-8b-instruct", - CoreModelId.llama3_8b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama3-70b-instruct", - CoreModelId.llama3_70b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.1-8b-instruct", - CoreModelId.llama3_1_8b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.1-70b-instruct", - CoreModelId.llama3_1_70b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.1-405b-instruct", - CoreModelId.llama3_1_405b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.2-1b-instruct", - CoreModelId.llama3_2_1b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.2-3b-instruct", - CoreModelId.llama3_2_3b_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.2-11b-vision-instruct", - CoreModelId.llama3_2_11b_vision_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.2-90b-vision-instruct", - CoreModelId.llama3_2_90b_vision_instruct.value, - ), - build_hf_repo_model_entry( - "meta/llama-3.3-70b-instruct", - CoreModelId.llama3_3_70b_instruct.value, - ), - ProviderModelEntry( - provider_model_id="nvidia/vila", - model_type=ModelType.llm, - ), - # NeMo Retriever Text Embedding models - - # - # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html - # - # +-----------------------------------+--------+-----------+-----------+------------+ - # | Model ID | Max | Publisher | Embedding | Dynamic | - # | | Tokens | | Dimension | Embeddings | - # +-----------------------------------+--------+-----------+-----------+------------+ - # | nvidia/llama-3.2-nv-embedqa-1b-v2 | 8192 | NVIDIA | 2048 | Yes | - # | nvidia/nv-embedqa-e5-v5 | 512 | NVIDIA | 1024 | No | - # | nvidia/nv-embedqa-mistral-7b-v2 | 512 | NVIDIA | 4096 | No | - # | snowflake/arctic-embed-l | 512 | Snowflake | 1024 | No | - # +-----------------------------------+--------+-----------+-----------+------------+ - ProviderModelEntry( - provider_model_id="nvidia/llama-3.2-nv-embedqa-1b-v2", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 2048, - "context_length": 8192, - }, - ), - ProviderModelEntry( - provider_model_id="nvidia/nv-embedqa-e5-v5", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 1024, - "context_length": 512, - }, - ), - ProviderModelEntry( - provider_model_id="nvidia/nv-embedqa-mistral-7b-v2", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 4096, - "context_length": 512, - }, - ), - ProviderModelEntry( - provider_model_id="snowflake/arctic-embed-l", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 1024, - "context_length": 512, - }, - ), - # NVIDIA Reranking models - ProviderModelEntry( - provider_model_id="nv-rerank-qa-mistral-4b:1", - model_type=ModelType.rerank, - metadata={ - "endpoint": "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking", - }, - ), - ProviderModelEntry( - provider_model_id="nvidia/nv-rerankqa-mistral-4b-v3", - model_type=ModelType.rerank, - metadata={ - "endpoint": "https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking", - }, - ), - ProviderModelEntry( - provider_model_id="nvidia/llama-3.2-nv-rerankqa-1b-v2", - model_type=ModelType.rerank, - metadata={ - "endpoint": "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking", - }, - ), - # TODO(mf): how do we handle Nemotron models? - # "Llama3.1-Nemotron-51B-Instruct" -> "meta/llama-3.1-nemotron-51b-instruct", -] + SAFETY_MODELS_ENTRIES