From a19c16428fbc437994f5ab9f8b85a0c7905e91a2 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Wed, 6 Aug 2025 16:55:14 -0400 Subject: [PATCH 1/2] feat: Updating files/content response to return additional fields Signed-off-by: Francisco Javier Arceo --- .github/workflows/integration-tests.yml | 2 +- CONTRIBUTING.md | 1 + docs/_static/llama-stack-spec.html | 209 + docs/_static/llama-stack-spec.yaml | 152 + docs/source/distributions/building_distro.md | 23 +- .../ondevice_distro/android_sdk.md | 8 +- .../self_hosted_distro/nvidia.md | 2 +- docs/source/getting_started/demo_script.py | 11 +- .../getting_started/detailed_tutorial.md | 8 +- docs/source/providers/agents/index.md | 2 +- docs/source/providers/datasetio/index.md | 2 +- docs/source/providers/eval/index.md | 2 +- docs/source/providers/files/index.md | 2 +- docs/source/providers/files/inline_localfs.md | 2 +- docs/source/providers/inference/index.md | 2 +- .../providers/inference/remote_hf_endpoint.md | 2 +- .../inference/remote_hf_serverless.md | 2 +- docs/source/providers/inference/remote_tgi.md | 2 +- docs/source/providers/post_training/index.md | 2 +- .../post_training/inline_huggingface.md | 3 +- docs/source/providers/safety/index.md | 2 +- docs/source/providers/scoring/index.md | 2 +- docs/source/providers/telemetry/index.md | 2 +- docs/source/providers/tool_runtime/index.md | 2 +- docs/source/providers/vector_io/index.md | 2 +- .../providers/vector_io/inline_chromadb.md | 2 +- .../providers/vector_io/inline_milvus.md | 2 +- .../providers/vector_io/inline_qdrant.md | 2 +- .../providers/vector_io/inline_sqlite-vec.md | 2 +- .../providers/vector_io/inline_sqlite_vec.md | 2 +- .../providers/vector_io/remote_chromadb.md | 2 +- .../providers/vector_io/remote_milvus.md | 4 +- .../llama_cli_reference/download_models.md | 2 +- .../references/llama_cli_reference/index.md | 2 +- llama_stack/apis/common/errors.py | 32 +- llama_stack/apis/safety/safety.py | 77 +- llama_stack/apis/vector_io/vector_io.py | 8 + llama_stack/core/routers/inference.py | 419 +- llama_stack/core/routers/safety.py | 39 + llama_stack/distributions/ci-tests/run.yaml | 1 + .../distributions/nvidia/doc_template.md | 2 +- .../postgres-demo/postgres_demo.py | 2 +- .../distributions/postgres-demo/run.yaml | 2 +- llama_stack/distributions/starter/run.yaml | 1 + .../inline/datasetio/localfs/datasetio.py | 6 +- .../post_training/huggingface/config.py | 9 +- .../huggingface/post_training.py | 19 +- .../post_training/torchtune/post_training.py | 11 +- .../inline/safety/llama_guard/llama_guard.py | 170 + .../meta_reference/console_span_processor.py | 5 +- .../telemetry/meta_reference/telemetry.py | 39 +- .../datasetio/huggingface/huggingface.py | 6 +- .../remote/inference/gemini/models.py | 2 + .../remote/inference/nvidia/NVIDIA.md | 16 +- .../remote/inference/ollama/ollama.py | 3 +- .../remote/vector_io/milvus/milvus.py | 51 +- .../providers/utils/datasetio/url_utils.py | 4 +- .../providers/utils/inference/stream_utils.py | 129 - .../utils/memory/openai_vector_store_mixin.py | 60 +- .../providers/utils/memory/vector_store.py | 16 +- .../providers/utils/telemetry/tracing.py | 31 +- .../[fileId]/contents/[contentId]/page.tsx | 383 ++ .../[id]/files/[fileId]/contents/page.tsx | 297 ++ .../[id]/files/[fileId]/page.tsx | 258 ++ .../ui/app/logs/vector-stores/layout.tsx | 25 +- .../ui/app/logs/vector-stores/page.tsx | 148 +- .../vector-stores/vector-store-detail.tsx | 17 +- llama_stack/ui/lib/contents-api.ts | 112 + llama_stack/ui/package-lock.json | 10 +- llama_stack/ui/package.json | 2 +- pyproject.toml | 9 +- scripts/provider_codegen.py | 7 +- .../test_supervied_fine_tuning.py | 60 - tests/integration/README.md | 8 +- .../agents/test_openai_responses.py | 37 +- tests/integration/conftest.py | 5 + tests/integration/fixtures/common.py | 39 +- .../inference/test_openai_completion.py | 17 - .../non_ci/responses}/__init__.py | 0 .../non_ci/responses/fixtures}/__init__.py | 0 .../non_ci/responses}/fixtures/fixtures.py | 10 - .../fixtures/images/vision_test_1.jpg | Bin .../fixtures/images/vision_test_2.jpg | Bin .../fixtures/images/vision_test_3.jpg | Bin .../non_ci/responses}/fixtures/load.py | 0 .../fixtures/pdfs/llama_stack_and_models.pdf | Bin .../fixtures/test_cases/chat_completion.yaml | 0 .../fixtures/test_cases/responses.yaml | 0 .../non_ci/responses}/test_responses.py | 252 +- tests/integration/recordings/index.sqlite | Bin 53248 -> 53248 bytes .../recordings/responses/140187e305dc.json | 56 + .../recordings/responses/382c2f22274c.json | 58 + .../recordings/responses/4096743baf8e.json | 56 + .../recordings/responses/4a3a4447b16b.json | 98 +- .../recordings/responses/67198cbad48f.json | 56 + .../recordings/responses/8295382a8e7c.json | 56 + .../recordings/responses/830a1fe14938.json | 56 + .../recordings/responses/9c007f300365.json | 58 + .../recordings/responses/a5187d9d5057.json | 56 + .../recordings/responses/b44cc7a7afc8.json | 3076 ++++++------- .../recordings/responses/c9667519ad7c.json | 58 + .../recordings/responses/cb3df2a1dc22.json | 56 + .../recordings/responses/d0ac68cbde69.json | 14 +- .../recordings/responses/d4f56d7d1996.json | 56 + .../recordings/responses/da531c71e64f.json | 421 ++ .../recordings/responses/dbc41d2417e1.json | 674 +++ .../recordings/responses/e2c9b07709fe.json | 58 + .../recordings/responses/f1ea938b0b0d.json | 56 + .../vision/responses/4096743baf8e.json | 56 + .../vision/responses/67198cbad48f.json | 56 + .../vision/responses/830a1fe14938.json | 56 + .../vision/responses/9c007f300365.json | 58 + .../vision/responses/c9667519ad7c.json | 58 + .../vision/responses/d4f56d7d1996.json | 56 + tests/integration/safety/test_safety.py | 45 + .../telemetry/test_openai_telemetry.py | 195 + .../tool_runtime/test_registration.py | 17 +- .../vector_io/test_openai_vector_stores.py | 94 +- .../providers/vector_io/remote/test_milvus.py | 141 + .../test_vector_io_openai_vector_stores.py | 34 +- tests/verifications/README.md | 79 - tests/verifications/REPORT.md | 232 - tests/verifications/conf/cerebras.yaml | 11 - .../conf/fireworks-llama-stack.yaml | 17 - tests/verifications/conf/fireworks.yaml | 15 - .../verifications/conf/groq-llama-stack.yaml | 17 - tests/verifications/conf/groq.yaml | 15 - tests/verifications/conf/meta_reference.yaml | 8 - .../conf/openai-llama-stack.yaml | 9 - tests/verifications/conf/openai.yaml | 9 - .../conf/together-llama-stack.yaml | 17 - tests/verifications/conf/together.yaml | 15 - tests/verifications/conftest.py | 96 - tests/verifications/generate_report.py | 502 --- .../openai-api-verification-run.yaml | 162 - tests/verifications/openai_api/__init__.py | 5 - tests/verifications/openai_api/conftest.py | 40 - .../openai_api/fixtures/__init__.py | 5 - .../openai_api/test_chat_completion.py | 717 ---- .../verifications/test_results/fireworks.json | 3751 ---------------- .../test_results/meta_reference.json | 1097 ----- tests/verifications/test_results/openai.json | 2161 ---------- .../verifications/test_results/together.json | 3821 ----------------- 143 files changed, 6907 insertions(+), 15104 deletions(-) delete mode 100644 llama_stack/providers/utils/inference/stream_utils.py create mode 100644 llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx create mode 100644 llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/page.tsx create mode 100644 llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/page.tsx create mode 100644 llama_stack/ui/lib/contents-api.ts delete mode 100644 tests/client-sdk/post_training/test_supervied_fine_tuning.py rename tests/{client-sdk/post_training => integration/non_ci/responses}/__init__.py (100%) rename tests/{verifications => integration/non_ci/responses/fixtures}/__init__.py (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/fixtures.py (91%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/images/vision_test_1.jpg (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/images/vision_test_2.jpg (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/images/vision_test_3.jpg (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/load.py (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/pdfs/llama_stack_and_models.pdf (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/test_cases/chat_completion.yaml (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/fixtures/test_cases/responses.yaml (100%) rename tests/{verifications/openai_api => integration/non_ci/responses}/test_responses.py (77%) create mode 100644 tests/integration/recordings/responses/140187e305dc.json create mode 100644 tests/integration/recordings/responses/382c2f22274c.json create mode 100644 tests/integration/recordings/responses/4096743baf8e.json create mode 100644 tests/integration/recordings/responses/67198cbad48f.json create mode 100644 tests/integration/recordings/responses/8295382a8e7c.json create mode 100644 tests/integration/recordings/responses/830a1fe14938.json create mode 100644 tests/integration/recordings/responses/9c007f300365.json create mode 100644 tests/integration/recordings/responses/a5187d9d5057.json create mode 100644 tests/integration/recordings/responses/c9667519ad7c.json create mode 100644 tests/integration/recordings/responses/cb3df2a1dc22.json create mode 100644 tests/integration/recordings/responses/d4f56d7d1996.json create mode 100644 tests/integration/recordings/responses/da531c71e64f.json create mode 100644 tests/integration/recordings/responses/dbc41d2417e1.json create mode 100644 tests/integration/recordings/responses/e2c9b07709fe.json create mode 100644 tests/integration/recordings/responses/f1ea938b0b0d.json create mode 100644 tests/integration/recordings/vision/responses/4096743baf8e.json create mode 100644 tests/integration/recordings/vision/responses/67198cbad48f.json create mode 100644 tests/integration/recordings/vision/responses/830a1fe14938.json create mode 100644 tests/integration/recordings/vision/responses/9c007f300365.json create mode 100644 tests/integration/recordings/vision/responses/c9667519ad7c.json create mode 100644 tests/integration/recordings/vision/responses/d4f56d7d1996.json create mode 100644 tests/integration/telemetry/test_openai_telemetry.py delete mode 100644 tests/verifications/README.md delete mode 100644 tests/verifications/REPORT.md delete mode 100644 tests/verifications/conf/cerebras.yaml delete mode 100644 tests/verifications/conf/fireworks-llama-stack.yaml delete mode 100644 tests/verifications/conf/fireworks.yaml delete mode 100644 tests/verifications/conf/groq-llama-stack.yaml delete mode 100644 tests/verifications/conf/groq.yaml delete mode 100644 tests/verifications/conf/meta_reference.yaml delete mode 100644 tests/verifications/conf/openai-llama-stack.yaml delete mode 100644 tests/verifications/conf/openai.yaml delete mode 100644 tests/verifications/conf/together-llama-stack.yaml delete mode 100644 tests/verifications/conf/together.yaml delete mode 100644 tests/verifications/conftest.py delete mode 100755 tests/verifications/generate_report.py delete mode 100644 tests/verifications/openai-api-verification-run.yaml delete mode 100644 tests/verifications/openai_api/__init__.py delete mode 100644 tests/verifications/openai_api/conftest.py delete mode 100644 tests/verifications/openai_api/fixtures/__init__.py delete mode 100644 tests/verifications/openai_api/test_chat_completion.py delete mode 100644 tests/verifications/test_results/fireworks.json delete mode 100644 tests/verifications/test_results/meta_reference.json delete mode 100644 tests/verifications/test_results/openai.json delete mode 100644 tests/verifications/test_results/together.json diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index a2a56c003..a38d4971a 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -53,7 +53,7 @@ jobs: # Get test directories dynamically, excluding non-test directories # NOTE: we are excluding post_training since the tests take too long TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | - grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" | + grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | sort | jq -R -s -c 'split("\n")[:-1]') echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fb223dc40..066fcecf0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -157,6 +157,7 @@ uv sync that describes the configuration. These descriptions will be used to generate the provider documentation. * When possible, use keyword arguments only when calling functions. +* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable. ## Common Tasks diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 79b9ede30..cb5f8af87 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4734,6 +4734,49 @@ } } }, + "/v1/openai/v1/moderations": { + "post": { + "responses": { + "200": { + "description": "A moderation object.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ModerationObject" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Safety" + ], + "description": "Classifies if text and/or image inputs are potentially harmful.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunModerationRequest" + } + } + }, + "required": true + } + } + }, "/v1/safety/run-shield": { "post": { "responses": { @@ -14778,6 +14821,47 @@ "text": { "type": "string", "description": "The actual text content" + }, + "embedding": { + "type": "array", + "items": { + "type": "number" + }, + "description": "(Optional) Embedding vector for the content, if available" + }, + "created_timestamp": { + "type": "integer", + "description": "(Optional) Timestamp when the content was created" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "(Optional) Metadata associated with the content, such as source, author, etc." + }, + "chunk_metadata": { + "$ref": "#/components/schemas/ChunkMetadata", + "description": "(Optional) Metadata associated with the chunk, such as document ID, source, etc." } }, "additionalProperties": false, @@ -16401,6 +16485,131 @@ ], "title": "RunEvalRequest" }, + "RunModerationRequest": { + "type": "object", + "properties": { + "input": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models." + }, + "model": { + "type": "string", + "description": "The content moderation model you would like to use." + } + }, + "additionalProperties": false, + "required": [ + "input", + "model" + ], + "title": "RunModerationRequest" + }, + "ModerationObject": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The unique identifier for the moderation request." + }, + "model": { + "type": "string", + "description": "The model used to generate the moderation results." + }, + "results": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ModerationObjectResults" + }, + "description": "A list of moderation objects" + } + }, + "additionalProperties": false, + "required": [ + "id", + "model", + "results" + ], + "title": "ModerationObject", + "description": "A moderation object." + }, + "ModerationObjectResults": { + "type": "object", + "properties": { + "flagged": { + "type": "boolean", + "description": "Whether any of the below categories are flagged." + }, + "categories": { + "type": "object", + "additionalProperties": { + "type": "boolean" + }, + "description": "A list of the categories, and whether they are flagged or not." + }, + "category_applied_input_types": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "type": "string" + } + }, + "description": "A list of the categories along with the input type(s) that the score applies to." + }, + "category_scores": { + "type": "object", + "additionalProperties": { + "type": "number" + }, + "description": "A list of the categories along with their scores as predicted by model. Required set of categories that need to be in response - violence - violence/graphic - harassment - harassment/threatening - hate - hate/threatening - illicit - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent - self-harm/instructions" + }, + "user_message": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "flagged", + "metadata" + ], + "title": "ModerationObjectResults", + "description": "A moderation object." + }, "RunShieldRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a15a2824e..92f360ff5 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -3358,6 +3358,36 @@ paths: schema: $ref: '#/components/schemas/RunEvalRequest' required: true + /v1/openai/v1/moderations: + post: + responses: + '200': + description: A moderation object. + content: + application/json: + schema: + $ref: '#/components/schemas/ModerationObject' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Safety + description: >- + Classifies if text and/or image inputs are potentially harmful. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RunModerationRequest' + required: true /v1/safety/run-shield: post: responses: @@ -10966,6 +10996,34 @@ components: text: type: string description: The actual text content + embedding: + type: array + items: + type: number + description: >- + (Optional) Embedding vector for the content, if available + created_timestamp: + type: integer + description: >- + (Optional) Timestamp when the content was created + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + (Optional) Metadata associated with the content, such as source, author, + etc. + chunk_metadata: + $ref: '#/components/schemas/ChunkMetadata' + description: >- + (Optional) Metadata associated with the chunk, such as document ID, source, + etc. additionalProperties: false required: - type @@ -12184,6 +12242,100 @@ components: required: - benchmark_config title: RunEvalRequest + RunModerationRequest: + type: object + properties: + input: + oneOf: + - type: string + - type: array + items: + type: string + description: >- + Input (or inputs) to classify. Can be a single string, an array of strings, + or an array of multi-modal input objects similar to other models. + model: + type: string + description: >- + The content moderation model you would like to use. + additionalProperties: false + required: + - input + - model + title: RunModerationRequest + ModerationObject: + type: object + properties: + id: + type: string + description: >- + The unique identifier for the moderation request. + model: + type: string + description: >- + The model used to generate the moderation results. + results: + type: array + items: + $ref: '#/components/schemas/ModerationObjectResults' + description: A list of moderation objects + additionalProperties: false + required: + - id + - model + - results + title: ModerationObject + description: A moderation object. + ModerationObjectResults: + type: object + properties: + flagged: + type: boolean + description: >- + Whether any of the below categories are flagged. + categories: + type: object + additionalProperties: + type: boolean + description: >- + A list of the categories, and whether they are flagged or not. + category_applied_input_types: + type: object + additionalProperties: + type: array + items: + type: string + description: >- + A list of the categories along with the input type(s) that the score applies + to. + category_scores: + type: object + additionalProperties: + type: number + description: >- + A list of the categories along with their scores as predicted by model. + Required set of categories that need to be in response - violence - violence/graphic + - harassment - harassment/threatening - hate - hate/threatening - illicit + - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent + - self-harm/instructions + user_message: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - flagged + - metadata + title: ModerationObjectResults + description: A moderation object. RunShieldRequest: type: object properties: diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index d1c79052d..24098708f 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -53,24 +53,31 @@ The main points to consider are: ``` llama stack build -h -usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run] +usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] + [--run] [--providers PROVIDERS] Build a Llama stack container options: -h, --help show this help message and exit - --config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will - be prompted to enter information interactively (default: None) - --template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None) - --list-templates Show the available templates for building a Llama Stack distribution (default: False) + --config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to + enter information interactively (default: None) + --template TEMPLATE (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: + None) + --distro DISTRIBUTION, --distribution DISTRIBUTION + Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None) + --list-distros, --list-distributions + Show the available distributions for building a Llama Stack distribution (default: False) --image-type {container,venv} Image Type to use for the build. If not specified, will use the image type from the template config. (default: None) --image-name IMAGE_NAME - [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if - found. (default: None) + [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default: + None) --print-deps-only Print the dependencies for the stack only, without building the stack (default: False) --run Run the stack after building using the same image type, name, and other applicable arguments (default: False) - + --providers PROVIDERS + Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per + API. (default: None) ``` After this step is complete, a file named `-build.yaml` and template file `-run.yaml` will be generated and saved at the output file path specified at the end of the command. diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md index 979acd913..9d16d07d7 100644 --- a/docs/source/distributions/ondevice_distro/android_sdk.md +++ b/docs/source/distributions/ondevice_distro/android_sdk.md @@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used ### Setup Remote Inferencing Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution: ``` -python -m venv stack-fireworks -source stack-fireworks/bin/activate # On Windows: stack-fireworks\Scripts\activate +uv venv starter --python 3.12 +source starter/bin/activate # On Windows: starter\Scripts\activate pip install --no-cache llama-stack==0.2.2 -llama stack build --distro fireworks --image-type venv +llama stack build --distro starter --image-type venv export FIREWORKS_API_KEY= -llama stack run fireworks --port 5050 +llama stack run starter --port 5050 ``` Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility. diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md index 6e399e6ce..e845c3c48 100644 --- a/docs/source/distributions/self_hosted_distro/nvidia.md +++ b/docs/source/distributions/self_hosted_distro/nvidia.md @@ -157,7 +157,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/docs/source/getting_started/demo_script.py b/docs/source/getting_started/demo_script.py index 298fd9899..777fc78c2 100644 --- a/docs/source/getting_started/demo_script.py +++ b/docs/source/getting_started/demo_script.py @@ -52,11 +52,16 @@ agent = Agent( prompt = "How do you do great work?" print("prompt>", prompt) +use_stream = True response = agent.create_turn( messages=[{"role": "user", "content": prompt}], session_id=agent.create_session("rag_session"), - stream=True, + stream=use_stream, ) -for log in AgentEventLogger().log(response): - log.print() +# Only call `AgentEventLogger().log(response)` for streaming responses. +if use_stream: + for log in AgentEventLogger().log(response): + log.print() +else: + print(response) diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md index ff2eaead4..14f888628 100644 --- a/docs/source/getting_started/detailed_tutorial.md +++ b/docs/source/getting_started/detailed_tutorial.md @@ -150,13 +150,7 @@ pip install llama-stack-client ``` ::: -:::{tab-item} Install with `venv` -```bash -python -m venv stack-client -source stack-client/bin/activate # On Windows: stack-client\Scripts\activate -pip install llama-stack-client -``` -::: + :::: Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the diff --git a/docs/source/providers/agents/index.md b/docs/source/providers/agents/index.md index a88f085ad..92bf9edc0 100644 --- a/docs/source/providers/agents/index.md +++ b/docs/source/providers/agents/index.md @@ -1,4 +1,4 @@ -# Agents +# Agents ## Overview diff --git a/docs/source/providers/datasetio/index.md b/docs/source/providers/datasetio/index.md index 9b0f385f4..94a97e2ed 100644 --- a/docs/source/providers/datasetio/index.md +++ b/docs/source/providers/datasetio/index.md @@ -1,4 +1,4 @@ -# Datasetio +# Datasetio ## Overview diff --git a/docs/source/providers/eval/index.md b/docs/source/providers/eval/index.md index f8d24a820..d180d256c 100644 --- a/docs/source/providers/eval/index.md +++ b/docs/source/providers/eval/index.md @@ -1,4 +1,4 @@ -# Eval +# Eval ## Overview diff --git a/docs/source/providers/files/index.md b/docs/source/providers/files/index.md index 8d4f8773a..692aad3ca 100644 --- a/docs/source/providers/files/index.md +++ b/docs/source/providers/files/index.md @@ -1,4 +1,4 @@ -# Files +# Files ## Overview diff --git a/docs/source/providers/files/inline_localfs.md b/docs/source/providers/files/inline_localfs.md index 54c489c7d..09267b7d8 100644 --- a/docs/source/providers/files/inline_localfs.md +++ b/docs/source/providers/files/inline_localfs.md @@ -8,7 +8,7 @@ Local filesystem-based file storage provider for managing files and documents lo | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `storage_dir` | `` | No | PydanticUndefined | Directory to store uploaded files | +| `storage_dir` | `` | No | | Directory to store uploaded files | | `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata | | `ttl_secs` | `` | No | 31536000 | | diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md index 207c28c64..1c7bc86b9 100644 --- a/docs/source/providers/inference/index.md +++ b/docs/source/providers/inference/index.md @@ -1,4 +1,4 @@ -# Inference +# Inference ## Overview diff --git a/docs/source/providers/inference/remote_hf_endpoint.md b/docs/source/providers/inference/remote_hf_endpoint.md index f9ca6b538..8aaf13476 100644 --- a/docs/source/providers/inference/remote_hf_endpoint.md +++ b/docs/source/providers/inference/remote_hf_endpoint.md @@ -8,7 +8,7 @@ HuggingFace Inference Endpoints provider for dedicated model serving. | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `endpoint_name` | `` | No | PydanticUndefined | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. | +| `endpoint_name` | `` | No | | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. | | `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) | ## Sample Configuration diff --git a/docs/source/providers/inference/remote_hf_serverless.md b/docs/source/providers/inference/remote_hf_serverless.md index 345af3e49..6764590b8 100644 --- a/docs/source/providers/inference/remote_hf_serverless.md +++ b/docs/source/providers/inference/remote_hf_serverless.md @@ -8,7 +8,7 @@ HuggingFace Inference API serverless provider for on-demand model inference. | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `huggingface_repo` | `` | No | PydanticUndefined | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') | +| `huggingface_repo` | `` | No | | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') | | `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) | ## Sample Configuration diff --git a/docs/source/providers/inference/remote_tgi.md b/docs/source/providers/inference/remote_tgi.md index 125984fab..104bb4aab 100644 --- a/docs/source/providers/inference/remote_tgi.md +++ b/docs/source/providers/inference/remote_tgi.md @@ -8,7 +8,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving. | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `url` | `` | No | PydanticUndefined | The URL for the TGI serving endpoint | +| `url` | `` | No | | The URL for the TGI serving endpoint | ## Sample Configuration diff --git a/docs/source/providers/post_training/index.md b/docs/source/providers/post_training/index.md index fb6af2d57..c6c92c40e 100644 --- a/docs/source/providers/post_training/index.md +++ b/docs/source/providers/post_training/index.md @@ -1,4 +1,4 @@ -# Post_Training +# Post_Training ## Overview diff --git a/docs/source/providers/post_training/inline_huggingface.md b/docs/source/providers/post_training/inline_huggingface.md index 0a8745e71..8b10fe79c 100644 --- a/docs/source/providers/post_training/inline_huggingface.md +++ b/docs/source/providers/post_training/inline_huggingface.md @@ -27,7 +27,7 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin | `dpo_beta` | `` | No | 0.1 | | | `use_reference_model` | `` | No | True | | | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | | -| `dpo_output_dir` | `` | No | ./checkpoints/dpo | | +| `dpo_output_dir` | `` | No | | | ## Sample Configuration @@ -35,6 +35,7 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin checkpoint_format: huggingface distributed_backend: null device: cpu +dpo_output_dir: ~/.llama/dummy/dpo_output ``` diff --git a/docs/source/providers/safety/index.md b/docs/source/providers/safety/index.md index f82694ac8..5ddda2242 100644 --- a/docs/source/providers/safety/index.md +++ b/docs/source/providers/safety/index.md @@ -1,4 +1,4 @@ -# Safety +# Safety ## Overview diff --git a/docs/source/providers/scoring/index.md b/docs/source/providers/scoring/index.md index 31a87c555..f3bd48eb0 100644 --- a/docs/source/providers/scoring/index.md +++ b/docs/source/providers/scoring/index.md @@ -1,4 +1,4 @@ -# Scoring +# Scoring ## Overview diff --git a/docs/source/providers/telemetry/index.md b/docs/source/providers/telemetry/index.md index 2451e8f62..c7fbfed73 100644 --- a/docs/source/providers/telemetry/index.md +++ b/docs/source/providers/telemetry/index.md @@ -1,4 +1,4 @@ -# Telemetry +# Telemetry ## Overview diff --git a/docs/source/providers/tool_runtime/index.md b/docs/source/providers/tool_runtime/index.md index a0b835e3b..8d29aed43 100644 --- a/docs/source/providers/tool_runtime/index.md +++ b/docs/source/providers/tool_runtime/index.md @@ -1,4 +1,4 @@ -# Tool_Runtime +# Tool_Runtime ## Overview diff --git a/docs/source/providers/vector_io/index.md b/docs/source/providers/vector_io/index.md index a7703ae14..28ae523d7 100644 --- a/docs/source/providers/vector_io/index.md +++ b/docs/source/providers/vector_io/index.md @@ -1,4 +1,4 @@ -# Vector_Io +# Vector_Io ## Overview diff --git a/docs/source/providers/vector_io/inline_chromadb.md b/docs/source/providers/vector_io/inline_chromadb.md index 679c82830..518e3f689 100644 --- a/docs/source/providers/vector_io/inline_chromadb.md +++ b/docs/source/providers/vector_io/inline_chromadb.md @@ -41,7 +41,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `db_path` | `` | No | PydanticUndefined | | +| `db_path` | `` | No | | | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | ## Sample Configuration diff --git a/docs/source/providers/vector_io/inline_milvus.md b/docs/source/providers/vector_io/inline_milvus.md index 3b3aad3fc..33ea4d179 100644 --- a/docs/source/providers/vector_io/inline_milvus.md +++ b/docs/source/providers/vector_io/inline_milvus.md @@ -10,7 +10,7 @@ Please refer to the remote provider documentation. | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `db_path` | `` | No | PydanticUndefined | | +| `db_path` | `` | No | | | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | | `consistency_level` | `` | No | Strong | The consistency level of the Milvus server | diff --git a/docs/source/providers/vector_io/inline_qdrant.md b/docs/source/providers/vector_io/inline_qdrant.md index e989a3554..b5072d220 100644 --- a/docs/source/providers/vector_io/inline_qdrant.md +++ b/docs/source/providers/vector_io/inline_qdrant.md @@ -50,7 +50,7 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `path` | `` | No | PydanticUndefined | | +| `path` | `` | No | | | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | ## Sample Configuration diff --git a/docs/source/providers/vector_io/inline_sqlite-vec.md b/docs/source/providers/vector_io/inline_sqlite-vec.md index ae7c45b21..854bb9d08 100644 --- a/docs/source/providers/vector_io/inline_sqlite-vec.md +++ b/docs/source/providers/vector_io/inline_sqlite-vec.md @@ -205,7 +205,7 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `db_path` | `` | No | PydanticUndefined | Path to the SQLite database file | +| `db_path` | `` | No | | Path to the SQLite database file | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | ## Sample Configuration diff --git a/docs/source/providers/vector_io/inline_sqlite_vec.md b/docs/source/providers/vector_io/inline_sqlite_vec.md index 7e14bb8bd..7ad8eb252 100644 --- a/docs/source/providers/vector_io/inline_sqlite_vec.md +++ b/docs/source/providers/vector_io/inline_sqlite_vec.md @@ -10,7 +10,7 @@ Please refer to the sqlite-vec provider documentation. | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `db_path` | `` | No | PydanticUndefined | Path to the SQLite database file | +| `db_path` | `` | No | | Path to the SQLite database file | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | ## Sample Configuration diff --git a/docs/source/providers/vector_io/remote_chromadb.md b/docs/source/providers/vector_io/remote_chromadb.md index 447ea6cd6..badfebe90 100644 --- a/docs/source/providers/vector_io/remote_chromadb.md +++ b/docs/source/providers/vector_io/remote_chromadb.md @@ -40,7 +40,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `url` | `str \| None` | No | PydanticUndefined | | +| `url` | `str \| None` | No | | | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | ## Sample Configuration diff --git a/docs/source/providers/vector_io/remote_milvus.md b/docs/source/providers/vector_io/remote_milvus.md index 6734d8315..3646f4acc 100644 --- a/docs/source/providers/vector_io/remote_milvus.md +++ b/docs/source/providers/vector_io/remote_milvus.md @@ -111,8 +111,8 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi | Field | Type | Required | Default | Description | |-------|------|----------|---------|-------------| -| `uri` | `` | No | PydanticUndefined | The URI of the Milvus server | -| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server | +| `uri` | `` | No | | The URI of the Milvus server | +| `token` | `str \| None` | No | | The token of the Milvus server | | `consistency_level` | `` | No | Strong | The consistency level of the Milvus server | | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. | diff --git a/docs/source/references/llama_cli_reference/download_models.md b/docs/source/references/llama_cli_reference/download_models.md index c44ba7788..e32099023 100644 --- a/docs/source/references/llama_cli_reference/download_models.md +++ b/docs/source/references/llama_cli_reference/download_models.md @@ -19,7 +19,7 @@ You have two ways to install Llama Stack: cd ~/local git clone git@github.com:meta-llama/llama-stack.git - python -m venv myenv + uv venv myenv --python 3.12 source myenv/bin/activate # On Windows: myenv\Scripts\activate cd llama-stack diff --git a/docs/source/references/llama_cli_reference/index.md b/docs/source/references/llama_cli_reference/index.md index fc7751ebf..4ef76fe7d 100644 --- a/docs/source/references/llama_cli_reference/index.md +++ b/docs/source/references/llama_cli_reference/index.md @@ -19,7 +19,7 @@ You have two ways to install Llama Stack: cd ~/local git clone git@github.com:meta-llama/llama-stack.git - python -m venv myenv + uv venv myenv --python 3.12 source myenv/bin/activate # On Windows: myenv\Scripts\activate cd llama-stack diff --git a/llama_stack/apis/common/errors.py b/llama_stack/apis/common/errors.py index bef048191..95d6ac18e 100644 --- a/llama_stack/apis/common/errors.py +++ b/llama_stack/apis/common/errors.py @@ -10,6 +10,16 @@ # 3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)' +class ResourceNotFoundError(ValueError): + """generic exception for a missing Llama Stack resource""" + + def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None: + message = ( + f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s." + ) + super().__init__(message) + + class UnsupportedModelError(ValueError): """raised when model is not present in the list of supported models""" @@ -18,38 +28,32 @@ class UnsupportedModelError(ValueError): super().__init__(message) -class ModelNotFoundError(ValueError): +class ModelNotFoundError(ResourceNotFoundError): """raised when Llama Stack cannot find a referenced model""" def __init__(self, model_name: str) -> None: - message = f"Model '{model_name}' not found. Use client.models.list() to list available models." - super().__init__(message) + super().__init__(model_name, "Model", "client.models.list()") -class VectorStoreNotFoundError(ValueError): +class VectorStoreNotFoundError(ResourceNotFoundError): """raised when Llama Stack cannot find a referenced vector store""" def __init__(self, vector_store_name: str) -> None: - message = f"Vector store '{vector_store_name}' not found. Use client.vector_dbs.list() to list available vector stores." - super().__init__(message) + super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()") -class DatasetNotFoundError(ValueError): +class DatasetNotFoundError(ResourceNotFoundError): """raised when Llama Stack cannot find a referenced dataset""" def __init__(self, dataset_name: str) -> None: - message = f"Dataset '{dataset_name}' not found. Use client.datasets.list() to list available datasets." - super().__init__(message) + super().__init__(dataset_name, "Dataset", "client.datasets.list()") -class ToolGroupNotFoundError(ValueError): +class ToolGroupNotFoundError(ResourceNotFoundError): """raised when Llama Stack cannot find a referenced tool group""" def __init__(self, toolgroup_name: str) -> None: - message = ( - f"Tool group '{toolgroup_name}' not found. Use client.toolgroups.list() to list available tool groups." - ) - super().__init__(message) + super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()") class SessionNotFoundError(ValueError): diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py index 468cfa63a..3f374460b 100644 --- a/llama_stack/apis/safety/safety.py +++ b/llama_stack/apis/safety/safety.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from enum import Enum +from enum import Enum, StrEnum from typing import Any, Protocol, runtime_checkable from pydantic import BaseModel, Field @@ -15,6 +15,71 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.schema_utils import json_schema_type, webmethod +# OpenAI Categories to return in the response +class OpenAICategories(StrEnum): + """ + Required set of categories in moderations api response + """ + + VIOLENCE = "violence" + VIOLENCE_GRAPHIC = "violence/graphic" + HARRASMENT = "harassment" + HARRASMENT_THREATENING = "harassment/threatening" + HATE = "hate" + HATE_THREATENING = "hate/threatening" + ILLICIT = "illicit" + ILLICIT_VIOLENT = "illicit/violent" + SEXUAL = "sexual" + SEXUAL_MINORS = "sexual/minors" + SELF_HARM = "self-harm" + SELF_HARM_INTENT = "self-harm/intent" + SELF_HARM_INSTRUCTIONS = "self-harm/instructions" + + +@json_schema_type +class ModerationObjectResults(BaseModel): + """A moderation object. + :param flagged: Whether any of the below categories are flagged. + :param categories: A list of the categories, and whether they are flagged or not. + :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to. + :param category_scores: A list of the categories along with their scores as predicted by model. + Required set of categories that need to be in response + - violence + - violence/graphic + - harassment + - harassment/threatening + - hate + - hate/threatening + - illicit + - illicit/violent + - sexual + - sexual/minors + - self-harm + - self-harm/intent + - self-harm/instructions + """ + + flagged: bool + categories: dict[str, bool] | None = None + category_applied_input_types: dict[str, list[str]] | None = None + category_scores: dict[str, float] | None = None + user_message: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +@json_schema_type +class ModerationObject(BaseModel): + """A moderation object. + :param id: The unique identifier for the moderation request. + :param model: The model used to generate the moderation results. + :param results: A list of moderation objects + """ + + id: str + model: str + results: list[ModerationObjectResults] + + @json_schema_type class ViolationLevel(Enum): """Severity level of a safety violation. @@ -82,3 +147,13 @@ class Safety(Protocol): :returns: A RunShieldResponse. """ ... + + @webmethod(route="/openai/v1/moderations", method="POST") + async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject: + """Classifies if text and/or image inputs are potentially harmful. + :param input: Input (or inputs) to classify. + Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models. + :param model: The content moderation model you would like to use. + :returns: A moderation object. + """ + ... diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py index 3e8065cfb..436a96254 100644 --- a/llama_stack/apis/vector_io/vector_io.py +++ b/llama_stack/apis/vector_io/vector_io.py @@ -226,10 +226,18 @@ class VectorStoreContent(BaseModel): :param type: Content type, currently only "text" is supported :param text: The actual text content + :param embedding: (Optional) Embedding vector for the content, if available + :param created_timestamp: (Optional) Timestamp when the content was created + :param metadata: (Optional) Metadata associated with the content, such as source, author, etc. + :param chunk_metadata: (Optional) Metadata associated with the chunk, such as document ID, source, etc. """ type: Literal["text"] text: str + embedding: list[float] | None = None + created_timestamp: int | None = None + metadata: dict[str, Any] | None = None + chunk_metadata: ChunkMetadata | None = None @json_schema_type diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 6152acd57..79ab7c34f 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -7,6 +7,7 @@ import asyncio import time from collections.abc import AsyncGenerator, AsyncIterator +from datetime import UTC, datetime from typing import Annotated, Any from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam @@ -25,14 +26,21 @@ from llama_stack.apis.inference import ( ChatCompletionResponseEventType, ChatCompletionResponseStreamChunk, CompletionMessage, + CompletionResponse, + CompletionResponseStreamChunk, EmbeddingsResponse, EmbeddingTaskType, Inference, ListOpenAIChatCompletionResponse, LogProbConfig, Message, + OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionToolCall, + OpenAIChatCompletionToolCallFunction, + OpenAIChoice, + OpenAIChoiceLogprobs, OpenAICompletion, OpenAICompletionWithInputMessages, OpenAIEmbeddingsResponse, @@ -55,7 +63,6 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable from llama_stack.providers.utils.inference.inference_store import InferenceStore -from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion from llama_stack.providers.utils.telemetry.tracing import get_current_span logger = get_logger(name=__name__, category="core") @@ -119,6 +126,7 @@ class InferenceRouter(Inference): if span is None: logger.warning("No span found for token usage metrics") return [] + metrics = [ ("prompt_tokens", prompt_tokens), ("completion_tokens", completion_tokens), @@ -132,7 +140,7 @@ class InferenceRouter(Inference): span_id=span.span_id, metric=metric_name, value=value, - timestamp=time.time(), + timestamp=datetime.now(UTC), unit="tokens", attributes={ "model_id": model.model_id, @@ -234,49 +242,26 @@ class InferenceRouter(Inference): prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format) if stream: - - async def stream_generator(): - completion_text = "" - async for chunk in await provider.chat_completion(**params): - if chunk.event.event_type == ChatCompletionResponseEventType.progress: - if chunk.event.delta.type == "text": - completion_text += chunk.event.delta.text - if chunk.event.event_type == ChatCompletionResponseEventType.complete: - completion_tokens = await self._count_tokens( - [ - CompletionMessage( - content=completion_text, - stop_reason=StopReason.end_of_turn, - ) - ], - tool_config.tool_prompt_format, - ) - total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) - metrics = await self._compute_and_log_token_usage( - prompt_tokens or 0, - completion_tokens or 0, - total_tokens, - model, - ) - chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics - yield chunk - - return stream_generator() - else: - response = await provider.chat_completion(**params) - completion_tokens = await self._count_tokens( - [response.completion_message], - tool_config.tool_prompt_format, + response_stream = await provider.chat_completion(**params) + return self.stream_tokens_and_compute_metrics( + response=response_stream, + prompt_tokens=prompt_tokens, + model=model, + tool_prompt_format=tool_config.tool_prompt_format, ) - total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) - metrics = await self._compute_and_log_token_usage( - prompt_tokens or 0, - completion_tokens or 0, - total_tokens, - model, - ) - response.metrics = metrics if response.metrics is None else response.metrics + metrics - return response + + response = await provider.chat_completion(**params) + metrics = await self.count_tokens_and_compute_metrics( + response=response, + prompt_tokens=prompt_tokens, + model=model, + tool_prompt_format=tool_config.tool_prompt_format, + ) + # these metrics will show up in the client response. + response.metrics = ( + metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics + ) + return response async def batch_chat_completion( self, @@ -332,39 +317,20 @@ class InferenceRouter(Inference): ) prompt_tokens = await self._count_tokens(content) - + response = await provider.completion(**params) if stream: - - async def stream_generator(): - completion_text = "" - async for chunk in await provider.completion(**params): - if hasattr(chunk, "delta"): - completion_text += chunk.delta - if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry: - completion_tokens = await self._count_tokens(completion_text) - total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) - metrics = await self._compute_and_log_token_usage( - prompt_tokens or 0, - completion_tokens or 0, - total_tokens, - model, - ) - chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics - yield chunk - - return stream_generator() - else: - response = await provider.completion(**params) - completion_tokens = await self._count_tokens(response.content) - total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) - metrics = await self._compute_and_log_token_usage( - prompt_tokens or 0, - completion_tokens or 0, - total_tokens, - model, + return self.stream_tokens_and_compute_metrics( + response=response, + prompt_tokens=prompt_tokens, + model=model, ) - response.metrics = metrics if response.metrics is None else response.metrics + metrics - return response + + metrics = await self.count_tokens_and_compute_metrics( + response=response, prompt_tokens=prompt_tokens, model=model + ) + response.metrics = metrics if response.metrics is None else response.metrics + metrics + + return response async def batch_completion( self, @@ -457,9 +423,29 @@ class InferenceRouter(Inference): prompt_logprobs=prompt_logprobs, suffix=suffix, ) - provider = await self.routing_table.get_provider_impl(model_obj.identifier) - return await provider.openai_completion(**params) + if stream: + return await provider.openai_completion(**params) + # TODO: Metrics do NOT work with openai_completion stream=True due to the fact + # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently. + # response_stream = await provider.openai_completion(**params) + + response = await provider.openai_completion(**params) + if self.telemetry: + metrics = self._construct_metrics( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, + model=model_obj, + ) + for metric in metrics: + await self.telemetry.log_event(metric) + + # these metrics will show up in the client response. + response.metrics = ( + metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics + ) + return response async def openai_chat_completion( self, @@ -537,18 +523,38 @@ class InferenceRouter(Inference): top_p=top_p, user=user, ) - provider = await self.routing_table.get_provider_impl(model_obj.identifier) if stream: response_stream = await provider.openai_chat_completion(**params) - if self.store: - return stream_and_store_openai_completion(response_stream, model, self.store, messages) - return response_stream - else: - response = await self._nonstream_openai_chat_completion(provider, params) - if self.store: - await self.store.store_chat_completion(response, messages) - return response + + # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk] + # We need to add metrics to each chunk and store the final completion + return self.stream_tokens_and_compute_metrics_openai_chat( + response=response_stream, + model=model_obj, + messages=messages, + ) + + response = await self._nonstream_openai_chat_completion(provider, params) + + # Store the response with the ID that will be returned to the client + if self.store: + await self.store.store_chat_completion(response, messages) + + if self.telemetry: + metrics = self._construct_metrics( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, + model=model_obj, + ) + for metric in metrics: + await self.telemetry.log_event(metric) + # these metrics will show up in the client response. + response.metrics = ( + metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics + ) + return response async def openai_embeddings( self, @@ -625,3 +631,244 @@ class InferenceRouter(Inference): status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}" ) return health_statuses + + async def stream_tokens_and_compute_metrics( + self, + response, + prompt_tokens, + model, + tool_prompt_format: ToolPromptFormat | None = None, + ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]: + completion_text = "" + async for chunk in response: + complete = False + if hasattr(chunk, "event"): # only ChatCompletions have .event + if chunk.event.event_type == ChatCompletionResponseEventType.progress: + if chunk.event.delta.type == "text": + completion_text += chunk.event.delta.text + if chunk.event.event_type == ChatCompletionResponseEventType.complete: + complete = True + completion_tokens = await self._count_tokens( + [ + CompletionMessage( + content=completion_text, + stop_reason=StopReason.end_of_turn, + ) + ], + tool_prompt_format=tool_prompt_format, + ) + else: + if hasattr(chunk, "delta"): + completion_text += chunk.delta + if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry: + complete = True + completion_tokens = await self._count_tokens(completion_text) + # if we are done receiving tokens + if complete: + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + + # Create a separate span for streaming completion metrics + if self.telemetry: + # Log metrics in the new span context + completion_metrics = self._construct_metrics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + model=model, + ) + for metric in completion_metrics: + if metric.metric in [ + "completion_tokens", + "total_tokens", + ]: # Only log completion and total tokens + await self.telemetry.log_event(metric) + + # Return metrics in response + async_metrics = [ + MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics + ] + chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics + else: + # Fallback if no telemetry + completion_metrics = self._construct_metrics( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + async_metrics = [ + MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics + ] + chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics + yield chunk + + async def count_tokens_and_compute_metrics( + self, + response: ChatCompletionResponse | CompletionResponse, + prompt_tokens, + model, + tool_prompt_format: ToolPromptFormat | None = None, + ): + if isinstance(response, ChatCompletionResponse): + content = [response.completion_message] + else: + content = response.content + completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format) + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + + # Create a separate span for completion metrics + if self.telemetry: + # Log metrics in the new span context + completion_metrics = self._construct_metrics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + model=model, + ) + for metric in completion_metrics: + if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens + await self.telemetry.log_event(metric) + + # Return metrics in response + return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics] + + # Fallback if no telemetry + metrics = self._construct_metrics( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] + + async def stream_tokens_and_compute_metrics_openai_chat( + self, + response: AsyncIterator[OpenAIChatCompletionChunk], + model: Model, + messages: list[OpenAIMessageParam] | None = None, + ) -> AsyncIterator[OpenAIChatCompletionChunk]: + """Stream OpenAI chat completion chunks, compute metrics, and store the final completion.""" + id = None + created = None + choices_data: dict[int, dict[str, Any]] = {} + + try: + async for chunk in response: + # Skip None chunks + if chunk is None: + continue + + # Capture ID and created timestamp from first chunk + if id is None and chunk.id: + id = chunk.id + if created is None and chunk.created: + created = chunk.created + + # Accumulate choice data for final assembly + if chunk.choices: + for choice_delta in chunk.choices: + idx = choice_delta.index + if idx not in choices_data: + choices_data[idx] = { + "content_parts": [], + "tool_calls_builder": {}, + "finish_reason": None, + "logprobs_content_parts": [], + } + current_choice_data = choices_data[idx] + + if choice_delta.delta: + delta = choice_delta.delta + if delta.content: + current_choice_data["content_parts"].append(delta.content) + if delta.tool_calls: + for tool_call_delta in delta.tool_calls: + tc_idx = tool_call_delta.index + if tc_idx not in current_choice_data["tool_calls_builder"]: + current_choice_data["tool_calls_builder"][tc_idx] = { + "id": None, + "type": "function", + "function_name_parts": [], + "function_arguments_parts": [], + } + builder = current_choice_data["tool_calls_builder"][tc_idx] + if tool_call_delta.id: + builder["id"] = tool_call_delta.id + if tool_call_delta.type: + builder["type"] = tool_call_delta.type + if tool_call_delta.function: + if tool_call_delta.function.name: + builder["function_name_parts"].append(tool_call_delta.function.name) + if tool_call_delta.function.arguments: + builder["function_arguments_parts"].append( + tool_call_delta.function.arguments + ) + if choice_delta.finish_reason: + current_choice_data["finish_reason"] = choice_delta.finish_reason + if choice_delta.logprobs and choice_delta.logprobs.content: + current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content) + + # Compute metrics on final chunk + if chunk.choices and chunk.choices[0].finish_reason: + completion_text = "" + for choice_data in choices_data.values(): + completion_text += "".join(choice_data["content_parts"]) + + # Add metrics to the chunk + if self.telemetry and chunk.usage: + metrics = self._construct_metrics( + prompt_tokens=chunk.usage.prompt_tokens, + completion_tokens=chunk.usage.completion_tokens, + total_tokens=chunk.usage.total_tokens, + model=model, + ) + for metric in metrics: + await self.telemetry.log_event(metric) + + yield chunk + finally: + # Store the final assembled completion + if id and self.store and messages: + assembled_choices: list[OpenAIChoice] = [] + for choice_idx, choice_data in choices_data.items(): + content_str = "".join(choice_data["content_parts"]) + assembled_tool_calls: list[OpenAIChatCompletionToolCall] = [] + if choice_data["tool_calls_builder"]: + for tc_build_data in choice_data["tool_calls_builder"].values(): + if tc_build_data["id"]: + func_name = "".join(tc_build_data["function_name_parts"]) + func_args = "".join(tc_build_data["function_arguments_parts"]) + assembled_tool_calls.append( + OpenAIChatCompletionToolCall( + id=tc_build_data["id"], + type=tc_build_data["type"], + function=OpenAIChatCompletionToolCallFunction( + name=func_name, arguments=func_args + ), + ) + ) + message = OpenAIAssistantMessageParam( + role="assistant", + content=content_str if content_str else None, + tool_calls=assembled_tool_calls if assembled_tool_calls else None, + ) + logprobs_content = choice_data["logprobs_content_parts"] + final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None + + assembled_choices.append( + OpenAIChoice( + finish_reason=choice_data["finish_reason"], + index=choice_idx, + message=message, + logprobs=final_logprobs, + ) + ) + + final_response = OpenAIChatCompletion( + id=id, + choices=assembled_choices, + created=created or int(time.time()), + model=model.identifier, + object="chat.completion", + ) + await self.store.store_chat_completion(final_response, messages) diff --git a/llama_stack/core/routers/safety.py b/llama_stack/core/routers/safety.py index f4273c7b5..9bf2b1bac 100644 --- a/llama_stack/core/routers/safety.py +++ b/llama_stack/core/routers/safety.py @@ -10,6 +10,7 @@ from llama_stack.apis.inference import ( Message, ) from llama_stack.apis.safety import RunShieldResponse, Safety +from llama_stack.apis.safety.safety import ModerationObject, OpenAICategories from llama_stack.apis.shields import Shield from llama_stack.log import get_logger from llama_stack.providers.datatypes import RoutingTable @@ -60,3 +61,41 @@ class SafetyRouter(Safety): messages=messages, params=params, ) + + async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject: + async def get_shield_id(self, model: str) -> str: + """Get Shield id from model (provider_resource_id) of shield.""" + list_shields_response = await self.routing_table.list_shields() + + matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id] + if not matches: + raise ValueError(f"No shield associated with provider_resource id {model}") + if len(matches) > 1: + raise ValueError(f"Multiple shields associated with provider_resource id {model}") + return matches[0] + + shield_id = await get_shield_id(self, model) + logger.debug(f"SafetyRouter.run_moderation: {shield_id}") + provider = await self.routing_table.get_provider_impl(shield_id) + + response = await provider.run_moderation( + input=input, + model=model, + ) + self._validate_required_categories_exist(response) + + return response + + def _validate_required_categories_exist(self, response: ModerationObject) -> None: + """Validate the ProviderImpl response contains the required Open AI moderations categories.""" + required_categories = list(map(str, OpenAICategories)) + + categories = response.results[0].categories + category_applied_input_types = response.results[0].category_applied_input_types + category_scores = response.results[0].category_scores + + for i in [categories, category_applied_input_types, category_scores]: + if not set(required_categories).issubset(set(i.keys())): + raise ValueError( + f"ProviderImpl response is missing required categories: {set(required_categories) - set(i.keys())}" + ) diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml index becec81c6..188c66275 100644 --- a/llama_stack/distributions/ci-tests/run.yaml +++ b/llama_stack/distributions/ci-tests/run.yaml @@ -154,6 +154,7 @@ providers: checkpoint_format: huggingface distributed_backend: null device: cpu + dpo_output_dir: ~/.llama/distributions/ci-tests/dpo_output eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/distributions/nvidia/doc_template.md b/llama_stack/distributions/nvidia/doc_template.md index 3884e6b51..56e99e523 100644 --- a/llama_stack/distributions/nvidia/doc_template.md +++ b/llama_stack/distributions/nvidia/doc_template.md @@ -129,7 +129,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/llama_stack/distributions/postgres-demo/postgres_demo.py b/llama_stack/distributions/postgres-demo/postgres_demo.py index d3ee4261d..c04cfedfa 100644 --- a/llama_stack/distributions/postgres-demo/postgres_demo.py +++ b/llama_stack/distributions/postgres-demo/postgres_demo.py @@ -123,7 +123,7 @@ def get_distribution_template() -> DistributionTemplate: config=dict( service_name="${env.OTEL_SERVICE_NAME:=\u200b}", sinks="${env.TELEMETRY_SINKS:=console,otel_trace}", - otel_trace_endpoint="${env.OTEL_TRACE_ENDPOINT:=http://localhost:4318/v1/traces}", + otel_exporter_otlp_endpoint="${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://localhost:4318/v1/traces}", ), ) ], diff --git a/llama_stack/distributions/postgres-demo/run.yaml b/llama_stack/distributions/postgres-demo/run.yaml index 747b7dc53..0cf0e82e6 100644 --- a/llama_stack/distributions/postgres-demo/run.yaml +++ b/llama_stack/distributions/postgres-demo/run.yaml @@ -55,7 +55,7 @@ providers: config: service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" sinks: ${env.TELEMETRY_SINKS:=console,otel_trace} - otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:=http://localhost:4318/v1/traces} + otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://localhost:4318/v1/traces} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml index d56559ebc..8bd737686 100644 --- a/llama_stack/distributions/starter/run.yaml +++ b/llama_stack/distributions/starter/run.yaml @@ -154,6 +154,7 @@ providers: checkpoint_format: huggingface distributed_backend: null device: cpu + dpo_output_dir: ~/.llama/distributions/starter/dpo_output eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index da71ecb17..e8ebeb30d 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -5,8 +5,6 @@ # the root directory of this source tree. from typing import Any -import pandas - from llama_stack.apis.common.responses import PaginatedResponse from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset @@ -44,6 +42,8 @@ class PandasDataframeDataset: if self.dataset_def.source.type == "uri": self.df = await get_dataframe_from_uri(self.dataset_def.source.uri) elif self.dataset_def.source.type == "rows": + import pandas + self.df = pandas.DataFrame(self.dataset_def.source.rows) else: raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}") @@ -103,6 +103,8 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): return paginate_records(records, start_index, limit) async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None: + import pandas + dataset_def = self.dataset_infos[dataset_id] dataset_impl = PandasDataframeDataset(dataset_def) await dataset_impl.load() diff --git a/llama_stack/providers/inline/post_training/huggingface/config.py b/llama_stack/providers/inline/post_training/huggingface/config.py index dae8fcc04..04e286ff0 100644 --- a/llama_stack/providers/inline/post_training/huggingface/config.py +++ b/llama_stack/providers/inline/post_training/huggingface/config.py @@ -71,8 +71,13 @@ class HuggingFacePostTrainingConfig(BaseModel): dpo_beta: float = 0.1 use_reference_model: bool = True dpo_loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid" - dpo_output_dir: str = "./checkpoints/dpo" + dpo_output_dir: str @classmethod def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]: - return {"checkpoint_format": "huggingface", "distributed_backend": None, "device": "cpu"} + return { + "checkpoint_format": "huggingface", + "distributed_backend": None, + "device": "cpu", + "dpo_output_dir": __distro_dir__ + "/dpo_output", + } diff --git a/llama_stack/providers/inline/post_training/huggingface/post_training.py b/llama_stack/providers/inline/post_training/huggingface/post_training.py index 81622e2b7..22ace1ae0 100644 --- a/llama_stack/providers/inline/post_training/huggingface/post_training.py +++ b/llama_stack/providers/inline/post_training/huggingface/post_training.py @@ -22,15 +22,8 @@ from llama_stack.apis.post_training import ( from llama_stack.providers.inline.post_training.huggingface.config import ( HuggingFacePostTrainingConfig, ) -from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device import ( - HFFinetuningSingleDevice, -) -from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device_dpo import ( - HFDPOAlignmentSingleDevice, -) from llama_stack.providers.utils.scheduler import JobArtifact, Scheduler from llama_stack.providers.utils.scheduler import JobStatus as SchedulerJobStatus -from llama_stack.schema_utils import webmethod class TrainingArtifactType(Enum): @@ -85,6 +78,10 @@ class HuggingFacePostTrainingImpl: algorithm_config: AlgorithmConfig | None = None, ) -> PostTrainingJob: async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb): + from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device import ( + HFFinetuningSingleDevice, + ) + on_log_message_cb("Starting HF finetuning") recipe = HFFinetuningSingleDevice( @@ -124,6 +121,10 @@ class HuggingFacePostTrainingImpl: logger_config: dict[str, Any], ) -> PostTrainingJob: async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb): + from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device_dpo import ( + HFDPOAlignmentSingleDevice, + ) + on_log_message_cb("Starting HF DPO alignment") recipe = HFDPOAlignmentSingleDevice( @@ -168,7 +169,6 @@ class HuggingFacePostTrainingImpl: data = cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.RESOURCES_STATS.value) return data[0] if data else None - @webmethod(route="/post-training/job/status") async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse | None: job = self._scheduler.get_job(job_uuid) @@ -195,16 +195,13 @@ class HuggingFacePostTrainingImpl: resources_allocated=self._get_resources_allocated(job), ) - @webmethod(route="/post-training/job/cancel") async def cancel_training_job(self, job_uuid: str) -> None: self._scheduler.cancel(job_uuid) - @webmethod(route="/post-training/job/artifacts") async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse | None: job = self._scheduler.get_job(job_uuid) return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job)) - @webmethod(route="/post-training/jobs", method="GET") async def get_training_jobs(self) -> ListPostTrainingJobsResponse: return ListPostTrainingJobsResponse( data=[PostTrainingJob(job_uuid=job.id) for job in self._scheduler.get_jobs()] diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py index d20e11b11..765f6789d 100644 --- a/llama_stack/providers/inline/post_training/torchtune/post_training.py +++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py @@ -23,12 +23,8 @@ from llama_stack.apis.post_training import ( from llama_stack.providers.inline.post_training.torchtune.config import ( TorchtunePostTrainingConfig, ) -from llama_stack.providers.inline.post_training.torchtune.recipes.lora_finetuning_single_device import ( - LoraFinetuningSingleDevice, -) from llama_stack.providers.utils.scheduler import JobArtifact, Scheduler from llama_stack.providers.utils.scheduler import JobStatus as SchedulerJobStatus -from llama_stack.schema_utils import webmethod class TrainingArtifactType(Enum): @@ -84,6 +80,10 @@ class TorchtunePostTrainingImpl: if isinstance(algorithm_config, LoraFinetuningConfig): async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb): + from llama_stack.providers.inline.post_training.torchtune.recipes.lora_finetuning_single_device import ( + LoraFinetuningSingleDevice, + ) + on_log_message_cb("Starting Lora finetuning") recipe = LoraFinetuningSingleDevice( @@ -144,7 +144,6 @@ class TorchtunePostTrainingImpl: data = cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.RESOURCES_STATS.value) return data[0] if data else None - @webmethod(route="/post-training/job/status") async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse | None: job = self._scheduler.get_job(job_uuid) @@ -171,11 +170,9 @@ class TorchtunePostTrainingImpl: resources_allocated=self._get_resources_allocated(job), ) - @webmethod(route="/post-training/job/cancel") async def cancel_training_job(self, job_uuid: str) -> None: self._scheduler.cancel(job_uuid) - @webmethod(route="/post-training/job/artifacts") async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse | None: job = self._scheduler.get_job(job_uuid) return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job)) diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py index 4a7e99e00..f83c39a6a 100644 --- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py +++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py @@ -4,7 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import logging import re +import uuid from string import Template from typing import Any @@ -20,6 +22,7 @@ from llama_stack.apis.safety import ( SafetyViolation, ViolationLevel, ) +from llama_stack.apis.safety.safety import ModerationObject, ModerationObjectResults, OpenAICategories from llama_stack.apis.shields import Shield from llama_stack.core.datatypes import Api from llama_stack.models.llama.datatypes import Role @@ -67,6 +70,31 @@ SAFETY_CATEGORIES_TO_CODE_MAP = { CAT_ELECTIONS: "S13", CAT_CODE_INTERPRETER_ABUSE: "S14", } +SAFETY_CODE_TO_CATEGORIES_MAP = {v: k for k, v in SAFETY_CATEGORIES_TO_CODE_MAP.items()} + +OPENAI_TO_LLAMA_CATEGORIES_MAP = { + OpenAICategories.VIOLENCE: [CAT_VIOLENT_CRIMES], + OpenAICategories.VIOLENCE_GRAPHIC: [CAT_VIOLENT_CRIMES], + OpenAICategories.HARRASMENT: [CAT_CHILD_EXPLOITATION], + OpenAICategories.HARRASMENT_THREATENING: [CAT_VIOLENT_CRIMES, CAT_CHILD_EXPLOITATION], + OpenAICategories.HATE: [CAT_HATE], + OpenAICategories.HATE_THREATENING: [CAT_HATE, CAT_VIOLENT_CRIMES], + OpenAICategories.ILLICIT: [CAT_NON_VIOLENT_CRIMES], + OpenAICategories.ILLICIT_VIOLENT: [CAT_VIOLENT_CRIMES, CAT_INDISCRIMINATE_WEAPONS], + OpenAICategories.SEXUAL: [CAT_SEX_CRIMES, CAT_SEXUAL_CONTENT], + OpenAICategories.SEXUAL_MINORS: [CAT_CHILD_EXPLOITATION], + OpenAICategories.SELF_HARM: [CAT_SELF_HARM], + OpenAICategories.SELF_HARM_INTENT: [CAT_SELF_HARM], + OpenAICategories.SELF_HARM_INSTRUCTIONS: [CAT_SELF_HARM, CAT_SPECIALIZED_ADVICE], + # These are custom categories that are not in the OpenAI moderation categories + "custom/defamation": [CAT_DEFAMATION], + "custom/specialized_advice": [CAT_SPECIALIZED_ADVICE], + "custom/privacy_violation": [CAT_PRIVACY], + "custom/intellectual_property": [CAT_INTELLECTUAL_PROPERTY], + "custom/weapons": [CAT_INDISCRIMINATE_WEAPONS], + "custom/elections": [CAT_ELECTIONS], + "custom/code_interpreter_abuse": [CAT_CODE_INTERPRETER_ABUSE], +} DEFAULT_LG_V3_SAFETY_CATEGORIES = [ @@ -194,6 +222,34 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate): return await impl.run(messages) + async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject: + if isinstance(input, list): + messages = input.copy() + else: + messages = [input] + + # convert to user messages format with role + messages = [UserMessage(content=m) for m in messages] + + # Determine safety categories based on the model type + # For known Llama Guard models, use specific categories + if model in LLAMA_GUARD_MODEL_IDS: + # Use the mapped model for categories but the original model_id for inference + mapped_model = LLAMA_GUARD_MODEL_IDS[model] + safety_categories = MODEL_TO_SAFETY_CATEGORIES_MAP.get(mapped_model, DEFAULT_LG_V3_SAFETY_CATEGORIES) + else: + # For unknown models, use default Llama Guard 3 8B categories + safety_categories = DEFAULT_LG_V3_SAFETY_CATEGORIES + [CAT_CODE_INTERPRETER_ABUSE] + + impl = LlamaGuardShield( + model=model, + inference_api=self.inference_api, + excluded_categories=self.config.excluded_categories, + safety_categories=safety_categories, + ) + + return await impl.run_moderation(messages) + class LlamaGuardShield: def __init__( @@ -340,3 +396,117 @@ class LlamaGuardShield: ) raise ValueError(f"Unexpected response: {response}") + + async def run_moderation(self, messages: list[Message]) -> ModerationObject: + if not messages: + return self.create_moderation_object(self.model) + + # TODO: Add Image based support for OpenAI Moderations + shield_input_message = self.build_text_shield_input(messages) + + response = await self.inference_api.openai_chat_completion( + model=self.model, + messages=[shield_input_message], + stream=False, + ) + content = response.choices[0].message.content + content = content.strip() + return self.get_moderation_object(content) + + def create_moderation_object(self, model: str, unsafe_code: str | None = None) -> ModerationObject: + """Create a ModerationObject for either safe or unsafe content. + + Args: + model: The model name + unsafe_code: Optional comma-separated list of safety codes. If None, creates safe object. + + Returns: + ModerationObject with appropriate configuration + """ + # Set default values for safe case + categories = dict.fromkeys(OPENAI_TO_LLAMA_CATEGORIES_MAP.keys(), False) + category_scores = dict.fromkeys(OPENAI_TO_LLAMA_CATEGORIES_MAP.keys(), 1.0) + category_applied_input_types = {key: [] for key in OPENAI_TO_LLAMA_CATEGORIES_MAP.keys()} + flagged = False + user_message = None + metadata = {} + + # Handle unsafe case + if unsafe_code: + unsafe_code_list = [code.strip() for code in unsafe_code.split(",")] + invalid_codes = [code for code in unsafe_code_list if code not in SAFETY_CODE_TO_CATEGORIES_MAP] + if invalid_codes: + logging.warning(f"Invalid safety codes returned: {invalid_codes}") + # just returning safe object, as we don't know what the invalid codes can map to + return ModerationObject( + id=f"modr-{uuid.uuid4()}", + model=model, + results=[ + ModerationObjectResults( + flagged=flagged, + categories=categories, + category_applied_input_types=category_applied_input_types, + category_scores=category_scores, + user_message=user_message, + metadata=metadata, + ) + ], + ) + + # Get OpenAI categories for the unsafe codes + openai_categories = [] + for code in unsafe_code_list: + llama_guard_category = SAFETY_CODE_TO_CATEGORIES_MAP[code] + openai_categories.extend( + k for k, v_l in OPENAI_TO_LLAMA_CATEGORIES_MAP.items() if llama_guard_category in v_l + ) + + # Update categories for unsafe content + categories = {k: k in openai_categories for k in OPENAI_TO_LLAMA_CATEGORIES_MAP} + category_scores = {k: 1.0 if k in openai_categories else 0.0 for k in OPENAI_TO_LLAMA_CATEGORIES_MAP} + category_applied_input_types = { + k: ["text"] if k in openai_categories else [] for k in OPENAI_TO_LLAMA_CATEGORIES_MAP + } + flagged = True + user_message = CANNED_RESPONSE_TEXT + metadata = {"violation_type": unsafe_code_list} + + return ModerationObject( + id=f"modr-{uuid.uuid4()}", + model=model, + results=[ + ModerationObjectResults( + flagged=flagged, + categories=categories, + category_applied_input_types=category_applied_input_types, + category_scores=category_scores, + user_message=user_message, + metadata=metadata, + ) + ], + ) + + def is_content_safe(self, response: str, unsafe_code: str | None = None) -> bool: + """Check if content is safe based on response and unsafe code.""" + if response.strip() == SAFE_RESPONSE: + return True + + if unsafe_code: + unsafe_code_list = unsafe_code.split(",") + if set(unsafe_code_list).issubset(set(self.excluded_categories)): + return True + + return False + + def get_moderation_object(self, response: str) -> ModerationObject: + response = response.strip() + if self.is_content_safe(response): + return self.create_moderation_object(self.model) + unsafe_code = self.check_unsafe_response(response) + if not unsafe_code: + raise ValueError(f"Unexpected response: {response}") + + if self.is_content_safe(response, unsafe_code): + return self.create_moderation_object(self.model) + else: + return self.create_moderation_object(self.model, unsafe_code) diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py index b4c77437d..78e49af94 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -28,9 +28,6 @@ class ConsoleSpanProcessor(SpanProcessor): logger.info(f"[dim]{timestamp}[/dim] [bold magenta][START][/bold magenta] [dim]{span.name}[/dim]") def on_end(self, span: ReadableSpan) -> None: - if span.attributes and span.attributes.get("__autotraced__"): - return - timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=UTC).strftime("%H:%M:%S.%f")[:-3] span_context = f"[dim]{timestamp}[/dim] [bold magenta][END][/bold magenta] [dim]{span.name}[/dim]" if span.status.status_code == StatusCode.ERROR: @@ -67,7 +64,7 @@ class ConsoleSpanProcessor(SpanProcessor): for key, value in event.attributes.items(): if key.startswith("__") or key in ["message", "severity"]: continue - logger.info(f"/r[dim]{key}[/dim]: {value}") + logger.info(f"[dim]{key}[/dim]: {value}") def shutdown(self) -> None: """Shutdown the processor.""" diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index 623267172..d99255c79 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -4,10 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import logging import threading from typing import Any from opentelemetry import metrics, trace + +logger = logging.getLogger(__name__) from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.metrics import MeterProvider @@ -110,7 +113,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): if TelemetrySink.SQLITE in self.config.sinks: trace.get_tracer_provider().add_span_processor(SQLiteSpanProcessor(self.config.sqlite_db_path)) if TelemetrySink.CONSOLE in self.config.sinks: - trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor()) + trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor(print_attributes=True)) if TelemetrySink.OTEL_METRIC in self.config.sinks: self.meter = metrics.get_meter(__name__) @@ -126,9 +129,11 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): trace.get_tracer_provider().force_flush() async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None: + logger.debug(f"DEBUG: log_event called with event type: {type(event).__name__}") if isinstance(event, UnstructuredLogEvent): self._log_unstructured(event, ttl_seconds) elif isinstance(event, MetricEvent): + logger.debug("DEBUG: Routing MetricEvent to _log_metric") self._log_metric(event) elif isinstance(event, StructuredLogEvent): self._log_structured(event, ttl_seconds) @@ -188,6 +193,38 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): return _GLOBAL_STORAGE["gauges"][name] def _log_metric(self, event: MetricEvent) -> None: + # Always log to console if console sink is enabled (debug) + if TelemetrySink.CONSOLE in self.config.sinks: + logger.debug(f"METRIC: {event.metric}={event.value} {event.unit} {event.attributes}") + + # Add metric as an event to the current span + try: + with self._lock: + # Only try to add to span if we have a valid span_id + if event.span_id: + try: + span_id = int(event.span_id, 16) + span = _GLOBAL_STORAGE["active_spans"].get(span_id) + + if span: + timestamp_ns = int(event.timestamp.timestamp() * 1e9) + span.add_event( + name=f"metric.{event.metric}", + attributes={ + "value": event.value, + "unit": event.unit, + **(event.attributes or {}), + }, + timestamp=timestamp_ns, + ) + except (ValueError, KeyError): + # Invalid span_id or span not found, but we already logged to console above + pass + except Exception: + # Lock acquisition failed + logger.debug("Failed to acquire lock to add metric to span") + + # Log to OpenTelemetry meter if available if self.meter is None: return if isinstance(event.value, int): diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py index fafd1d8ff..a34e354bf 100644 --- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py +++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py @@ -6,8 +6,6 @@ from typing import Any from urllib.parse import parse_qs, urlparse -import datasets as hf_datasets - from llama_stack.apis.common.responses import PaginatedResponse from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset @@ -73,6 +71,8 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): start_index: int | None = None, limit: int | None = None, ) -> PaginatedResponse: + import datasets as hf_datasets + dataset_def = self.dataset_infos[dataset_id] path, params = parse_hf_params(dataset_def) loaded_dataset = hf_datasets.load_dataset(path, **params) @@ -81,6 +81,8 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): return paginate_records(records, start_index, limit) async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None: + import datasets as hf_datasets + dataset_def = self.dataset_infos[dataset_id] path, params = parse_hf_params(dataset_def) loaded_dataset = hf_datasets.load_dataset(path, **params) diff --git a/llama_stack/providers/remote/inference/gemini/models.py b/llama_stack/providers/remote/inference/gemini/models.py index 6fda35e0f..bd696b0ac 100644 --- a/llama_stack/providers/remote/inference/gemini/models.py +++ b/llama_stack/providers/remote/inference/gemini/models.py @@ -13,7 +13,9 @@ LLM_MODEL_IDS = [ "gemini-1.5-flash", "gemini-1.5-pro", "gemini-2.0-flash", + "gemini-2.0-flash-lite", "gemini-2.5-flash", + "gemini-2.5-flash-lite", "gemini-2.5-pro", ] diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index 2505718e0..4a072215c 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -42,8 +42,8 @@ client.initialize() ### Create Completion ```python -response = client.completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", content="Complete the sentence using one word: Roses are red, violets are :", stream=False, sampling_params={ @@ -56,8 +56,8 @@ print(f"Response: {response.content}") ### Create Chat Completion ```python -response = client.chat_completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.chat_completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", messages=[ { "role": "system", @@ -78,8 +78,10 @@ print(f"Response: {response.completion_message.content}") ### Create Embeddings ```python -response = client.embeddings( - model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"] +response = client.inference.embeddings( + model_id="nvidia/llama-3.2-nv-embedqa-1b-v2", + contents=["What is the capital of France?"], + task_type="query", ) print(f"Embeddings: {response.embeddings}") -``` +``` \ No newline at end of file diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 098e4d324..26b4dec76 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -112,7 +112,8 @@ class OllamaInferenceAdapter( @property def openai_client(self) -> AsyncOpenAI: if self._openai_client is None: - self._openai_client = AsyncOpenAI(base_url=f"{self.config.url}/v1", api_key="ollama") + url = self.config.url.rstrip("/") + self._openai_client = AsyncOpenAI(base_url=f"{url}/v1", api_key="ollama") return self._openai_client async def initialize(self) -> None: diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py index db58bf6d3..b09edb65c 100644 --- a/llama_stack/providers/remote/vector_io/milvus/milvus.py +++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py @@ -10,7 +10,7 @@ import os from typing import Any from numpy.typing import NDArray -from pymilvus import DataType, Function, FunctionType, MilvusClient +from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, RRFRanker, WeightedRanker from llama_stack.apis.common.errors import VectorStoreNotFoundError from llama_stack.apis.files.files import Files @@ -27,6 +27,7 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from llama_stack.providers.utils.kvstore.api import KVStore from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( + RERANKER_TYPE_WEIGHTED, EmbeddingIndex, VectorDBWithIndex, ) @@ -238,7 +239,53 @@ class MilvusIndex(EmbeddingIndex): reranker_type: str, reranker_params: dict[str, Any] | None = None, ) -> QueryChunksResponse: - raise NotImplementedError("Hybrid search is not supported in Milvus") + """ + Hybrid search using Milvus's native hybrid search capabilities. + + This implementation uses Milvus's hybrid_search method which combines + vector search and BM25 search with configurable reranking strategies. + """ + search_requests = [] + + # nprobe: Controls search accuracy vs performance trade-off + # 10 balances these trade-offs for RAG applications + search_requests.append( + AnnSearchRequest(data=[embedding.tolist()], anns_field="vector", param={"nprobe": 10}, limit=k) + ) + + # drop_ratio_search: Filters low-importance terms to improve search performance + # 0.2 balances noise reduction with recall + search_requests.append( + AnnSearchRequest(data=[query_string], anns_field="sparse", param={"drop_ratio_search": 0.2}, limit=k) + ) + + if reranker_type == RERANKER_TYPE_WEIGHTED: + alpha = (reranker_params or {}).get("alpha", 0.5) + rerank = WeightedRanker(alpha, 1 - alpha) + else: + impact_factor = (reranker_params or {}).get("impact_factor", 60.0) + rerank = RRFRanker(impact_factor) + + search_res = await asyncio.to_thread( + self.client.hybrid_search, + collection_name=self.collection_name, + reqs=search_requests, + ranker=rerank, + limit=k, + output_fields=["chunk_content"], + ) + + chunks = [] + scores = [] + for res in search_res[0]: + chunk = Chunk(**res["entity"]["chunk_content"]) + chunks.append(chunk) + scores.append(res["distance"]) + + filtered_chunks = [chunk for chunk, score in zip(chunks, scores, strict=False) if score >= score_threshold] + filtered_scores = [score for score in scores if score >= score_threshold] + + return QueryChunksResponse(chunks=filtered_chunks, scores=filtered_scores) async def delete_chunk(self, chunk_id: str) -> None: """Remove a chunk from the Milvus collection.""" diff --git a/llama_stack/providers/utils/datasetio/url_utils.py b/llama_stack/providers/utils/datasetio/url_utils.py index 386ee736d..77b047e2d 100644 --- a/llama_stack/providers/utils/datasetio/url_utils.py +++ b/llama_stack/providers/utils/datasetio/url_utils.py @@ -9,12 +9,12 @@ import base64 import io from urllib.parse import unquote -import pandas - from llama_stack.providers.utils.memory.vector_store import parse_data_url async def get_dataframe_from_uri(uri: str): + import pandas + df = None if uri.endswith(".csv"): # Moving to its own thread to avoid io from blocking the eventloop diff --git a/llama_stack/providers/utils/inference/stream_utils.py b/llama_stack/providers/utils/inference/stream_utils.py deleted file mode 100644 index bbfac13a3..000000000 --- a/llama_stack/providers/utils/inference/stream_utils.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from collections.abc import AsyncIterator -from datetime import UTC, datetime -from typing import Any - -from llama_stack.apis.inference import ( - OpenAIAssistantMessageParam, - OpenAIChatCompletion, - OpenAIChatCompletionChunk, - OpenAIChatCompletionToolCall, - OpenAIChatCompletionToolCallFunction, - OpenAIChoice, - OpenAIChoiceLogprobs, - OpenAIMessageParam, -) -from llama_stack.providers.utils.inference.inference_store import InferenceStore - - -async def stream_and_store_openai_completion( - provider_stream: AsyncIterator[OpenAIChatCompletionChunk], - model: str, - store: InferenceStore, - input_messages: list[OpenAIMessageParam], -) -> AsyncIterator[OpenAIChatCompletionChunk]: - """ - Wraps a provider's stream, yields chunks, and stores the full completion at the end. - """ - id = None - created = None - choices_data: dict[int, dict[str, Any]] = {} - - try: - async for chunk in provider_stream: - if id is None and chunk.id: - id = chunk.id - if created is None and chunk.created: - created = chunk.created - - if chunk.choices: - for choice_delta in chunk.choices: - idx = choice_delta.index - if idx not in choices_data: - choices_data[idx] = { - "content_parts": [], - "tool_calls_builder": {}, - "finish_reason": None, - "logprobs_content_parts": [], - } - current_choice_data = choices_data[idx] - - if choice_delta.delta: - delta = choice_delta.delta - if delta.content: - current_choice_data["content_parts"].append(delta.content) - if delta.tool_calls: - for tool_call_delta in delta.tool_calls: - tc_idx = tool_call_delta.index - if tc_idx not in current_choice_data["tool_calls_builder"]: - # Initialize with correct structure for _ToolCallBuilderData - current_choice_data["tool_calls_builder"][tc_idx] = { - "id": None, - "type": "function", - "function_name_parts": [], - "function_arguments_parts": [], - } - builder = current_choice_data["tool_calls_builder"][tc_idx] - if tool_call_delta.id: - builder["id"] = tool_call_delta.id - if tool_call_delta.type: - builder["type"] = tool_call_delta.type - if tool_call_delta.function: - if tool_call_delta.function.name: - builder["function_name_parts"].append(tool_call_delta.function.name) - if tool_call_delta.function.arguments: - builder["function_arguments_parts"].append(tool_call_delta.function.arguments) - if choice_delta.finish_reason: - current_choice_data["finish_reason"] = choice_delta.finish_reason - if choice_delta.logprobs and choice_delta.logprobs.content: - # Ensure that we are extending with the correct type - current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content) - yield chunk - finally: - if id: - assembled_choices: list[OpenAIChoice] = [] - for choice_idx, choice_data in choices_data.items(): - content_str = "".join(choice_data["content_parts"]) - assembled_tool_calls: list[OpenAIChatCompletionToolCall] = [] - if choice_data["tool_calls_builder"]: - for tc_build_data in choice_data["tool_calls_builder"].values(): - if tc_build_data["id"]: - func_name = "".join(tc_build_data["function_name_parts"]) - func_args = "".join(tc_build_data["function_arguments_parts"]) - assembled_tool_calls.append( - OpenAIChatCompletionToolCall( - id=tc_build_data["id"], - type=tc_build_data["type"], # No or "function" needed, already set - function=OpenAIChatCompletionToolCallFunction(name=func_name, arguments=func_args), - ) - ) - message = OpenAIAssistantMessageParam( - role="assistant", - content=content_str if content_str else None, - tool_calls=assembled_tool_calls if assembled_tool_calls else None, - ) - logprobs_content = choice_data["logprobs_content_parts"] - final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None - - assembled_choices.append( - OpenAIChoice( - finish_reason=choice_data["finish_reason"], - index=choice_idx, - message=message, - logprobs=final_logprobs, - ) - ) - - final_response = OpenAIChatCompletion( - id=id, - choices=assembled_choices, - created=created or int(datetime.now(UTC).timestamp()), - model=model, - object="chat.completion", - ) - await store.store_chat_completion(final_response, input_messages) diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 7b6e69df1..be010f72f 100644 --- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -18,6 +18,7 @@ from llama_stack.apis.files import Files, OpenAIFileObject from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_io import ( Chunk, + ChunkMetadata, QueryChunksResponse, SearchRankingOptions, VectorStoreChunkingStrategy, @@ -516,31 +517,68 @@ class OpenAIVectorStoreMixin(ABC): raise ValueError(f"Unsupported filter type: {filter_type}") def _chunk_to_vector_store_content(self, chunk: Chunk) -> list[VectorStoreContent]: + created_ts = None + if chunk.chunk_metadata is not None: + created_ts = getattr(chunk.chunk_metadata, "created_timestamp", None) + + metadata_dict = {} + if chunk.chunk_metadata: + if hasattr(chunk.chunk_metadata, "model_dump"): + metadata_dict = chunk.chunk_metadata.model_dump() + else: + metadata_dict = vars(chunk.chunk_metadata) + + user_metadata = chunk.metadata or {} + base_meta = {**metadata_dict, **user_metadata} + # content is InterleavedContent if isinstance(chunk.content, str): content = [ VectorStoreContent( type="text", text=chunk.content, + embedding=chunk.embedding, + created_timestamp=created_ts, + metadata=user_metadata, + chunk_metadata=ChunkMetadata(**base_meta) if base_meta else None, ) ] elif isinstance(chunk.content, list): # TODO: Add support for other types of content - content = [ - VectorStoreContent( - type="text", - text=item.text, - ) - for item in chunk.content - if item.type == "text" - ] + content = [] + for item in chunk.content: + if hasattr(item, "type") and item.type == "text": + item_meta = {**base_meta} + item_user_meta = getattr(item, "metadata", {}) or {} + if item_user_meta: + item_meta.update(item_user_meta) + + content.append( + VectorStoreContent( + type="text", + text=item.text, + embedding=getattr(item, "embedding", None), + created_timestamp=created_ts, + metadata=item_user_meta, + chunk_metadata=ChunkMetadata(**item_meta) if item_meta else None, + ) + ) else: - if chunk.content.type != "text": - raise ValueError(f"Unsupported content type: {chunk.content.type}") + content_item = chunk.content + if content_item.type != "text": + raise ValueError(f"Unsupported content type: {content_item.type}") + + item_user_meta = getattr(content_item, "metadata", {}) or {} + combined_meta = {**base_meta, **item_user_meta} + content = [ VectorStoreContent( type="text", - text=chunk.content.text, + text=content_item.text, + embedding=getattr(content_item, "embedding", None), + created_timestamp=created_ts, + metadata=item_user_meta, + chunk_metadata=ChunkMetadata(**combined_meta) if combined_meta else None, ) ] return content diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index 484475e9d..bb9002f30 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -302,23 +302,25 @@ class VectorDBWithIndex: mode = params.get("mode") score_threshold = params.get("score_threshold", 0.0) - # Get ranker configuration ranker = params.get("ranker") if ranker is None: - # Default to RRF with impact_factor=60.0 reranker_type = RERANKER_TYPE_RRF reranker_params = {"impact_factor": 60.0} else: - reranker_type = ranker.type - reranker_params = ( - {"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha} - ) + strategy = ranker.get("strategy", "rrf") + if strategy == "weighted": + weights = ranker.get("params", {}).get("weights", [0.5, 0.5]) + reranker_type = RERANKER_TYPE_WEIGHTED + reranker_params = {"alpha": weights[0] if len(weights) > 0 else 0.5} + else: + reranker_type = RERANKER_TYPE_RRF + k_value = ranker.get("params", {}).get("k", 60.0) + reranker_params = {"impact_factor": k_value} query_string = interleaved_content_as_str(query) if mode == "keyword": return await self.index.query_keyword(query_string, k, score_threshold) - # Calculate embeddings for both vector and hybrid modes embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string]) query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32) if mode == "hybrid": diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index c85722bdc..7080e774a 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -9,7 +9,9 @@ import contextvars import logging import queue import random +import sys import threading +import time from collections.abc import Callable from datetime import UTC, datetime from functools import wraps @@ -30,6 +32,16 @@ from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value logger = get_logger(__name__, category="core") +# Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion +_fallback_logger = logging.getLogger("llama_stack.telemetry.background") +if not _fallback_logger.handlers: + _fallback_logger.propagate = False + _fallback_logger.setLevel(logging.ERROR) + _fallback_handler = logging.StreamHandler(sys.stderr) + _fallback_handler.setLevel(logging.ERROR) + _fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")) + _fallback_logger.addHandler(_fallback_handler) + INVALID_SPAN_ID = 0x0000000000000000 INVALID_TRACE_ID = 0x00000000000000000000000000000000 @@ -79,19 +91,32 @@ def generate_trace_id() -> str: CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None) BACKGROUND_LOGGER = None +LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0 + class BackgroundLogger: - def __init__(self, api: Telemetry, capacity: int = 1000): + def __init__(self, api: Telemetry, capacity: int = 100000): self.api = api - self.log_queue = queue.Queue(maxsize=capacity) + self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) self.worker_thread.start() + self._last_queue_full_log_time: float = 0.0 + self._dropped_since_last_notice: int = 0 def log_event(self, event): try: self.log_queue.put_nowait(event) except queue.Full: - logger.error("Log queue is full, dropping event") + # Aggregate drops and emit at most once per interval via fallback logger + self._dropped_since_last_notice += 1 + current_time = time.time() + if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS: + _fallback_logger.error( + "Log queue is full; dropped %d events since last notice", + self._dropped_since_last_notice, + ) + self._last_queue_full_log_time = current_time + self._dropped_since_last_notice = 0 def _process_logs(self): while True: diff --git a/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx b/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx new file mode 100644 index 000000000..6896b992a --- /dev/null +++ b/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx @@ -0,0 +1,383 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useParams, useRouter } from "next/navigation"; +import { useAuthClient } from "@/hooks/use-auth-client"; +import { ContentsAPI, VectorStoreContentItem } from "@/lib/contents-api"; +import type { VectorStore } from "llama-stack-client/resources/vector-stores/vector-stores"; +import type { VectorStoreFile } from "llama-stack-client/resources/vector-stores/files"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Edit, Save, X, Trash2 } from "lucide-react"; +import { + DetailLoadingView, + DetailErrorView, + DetailNotFoundView, + DetailLayout, + PropertiesCard, + PropertyItem, +} from "@/components/layout/detail-layout"; +import { PageBreadcrumb, BreadcrumbSegment } from "@/components/layout/page-breadcrumb"; + +export default function ContentDetailPage() { + const params = useParams(); + const router = useRouter(); + const vectorStoreId = params.id as string; + const fileId = params.fileId as string; + const contentId = params.contentId as string; + const client = useAuthClient(); + + const getTextFromContent = (content: any): string => { + if (typeof content === 'string') { + return content; + } else if (content && content.type === 'text') { + return content.text; + } + return ''; + }; + + const [store, setStore] = useState(null); + const [file, setFile] = useState(null); + const [content, setContent] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + const [isEditing, setIsEditing] = useState(false); + const [editedContent, setEditedContent] = useState(""); + const [editedMetadata, setEditedMetadata] = useState>({}); + const [isEditingEmbedding, setIsEditingEmbedding] = useState(false); + const [editedEmbedding, setEditedEmbedding] = useState([]); + + useEffect(() => { + if (!vectorStoreId || !fileId || !contentId) return; + + const fetchData = async () => { + setIsLoading(true); + setError(null); + try { + const [storeResponse, fileResponse] = await Promise.all([ + client.vectorStores.retrieve(vectorStoreId), + client.vectorStores.files.retrieve(vectorStoreId, fileId), + ]); + + setStore(storeResponse as VectorStore); + setFile(fileResponse as VectorStoreFile); + + const contentsAPI = new ContentsAPI(client); + const contentsResponse = await contentsAPI.listContents(vectorStoreId, fileId); + const targetContent = contentsResponse.data.find(c => c.id === contentId); + + if (targetContent) { + setContent(targetContent); + setEditedContent(getTextFromContent(targetContent.content)); + setEditedMetadata({ ...targetContent.metadata }); + setEditedEmbedding(targetContent.embedding || []); + } else { + throw new Error(`Content ${contentId} not found`); + } + } catch (err) { + setError(err instanceof Error ? err : new Error("Failed to load content.")); + } finally { + setIsLoading(false); + } + }; + fetchData(); + }, [vectorStoreId, fileId, contentId, client]); + + const handleSave = async () => { + if (!content) return; + + try { + const updates: { content?: string; metadata?: Record } = {}; + + if (editedContent !== getTextFromContent(content.content)) { + updates.content = editedContent; + } + + if (JSON.stringify(editedMetadata) !== JSON.stringify(content.metadata)) { + updates.metadata = editedMetadata; + } + + if (Object.keys(updates).length > 0) { + const contentsAPI = new ContentsAPI(client); + const updatedContent = await contentsAPI.updateContent(vectorStoreId, fileId, contentId, updates); + setContent(updatedContent); + } + + setIsEditing(false); + } catch (err) { + console.error('Failed to update content:', err); + } + }; + + const handleDelete = async () => { + if (!confirm('Are you sure you want to delete this content?')) return; + + try { + const contentsAPI = new ContentsAPI(client); + await contentsAPI.deleteContent(vectorStoreId, fileId, contentId); + router.push(`/logs/vector-stores/${vectorStoreId}/files/${fileId}/contents`); + } catch (err) { + console.error('Failed to delete content:', err); + } + }; + + const handleCancel = () => { + setEditedContent(content ? getTextFromContent(content.content) : ""); + setEditedMetadata({ ...content?.metadata }); + setEditedEmbedding(content?.embedding || []); + setIsEditing(false); + setIsEditingEmbedding(false); + }; + + const title = `Content: ${contentId}`; + + const breadcrumbSegments: BreadcrumbSegment[] = [ + { label: "Vector Stores", href: "/logs/vector-stores" }, + { label: store?.name || vectorStoreId, href: `/logs/vector-stores/${vectorStoreId}` }, + { label: "Files", href: `/logs/vector-stores/${vectorStoreId}` }, + { label: fileId, href: `/logs/vector-stores/${vectorStoreId}/files/${fileId}` }, + { label: "Contents", href: `/logs/vector-stores/${vectorStoreId}/files/${fileId}/contents` }, + { label: contentId }, + ]; + + if (error) { + return ; + } + if (isLoading) { + return ; + } + if (!content) { + return ; + } + + const mainContent = ( + <> + + + Content +
+ {isEditing ? ( + <> + + + + ) : ( + <> + + + + )} +
+
+ + {isEditing ? ( +