mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-28 07:02:00 +00:00
Merge branch 'main' into feat/litellm_sambanova_usage
This commit is contained in:
commit
9c9f9577e2
173 changed files with 3073 additions and 3118 deletions
155
docs/_static/llama-stack-spec.html
vendored
155
docs/_static/llama-stack-spec.html
vendored
|
|
@ -818,14 +818,7 @@
|
|||
"delete": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/FileResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
"description": "OK"
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
|
|
@ -2122,7 +2115,7 @@
|
|||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/IterrowsResponse"
|
||||
"$ref": "#/components/schemas/PaginatedResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2143,7 +2136,7 @@
|
|||
"tags": [
|
||||
"DatasetIO"
|
||||
],
|
||||
"description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
|
||||
"description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "dataset_id",
|
||||
|
|
@ -2695,9 +2688,9 @@
|
|||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/jsonl": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ToolDef"
|
||||
"$ref": "#/components/schemas/ListToolDefsResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -4053,22 +4046,33 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"strategy": {
|
||||
"$ref": "#/components/schemas/SamplingStrategy"
|
||||
"$ref": "#/components/schemas/SamplingStrategy",
|
||||
"description": "The sampling strategy."
|
||||
},
|
||||
"max_tokens": {
|
||||
"type": "integer",
|
||||
"default": 0
|
||||
"default": 0,
|
||||
"description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
|
||||
},
|
||||
"repetition_penalty": {
|
||||
"type": "number",
|
||||
"default": 1.0
|
||||
"default": 1.0,
|
||||
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
|
||||
},
|
||||
"stop": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"strategy"
|
||||
],
|
||||
"title": "SamplingParams"
|
||||
"title": "SamplingParams",
|
||||
"description": "Sampling parameters."
|
||||
},
|
||||
"SamplingStrategy": {
|
||||
"oneOf": [
|
||||
|
|
@ -6129,46 +6133,6 @@
|
|||
"title": "FileUploadResponse",
|
||||
"description": "Response after initiating a file upload session."
|
||||
},
|
||||
"FileResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bucket": {
|
||||
"type": "string",
|
||||
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
|
||||
},
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
|
||||
},
|
||||
"mime_type": {
|
||||
"type": "string",
|
||||
"description": "MIME type of the file"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "Upload URL for the file contents"
|
||||
},
|
||||
"bytes": {
|
||||
"type": "integer",
|
||||
"description": "Size of the file in bytes"
|
||||
},
|
||||
"created_at": {
|
||||
"type": "integer",
|
||||
"description": "Timestamp of when the file was created"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"bucket",
|
||||
"key",
|
||||
"mime_type",
|
||||
"url",
|
||||
"bytes",
|
||||
"created_at"
|
||||
],
|
||||
"title": "FileResponse",
|
||||
"description": "Response representing a file entry."
|
||||
},
|
||||
"EmbeddingsRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -6922,6 +6886,46 @@
|
|||
"title": "URIDataSource",
|
||||
"description": "A dataset that can be obtained from a URI."
|
||||
},
|
||||
"FileResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bucket": {
|
||||
"type": "string",
|
||||
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
|
||||
},
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
|
||||
},
|
||||
"mime_type": {
|
||||
"type": "string",
|
||||
"description": "MIME type of the file"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "Upload URL for the file contents"
|
||||
},
|
||||
"bytes": {
|
||||
"type": "integer",
|
||||
"description": "Size of the file in bytes"
|
||||
},
|
||||
"created_at": {
|
||||
"type": "integer",
|
||||
"description": "Timestamp of when the file was created"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"bucket",
|
||||
"key",
|
||||
"mime_type",
|
||||
"url",
|
||||
"bytes",
|
||||
"created_at"
|
||||
],
|
||||
"title": "FileResponse",
|
||||
"description": "Response representing a file entry."
|
||||
},
|
||||
"Model": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -7660,7 +7664,8 @@
|
|||
"completed",
|
||||
"in_progress",
|
||||
"failed",
|
||||
"scheduled"
|
||||
"scheduled",
|
||||
"cancelled"
|
||||
],
|
||||
"title": "JobStatus"
|
||||
},
|
||||
|
|
@ -8068,7 +8073,7 @@
|
|||
"additionalProperties": false,
|
||||
"title": "ToolInvocationResult"
|
||||
},
|
||||
"IterrowsResponse": {
|
||||
"PaginatedResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
|
|
@ -8098,19 +8103,20 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"description": "The rows in the current page."
|
||||
"description": "The list of items for the current page"
|
||||
},
|
||||
"next_start_index": {
|
||||
"type": "integer",
|
||||
"description": "Index into dataset for the first row in the next page. None if there are no more rows."
|
||||
"has_more": {
|
||||
"type": "boolean",
|
||||
"description": "Whether there are more items available after this set"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"data"
|
||||
"data",
|
||||
"has_more"
|
||||
],
|
||||
"title": "IterrowsResponse",
|
||||
"description": "A paginated list of rows from a dataset."
|
||||
"title": "PaginatedResponse",
|
||||
"description": "A generic paginated response that follows a simple format."
|
||||
},
|
||||
"Job": {
|
||||
"type": "object",
|
||||
|
|
@ -8124,7 +8130,8 @@
|
|||
"completed",
|
||||
"in_progress",
|
||||
"failed",
|
||||
"scheduled"
|
||||
"scheduled",
|
||||
"cancelled"
|
||||
],
|
||||
"title": "JobStatus"
|
||||
}
|
||||
|
|
@ -8321,6 +8328,22 @@
|
|||
],
|
||||
"title": "ListRoutesResponse"
|
||||
},
|
||||
"ListToolDefsResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ToolDef"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"data"
|
||||
],
|
||||
"title": "ListToolDefsResponse"
|
||||
},
|
||||
"ListScoringFunctionsResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
|
|||
138
docs/_static/llama-stack-spec.yaml
vendored
138
docs/_static/llama-stack-spec.yaml
vendored
|
|
@ -557,10 +557,6 @@ paths:
|
|||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/FileResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
|
|
@ -1447,7 +1443,7 @@ paths:
|
|||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/IterrowsResponse'
|
||||
$ref: '#/components/schemas/PaginatedResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
|
|
@ -1461,7 +1457,20 @@ paths:
|
|||
tags:
|
||||
- DatasetIO
|
||||
description: >-
|
||||
Get a paginated list of rows from a dataset. Uses cursor-based pagination.
|
||||
Get a paginated list of rows from a dataset.
|
||||
|
||||
Uses offset-based pagination where:
|
||||
|
||||
- start_index: The starting index (0-based). If None, starts from beginning.
|
||||
|
||||
- limit: Number of items to return. If None or -1, returns all items.
|
||||
|
||||
|
||||
The response includes:
|
||||
|
||||
- data: List of items for the current page
|
||||
|
||||
- has_more: Whether there are more items available after this set
|
||||
parameters:
|
||||
- name: dataset_id
|
||||
in: path
|
||||
|
|
@ -1846,9 +1855,9 @@ paths:
|
|||
'200':
|
||||
description: OK
|
||||
content:
|
||||
application/jsonl:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ToolDef'
|
||||
$ref: '#/components/schemas/ListToolDefsResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
|
|
@ -2787,16 +2796,33 @@ components:
|
|||
properties:
|
||||
strategy:
|
||||
$ref: '#/components/schemas/SamplingStrategy'
|
||||
description: The sampling strategy.
|
||||
max_tokens:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
The maximum number of tokens that can be generated in the completion.
|
||||
The token count of your prompt plus max_tokens cannot exceed the model's
|
||||
context length.
|
||||
repetition_penalty:
|
||||
type: number
|
||||
default: 1.0
|
||||
description: >-
|
||||
Number between -2.0 and 2.0. Positive values penalize new tokens based
|
||||
on whether they appear in the text so far, increasing the model's likelihood
|
||||
to talk about new topics.
|
||||
stop:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
Up to 4 sequences where the API will stop generating further tokens. The
|
||||
returned text will not contain the stop sequence.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- strategy
|
||||
title: SamplingParams
|
||||
description: Sampling parameters.
|
||||
SamplingStrategy:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/GreedySamplingStrategy'
|
||||
|
|
@ -4269,39 +4295,6 @@ components:
|
|||
title: FileUploadResponse
|
||||
description: >-
|
||||
Response after initiating a file upload session.
|
||||
FileResponse:
|
||||
type: object
|
||||
properties:
|
||||
bucket:
|
||||
type: string
|
||||
description: >-
|
||||
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
|
||||
key:
|
||||
type: string
|
||||
description: >-
|
||||
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
|
||||
mime_type:
|
||||
type: string
|
||||
description: MIME type of the file
|
||||
url:
|
||||
type: string
|
||||
description: Upload URL for the file contents
|
||||
bytes:
|
||||
type: integer
|
||||
description: Size of the file in bytes
|
||||
created_at:
|
||||
type: integer
|
||||
description: Timestamp of when the file was created
|
||||
additionalProperties: false
|
||||
required:
|
||||
- bucket
|
||||
- key
|
||||
- mime_type
|
||||
- url
|
||||
- bytes
|
||||
- created_at
|
||||
title: FileResponse
|
||||
description: Response representing a file entry.
|
||||
EmbeddingsRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -4813,6 +4806,39 @@ components:
|
|||
title: URIDataSource
|
||||
description: >-
|
||||
A dataset that can be obtained from a URI.
|
||||
FileResponse:
|
||||
type: object
|
||||
properties:
|
||||
bucket:
|
||||
type: string
|
||||
description: >-
|
||||
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
|
||||
key:
|
||||
type: string
|
||||
description: >-
|
||||
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
|
||||
mime_type:
|
||||
type: string
|
||||
description: MIME type of the file
|
||||
url:
|
||||
type: string
|
||||
description: Upload URL for the file contents
|
||||
bytes:
|
||||
type: integer
|
||||
description: Size of the file in bytes
|
||||
created_at:
|
||||
type: integer
|
||||
description: Timestamp of when the file was created
|
||||
additionalProperties: false
|
||||
required:
|
||||
- bucket
|
||||
- key
|
||||
- mime_type
|
||||
- url
|
||||
- bytes
|
||||
- created_at
|
||||
title: FileResponse
|
||||
description: Response representing a file entry.
|
||||
Model:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -5289,6 +5315,7 @@ components:
|
|||
- in_progress
|
||||
- failed
|
||||
- scheduled
|
||||
- cancelled
|
||||
title: JobStatus
|
||||
scheduled_at:
|
||||
type: string
|
||||
|
|
@ -5528,7 +5555,7 @@ components:
|
|||
- type: object
|
||||
additionalProperties: false
|
||||
title: ToolInvocationResult
|
||||
IterrowsResponse:
|
||||
PaginatedResponse:
|
||||
type: object
|
||||
properties:
|
||||
data:
|
||||
|
|
@ -5543,17 +5570,18 @@ components:
|
|||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows in the current page.
|
||||
next_start_index:
|
||||
type: integer
|
||||
description: The list of items for the current page
|
||||
has_more:
|
||||
type: boolean
|
||||
description: >-
|
||||
Index into dataset for the first row in the next page. None if there are
|
||||
no more rows.
|
||||
Whether there are more items available after this set
|
||||
additionalProperties: false
|
||||
required:
|
||||
- data
|
||||
title: IterrowsResponse
|
||||
description: A paginated list of rows from a dataset.
|
||||
- has_more
|
||||
title: PaginatedResponse
|
||||
description: >-
|
||||
A generic paginated response that follows a simple format.
|
||||
Job:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -5566,6 +5594,7 @@ components:
|
|||
- in_progress
|
||||
- failed
|
||||
- scheduled
|
||||
- cancelled
|
||||
title: JobStatus
|
||||
additionalProperties: false
|
||||
required:
|
||||
|
|
@ -5703,6 +5732,17 @@ components:
|
|||
required:
|
||||
- data
|
||||
title: ListRoutesResponse
|
||||
ListToolDefsResponse:
|
||||
type: object
|
||||
properties:
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ToolDef'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- data
|
||||
title: ListToolDefsResponse
|
||||
ListScoringFunctionsResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
|
|
|||
BIN
docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
BIN
docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 37 KiB |
BIN
docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 56 KiB |
File diff suppressed because one or more lines are too long
|
|
@ -963,16 +963,19 @@
|
|||
"\n",
|
||||
"client.benchmarks.register(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
|
||||
" # `input_rows` argument and does not fetch data from the dataset.\n",
|
||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
|
||||
" scoring_functions=[],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" input_rows=eval_rows,\n",
|
||||
" # Note: Here we define the actual scoring functions.\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||
|
|
@ -1139,12 +1142,11 @@
|
|||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.data,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||
|
|
@ -1288,12 +1290,11 @@
|
|||
" \"enable_session_persistence\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.data,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"agent\",\n",
|
||||
" \"config\": agent_config,\n",
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack # noqa: E402
|
|||
|
||||
from .pyopenapi.options import Options # noqa: E402
|
||||
from .pyopenapi.specification import Info, Server # noqa: E402
|
||||
from .pyopenapi.utility import Specification, validate_api_method_return_types # noqa: E402
|
||||
from .pyopenapi.utility import Specification, validate_api # noqa: E402
|
||||
|
||||
|
||||
def str_presenter(dumper, data):
|
||||
|
|
@ -40,8 +40,7 @@ def main(output_dir: str):
|
|||
raise ValueError(f"Directory {output_dir} does not exist")
|
||||
|
||||
# Validate API protocols before generating spec
|
||||
print("Validating API method return types...")
|
||||
return_type_errors = validate_api_method_return_types()
|
||||
return_type_errors = validate_api()
|
||||
if return_type_errors:
|
||||
print("\nAPI Method Return Type Validation Errors:\n")
|
||||
for error in return_type_errors:
|
||||
|
|
|
|||
|
|
@ -7,10 +7,9 @@
|
|||
import json
|
||||
import typing
|
||||
import inspect
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TextIO
|
||||
from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
|
||||
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
|
||||
|
||||
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
|
||||
from llama_stack.distribution.resolver import api_protocol_map
|
||||
|
|
@ -125,29 +124,89 @@ def is_optional_type(type_: Any) -> bool:
|
|||
return origin is Optional or (origin is Union and type(None) in args)
|
||||
|
||||
|
||||
def validate_api_method_return_types() -> List[str]:
|
||||
"""Validate that all API methods have proper return types."""
|
||||
def _validate_api_method_return_type(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if is_optional_type(return_type):
|
||||
return "returns Optional type where a return value is mandatory"
|
||||
|
||||
|
||||
def _validate_api_method_doesnt_return_list(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if get_origin(return_type) is list:
|
||||
return "returns a list where a PaginatedResponse or List*Response object is expected"
|
||||
|
||||
|
||||
def _validate_api_delete_method_returns_none(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if return_type is not None and return_type is not type(None):
|
||||
return "does not return None where None is mandatory"
|
||||
|
||||
|
||||
def _validate_list_parameters_contain_data(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if not inspect.isclass(return_type):
|
||||
return
|
||||
|
||||
if not return_type.__name__.startswith('List'):
|
||||
return
|
||||
|
||||
if 'data' not in return_type.model_fields:
|
||||
return "does not have a mandatory data attribute containing the list of objects"
|
||||
|
||||
|
||||
_VALIDATORS = {
|
||||
"GET": [
|
||||
_validate_api_method_return_type,
|
||||
_validate_list_parameters_contain_data,
|
||||
_validate_api_method_doesnt_return_list,
|
||||
],
|
||||
"DELETE": [
|
||||
_validate_api_delete_method_returns_none,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _get_methods_by_type(protocol, method_type: str):
|
||||
members = inspect.getmembers(protocol, predicate=inspect.isfunction)
|
||||
return {
|
||||
method_name: method
|
||||
for method_name, method in members
|
||||
if (webmethod := getattr(method, '__webmethod__', None))
|
||||
if webmethod and webmethod.method == method_type
|
||||
}
|
||||
|
||||
|
||||
def validate_api() -> List[str]:
|
||||
"""Validate the API protocols."""
|
||||
errors = []
|
||||
protocols = api_protocol_map()
|
||||
|
||||
for protocol_name, protocol in protocols.items():
|
||||
methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
|
||||
|
||||
for method_name, method in methods:
|
||||
if not hasattr(method, '__webmethod__'):
|
||||
continue
|
||||
|
||||
# Only check GET methods
|
||||
if method.__webmethod__.method != "GET":
|
||||
continue
|
||||
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
|
||||
else:
|
||||
return_type = hints['return']
|
||||
if is_optional_type(return_type):
|
||||
errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
|
||||
for target, validators in _VALIDATORS.items():
|
||||
for protocol_name, protocol in protocols.items():
|
||||
for validator in validators:
|
||||
for method_name, method in _get_methods_by_type(protocol, target).items():
|
||||
err = validator(method)
|
||||
if err:
|
||||
errors.append(f"Method {protocol_name}.{method_name} {err}")
|
||||
|
||||
return errors
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Building AI Applications
|
||||
# Building AI Applications (Examples)
|
||||
|
||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
## Using Retrieval Augmented Generation (RAG)
|
||||
## Retrieval Augmented Generation (RAG)
|
||||
|
||||
RAG enables your applications to reference and recall information from previous interactions or external documents.
|
||||
|
||||
|
|
|
|||
|
|
@ -45,14 +45,16 @@ Here's an example that sends telemetry signals to all three sink types. Your con
|
|||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
sinks: ['console', 'sqlite', 'otel']
|
||||
otel_endpoint: "http://localhost:4318/v1/traces"
|
||||
sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
|
||||
otel_trace_endpoint: "http://localhost:4318/v1/traces"
|
||||
otel_metric_endpoint: "http://localhost:4318/v1/metrics"
|
||||
sqlite_db_path: "/path/to/telemetry.db"
|
||||
```
|
||||
|
||||
### Jaeger to visualize traces
|
||||
|
||||
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
|
||||
The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
|
||||
Let's use Jaeger to visualize this data.
|
||||
|
||||
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
|
||||
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from docutils import nodes
|
|||
from pathlib import Path
|
||||
import requests
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Read version from pyproject.toml
|
||||
with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
|
||||
|
|
@ -28,7 +29,7 @@ with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") a
|
|||
llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
|
||||
|
||||
project = "llama-stack"
|
||||
copyright = "2025, Meta"
|
||||
copyright = f"{datetime.now().year}, Meta"
|
||||
author = "Meta"
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
|
@ -37,6 +38,7 @@ author = "Meta"
|
|||
extensions = [
|
||||
"myst_parser",
|
||||
"sphinx_rtd_theme",
|
||||
"sphinx_rtd_dark_mode",
|
||||
"sphinx_copybutton",
|
||||
"sphinx_tabs.tabs",
|
||||
"sphinx_design",
|
||||
|
|
@ -103,6 +105,8 @@ source_suffix = {
|
|||
# html_theme = "alabaster"
|
||||
html_theme_options = {
|
||||
"canonical_url": "https://github.com/meta-llama/llama-stack",
|
||||
'collapse_navigation': False,
|
||||
|
||||
# "style_nav_header_background": "#c3c9d4",
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
# Contributing to Llama Stack
|
||||
|
||||
Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
|
||||
```{include} ../../../CONTRIBUTING.md
|
||||
```
|
||||
|
||||
See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||
|
||||
|
||||
- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
|
||||
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
|
||||
new_api_provider
|
||||
testing
|
||||
```
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ options:
|
|||
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
|
||||
conda)
|
||||
--image-name IMAGE_NAME
|
||||
[for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
|
||||
[for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
|
||||
found. (default: None)
|
||||
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
||||
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Configuring a Stack
|
||||
# Configuring a "Stack"
|
||||
|
||||
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
# Using Llama Stack as a Library
|
||||
|
||||
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
|
||||
## Setup Llama Stack without a Server
|
||||
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
|
||||
This avoids the overhead of setting up a server.
|
||||
```bash
|
||||
# setup
|
||||
uv pip install llama-stack
|
||||
llama stack build --template together --image-type venv
|
||||
llama stack build --template ollama --image-type venv
|
||||
```
|
||||
|
||||
```python
|
||||
|
|
|
|||
|
|
@ -1,34 +1,18 @@
|
|||
# Starting a Llama Stack Server
|
||||
# Distributions Overview
|
||||
|
||||
You can run a Llama Stack server in one of the following ways:
|
||||
|
||||
**As a Library**:
|
||||
|
||||
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
|
||||
|
||||
|
||||
**Container**:
|
||||
|
||||
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
||||
|
||||
|
||||
**Conda**:
|
||||
|
||||
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
|
||||
|
||||
|
||||
**Kubernetes**:
|
||||
|
||||
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
||||
A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
|
||||
|
||||
This section provides an overview of the distributions available in Llama Stack.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
:maxdepth: 3
|
||||
|
||||
importing_as_library
|
||||
building_distro
|
||||
configuration
|
||||
selection
|
||||
list_of_distributions
|
||||
kubernetes_deployment
|
||||
building_distro
|
||||
on_device_distro
|
||||
remote_hosted_distro
|
||||
self_hosted_distro
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
# Kubernetes Deployment Guide
|
||||
|
||||
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
|
||||
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
|
||||
|
||||
### Prerequisites
|
||||
In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
|
||||
|
||||
First, create a local Kubernetes cluster via Kind:
|
||||
|
||||
|
|
@ -8,7 +11,7 @@ First, create a local Kubernetes cluster via Kind:
|
|||
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
|
||||
```
|
||||
|
||||
Start vLLM server as a Kubernetes Pod and Service:
|
||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
|
|
@ -31,7 +34,13 @@ metadata:
|
|||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
---
|
||||
```
|
||||
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
|
|
@ -47,28 +56,23 @@ spec:
|
|||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack
|
||||
image: $(VLLM_IMAGE)
|
||||
command:
|
||||
- bash
|
||||
- -c
|
||||
- |
|
||||
MODEL="meta-llama/Llama-3.2-1B-Instruct"
|
||||
MODEL_PATH=/app/model/$(basename $MODEL)
|
||||
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
|
||||
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
|
||||
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /app/model
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
|
|
@ -127,6 +131,7 @@ EOF
|
|||
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
|
||||
```
|
||||
|
||||
### Deploying Llama Stack Server in Kubernetes
|
||||
|
||||
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
|
||||
|
||||
|
|
@ -187,6 +192,7 @@ spec:
|
|||
EOF
|
||||
```
|
||||
|
||||
### Verifying the Deployment
|
||||
We can check that the LlamaStack server has started:
|
||||
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# List of Distributions
|
||||
# Available List of Distributions
|
||||
|
||||
Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.
|
||||
|
||||
|
|
@ -9,6 +9,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
| datasetio | `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
| scoring | `inline::basic` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
|
@ -21,6 +22,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
|
|
|||
|
|
@ -98,11 +98,14 @@ export INFERENCE_PORT=8000
|
|||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export LLAMA_STACK_PORT=8321
|
||||
|
||||
# You need a local checkout of llama-stack to run this, get it using
|
||||
# git clone https://github.com/meta-llama/llama-stack.git
|
||||
cd /path/to/llama-stack
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
-v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-remote-vllm \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
|
|
@ -121,7 +124,6 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|||
cd /path/to/llama-stack
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
|
|
|
|||
32
docs/source/distributions/starting_llama_stack_server.md
Normal file
32
docs/source/distributions/starting_llama_stack_server.md
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# Starting a Llama Stack Server
|
||||
|
||||
You can run a Llama Stack server in one of the following ways:
|
||||
|
||||
**As a Library**:
|
||||
|
||||
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
|
||||
|
||||
|
||||
**Container**:
|
||||
|
||||
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
||||
|
||||
|
||||
**Conda**:
|
||||
|
||||
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
|
||||
|
||||
|
||||
**Kubernetes**:
|
||||
|
||||
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
||||
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
|
||||
importing_as_library
|
||||
configuration
|
||||
kubernetes_deployment
|
||||
```
|
||||
|
|
@ -1,10 +1,11 @@
|
|||
# Quick Start
|
||||
|
||||
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.
|
||||
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to build a simple [RAG (Retrieval Augmented Generation)](../building_applications/rag.md) agent.
|
||||
|
||||
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
|
||||
|
||||
In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
|
||||
Ollama is an LLM runtime that allows you to run Llama models locally.
|
||||
|
||||
|
||||
### 1. Start Ollama
|
||||
|
|
@ -24,7 +25,7 @@ If you do not have ollama, you can install it from [here](https://ollama.com/dow
|
|||
|
||||
### 2. Pick a client environment
|
||||
|
||||
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
|
||||
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through a REST interface. You can interact with the Stack in two ways:
|
||||
|
||||
* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
|
||||
* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_v
|
|||
|
||||
# Llama Stack
|
||||
|
||||
## What is Llama Stack?
|
||||
|
||||
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
|
||||
|
||||
|
|
@ -22,6 +23,12 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge
|
|||
|
||||
Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
|
||||
|
||||
## How does Llama Stack work?
|
||||
Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and [client SDKs](#available-sdks) meant to
|
||||
be used in your applications. The server can be run in a variety of environments, including local (inline)
|
||||
development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
|
||||
Kotlin.
|
||||
|
||||
## Quick Links
|
||||
|
||||
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
|
||||
|
|
@ -93,7 +100,6 @@ getting_started/index
|
|||
concepts/index
|
||||
providers/index
|
||||
distributions/index
|
||||
distributions/selection
|
||||
building_applications/index
|
||||
playground/index
|
||||
contributing/index
|
||||
|
|
|
|||
|
|
@ -92,8 +92,6 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
|||
|
||||
## Starting the Llama Stack Playground
|
||||
|
||||
### Llama CLI
|
||||
|
||||
To start the Llama Stack Playground, run the following commands:
|
||||
|
||||
1. Start up the Llama Stack API server
|
||||
|
|
@ -109,29 +107,3 @@ cd llama_stack/distribution/ui
|
|||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
Playground can also be started in a docker image:
|
||||
|
||||
```sh
|
||||
export LLAMA_STACK_URL=http://localhost:11434
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
-p 8501:8501 \
|
||||
-e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
|
||||
quay.io/jland/llama-stack-playground
|
||||
```
|
||||
|
||||
## Configurable Environment Variables
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Environment Variable | Description | Default Value |
|
||||
|----------------------------|------------------------------------|---------------------------|
|
||||
| LLAMA_STACK_ENDPOINT | The endpoint for the Llama Stack | http://localhost:8321 |
|
||||
| FIREWORKS_API_KEY | API key for Fireworks provider | (empty string) |
|
||||
| TOGETHER_API_KEY | API key for Together provider | (empty string) |
|
||||
| SAMBANOVA_API_KEY | API key for SambaNova provider | (empty string) |
|
||||
| OPENAI_API_KEY | API key for OpenAI provider | (empty string) |
|
||||
|
|
|
|||
|
|
@ -10,11 +10,57 @@ That means you're not limited to storing vectors in memory or in a separate serv
|
|||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Fully integrated with Llama Stacks
|
||||
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||
|
||||
### Comparison to Faiss
|
||||
|
||||
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||
as they have different strengths.
|
||||
|
||||
#### Choosing the Right Provider
|
||||
|
||||
Scenario | Recommended Tool | Reason
|
||||
-- |-----------------| --
|
||||
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||
|
||||
#### Empirical Example
|
||||
|
||||
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||
uniformly spread across the [1500, 100000] interval.
|
||||
|
||||
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||
write speed as Faiss reindexes the vectors after each write.
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||
The modes of the two distributions highlight the differences much further where Faiss
|
||||
will likely yield faster read performance.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To use SQLite-Vec in your Llama Stack project, follow these steps:
|
||||
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue