mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-20 19:56:59 +00:00
Merge branch 'main' into nvidia-e2e-notebook
This commit is contained in:
commit
51b68b4be6
234 changed files with 21943 additions and 7540 deletions
3956
docs/_static/llama-stack-spec.html
vendored
3956
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
2930
docs/_static/llama-stack-spec.yaml
vendored
2930
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -1050,8 +1050,6 @@
|
|||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolGroup</span><span style=\"font-weight: bold\">(</span>\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">identifier</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'builtin::code_interpreter'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">provider_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'code-interpreter'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">provider_resource_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'builtin::code_interpreter'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'tool_group'</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">args</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
|
||||
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">mcp_endpoint</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
|
||||
|
@ -1061,7 +1059,6 @@
|
|||
"text/plain": [
|
||||
"\u001b[1;35mToolGroup\u001b[0m\u001b[1m(\u001b[0m\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33midentifier\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mprovider_id\u001b[0m=\u001b[32m'code-interpreter'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mprovider_resource_id\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'tool_group'\u001b[0m,\n",
|
||||
"\u001b[2;32m│ \u001b[0m\u001b[33margs\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
|
||||
|
|
|
@ -337,9 +337,6 @@
|
|||
" provider_id: tavily-search\n",
|
||||
" provider_type: remote::tavily-search\n",
|
||||
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
||||
" provider_id: code-interpreter\n",
|
||||
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::c</span>ode-interpreter\n",
|
||||
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
||||
" provider_id: rag-runtime\n",
|
||||
" provider_type: inline::rag-runtime\n",
|
||||
" - config: <span style=\"font-weight: bold\">{}</span>\n",
|
||||
|
@ -378,10 +375,6 @@
|
|||
" toolgroup_id: builtin::rag\n",
|
||||
"- args: null\n",
|
||||
" mcp_endpoint: null\n",
|
||||
" provider_id: code-interpreter\n",
|
||||
" toolgroup_id: builtin::code_interpreter\n",
|
||||
"- args: null\n",
|
||||
" mcp_endpoint: null\n",
|
||||
" provider_id: wolfram-alpha\n",
|
||||
" toolgroup_id: builtin::wolfram_alpha\n",
|
||||
"vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
|
@ -617,9 +610,6 @@
|
|||
" provider_id: tavily-search\n",
|
||||
" provider_type: remote::tavily-search\n",
|
||||
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
||||
" provider_id: code-interpreter\n",
|
||||
" provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n",
|
||||
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
||||
" provider_id: rag-runtime\n",
|
||||
" provider_type: inline::rag-runtime\n",
|
||||
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
|
||||
|
@ -658,10 +648,6 @@
|
|||
" toolgroup_id: builtin::rag\n",
|
||||
"- args: null\n",
|
||||
" mcp_endpoint: null\n",
|
||||
" provider_id: code-interpreter\n",
|
||||
" toolgroup_id: builtin::code_interpreter\n",
|
||||
"- args: null\n",
|
||||
" mcp_endpoint: null\n",
|
||||
" provider_id: wolfram-alpha\n",
|
||||
" toolgroup_id: builtin::wolfram_alpha\n",
|
||||
"vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
|
|
|
@ -840,7 +840,6 @@
|
|||
" \"memory_optimizations.rst\",\n",
|
||||
" \"chat.rst\",\n",
|
||||
" \"llama3.rst\",\n",
|
||||
" \"datasets.rst\",\n",
|
||||
" \"qat_finetune.rst\",\n",
|
||||
" \"lora_finetune.rst\",\n",
|
||||
"]\n",
|
||||
|
@ -1586,7 +1585,6 @@
|
|||
" \"memory_optimizations.rst\",\n",
|
||||
" \"chat.rst\",\n",
|
||||
" \"llama3.rst\",\n",
|
||||
" \"datasets.rst\",\n",
|
||||
" \"qat_finetune.rst\",\n",
|
||||
" \"lora_finetune.rst\",\n",
|
||||
"]\n",
|
||||
|
|
|
@ -44,7 +44,7 @@ def main(output_dir: str):
|
|||
if return_type_errors:
|
||||
print("\nAPI Method Return Type Validation Errors:\n")
|
||||
for error in return_type_errors:
|
||||
print(error)
|
||||
print(error, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
now = str(datetime.now())
|
||||
print(
|
||||
|
|
|
@ -759,7 +759,7 @@ class Generator:
|
|||
)
|
||||
|
||||
return Operation(
|
||||
tags=[op.defining_class.__name__],
|
||||
tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
|
||||
summary=None,
|
||||
# summary=doc_string.short_description,
|
||||
description=description,
|
||||
|
@ -805,6 +805,8 @@ class Generator:
|
|||
operation_tags: List[Tag] = []
|
||||
for cls in endpoint_classes:
|
||||
doc_string = parse_type(cls)
|
||||
if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
|
||||
continue
|
||||
operation_tags.append(
|
||||
Tag(
|
||||
name=cls.__name__,
|
||||
|
|
|
@ -174,14 +174,64 @@ def _validate_list_parameters_contain_data(method) -> str | None:
|
|||
return "does not have a mandatory data attribute containing the list of objects"
|
||||
|
||||
|
||||
def _validate_has_ellipsis(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
if "..." not in source and not "NotImplementedError" in source:
|
||||
return "does not contain ellipsis (...) in its implementation"
|
||||
|
||||
def _validate_has_return_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
return_type = method.__annotations__.get('return')
|
||||
if return_type is not None and return_type != type(None) and ":returns:" not in source:
|
||||
return "does not have a ':returns:' in its docstring"
|
||||
|
||||
def _validate_has_params_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
sig = inspect.signature(method)
|
||||
# Only check if the method has more than one parameter
|
||||
if len(sig.parameters) > 1 and ":param" not in source:
|
||||
return "does not have a ':param' in its docstring"
|
||||
|
||||
def _validate_has_no_return_none_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
return_type = method.__annotations__.get('return')
|
||||
if return_type is None and ":returns: None" in source:
|
||||
return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
|
||||
|
||||
def _validate_docstring_lines_end_with_dot(method) -> str | None:
|
||||
docstring = inspect.getdoc(method)
|
||||
if docstring is None:
|
||||
return None
|
||||
|
||||
lines = docstring.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and not any(line.endswith(char) for char in '.:{}[]()",'):
|
||||
return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
|
||||
|
||||
_VALIDATORS = {
|
||||
"GET": [
|
||||
_validate_api_method_return_type,
|
||||
_validate_list_parameters_contain_data,
|
||||
_validate_api_method_doesnt_return_list,
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_docstring_lines_end_with_dot,
|
||||
],
|
||||
"DELETE": [
|
||||
_validate_api_delete_method_returns_none,
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_has_no_return_none_in_docstring
|
||||
],
|
||||
"POST": [
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_has_no_return_none_in_docstring,
|
||||
_validate_docstring_lines_end_with_dot,
|
||||
],
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ chunks = [
|
|||
"mime_type": "text/plain",
|
||||
"metadata": {
|
||||
"document_id": "doc1",
|
||||
"author": "Jane Doe",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
@ -98,6 +99,17 @@ results = client.tool_runtime.rag_tool.query(
|
|||
)
|
||||
```
|
||||
|
||||
You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
|
||||
```python
|
||||
# Query documents
|
||||
results = client.tool_runtime.rag_tool.query(
|
||||
vector_db_ids=[vector_db_id],
|
||||
content="What do you know about...",
|
||||
query_config={
|
||||
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
|
||||
},
|
||||
)
|
||||
```
|
||||
### Building RAG-Enhanced Agents
|
||||
|
||||
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
|
||||
|
@ -115,6 +127,12 @@ agent = Agent(
|
|||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {
|
||||
"vector_db_ids": [vector_db_id],
|
||||
# Defaults
|
||||
"query_config": {
|
||||
"chunk_size_in_tokens": 512,
|
||||
"chunk_overlap_in_tokens": 0,
|
||||
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
|
|
|
@ -165,34 +165,6 @@ all_tools = client.tools.list_tools()
|
|||
group_tools = client.tools.list_tools(toolgroup_id="search_tools")
|
||||
```
|
||||
|
||||
## Simple Example: Using an Agent with the Code-Interpreter Tool
|
||||
|
||||
```python
|
||||
from llama_stack_client import Agent
|
||||
|
||||
# Instantiate the AI agent with the given configuration
|
||||
agent = Agent(
|
||||
client,
|
||||
name="code-interpreter",
|
||||
description="A code interpreter agent for executing Python code snippets",
|
||||
instructions="""
|
||||
You are a highly reliable, concise, and precise assistant.
|
||||
Always show the generated code, never generate your own code, and never anticipate results.
|
||||
""",
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
tools=["builtin::code_interpreter"],
|
||||
max_infer_iters=5,
|
||||
)
|
||||
|
||||
# Start a session
|
||||
session_id = agent.create_session("tool_session")
|
||||
|
||||
# Send a query to the AI agent for code execution
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
|
||||
session_id=session_id,
|
||||
)
|
||||
```
|
||||
## Simple Example 2: Using an Agent with the Web Search Tool
|
||||
1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
|
||||
2. [Optional] Provide the API key directly to the Llama Stack server
|
||||
|
|
|
@ -110,6 +110,8 @@ html_theme_options = {
|
|||
"canonical_url": "https://github.com/meta-llama/llama-stack",
|
||||
"collapse_navigation": False,
|
||||
# "style_nav_header_background": "#c3c9d4",
|
||||
'display_version': True,
|
||||
'version_selector': True,
|
||||
}
|
||||
|
||||
default_dark_mode = False
|
||||
|
|
|
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
|
|||
- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
|
||||
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
|
||||
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
|
||||
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
|
||||
- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
|
||||
|
||||
|
||||
Here are some example PRs to help you get started:
|
||||
|
@ -33,6 +33,7 @@ Note that each provider's `sample_run_config()` method (in the configuration cla
|
|||
|
||||
Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
|
||||
|
||||
Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
|
||||
|
||||
### 3. Additional end-to-end testing
|
||||
|
||||
|
|
|
@ -178,7 +178,7 @@ image_name: ollama
|
|||
image_type: conda
|
||||
|
||||
# If some providers are external, you can specify the path to the implementation
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
external_providers_dir: ~/.llama/providers.d
|
||||
```
|
||||
|
||||
```
|
||||
|
@ -206,7 +206,7 @@ distribution_spec:
|
|||
image_type: container
|
||||
image_name: ci-test
|
||||
# Path to external provider implementations
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
external_providers_dir: ~/.llama/providers.d
|
||||
```
|
||||
|
||||
Here's an example for a custom Ollama provider:
|
||||
|
@ -271,7 +271,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
|
|||
|
||||
```
|
||||
llama stack run -h
|
||||
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
|
||||
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
|
||||
[--image-type {conda,container,venv}]
|
||||
config
|
||||
|
||||
|
@ -285,7 +285,6 @@ options:
|
|||
--port PORT Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
|
||||
--image-name IMAGE_NAME
|
||||
Name of the image to run. Defaults to the current environment (default: None)
|
||||
--disable-ipv6 Disable IPv6 support (default: False)
|
||||
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
|
||||
--tls-keyfile TLS_KEYFILE
|
||||
Path to TLS key file for HTTPS (default: None)
|
||||
|
|
|
@ -172,7 +172,7 @@ spec:
|
|||
- name: llama-stack
|
||||
image: localhost/llama-stack-run-k8s:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
|
||||
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
|
||||
ports:
|
||||
- containerPort: 5000
|
||||
volumeMounts:
|
||||
|
|
|
@ -18,7 +18,7 @@ The `llamastack/distribution-watsonx` distribution consists of the following pro
|
|||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::watsonx` |
|
||||
| inference | `remote::watsonx`, `inline::sentence-transformers` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
@ -70,7 +70,7 @@ docker run \
|
|||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-watsonx \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env WATSONX_API_KEY=$WATSONX_API_KEY \
|
||||
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
|
||||
|
|
|
@ -52,7 +52,7 @@ docker run \
|
|||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-cerebras \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
||||
```
|
||||
|
|
|
@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
@ -155,7 +155,7 @@ docker run \
|
|||
-v $HOME/.llama:/root/.llama \
|
||||
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-dell \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env DEH_URL=$DEH_URL \
|
||||
|
|
|
@ -144,7 +144,7 @@ docker run \
|
|||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-nvidia \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
```
|
||||
|
|
|
@ -19,6 +19,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
|
|||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::ollama` |
|
||||
| post_training | `inline::huggingface` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
@ -97,7 +98,7 @@ docker run \
|
|||
-v ~/.llama:/root/.llama \
|
||||
-v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-ollama \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
|
|
|
@ -233,7 +233,7 @@ docker run \
|
|||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-remote-vllm \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
|
||||
|
@ -255,7 +255,7 @@ docker run \
|
|||
-v ~/.llama:/root/.llama \
|
||||
-v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-remote-vllm \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
|
||||
|
|
|
@ -16,10 +16,10 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| inference | `remote::sambanova` |
|
||||
| inference | `remote::sambanova`, `inline::sentence-transformers` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
@ -28,22 +28,22 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
|
|||
The following environment variables can be configured:
|
||||
|
||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||
- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
|
||||
- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
||||
- `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
||||
- `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
|
||||
- `Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
|
||||
- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
||||
- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
|
||||
- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
|
||||
- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
|
|
@ -117,7 +117,7 @@ docker run \
|
|||
-v ~/.llama:/root/.llama \
|
||||
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-tgi \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
|
|
|
@ -42,7 +42,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
|
|||
Setup your virtual environment.
|
||||
|
||||
```bash
|
||||
uv venv --python 3.10
|
||||
uv sync --python 3.10
|
||||
source .venv/bin/activate
|
||||
```
|
||||
## Step 2: Run Llama Stack
|
||||
|
@ -445,7 +445,6 @@ from llama_stack_client import LlamaStackClient
|
|||
from llama_stack_client import Agent, AgentEventLogger
|
||||
from llama_stack_client.types import Document
|
||||
import uuid
|
||||
from termcolor import cprint
|
||||
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
|
@ -463,7 +462,6 @@ urls = [
|
|||
"memory_optimizations.rst",
|
||||
"chat.rst",
|
||||
"llama3.rst",
|
||||
"datasets.rst",
|
||||
"qat_finetune.rst",
|
||||
"lora_finetune.rst",
|
||||
]
|
||||
|
|
|
@ -10,7 +10,7 @@ Llama Stack supports external providers that live outside of the main codebase.
|
|||
To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications:
|
||||
|
||||
```yaml
|
||||
external_providers_dir: /etc/llama-stack/providers.d/
|
||||
external_providers_dir: ~/.llama/providers.d/
|
||||
```
|
||||
|
||||
## Directory Structure
|
||||
|
@ -53,7 +53,7 @@ Here's a list of known external providers that you can use with Llama Stack:
|
|||
| Name | Description | API | Type | Repository |
|
||||
|------|-------------|-----|------|------------|
|
||||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||
| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
|
||||
|
||||
|
@ -182,7 +182,7 @@ dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
|
|||
3. Create the provider specification:
|
||||
|
||||
```yaml
|
||||
# /etc/llama-stack/providers.d/remote/inference/custom_ollama.yaml
|
||||
# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
|
||||
adapter:
|
||||
adapter_type: custom_ollama
|
||||
pip_packages: ["ollama", "aiohttp"]
|
||||
|
@ -201,7 +201,7 @@ uv pip install -e .
|
|||
5. Configure Llama Stack to use external providers:
|
||||
|
||||
```yaml
|
||||
external_providers_dir: /etc/llama-stack/providers.d/
|
||||
external_providers_dir: ~/.llama/providers.d/
|
||||
```
|
||||
|
||||
The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
|
||||
|
|
|
@ -253,8 +253,6 @@ llama-stack-client toolgroups list
|
|||
+---------------------------+------------------+------+---------------+
|
||||
| identifier | provider_id | args | mcp_endpoint |
|
||||
+===========================+==================+======+===============+
|
||||
| builtin::code_interpreter | code-interpreter | None | None |
|
||||
+---------------------------+------------------+------+---------------+
|
||||
| builtin::rag | rag-runtime | None | None |
|
||||
+---------------------------+------------------+------+---------------+
|
||||
| builtin::websearch | tavily-search | None | None |
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue