mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-29 04:21:59 +00:00
Merge branch 'main' into add-watsonx-inference-adapter
This commit is contained in:
commit
6fe8b292b1
74 changed files with 5033 additions and 1685 deletions
|
|
@ -68,7 +68,8 @@ chunks_response = client.vector_io.query(
|
|||
### Using the RAG Tool
|
||||
|
||||
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
|
||||
and automatically chunks them into smaller pieces.
|
||||
and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
|
||||
[appendix](#more-ragdocument-examples).
|
||||
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
|
|
@ -178,3 +179,38 @@ for vector_db_id in client.vector_dbs.list():
|
|||
print(f"Unregistering vector database: {vector_db_id.identifier}")
|
||||
client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
|
||||
```
|
||||
|
||||
### Appendix
|
||||
|
||||
#### More RAGDocument Examples
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
import base64
|
||||
|
||||
RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
|
||||
RAGDocument(document_id="num-1", content="plain text")
|
||||
RAGDocument(
|
||||
document_id="num-2",
|
||||
content={
|
||||
"type": "text",
|
||||
"text": "plain text input",
|
||||
}, # for inputs that should be treated as text explicitly
|
||||
)
|
||||
RAGDocument(
|
||||
document_id="num-3",
|
||||
content={
|
||||
"type": "image",
|
||||
"image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
|
||||
},
|
||||
)
|
||||
B64_ENCODED_IMAGE = base64.b64encode(
|
||||
requests.get(
|
||||
"https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
|
||||
).content
|
||||
)
|
||||
RAGDocuemnt(
|
||||
document_id="num-4",
|
||||
content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
|
||||
)
|
||||
```
|
||||
for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ client.toolgroups.register(
|
|||
|
||||
The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
|
||||
|
||||
|
||||
> **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
|
||||
|
||||
#### Code Interpreter
|
||||
|
||||
|
|
@ -214,3 +214,69 @@ response = agent.create_turn(
|
|||
session_id=session_id,
|
||||
)
|
||||
```
|
||||
## Simple Example 2: Using an Agent with the Web Search Tool
|
||||
1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
|
||||
2. [Optional] Provide the API key directly to the Llama Stack server
|
||||
```bash
|
||||
export TAVILY_SEARCH_API_KEY="your key"
|
||||
```
|
||||
```bash
|
||||
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
|
||||
```
|
||||
3. Run the following script.
|
||||
```python
|
||||
from llama_stack_client.lib.agents.agent import Agent
|
||||
from llama_stack_client.types.agent_create_params import AgentConfig
|
||||
from llama_stack_client.lib.agents.event_logger import EventLogger
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
client = LlamaStackClient(
|
||||
base_url=f"http://localhost:8321",
|
||||
provider_data={
|
||||
"tavily_search_api_key": "your_TAVILY_SEARCH_API_KEY"
|
||||
}, # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
|
||||
)
|
||||
|
||||
agent = Agent(
|
||||
client,
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
instructions=(
|
||||
"You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
|
||||
),
|
||||
tools=["builtin::websearch"],
|
||||
)
|
||||
|
||||
session_id = agent.create_session("websearch-session")
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[
|
||||
{"role": "user", "content": "How did the USA perform in the last Olympics?"}
|
||||
],
|
||||
session_id=session_id,
|
||||
)
|
||||
for log in EventLogger().log(response):
|
||||
log.print()
|
||||
```
|
||||
|
||||
## Simple Example3: Using an Agent with the WolframAlpha Tool
|
||||
1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
|
||||
2. Provide the API key either when starting the Llama Stack server:
|
||||
```bash
|
||||
--env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
|
||||
```
|
||||
or from the client side:
|
||||
```python
|
||||
client = LlamaStackClient(
|
||||
base_url="http://localhost:8321",
|
||||
provider_data={"wolfram_alpha_api_key": wolfram_api_key},
|
||||
)
|
||||
```
|
||||
3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
|
||||
4. Example user query:
|
||||
```python
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
|
||||
session_id=session_id,
|
||||
)
|
||||
```
|
||||
```
|
||||
|
|
|
|||
|
|
@ -176,7 +176,11 @@ distribution_spec:
|
|||
safety: inline::llama-guard
|
||||
agents: inline::meta-reference
|
||||
telemetry: inline::meta-reference
|
||||
image_name: ollama
|
||||
image_type: conda
|
||||
|
||||
# If some providers are external, you can specify the path to the implementation
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
```
|
||||
|
||||
```
|
||||
|
|
@ -184,6 +188,57 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
|
|||
```
|
||||
:::
|
||||
|
||||
:::{tab-item} Building with External Providers
|
||||
|
||||
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
|
||||
|
||||
To build a distribution with external providers, you need to:
|
||||
|
||||
1. Configure the `external_providers_dir` in your build configuration file:
|
||||
|
||||
```yaml
|
||||
# Example my-external-stack.yaml with external providers
|
||||
version: '2'
|
||||
distribution_spec:
|
||||
description: Custom distro for CI tests
|
||||
providers:
|
||||
inference:
|
||||
- remote::custom_ollama
|
||||
# Add more providers as needed
|
||||
image_type: container
|
||||
image_name: ci-test
|
||||
# Path to external provider implementations
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
```
|
||||
|
||||
Here's an example for a custom Ollama provider:
|
||||
|
||||
```yaml
|
||||
adapter:
|
||||
adapter_type: custom_ollama
|
||||
pip_packages:
|
||||
- ollama
|
||||
- aiohttp
|
||||
- llama-stack-provider-ollama # This is the provider package
|
||||
config_class: llama_stack_ollama_provider.config.OllamaImplConfig
|
||||
module: llama_stack_ollama_provider
|
||||
api_dependencies: []
|
||||
optional_api_dependencies: []
|
||||
```
|
||||
|
||||
The `pip_packages` section lists the Python packages required by the provider, as well as the
|
||||
provider package itself. The package must be available on PyPI or can be provided from a local
|
||||
directory or a git repository (git must be installed on the build environment).
|
||||
|
||||
2. Build your distribution using the config file:
|
||||
|
||||
```
|
||||
llama stack build --config my-external-stack.yaml
|
||||
```
|
||||
|
||||
For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
|
||||
:::
|
||||
|
||||
:::{tab-item} Building Container
|
||||
|
||||
```{admonition} Podman Alternative
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| eval | `remote::nvidia` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
|
|
@ -22,13 +22,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ The following models are available by default:
|
|||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
|
@ -118,7 +118,7 @@ curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
|||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
|
|
@ -171,7 +171,3 @@ llama stack run ./run.yaml \
|
|||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Example Notebooks
|
||||
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ The following environment variables can be configured:
|
|||
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||
that we only use GPUs here for demonstration purposes.
|
||||
that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
|
||||
|
||||
### Setting up vLLM server on AMD GPU
|
||||
|
||||
|
|
|
|||
|
|
@ -50,9 +50,10 @@ Llama Stack supports two types of external providers:
|
|||
|
||||
Here's a list of known external providers that you can use with Llama Stack:
|
||||
|
||||
| Type | Name | Description | Repository |
|
||||
|------|------|-------------|------------|
|
||||
| Remote | KubeFlow Training | Train models with KubeFlow | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| Name | Description | API | Type | Repository |
|
||||
|------|-------------|-----|------|------------|
|
||||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||
|
||||
### Remote Provider Specification
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue