mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-05 12:21:52 +00:00
Merge branch 'main' into content-extension
This commit is contained in:
commit
84a26339c8
73 changed files with 2416 additions and 506 deletions
83
docs/_static/llama-stack-spec.html
vendored
83
docs/_static/llama-stack-spec.html
vendored
|
@ -8293,28 +8293,60 @@
|
|||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
"properties": {
|
||||
"attributes": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
"description": "(Optional) Key-value attributes associated with the file"
|
||||
},
|
||||
"file_id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier of the file containing the result"
|
||||
},
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"description": "Name of the file containing the result"
|
||||
},
|
||||
"score": {
|
||||
"type": "number",
|
||||
"description": "Relevance score for this search result (between 0 and 1)"
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text content of the search result"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"attributes",
|
||||
"file_id",
|
||||
"filename",
|
||||
"score",
|
||||
"text"
|
||||
],
|
||||
"title": "OpenAIResponseOutputMessageFileSearchToolCallResults",
|
||||
"description": "Search results returned by the file search operation."
|
||||
},
|
||||
"description": "(Optional) Search results returned by the file search operation"
|
||||
}
|
||||
|
@ -8515,6 +8547,13 @@
|
|||
"$ref": "#/components/schemas/OpenAIResponseInputTool"
|
||||
}
|
||||
},
|
||||
"include": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "(Optional) Additional fields to include in the response."
|
||||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
}
|
||||
|
@ -16571,7 +16610,7 @@
|
|||
"additionalProperties": {
|
||||
"type": "number"
|
||||
},
|
||||
"description": "A list of the categories along with their scores as predicted by model. Required set of categories that need to be in response - violence - violence/graphic - harassment - harassment/threatening - hate - hate/threatening - illicit - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent - self-harm/instructions"
|
||||
"description": "A list of the categories along with their scores as predicted by model."
|
||||
},
|
||||
"user_message": {
|
||||
"type": "string"
|
||||
|
|
56
docs/_static/llama-stack-spec.yaml
vendored
56
docs/_static/llama-stack-spec.yaml
vendored
|
@ -6021,14 +6021,44 @@ components:
|
|||
type: array
|
||||
items:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
properties:
|
||||
attributes:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: >-
|
||||
(Optional) Key-value attributes associated with the file
|
||||
file_id:
|
||||
type: string
|
||||
description: >-
|
||||
Unique identifier of the file containing the result
|
||||
filename:
|
||||
type: string
|
||||
description: Name of the file containing the result
|
||||
score:
|
||||
type: number
|
||||
description: >-
|
||||
Relevance score for this search result (between 0 and 1)
|
||||
text:
|
||||
type: string
|
||||
description: Text content of the search result
|
||||
additionalProperties: false
|
||||
required:
|
||||
- attributes
|
||||
- file_id
|
||||
- filename
|
||||
- score
|
||||
- text
|
||||
title: >-
|
||||
OpenAIResponseOutputMessageFileSearchToolCallResults
|
||||
description: >-
|
||||
Search results returned by the file search operation.
|
||||
description: >-
|
||||
(Optional) Search results returned by the file search operation
|
||||
additionalProperties: false
|
||||
|
@ -6188,6 +6218,12 @@ components:
|
|||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseInputTool'
|
||||
include:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
|
@ -12314,10 +12350,6 @@ components:
|
|||
type: number
|
||||
description: >-
|
||||
A list of the categories along with their scores as predicted by model.
|
||||
Required set of categories that need to be in response - violence - violence/graphic
|
||||
- harassment - harassment/threatening - hate - hate/threatening - illicit
|
||||
- illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent
|
||||
- self-harm/instructions
|
||||
user_message:
|
||||
type: string
|
||||
metadata:
|
||||
|
|
|
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
|
|||
version = "0.1.0"
|
||||
description = "Weather API for Llama Stack"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = ["llama-stack", "pydantic"]
|
||||
|
||||
[build-system]
|
||||
|
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
|
|||
version = "0.1.0"
|
||||
description = "Kaze weather provider for Llama Stack"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = ["llama-stack", "pydantic", "aiohttp"]
|
||||
|
||||
[build-system]
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
|
||||
|
||||
> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
||||
```{note}
|
||||
For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
|
|
|
@ -76,7 +76,9 @@ Features:
|
|||
- Context retrieval with token limits
|
||||
|
||||
|
||||
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
||||
```{note}
|
||||
By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
||||
```
|
||||
|
||||
## Model Context Protocol (MCP)
|
||||
|
||||
|
|
|
@ -2,17 +2,6 @@
|
|||
```{include} ../../../CONTRIBUTING.md
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
See the [Test Page](testing.md) which describes how to test your changes.
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
:caption: Testing
|
||||
|
||||
testing
|
||||
```
|
||||
|
||||
## Adding a New Provider
|
||||
|
||||
See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||
|
@ -27,3 +16,14 @@ See the [External Provider Page](../providers/external/index.md) which describes
|
|||
new_api_provider
|
||||
new_vector_database
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
See the [Test Page](testing.md) which describes how to test your changes.
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
:caption: Testing
|
||||
|
||||
testing
|
||||
```
|
57
docs/source/distributions/k8s-benchmark/apply.sh
Executable file
57
docs/source/distributions/k8s-benchmark/apply.sh
Executable file
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
||||
|
||||
export MOCK_INFERENCE_PORT=8080
|
||||
export STREAM_DELAY_SECONDS=0.005
|
||||
|
||||
export POSTGRES_USER=llamastack
|
||||
export POSTGRES_DB=llamastack
|
||||
export POSTGRES_PASSWORD=llamastack
|
||||
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
export MOCK_INFERENCE_MODEL=mock-inference
|
||||
|
||||
# Use llama-stack-benchmark-service as the benchmark server
|
||||
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
|
||||
export LOCUST_BASE_PATH=/v1/openai/v1
|
||||
|
||||
# Use vllm-service as the benchmark server
|
||||
# export LOCUST_HOST=http://vllm-server:8000
|
||||
# export LOCUST_BASE_PATH=/v1
|
||||
|
||||
|
||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
# Deploy benchmark-specific components
|
||||
# Deploy OpenAI mock server
|
||||
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
|
||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
||||
|
||||
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
|
||||
|
||||
# Create configmap with our custom stack config
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
--dry-run=client -o yaml > stack-configmap.yaml
|
||||
|
||||
kubectl apply --validate=false -f stack-configmap.yaml
|
||||
|
||||
# Deploy our custom llama stack server (overriding the base one)
|
||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
||||
|
||||
# Deploy Locust load testing
|
||||
kubectl create configmap locust-script --from-file=locustfile.py \
|
||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
||||
|
||||
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
|
131
docs/source/distributions/k8s-benchmark/locust-k8s.yaml
Normal file
131
docs/source/distributions/k8s-benchmark/locust-k8s.yaml
Normal file
|
@ -0,0 +1,131 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: locust-master
|
||||
labels:
|
||||
app: locust
|
||||
role: master
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: locust
|
||||
role: master
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: locust
|
||||
role: master
|
||||
spec:
|
||||
containers:
|
||||
- name: locust-master
|
||||
image: locustio/locust:2.31.8
|
||||
ports:
|
||||
- containerPort: 8089 # Web UI
|
||||
- containerPort: 5557 # Master communication
|
||||
env:
|
||||
- name: LOCUST_HOST
|
||||
value: "${LOCUST_HOST}"
|
||||
- name: LOCUST_LOCUSTFILE
|
||||
value: "/locust/locustfile.py"
|
||||
- name: LOCUST_WEB_HOST
|
||||
value: "0.0.0.0"
|
||||
- name: LOCUST_MASTER
|
||||
value: "true"
|
||||
- name: LOCUST_BASE_PATH
|
||||
value: "${LOCUST_BASE_PATH}"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
||||
volumeMounts:
|
||||
- name: locust-script
|
||||
mountPath: /locust
|
||||
command: ["locust"]
|
||||
args:
|
||||
- "--master"
|
||||
- "--web-host=0.0.0.0"
|
||||
- "--web-port=8089"
|
||||
- "--host=${LOCUST_HOST}"
|
||||
- "--locustfile=/locust/locustfile.py"
|
||||
volumes:
|
||||
- name: locust-script
|
||||
configMap:
|
||||
name: locust-script
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: locust-worker
|
||||
labels:
|
||||
app: locust
|
||||
role: worker
|
||||
spec:
|
||||
replicas: 2 # Start with 2 workers, can be scaled up
|
||||
selector:
|
||||
matchLabels:
|
||||
app: locust
|
||||
role: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: locust
|
||||
role: worker
|
||||
spec:
|
||||
containers:
|
||||
- name: locust-worker
|
||||
image: locustio/locust:2.31.8
|
||||
env:
|
||||
- name: LOCUST_HOST
|
||||
value: "${LOCUST_HOST}"
|
||||
- name: LOCUST_LOCUSTFILE
|
||||
value: "/locust/locustfile.py"
|
||||
- name: LOCUST_MASTER_HOST
|
||||
value: "locust-master-service"
|
||||
- name: LOCUST_MASTER_PORT
|
||||
value: "5557"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
||||
- name: LOCUST_BASE_PATH
|
||||
value: "${LOCUST_BASE_PATH}"
|
||||
volumeMounts:
|
||||
- name: locust-script
|
||||
mountPath: /locust
|
||||
command: ["locust"]
|
||||
args:
|
||||
- "--worker"
|
||||
- "--master-host=locust-master-service"
|
||||
- "--master-port=5557"
|
||||
- "--locustfile=/locust/locustfile.py"
|
||||
volumes:
|
||||
- name: locust-script
|
||||
configMap:
|
||||
name: locust-script
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: locust-master-service
|
||||
spec:
|
||||
selector:
|
||||
app: locust
|
||||
role: master
|
||||
ports:
|
||||
- name: web-ui
|
||||
port: 8089
|
||||
targetPort: 8089
|
||||
- name: master-comm
|
||||
port: 5557
|
||||
targetPort: 5557
|
||||
type: ClusterIP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: locust-web-ui
|
||||
spec:
|
||||
selector:
|
||||
app: locust
|
||||
role: master
|
||||
ports:
|
||||
- port: 8089
|
||||
targetPort: 8089
|
||||
type: ClusterIP # Keep internal, use port-forward to access
|
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
|
||||
"""
|
||||
|
||||
import random
|
||||
from locust import HttpUser, task, between
|
||||
import os
|
||||
|
||||
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
|
||||
|
||||
MODEL_ID = os.getenv("INFERENCE_MODEL")
|
||||
|
||||
class LlamaStackUser(HttpUser):
|
||||
wait_time = between(0.0, 0.0001)
|
||||
|
||||
def on_start(self):
|
||||
"""Setup authentication and test data."""
|
||||
# No auth required for benchmark server
|
||||
self.headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Test messages of varying lengths
|
||||
self.test_messages = [
|
||||
[{"role": "user", "content": "Hi"}],
|
||||
[{"role": "user", "content": "What is the capital of France?"}],
|
||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||
[
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||
{"role": "user", "content": "Can you give me a practical example?"}
|
||||
]
|
||||
]
|
||||
|
||||
@task(weight=100)
|
||||
def chat_completion_streaming(self):
|
||||
"""Test streaming chat completion (20% of requests)."""
|
||||
messages = random.choice(self.test_messages)
|
||||
payload = {
|
||||
"model": MODEL_ID,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"max_tokens": 100
|
||||
}
|
||||
|
||||
with self.client.post(
|
||||
f"{base_path}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
stream=True,
|
||||
catch_response=True
|
||||
) as response:
|
||||
if response.status_code == 200:
|
||||
chunks_received = 0
|
||||
try:
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
line_str = line.decode('utf-8')
|
||||
if line_str.startswith('data: '):
|
||||
chunks_received += 1
|
||||
if line_str.strip() == 'data: [DONE]':
|
||||
break
|
||||
|
||||
if chunks_received > 0:
|
||||
response.success()
|
||||
else:
|
||||
response.failure("No streaming chunks received")
|
||||
except Exception as e:
|
||||
response.failure(f"Streaming error: {e}")
|
||||
else:
|
||||
response.failure(f"HTTP {response.status_code}: {response.text}")
|
|
@ -0,0 +1,52 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: openai-mock
|
||||
labels:
|
||||
app: openai-mock
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: openai-mock
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: openai-mock
|
||||
spec:
|
||||
containers:
|
||||
- name: openai-mock
|
||||
image: python:3.12-slim
|
||||
ports:
|
||||
- containerPort: ${MOCK_INFERENCE_PORT}
|
||||
env:
|
||||
- name: PORT
|
||||
value: "${MOCK_INFERENCE_PORT}"
|
||||
- name: MOCK_MODELS
|
||||
value: "${MOCK_INFERENCE_MODEL}"
|
||||
- name: STREAM_DELAY_SECONDS
|
||||
value: "${STREAM_DELAY_SECONDS}"
|
||||
command: ["sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
pip install flask &&
|
||||
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
|
||||
volumeMounts:
|
||||
- name: openai-mock-script
|
||||
mountPath: /app
|
||||
volumes:
|
||||
- name: openai-mock-script
|
||||
configMap:
|
||||
name: openai-mock
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: openai-mock-service
|
||||
spec:
|
||||
selector:
|
||||
app: openai-mock
|
||||
ports:
|
||||
- port: 8080
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
190
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file
190
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file
|
@ -0,0 +1,190 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
OpenAI-compatible mock server that returns:
|
||||
- Hardcoded /models response for consistent validation
|
||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||
"""
|
||||
|
||||
from flask import Flask, request, jsonify, Response
|
||||
import time
|
||||
import random
|
||||
import uuid
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Models from environment variables
|
||||
def get_models():
|
||||
models_str = os.getenv("MOCK_MODELS", "mock-inference")
|
||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": model_id,
|
||||
"object": "model",
|
||||
"created": 1234567890,
|
||||
"owned_by": "vllm"
|
||||
}
|
||||
for model_id in model_ids
|
||||
]
|
||||
}
|
||||
|
||||
def generate_random_text(length=50):
|
||||
"""Generate random but coherent text for responses."""
|
||||
words = [
|
||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
||||
]
|
||||
return " ".join(random.choices(words, k=length))
|
||||
|
||||
@app.route('/models', methods=['GET'])
|
||||
def list_models():
|
||||
models = get_models()
|
||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||
return jsonify(models)
|
||||
|
||||
@app.route('/chat/completions', methods=['POST'])
|
||||
def chat_completions():
|
||||
"""Return OpenAI-formatted chat completion responses."""
|
||||
data = request.get_json()
|
||||
default_model = get_models()['data'][0]['id']
|
||||
model = data.get('model', default_model)
|
||||
messages = data.get('messages', [])
|
||||
stream = data.get('stream', False)
|
||||
|
||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||
|
||||
if stream:
|
||||
return handle_streaming_completion(model, messages)
|
||||
else:
|
||||
return handle_non_streaming_completion(model, messages)
|
||||
|
||||
def handle_non_streaming_completion(model, messages):
|
||||
response_text = generate_random_text(random.randint(20, 80))
|
||||
|
||||
# Calculate realistic token counts
|
||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
||||
completion_tokens = len(response_text.split())
|
||||
|
||||
response = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
def handle_streaming_completion(model, messages):
|
||||
def generate_stream():
|
||||
# Generate response text
|
||||
full_response = generate_random_text(random.randint(30, 100))
|
||||
words = full_response.split()
|
||||
|
||||
# Send initial chunk
|
||||
initial_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": ""}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||
|
||||
# Send word by word
|
||||
for i, word in enumerate(words):
|
||||
chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
# Configurable delay to simulate realistic streaming
|
||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
||||
time.sleep(stream_delay)
|
||||
|
||||
# Send final chunk
|
||||
final_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": ""},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return Response(
|
||||
generate_stream(),
|
||||
mimetype='text/event-stream',
|
||||
headers={
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
}
|
||||
)
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
||||
parser.add_argument('--port', type=int, default=8081,
|
||||
help='Port to run the server on (default: 8081)')
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
|
||||
models = get_models()
|
||||
print("Starting OpenAI-compatible mock server...")
|
||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||
print("- Streaming support with valid SSE format")
|
||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||
app.run(host='0.0.0.0', port=port, debug=False)
|
143
docs/source/distributions/k8s-benchmark/stack-configmap.yaml
Normal file
143
docs/source/distributions/k8s-benchmark/stack-configmap.yaml
Normal file
|
@ -0,0 +1,143 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
stack_run_config.yaml: |
|
||||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: mock-vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
||||
max_tokens: 4096
|
||||
api_token: fake
|
||||
tls_verify: false
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
||||
provider_id: mock-vllm-inference
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: llama-stack-config
|
|
@ -0,0 +1,87 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-benchmark-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-benchmark-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack-benchmark
|
||||
image: llamastack/distribution-starter:latest
|
||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||
env:
|
||||
- name: ENABLE_CHROMADB
|
||||
value: "true"
|
||||
- name: CHROMADB_URL
|
||||
value: http://chromadb.default.svc.cluster.local:6000
|
||||
- name: POSTGRES_HOST
|
||||
value: postgres-server.default.svc.cluster.local
|
||||
- name: POSTGRES_PORT
|
||||
value: "5432"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: TAVILY_SEARCH_API_KEY
|
||||
value: "${TAVILY_SEARCH_API_KEY}"
|
||||
- name: MOCK_INFERENCE_PORT
|
||||
value: "${MOCK_INFERENCE_PORT}"
|
||||
- name: VLLM_URL
|
||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||
- name: VLLM_MAX_TOKENS
|
||||
value: "3072"
|
||||
- name: VLLM_SAFETY_URL
|
||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
- name: MOCK_INFERENCE_MODEL
|
||||
value: "${MOCK_INFERENCE_MODEL}"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||
ports:
|
||||
- containerPort: 8323
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
- name: llama-config
|
||||
mountPath: /etc/config
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-benchmark-pvc
|
||||
- name: llama-config
|
||||
configMap:
|
||||
name: llama-stack-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-stack-benchmark-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
ports:
|
||||
- name: http
|
||||
port: 8323
|
||||
targetPort: 8323
|
||||
type: ClusterIP
|
136
docs/source/distributions/k8s-benchmark/stack_run_config.yaml
Normal file
136
docs/source/distributions/k8s-benchmark/stack_run_config.yaml
Normal file
|
@ -0,0 +1,136 @@
|
|||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: mock-vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
||||
max_tokens: 4096
|
||||
api_token: fake
|
||||
tls_verify: false
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
||||
provider_id: mock-vllm-inference
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
|
@ -40,19 +40,19 @@ spec:
|
|||
value: "3072"
|
||||
- name: VLLM_SAFETY_URL
|
||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
- name: POSTGRES_HOST
|
||||
value: postgres-server.default.svc.cluster.local
|
||||
- name: POSTGRES_PORT
|
||||
value: "5432"
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: TAVILY_SEARCH_API_KEY
|
||||
value: "${TAVILY_SEARCH_API_KEY}"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||
ports:
|
||||
- containerPort: 8321
|
||||
volumeMounts:
|
||||
|
|
|
@ -226,7 +226,7 @@ uv init
|
|||
name = "llama-stack-provider-ollama"
|
||||
version = "0.1.0"
|
||||
description = "Ollama provider for Llama Stack"
|
||||
requires-python = ">=3.10"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
|
||||
```
|
||||
|
||||
|
|
|
@ -21,5 +21,7 @@ kvstore:
|
|||
|
||||
## Deprecation Notice
|
||||
|
||||
⚠️ **Warning**: Please use the `inline::faiss` provider instead.
|
||||
```{warning}
|
||||
Please use the `inline::faiss` provider instead.
|
||||
```
|
||||
|
||||
|
|
|
@ -25,5 +25,7 @@ kvstore:
|
|||
|
||||
## Deprecation Notice
|
||||
|
||||
⚠️ **Warning**: Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
||||
```{warning}
|
||||
Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
||||
```
|
||||
|
||||
|
|
|
@ -204,7 +204,10 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
|
|||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
||||
|
||||
> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||
```{note}
|
||||
This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||
```
|
||||
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
|
|
|
@ -128,7 +128,9 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
|
|||
|
||||
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
||||
|
||||
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||
```{tip}
|
||||
Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||
```
|
||||
|
||||
## List the downloaded models
|
||||
|
||||
|
|
|
@ -152,7 +152,9 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
|
|||
|
||||
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
||||
|
||||
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||
```{tip}
|
||||
Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||
```
|
||||
|
||||
## List the downloaded models
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue