mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 00:34:44 +00:00
Merge branch 'main' into pr2088
This commit is contained in:
commit
57b5449e85
12 changed files with 2341 additions and 2217 deletions
2
.github/workflows/providers-build.yml
vendored
2
.github/workflows/providers-build.yml
vendored
|
@ -153,7 +153,7 @@ jobs:
|
|||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
|
|
|
@ -141,11 +141,18 @@ uv sync
|
|||
|
||||
## Coding Style
|
||||
|
||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
|
||||
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
|
||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
||||
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
|
||||
rather than explain what the next line of code does.
|
||||
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
|
||||
`Exception`.
|
||||
* Error messages should be prefixed with "Failed to ..."
|
||||
* 4 spaces for indentation rather than tabs
|
||||
* 4 spaces for indentation rather than tab
|
||||
* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
|
||||
justification for bypassing the check.
|
||||
* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
|
||||
justification for bypassing the check.
|
||||
|
||||
## Common Tasks
|
||||
|
||||
|
|
6
docs/_static/css/my_theme.css
vendored
6
docs/_static/css/my_theme.css
vendored
|
@ -27,3 +27,9 @@ pre {
|
|||
white-space: pre-wrap !important;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .mermaid {
|
||||
background-color: #f4f4f6 !important;
|
||||
border-radius: 6px;
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@ Here's a list of known external providers that you can use with Llama Stack:
|
|||
| Name | Description | API | Type | Repository |
|
||||
|------|-------------|-----|------|------------|
|
||||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||
|
||||
### Remote Provider Specification
|
||||
|
|
107
install.sh
107
install.sh
|
@ -16,61 +16,120 @@ WAIT_TIMEOUT=300
|
|||
log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
|
||||
die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
|
||||
|
||||
wait_for_service() {
|
||||
local url="$1"
|
||||
local pattern="$2"
|
||||
local timeout="$3"
|
||||
local name="$4"
|
||||
local start ts
|
||||
log "⏳ Waiting for ${name}…"
|
||||
start=$(date +%s)
|
||||
while true; do
|
||||
if curl --retry 5 --retry-delay 1 --retry-max-time "$timeout" --retry-all-errors --silent --fail "$url" 2>/dev/null | grep -q "$pattern"; then
|
||||
break
|
||||
fi
|
||||
ts=$(date +%s)
|
||||
if (( ts - start >= timeout )); then
|
||||
return 1
|
||||
fi
|
||||
printf '.'
|
||||
sleep 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
if command -v docker &> /dev/null; then
|
||||
ENGINE="docker"
|
||||
HOST_DNS="host.docker.internal"
|
||||
elif command -v podman &> /dev/null; then
|
||||
ENGINE="podman"
|
||||
HOST_DNS="host.containers.internal"
|
||||
else
|
||||
die "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
|
||||
fi
|
||||
|
||||
# Explicitly set the platform for the host architecture
|
||||
HOST_ARCH="$(uname -m)"
|
||||
if [ "$HOST_ARCH" = "arm64" ]; then
|
||||
if [ "$ENGINE" = "docker" ]; then
|
||||
PLATFORM_OPTS=( --platform linux/amd64 )
|
||||
else
|
||||
PLATFORM_OPTS=( --os linux --arch amd64 )
|
||||
fi
|
||||
else
|
||||
PLATFORM_OPTS=()
|
||||
fi
|
||||
|
||||
# macOS + Podman: ensure VM is running before we try to launch containers
|
||||
# If you need GPU passthrough under Podman on macOS, init the VM with libkrun:
|
||||
# CONTAINERS_MACHINE_PROVIDER=libkrun podman machine init
|
||||
if [ "$ENGINE" = "podman" ] && [ "$(uname -s)" = "Darwin" ]; then
|
||||
if ! podman info &>/dev/null; then
|
||||
log "⌛️ Initializing Podman VM…"
|
||||
podman machine init &>/dev/null || true
|
||||
podman machine start &>/dev/null || true
|
||||
|
||||
log "⌛️ Waiting for Podman API…"
|
||||
until podman info &>/dev/null; do
|
||||
sleep 1
|
||||
done
|
||||
log "✅ Podman VM is up"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Clean up any leftovers from earlier runs
|
||||
for name in ollama-server llama-stack; do
|
||||
ids=$($ENGINE ps -aq --filter "name=^${name}$")
|
||||
if [ -n "$ids" ]; then
|
||||
log "⚠️ Found existing container(s) for '${name}', removing..."
|
||||
$ENGINE rm -f "$ids"
|
||||
log "⚠️ Found existing container(s) for '${name}', removing…"
|
||||
$ENGINE rm -f "$ids" > /dev/null 2>&1
|
||||
fi
|
||||
done
|
||||
|
||||
###############################################################################
|
||||
# 0. Create a shared network
|
||||
###############################################################################
|
||||
if ! $ENGINE network inspect llama-net >/dev/null 2>&1; then
|
||||
log "🌐 Creating network…"
|
||||
$ENGINE network create llama-net >/dev/null 2>&1
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
# 1. Ollama
|
||||
###############################################################################
|
||||
log "🦙 Starting Ollama…"
|
||||
$ENGINE run -d --name ollama-server \
|
||||
-p "${OLLAMA_PORT}:11434" \
|
||||
$ENGINE run -d "${PLATFORM_OPTS[@]}" --name ollama-server \
|
||||
--network llama-net \
|
||||
-p "${OLLAMA_PORT}:${OLLAMA_PORT}" \
|
||||
ollama/ollama > /dev/null 2>&1
|
||||
|
||||
log "⏳ Waiting for Ollama daemon…"
|
||||
if ! timeout "$WAIT_TIMEOUT" bash -c \
|
||||
"until curl -fsS http://localhost:${OLLAMA_PORT}/ 2>/dev/null | grep -q 'Ollama'; do sleep 1; done"; then
|
||||
if ! wait_for_service "http://localhost:${OLLAMA_PORT}/" "Ollama" "$WAIT_TIMEOUT" "Ollama daemon"; then
|
||||
log "❌ Ollama daemon did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
||||
$ENGINE logs ollama-server --tail=200
|
||||
$ENGINE logs --tail 200 ollama-server
|
||||
die "Ollama startup failed"
|
||||
fi
|
||||
|
||||
log "📦 Ensuring model is pulled: ${MODEL_ALIAS}..."
|
||||
$ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1
|
||||
log "📦 Ensuring model is pulled: ${MODEL_ALIAS}…"
|
||||
if ! $ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1; then
|
||||
log "❌ Failed to pull model ${MODEL_ALIAS}; dumping container logs:"
|
||||
$ENGINE logs --tail 200 ollama-server
|
||||
die "Model pull failed"
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
# 2. Llama‑Stack
|
||||
###############################################################################
|
||||
log "🦙📦 Starting Llama‑Stack…"
|
||||
$ENGINE run -d --name llama-stack \
|
||||
-p "${PORT}:${PORT}" \
|
||||
--add-host="${HOST_DNS}:host-gateway" \
|
||||
"${SERVER_IMAGE}" \
|
||||
--port "${PORT}" \
|
||||
--env INFERENCE_MODEL="${MODEL_ALIAS}" \
|
||||
--env OLLAMA_URL="http://${HOST_DNS}:${OLLAMA_PORT}" > /dev/null 2>&1
|
||||
cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
|
||||
--network llama-net \
|
||||
-p "${PORT}:${PORT}" \
|
||||
"${SERVER_IMAGE}" --port "${PORT}" \
|
||||
--env INFERENCE_MODEL="${MODEL_ALIAS}" \
|
||||
--env OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" )
|
||||
|
||||
log "⏳ Waiting for Llama-Stack API…"
|
||||
if ! timeout "$WAIT_TIMEOUT" bash -c \
|
||||
"until curl -fsS http://localhost:${PORT}/v1/health 2>/dev/null | grep -q 'OK'; do sleep 1; done"; then
|
||||
log "🦙 Starting Llama‑Stack…"
|
||||
$ENGINE "${cmd[@]}" > /dev/null 2>&1
|
||||
|
||||
if ! wait_for_service "http://127.0.0.1:${PORT}/v1/health" "OK" "$WAIT_TIMEOUT" "Llama-Stack API"; then
|
||||
log "❌ Llama-Stack did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
||||
$ENGINE logs llama-stack --tail=200
|
||||
$ENGINE logs --tail 200 llama-stack
|
||||
die "Llama-Stack startup failed"
|
||||
fi
|
||||
|
||||
|
|
|
@ -108,6 +108,7 @@ from llama_stack.apis.inference.inference import (
|
|||
OpenAIChatCompletion,
|
||||
OpenAICompletion,
|
||||
OpenAICompletionChoice,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatParam,
|
||||
ToolConfig,
|
||||
)
|
||||
|
@ -987,7 +988,7 @@ def _convert_openai_sampling_params(
|
|||
|
||||
|
||||
def openai_messages_to_messages(
|
||||
messages: list[OpenAIChatCompletionMessage],
|
||||
messages: list[OpenAIMessageParam],
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Convert a list of OpenAIChatCompletionMessage into a list of Message.
|
||||
|
@ -995,12 +996,12 @@ def openai_messages_to_messages(
|
|||
converted_messages = []
|
||||
for message in messages:
|
||||
if message.role == "system":
|
||||
converted_message = SystemMessage(content=message.content)
|
||||
converted_message = SystemMessage(content=openai_content_to_content(message.content))
|
||||
elif message.role == "user":
|
||||
converted_message = UserMessage(content=openai_content_to_content(message.content))
|
||||
elif message.role == "assistant":
|
||||
converted_message = CompletionMessage(
|
||||
content=message.content,
|
||||
content=openai_content_to_content(message.content),
|
||||
tool_calls=_convert_openai_tool_calls(message.tool_calls),
|
||||
stop_reason=StopReason.end_of_turn,
|
||||
)
|
||||
|
@ -1331,7 +1332,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
|||
async def openai_chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: list[OpenAIChatCompletionMessage],
|
||||
messages: list[OpenAIMessageParam],
|
||||
frequency_penalty: float | None = None,
|
||||
function_call: str | dict[str, Any] | None = None,
|
||||
functions: list[dict[str, Any]] | None = None,
|
||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||
|
||||
[project]
|
||||
name = "llama_stack"
|
||||
version = "0.2.4"
|
||||
version = "0.2.5"
|
||||
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
|
||||
description = "Llama Stack"
|
||||
readme = "README.md"
|
||||
|
@ -27,7 +27,7 @@ dependencies = [
|
|||
"huggingface-hub",
|
||||
"jinja2>=3.1.6",
|
||||
"jsonschema",
|
||||
"llama-stack-client>=0.2.4",
|
||||
"llama-stack-client>=0.2.5",
|
||||
"openai>=1.66",
|
||||
"prompt-toolkit",
|
||||
"python-dotenv",
|
||||
|
@ -105,7 +105,7 @@ codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
|
|||
ui = [
|
||||
"streamlit",
|
||||
"pandas",
|
||||
"llama-stack-client>=0.2.4",
|
||||
"llama-stack-client>=0.2.5",
|
||||
"streamlit-option-menu",
|
||||
]
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ jiter==0.8.2
|
|||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
kubernetes==32.0.1
|
||||
llama-stack-client==0.2.4
|
||||
llama-stack-client==0.2.5
|
||||
lxml==5.3.1
|
||||
markdown-it-py==3.0.0
|
||||
markupsafe==3.0.2
|
||||
|
|
|
@ -548,6 +548,7 @@ def test_rag_agent_with_attachments(llama_stack_client, agent_config):
|
|||
assert "lora" in response.output_message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Code interpreter is currently disabled in the Stack")
|
||||
def test_rag_and_code_agent(llama_stack_client, agent_config):
|
||||
if "llama-4" in agent_config["model"].lower():
|
||||
pytest.xfail("Not working for llama4")
|
||||
|
|
|
@ -7,9 +7,20 @@
|
|||
import pytest
|
||||
|
||||
from llama_stack.apis.common.content_types import TextContentItem
|
||||
from llama_stack.apis.inference.inference import CompletionMessage, UserMessage
|
||||
from llama_stack.apis.inference.inference import (
|
||||
CompletionMessage,
|
||||
OpenAIAssistantMessageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
OpenAISystemMessageParam,
|
||||
OpenAIUserMessageParam,
|
||||
SystemMessage,
|
||||
UserMessage,
|
||||
)
|
||||
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
|
||||
from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict
|
||||
from llama_stack.providers.utils.inference.openai_compat import (
|
||||
convert_message_to_openai_dict,
|
||||
openai_messages_to_messages,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
@ -67,3 +78,39 @@ async def test_convert_message_to_openai_dict_with_builtin_tool_call():
|
|||
{"id": "123", "type": "function", "function": {"name": "brave_search", "arguments": '{"foo": "bar"}'}}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openai_messages_to_messages_with_content_str():
|
||||
openai_messages = [
|
||||
OpenAISystemMessageParam(content="system message"),
|
||||
OpenAIUserMessageParam(content="user message"),
|
||||
OpenAIAssistantMessageParam(content="assistant message"),
|
||||
]
|
||||
|
||||
llama_messages = openai_messages_to_messages(openai_messages)
|
||||
assert len(llama_messages) == 3
|
||||
assert isinstance(llama_messages[0], SystemMessage)
|
||||
assert isinstance(llama_messages[1], UserMessage)
|
||||
assert isinstance(llama_messages[2], CompletionMessage)
|
||||
assert llama_messages[0].content == "system message"
|
||||
assert llama_messages[1].content == "user message"
|
||||
assert llama_messages[2].content == "assistant message"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openai_messages_to_messages_with_content_list():
|
||||
openai_messages = [
|
||||
OpenAISystemMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="system message")]),
|
||||
OpenAIUserMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="user message")]),
|
||||
OpenAIAssistantMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="assistant message")]),
|
||||
]
|
||||
|
||||
llama_messages = openai_messages_to_messages(openai_messages)
|
||||
assert len(llama_messages) == 3
|
||||
assert isinstance(llama_messages[0], SystemMessage)
|
||||
assert isinstance(llama_messages[1], UserMessage)
|
||||
assert isinstance(llama_messages[2], CompletionMessage)
|
||||
assert llama_messages[0].content[0].text == "system message"
|
||||
assert llama_messages[1].content[0].text == "user message"
|
||||
assert llama_messages[2].content[0].text == "assistant message"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue