mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 16:54:42 +00:00
Merge branch 'main' into pr2088
This commit is contained in:
commit
57b5449e85
12 changed files with 2341 additions and 2217 deletions
2
.github/workflows/providers-build.yml
vendored
2
.github/workflows/providers-build.yml
vendored
|
@ -153,7 +153,7 @@ jobs:
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
||||||
|
|
|
@ -141,11 +141,18 @@ uv sync
|
||||||
|
|
||||||
## Coding Style
|
## Coding Style
|
||||||
|
|
||||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
||||||
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
|
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||||
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
|
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
|
||||||
|
rather than explain what the next line of code does.
|
||||||
|
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
|
||||||
|
`Exception`.
|
||||||
* Error messages should be prefixed with "Failed to ..."
|
* Error messages should be prefixed with "Failed to ..."
|
||||||
* 4 spaces for indentation rather than tabs
|
* 4 spaces for indentation rather than tab
|
||||||
|
* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
|
||||||
|
justification for bypassing the check.
|
||||||
|
* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
|
||||||
|
justification for bypassing the check.
|
||||||
|
|
||||||
## Common Tasks
|
## Common Tasks
|
||||||
|
|
||||||
|
|
6
docs/_static/css/my_theme.css
vendored
6
docs/_static/css/my_theme.css
vendored
|
@ -27,3 +27,9 @@ pre {
|
||||||
white-space: pre-wrap !important;
|
white-space: pre-wrap !important;
|
||||||
word-break: break-all;
|
word-break: break-all;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[data-theme="dark"] .mermaid {
|
||||||
|
background-color: #f4f4f6 !important;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 0.5em;
|
||||||
|
}
|
||||||
|
|
|
@ -53,6 +53,7 @@ Here's a list of known external providers that you can use with Llama Stack:
|
||||||
| Name | Description | API | Type | Repository |
|
| Name | Description | API | Type | Repository |
|
||||||
|------|-------------|-----|------|------------|
|
|------|-------------|-----|------|------------|
|
||||||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||||
|
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||||
|
|
||||||
### Remote Provider Specification
|
### Remote Provider Specification
|
||||||
|
|
103
install.sh
103
install.sh
|
@ -16,61 +16,120 @@ WAIT_TIMEOUT=300
|
||||||
log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
|
log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
|
||||||
die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
|
die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
|
||||||
|
|
||||||
|
wait_for_service() {
|
||||||
|
local url="$1"
|
||||||
|
local pattern="$2"
|
||||||
|
local timeout="$3"
|
||||||
|
local name="$4"
|
||||||
|
local start ts
|
||||||
|
log "⏳ Waiting for ${name}…"
|
||||||
|
start=$(date +%s)
|
||||||
|
while true; do
|
||||||
|
if curl --retry 5 --retry-delay 1 --retry-max-time "$timeout" --retry-all-errors --silent --fail "$url" 2>/dev/null | grep -q "$pattern"; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
ts=$(date +%s)
|
||||||
|
if (( ts - start >= timeout )); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
printf '.'
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
if command -v docker &> /dev/null; then
|
if command -v docker &> /dev/null; then
|
||||||
ENGINE="docker"
|
ENGINE="docker"
|
||||||
HOST_DNS="host.docker.internal"
|
|
||||||
elif command -v podman &> /dev/null; then
|
elif command -v podman &> /dev/null; then
|
||||||
ENGINE="podman"
|
ENGINE="podman"
|
||||||
HOST_DNS="host.containers.internal"
|
|
||||||
else
|
else
|
||||||
die "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
|
die "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Explicitly set the platform for the host architecture
|
||||||
|
HOST_ARCH="$(uname -m)"
|
||||||
|
if [ "$HOST_ARCH" = "arm64" ]; then
|
||||||
|
if [ "$ENGINE" = "docker" ]; then
|
||||||
|
PLATFORM_OPTS=( --platform linux/amd64 )
|
||||||
|
else
|
||||||
|
PLATFORM_OPTS=( --os linux --arch amd64 )
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
PLATFORM_OPTS=()
|
||||||
|
fi
|
||||||
|
|
||||||
|
# macOS + Podman: ensure VM is running before we try to launch containers
|
||||||
|
# If you need GPU passthrough under Podman on macOS, init the VM with libkrun:
|
||||||
|
# CONTAINERS_MACHINE_PROVIDER=libkrun podman machine init
|
||||||
|
if [ "$ENGINE" = "podman" ] && [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
if ! podman info &>/dev/null; then
|
||||||
|
log "⌛️ Initializing Podman VM…"
|
||||||
|
podman machine init &>/dev/null || true
|
||||||
|
podman machine start &>/dev/null || true
|
||||||
|
|
||||||
|
log "⌛️ Waiting for Podman API…"
|
||||||
|
until podman info &>/dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
log "✅ Podman VM is up"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Clean up any leftovers from earlier runs
|
# Clean up any leftovers from earlier runs
|
||||||
for name in ollama-server llama-stack; do
|
for name in ollama-server llama-stack; do
|
||||||
ids=$($ENGINE ps -aq --filter "name=^${name}$")
|
ids=$($ENGINE ps -aq --filter "name=^${name}$")
|
||||||
if [ -n "$ids" ]; then
|
if [ -n "$ids" ]; then
|
||||||
log "⚠️ Found existing container(s) for '${name}', removing..."
|
log "⚠️ Found existing container(s) for '${name}', removing…"
|
||||||
$ENGINE rm -f "$ids"
|
$ENGINE rm -f "$ids" > /dev/null 2>&1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# 0. Create a shared network
|
||||||
|
###############################################################################
|
||||||
|
if ! $ENGINE network inspect llama-net >/dev/null 2>&1; then
|
||||||
|
log "🌐 Creating network…"
|
||||||
|
$ENGINE network create llama-net >/dev/null 2>&1
|
||||||
|
fi
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# 1. Ollama
|
# 1. Ollama
|
||||||
###############################################################################
|
###############################################################################
|
||||||
log "🦙 Starting Ollama…"
|
log "🦙 Starting Ollama…"
|
||||||
$ENGINE run -d --name ollama-server \
|
$ENGINE run -d "${PLATFORM_OPTS[@]}" --name ollama-server \
|
||||||
-p "${OLLAMA_PORT}:11434" \
|
--network llama-net \
|
||||||
|
-p "${OLLAMA_PORT}:${OLLAMA_PORT}" \
|
||||||
ollama/ollama > /dev/null 2>&1
|
ollama/ollama > /dev/null 2>&1
|
||||||
|
|
||||||
log "⏳ Waiting for Ollama daemon…"
|
if ! wait_for_service "http://localhost:${OLLAMA_PORT}/" "Ollama" "$WAIT_TIMEOUT" "Ollama daemon"; then
|
||||||
if ! timeout "$WAIT_TIMEOUT" bash -c \
|
|
||||||
"until curl -fsS http://localhost:${OLLAMA_PORT}/ 2>/dev/null | grep -q 'Ollama'; do sleep 1; done"; then
|
|
||||||
log "❌ Ollama daemon did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
log "❌ Ollama daemon did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
||||||
$ENGINE logs ollama-server --tail=200
|
$ENGINE logs --tail 200 ollama-server
|
||||||
die "Ollama startup failed"
|
die "Ollama startup failed"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "📦 Ensuring model is pulled: ${MODEL_ALIAS}..."
|
log "📦 Ensuring model is pulled: ${MODEL_ALIAS}…"
|
||||||
$ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1
|
if ! $ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1; then
|
||||||
|
log "❌ Failed to pull model ${MODEL_ALIAS}; dumping container logs:"
|
||||||
|
$ENGINE logs --tail 200 ollama-server
|
||||||
|
die "Model pull failed"
|
||||||
|
fi
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# 2. Llama‑Stack
|
# 2. Llama‑Stack
|
||||||
###############################################################################
|
###############################################################################
|
||||||
log "🦙📦 Starting Llama‑Stack…"
|
cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
|
||||||
$ENGINE run -d --name llama-stack \
|
--network llama-net \
|
||||||
-p "${PORT}:${PORT}" \
|
-p "${PORT}:${PORT}" \
|
||||||
--add-host="${HOST_DNS}:host-gateway" \
|
"${SERVER_IMAGE}" --port "${PORT}" \
|
||||||
"${SERVER_IMAGE}" \
|
|
||||||
--port "${PORT}" \
|
|
||||||
--env INFERENCE_MODEL="${MODEL_ALIAS}" \
|
--env INFERENCE_MODEL="${MODEL_ALIAS}" \
|
||||||
--env OLLAMA_URL="http://${HOST_DNS}:${OLLAMA_PORT}" > /dev/null 2>&1
|
--env OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" )
|
||||||
|
|
||||||
log "⏳ Waiting for Llama-Stack API…"
|
log "🦙 Starting Llama‑Stack…"
|
||||||
if ! timeout "$WAIT_TIMEOUT" bash -c \
|
$ENGINE "${cmd[@]}" > /dev/null 2>&1
|
||||||
"until curl -fsS http://localhost:${PORT}/v1/health 2>/dev/null | grep -q 'OK'; do sleep 1; done"; then
|
|
||||||
|
if ! wait_for_service "http://127.0.0.1:${PORT}/v1/health" "OK" "$WAIT_TIMEOUT" "Llama-Stack API"; then
|
||||||
log "❌ Llama-Stack did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
log "❌ Llama-Stack did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
|
||||||
$ENGINE logs llama-stack --tail=200
|
$ENGINE logs --tail 200 llama-stack
|
||||||
die "Llama-Stack startup failed"
|
die "Llama-Stack startup failed"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -108,6 +108,7 @@ from llama_stack.apis.inference.inference import (
|
||||||
OpenAIChatCompletion,
|
OpenAIChatCompletion,
|
||||||
OpenAICompletion,
|
OpenAICompletion,
|
||||||
OpenAICompletionChoice,
|
OpenAICompletionChoice,
|
||||||
|
OpenAIMessageParam,
|
||||||
OpenAIResponseFormatParam,
|
OpenAIResponseFormatParam,
|
||||||
ToolConfig,
|
ToolConfig,
|
||||||
)
|
)
|
||||||
|
@ -987,7 +988,7 @@ def _convert_openai_sampling_params(
|
||||||
|
|
||||||
|
|
||||||
def openai_messages_to_messages(
|
def openai_messages_to_messages(
|
||||||
messages: list[OpenAIChatCompletionMessage],
|
messages: list[OpenAIMessageParam],
|
||||||
) -> list[Message]:
|
) -> list[Message]:
|
||||||
"""
|
"""
|
||||||
Convert a list of OpenAIChatCompletionMessage into a list of Message.
|
Convert a list of OpenAIChatCompletionMessage into a list of Message.
|
||||||
|
@ -995,12 +996,12 @@ def openai_messages_to_messages(
|
||||||
converted_messages = []
|
converted_messages = []
|
||||||
for message in messages:
|
for message in messages:
|
||||||
if message.role == "system":
|
if message.role == "system":
|
||||||
converted_message = SystemMessage(content=message.content)
|
converted_message = SystemMessage(content=openai_content_to_content(message.content))
|
||||||
elif message.role == "user":
|
elif message.role == "user":
|
||||||
converted_message = UserMessage(content=openai_content_to_content(message.content))
|
converted_message = UserMessage(content=openai_content_to_content(message.content))
|
||||||
elif message.role == "assistant":
|
elif message.role == "assistant":
|
||||||
converted_message = CompletionMessage(
|
converted_message = CompletionMessage(
|
||||||
content=message.content,
|
content=openai_content_to_content(message.content),
|
||||||
tool_calls=_convert_openai_tool_calls(message.tool_calls),
|
tool_calls=_convert_openai_tool_calls(message.tool_calls),
|
||||||
stop_reason=StopReason.end_of_turn,
|
stop_reason=StopReason.end_of_turn,
|
||||||
)
|
)
|
||||||
|
@ -1331,7 +1332,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
||||||
async def openai_chat_completion(
|
async def openai_chat_completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
messages: list[OpenAIChatCompletionMessage],
|
messages: list[OpenAIMessageParam],
|
||||||
frequency_penalty: float | None = None,
|
frequency_penalty: float | None = None,
|
||||||
function_call: str | dict[str, Any] | None = None,
|
function_call: str | dict[str, Any] | None = None,
|
||||||
functions: list[dict[str, Any]] | None = None,
|
functions: list[dict[str, Any]] | None = None,
|
||||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "llama_stack"
|
name = "llama_stack"
|
||||||
version = "0.2.4"
|
version = "0.2.5"
|
||||||
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
|
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
|
||||||
description = "Llama Stack"
|
description = "Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
@ -27,7 +27,7 @@ dependencies = [
|
||||||
"huggingface-hub",
|
"huggingface-hub",
|
||||||
"jinja2>=3.1.6",
|
"jinja2>=3.1.6",
|
||||||
"jsonschema",
|
"jsonschema",
|
||||||
"llama-stack-client>=0.2.4",
|
"llama-stack-client>=0.2.5",
|
||||||
"openai>=1.66",
|
"openai>=1.66",
|
||||||
"prompt-toolkit",
|
"prompt-toolkit",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
|
@ -105,7 +105,7 @@ codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
|
||||||
ui = [
|
ui = [
|
||||||
"streamlit",
|
"streamlit",
|
||||||
"pandas",
|
"pandas",
|
||||||
"llama-stack-client>=0.2.4",
|
"llama-stack-client>=0.2.5",
|
||||||
"streamlit-option-menu",
|
"streamlit-option-menu",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ jiter==0.8.2
|
||||||
jsonschema==4.23.0
|
jsonschema==4.23.0
|
||||||
jsonschema-specifications==2024.10.1
|
jsonschema-specifications==2024.10.1
|
||||||
kubernetes==32.0.1
|
kubernetes==32.0.1
|
||||||
llama-stack-client==0.2.4
|
llama-stack-client==0.2.5
|
||||||
lxml==5.3.1
|
lxml==5.3.1
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
markupsafe==3.0.2
|
markupsafe==3.0.2
|
||||||
|
|
|
@ -548,6 +548,7 @@ def test_rag_agent_with_attachments(llama_stack_client, agent_config):
|
||||||
assert "lora" in response.output_message.content.lower()
|
assert "lora" in response.output_message.content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Code interpreter is currently disabled in the Stack")
|
||||||
def test_rag_and_code_agent(llama_stack_client, agent_config):
|
def test_rag_and_code_agent(llama_stack_client, agent_config):
|
||||||
if "llama-4" in agent_config["model"].lower():
|
if "llama-4" in agent_config["model"].lower():
|
||||||
pytest.xfail("Not working for llama4")
|
pytest.xfail("Not working for llama4")
|
||||||
|
|
|
@ -7,9 +7,20 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import TextContentItem
|
from llama_stack.apis.common.content_types import TextContentItem
|
||||||
from llama_stack.apis.inference.inference import CompletionMessage, UserMessage
|
from llama_stack.apis.inference.inference import (
|
||||||
|
CompletionMessage,
|
||||||
|
OpenAIAssistantMessageParam,
|
||||||
|
OpenAIChatCompletionContentPartTextParam,
|
||||||
|
OpenAISystemMessageParam,
|
||||||
|
OpenAIUserMessageParam,
|
||||||
|
SystemMessage,
|
||||||
|
UserMessage,
|
||||||
|
)
|
||||||
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
|
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
|
||||||
from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
convert_message_to_openai_dict,
|
||||||
|
openai_messages_to_messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -67,3 +78,39 @@ async def test_convert_message_to_openai_dict_with_builtin_tool_call():
|
||||||
{"id": "123", "type": "function", "function": {"name": "brave_search", "arguments": '{"foo": "bar"}'}}
|
{"id": "123", "type": "function", "function": {"name": "brave_search", "arguments": '{"foo": "bar"}'}}
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_openai_messages_to_messages_with_content_str():
|
||||||
|
openai_messages = [
|
||||||
|
OpenAISystemMessageParam(content="system message"),
|
||||||
|
OpenAIUserMessageParam(content="user message"),
|
||||||
|
OpenAIAssistantMessageParam(content="assistant message"),
|
||||||
|
]
|
||||||
|
|
||||||
|
llama_messages = openai_messages_to_messages(openai_messages)
|
||||||
|
assert len(llama_messages) == 3
|
||||||
|
assert isinstance(llama_messages[0], SystemMessage)
|
||||||
|
assert isinstance(llama_messages[1], UserMessage)
|
||||||
|
assert isinstance(llama_messages[2], CompletionMessage)
|
||||||
|
assert llama_messages[0].content == "system message"
|
||||||
|
assert llama_messages[1].content == "user message"
|
||||||
|
assert llama_messages[2].content == "assistant message"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_openai_messages_to_messages_with_content_list():
|
||||||
|
openai_messages = [
|
||||||
|
OpenAISystemMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="system message")]),
|
||||||
|
OpenAIUserMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="user message")]),
|
||||||
|
OpenAIAssistantMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="assistant message")]),
|
||||||
|
]
|
||||||
|
|
||||||
|
llama_messages = openai_messages_to_messages(openai_messages)
|
||||||
|
assert len(llama_messages) == 3
|
||||||
|
assert isinstance(llama_messages[0], SystemMessage)
|
||||||
|
assert isinstance(llama_messages[1], UserMessage)
|
||||||
|
assert isinstance(llama_messages[2], CompletionMessage)
|
||||||
|
assert llama_messages[0].content[0].text == "system message"
|
||||||
|
assert llama_messages[1].content[0].text == "user message"
|
||||||
|
assert llama_messages[2].content[0].text == "assistant message"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue