Merge branch 'main' into fix/divide-by-zero-exception-faiss-query-vector

This commit is contained in:
Ibrahim Haroon 2025-06-06 11:29:14 -04:00 committed by GitHub
commit b05a3db358
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 254 additions and 389 deletions

View file

@ -1,26 +1,9 @@
name: Setup Ollama
description: Start Ollama and cache model
inputs:
models:
description: Comma-separated list of models to pull
default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
description: Start Ollama
runs:
using: "composite"
steps:
- name: Install and start Ollama
- name: Start Ollama
shell: bash
run: |
# the ollama installer also starts the ollama service
curl -fsSL https://ollama.com/install.sh | sh
# Do NOT cache models - pulling the cache is actually slower than just pulling the model.
# It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
# pull them directly.
# Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
- name: Pull requested models
if: inputs.models != ''
shell: bash
run: |
for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
ollama pull "$model"
done
docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models

View file

@ -1,12 +1,17 @@
name: Setup runner
description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
inputs:
python-version:
description: The Python version to use
required: false
default: "3.10"
runs:
using: "composite"
steps:
- name: Install uv
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
with:
python-version: "3.10"
python-version: ${{ inputs.python-version }}
activate-environment: true
version: 0.7.6

View file

@ -26,6 +26,7 @@ jobs:
# TODO: generate matrix list from tests/integration when fixed
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
client-type: [library, http]
python-version: ["3.10", "3.11", "3.12"]
fail-fast: false # we want to run all tests regardless of failure
steps:
@ -34,20 +35,22 @@ jobs:
- name: Install dependencies
uses: ./.github/actions/setup-runner
with:
python-version: ${{ matrix.python-version }}
- name: Setup ollama
uses: ./.github/actions/setup-ollama
- name: Build Llama Stack
run: |
llama stack build --template ollama --image-type venv
uv run llama stack build --template ollama --image-type venv
- name: Start Llama Stack server in background
if: matrix.client-type == 'http'
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: |
LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
- name: Wait for Llama Stack server to be ready
if: matrix.client-type == 'http'
@ -84,6 +87,7 @@ jobs:
- name: Run Integration Tests
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
OLLAMA_URL: "http://0.0.0.0:11434"
run: |
if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="ollama"
@ -104,13 +108,13 @@ jobs:
- name: Write ollama logs to file
if: ${{ always() }}
run: |
sudo journalctl -u ollama.service > ollama.log
sudo docker logs ollama > ollama.log
- name: Upload all logs to artifacts
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
path: |
*.log
retention-days: 1

View file

@ -43,23 +43,12 @@ def get_provider_dependencies(
config: BuildConfig | DistributionTemplate,
) -> tuple[list[str], list[str]]:
"""Get normal and special dependencies from provider configuration."""
# Extract providers based on config type
if isinstance(config, DistributionTemplate):
providers = config.providers
config = config.build_config()
providers = config.distribution_spec.providers
additional_pip_packages = config.additional_pip_packages
# TODO: This is a hack to get the dependencies for internal APIs into build
# We should have a better way to do this by formalizing the concept of "internal" APIs
# and providers, with a way to specify dependencies for them.
run_configs = config.run_configs
additional_pip_packages: list[str] = []
if run_configs:
for run_config in run_configs.values():
run_config_ = run_config.run_config(name="", providers={}, container_image=None)
if run_config_.inference_store:
additional_pip_packages.extend(run_config_.inference_store.pip_packages)
elif isinstance(config, BuildConfig):
providers = config.distribution_spec.providers
additional_pip_packages = config.additional_pip_packages
deps = []
registry = get_provider_registry(config)
for api_str, provider_or_providers in providers.items():
@ -87,8 +76,7 @@ def get_provider_dependencies(
else:
normal_deps.append(package)
if additional_pip_packages:
normal_deps.extend(additional_pip_packages)
normal_deps.extend(additional_pip_packages or [])
return list(set(normal_deps)), list(set(special_deps))

View file

@ -149,12 +149,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
def request(self, *args, **kwargs):
# NOTE: We are using AsyncLlamaStackClient under the hood
# A new event loop is needed to convert the AsyncStream
# from async client into SyncStream return type for streaming
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
if kwargs.get("stream"):
# NOTE: We are using AsyncLlamaStackClient under the hood
# A new event loop is needed to convert the AsyncStream
# from async client into SyncStream return type for streaming
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
def sync_generator():
try:
@ -172,7 +173,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
return sync_generator()
else:
return asyncio.run(self.async_client.request(*args, **kwargs))
try:
result = loop.run_until_complete(self.async_client.request(*args, **kwargs))
finally:
pending = asyncio.all_tasks(loop)
if pending:
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
loop.close()
return result
class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

View file

@ -8,7 +8,7 @@ import json
import time
import uuid
from collections.abc import AsyncIterator
from typing import Any, cast
from typing import Any
from openai.types.chat import ChatCompletionToolParam
from pydantic import BaseModel
@ -200,7 +200,6 @@ class ChatCompletionContext(BaseModel):
messages: list[OpenAIMessageParam]
tools: list[ChatCompletionToolParam] | None = None
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
stream: bool
temperature: float | None
response_format: OpenAIResponseFormatParam
@ -281,49 +280,6 @@ class OpenAIResponsesImpl:
"""
return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
def _is_function_tool_call(
self,
tool_call: OpenAIChatCompletionToolCall,
tools: list[OpenAIResponseInputTool],
) -> bool:
if not tool_call.function:
return False
for t in tools:
if t.type == "function" and t.name == tool_call.function.name:
return True
return False
async def _process_response_choices(
self,
chat_response: OpenAIChatCompletion,
ctx: ChatCompletionContext,
tools: list[OpenAIResponseInputTool] | None,
) -> list[OpenAIResponseOutput]:
"""Handle tool execution and response message creation."""
output_messages: list[OpenAIResponseOutput] = []
# Execute tool calls if any
for choice in chat_response.choices:
if choice.message.tool_calls and tools:
# Assume if the first tool is a function, all tools are functions
if self._is_function_tool_call(choice.message.tool_calls[0], tools):
for tool_call in choice.message.tool_calls:
output_messages.append(
OpenAIResponseOutputMessageFunctionToolCall(
arguments=tool_call.function.arguments or "",
call_id=tool_call.id,
name=tool_call.function.name or "",
id=f"fc_{uuid.uuid4()}",
status="completed",
)
)
else:
tool_messages = await self._execute_tool_and_return_final_output(choice, ctx)
output_messages.extend(tool_messages)
else:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
return output_messages
async def _store_response(
self,
response: OpenAIResponseObject,
@ -370,9 +326,48 @@ class OpenAIResponsesImpl:
tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10,
):
stream = False if stream is None else stream
stream = bool(stream)
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
stream_gen = self._create_streaming_response(
input=input,
model=model,
instructions=instructions,
previous_response_id=previous_response_id,
store=store,
temperature=temperature,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
if stream:
return stream_gen
else:
response = None
async for stream_chunk in stream_gen:
if stream_chunk.type == "response.completed":
if response is not None:
raise ValueError("The response stream completed multiple times! Earlier response: {response}")
response = stream_chunk.response
# don't leave the generator half complete!
if response is None:
raise ValueError("The response stream never completed")
return response
async def _create_streaming_response(
self,
input: str | list[OpenAIResponseInput],
model: str,
instructions: str | None = None,
previous_response_id: str | None = None,
store: bool | None = True,
temperature: float | None = None,
text: OpenAIResponseText | None = None,
tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10,
) -> AsyncIterator[OpenAIResponseObjectStream]:
output_messages: list[OpenAIResponseOutput] = []
# Input preprocessing
@ -383,7 +378,7 @@ class OpenAIResponsesImpl:
# Structured outputs
response_format = await _convert_response_text_to_chat_response_format(text)
# Tool setup
# Tool setup, TODO: refactor this slightly since this can also yield events
chat_tools, mcp_tool_to_server, mcp_list_message = (
await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
)
@ -395,136 +390,10 @@ class OpenAIResponsesImpl:
messages=messages,
tools=chat_tools,
mcp_tool_to_server=mcp_tool_to_server,
stream=stream,
temperature=temperature,
response_format=response_format,
)
# Fork to streaming vs non-streaming - let each handle ALL inference rounds
if stream:
return self._create_streaming_response(
ctx=ctx,
output_messages=output_messages,
input=input,
model=model,
store=store,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
else:
return await self._create_non_streaming_response(
ctx=ctx,
output_messages=output_messages,
input=input,
model=model,
store=store,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
async def _create_non_streaming_response(
self,
ctx: ChatCompletionContext,
output_messages: list[OpenAIResponseOutput],
input: str | list[OpenAIResponseInput],
model: str,
store: bool | None,
text: OpenAIResponseText,
tools: list[OpenAIResponseInputTool] | None,
max_infer_iters: int,
) -> OpenAIResponseObject:
n_iter = 0
messages = ctx.messages.copy()
while True:
# Do inference (including the first one)
inference_result = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=messages,
tools=ctx.tools,
stream=False,
temperature=ctx.temperature,
response_format=ctx.response_format,
)
completion = OpenAIChatCompletion(**inference_result.model_dump())
# Separate function vs non-function tool calls
function_tool_calls = []
non_function_tool_calls = []
for choice in completion.choices:
if choice.message.tool_calls and tools:
for tool_call in choice.message.tool_calls:
if self._is_function_tool_call(tool_call, tools):
function_tool_calls.append(tool_call)
else:
non_function_tool_calls.append(tool_call)
# Process response choices based on tool call types
if function_tool_calls:
# For function tool calls, use existing logic and return immediately
current_output_messages = await self._process_response_choices(
chat_response=completion,
ctx=ctx,
tools=tools,
)
output_messages.extend(current_output_messages)
break
elif non_function_tool_calls:
# For non-function tool calls, execute them and continue loop
for choice in completion.choices:
tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
output_messages.extend(tool_outputs)
# Add assistant message and tool responses to messages for next iteration
messages.append(choice.message)
messages.extend(tool_response_messages)
n_iter += 1
if n_iter >= max_infer_iters:
break
# Continue with next iteration of the loop
continue
else:
# No tool calls - convert response to message and we're done
for choice in completion.choices:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
break
response = OpenAIResponseObject(
created_at=completion.created,
id=f"resp-{uuid.uuid4()}",
model=model,
object="response",
status="completed",
output=output_messages,
text=text,
)
logger.debug(f"OpenAI Responses response: {response}")
# Store response if requested
if store:
await self._store_response(
response=response,
input=input,
)
return response
async def _create_streaming_response(
self,
ctx: ChatCompletionContext,
output_messages: list[OpenAIResponseOutput],
input: str | list[OpenAIResponseInput],
model: str,
store: bool | None,
text: OpenAIResponseText,
tools: list[OpenAIResponseInputTool] | None,
max_infer_iters: int | None,
) -> AsyncIterator[OpenAIResponseObjectStream]:
# Create initial response and emit response.created immediately
response_id = f"resp-{uuid.uuid4()}"
created_at = int(time.time())
@ -539,15 +408,13 @@ class OpenAIResponsesImpl:
text=text,
)
# Emit response.created immediately
yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
# Implement tool execution loop for streaming - handle ALL inference rounds including the first
n_iter = 0
messages = ctx.messages.copy()
while True:
current_inference_result = await self.inference_api.openai_chat_completion(
completion_result = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=messages,
tools=ctx.tools,
@ -568,7 +435,7 @@ class OpenAIResponsesImpl:
# Create a placeholder message item for delta events
message_item_id = f"msg_{uuid.uuid4()}"
async for chunk in current_inference_result:
async for chunk in completion_result:
chat_response_id = chunk.id
chunk_created = chunk.created
chunk_model = chunk.model
@ -628,50 +495,55 @@ class OpenAIResponsesImpl:
model=chunk_model,
)
# Separate function vs non-function tool calls
function_tool_calls = []
non_function_tool_calls = []
next_turn_messages = messages.copy()
for choice in current_response.choices:
next_turn_messages.append(choice.message)
if choice.message.tool_calls and tools:
for tool_call in choice.message.tool_calls:
if self._is_function_tool_call(tool_call, tools):
if _is_function_tool_call(tool_call, tools):
function_tool_calls.append(tool_call)
else:
non_function_tool_calls.append(tool_call)
# Process response choices based on tool call types
if function_tool_calls:
# For function tool calls, use existing logic and break
current_output_messages = await self._process_response_choices(
chat_response=current_response,
ctx=ctx,
tools=tools,
)
output_messages.extend(current_output_messages)
break
elif non_function_tool_calls:
# For non-function tool calls, execute them and continue loop
for choice in current_response.choices:
tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
output_messages.extend(tool_outputs)
# Add assistant message and tool responses to messages for next iteration
messages.append(choice.message)
messages.extend(tool_response_messages)
n_iter += 1
if n_iter >= (max_infer_iters or 10):
break
# Continue with next iteration of the loop
continue
else:
# No tool calls - convert response to message and we're done
for choice in current_response.choices:
else:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
# execute non-function tool calls
for tool_call in non_function_tool_calls:
tool_call_log, tool_response_message = await self._execute_tool_call(tool_call, ctx)
if tool_call_log:
output_messages.append(tool_call_log)
if tool_response_message:
next_turn_messages.append(tool_response_message)
for tool_call in function_tool_calls:
output_messages.append(
OpenAIResponseOutputMessageFunctionToolCall(
arguments=tool_call.function.arguments or "",
call_id=tool_call.id,
name=tool_call.function.name or "",
id=f"fc_{uuid.uuid4()}",
status="completed",
)
)
if not function_tool_calls and not non_function_tool_calls:
break
if function_tool_calls:
logger.info("Exiting inference loop since there is a function (client-side) tool call")
break
n_iter += 1
if n_iter >= max_infer_iters:
logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}")
break
messages = next_turn_messages
# Create final response
final_response = OpenAIResponseObject(
created_at=created_at,
@ -683,15 +555,15 @@ class OpenAIResponsesImpl:
output=output_messages,
)
# Emit response.completed
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
if store:
await self._store_response(
response=final_response,
input=input,
)
# Emit response.completed
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
async def _convert_response_tools_to_chat_tools(
self, tools: list[OpenAIResponseInputTool]
) -> tuple[
@ -784,73 +656,6 @@ class OpenAIResponsesImpl:
raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
return chat_tools, mcp_tool_to_server, mcp_list_message
async def _execute_tool_calls_only(
self,
choice: OpenAIChoice,
ctx: ChatCompletionContext,
) -> tuple[list[OpenAIResponseOutput], list[OpenAIMessageParam]]:
"""Execute tool calls and return output messages and tool response messages for next inference."""
output_messages: list[OpenAIResponseOutput] = []
tool_response_messages: list[OpenAIMessageParam] = []
if not isinstance(choice.message, OpenAIAssistantMessageParam):
return output_messages, tool_response_messages
if not choice.message.tool_calls:
return output_messages, tool_response_messages
for tool_call in choice.message.tool_calls:
tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
if tool_call_log:
output_messages.append(tool_call_log)
if further_input:
tool_response_messages.append(further_input)
return output_messages, tool_response_messages
async def _execute_tool_and_return_final_output(
self,
choice: OpenAIChoice,
ctx: ChatCompletionContext,
) -> list[OpenAIResponseOutput]:
output_messages: list[OpenAIResponseOutput] = []
if not isinstance(choice.message, OpenAIAssistantMessageParam):
return output_messages
if not choice.message.tool_calls:
return output_messages
next_turn_messages = ctx.messages.copy()
# Add the assistant message with tool_calls response to the messages list
next_turn_messages.append(choice.message)
for tool_call in choice.message.tool_calls:
# TODO: telemetry spans for tool calls
tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
if tool_call_log:
output_messages.append(tool_call_log)
if further_input:
next_turn_messages.append(further_input)
tool_results_chat_response = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=next_turn_messages,
stream=ctx.stream,
temperature=ctx.temperature,
)
# type cast to appease mypy: this is needed because we don't handle streaming properly :)
tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
# Huge TODO: these are NOT the final outputs, we must keep the loop going
tool_final_outputs = [
await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
]
# TODO: Wire in annotations with URLs, titles, etc to these output messages
output_messages.extend(tool_final_outputs)
return output_messages
async def _execute_tool_call(
self,
tool_call: OpenAIChatCompletionToolCall,
@ -939,3 +744,15 @@ class OpenAIResponsesImpl:
input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id)
return message, input_message
def _is_function_tool_call(
tool_call: OpenAIChatCompletionToolCall,
tools: list[OpenAIResponseInputTool],
) -> bool:
if not tool_call.function:
return False
for t in tools:
if t.type == "function" and t.name == tool_call.function.name:
return True
return False

View file

@ -24,7 +24,7 @@ def available_providers() -> list[ProviderSpec]:
"pandas",
"scikit-learn",
]
+ kvstore_dependencies(),
+ kvstore_dependencies(), # TODO make this dynamic based on the kvstore config
module="llama_stack.providers.inline.agents.meta_reference",
config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
api_dependencies=[

View file

@ -345,21 +345,27 @@ class OllamaInferenceAdapter(
model = await self.register_helper.register_model(model)
except ValueError:
pass # Ignore statically unknown model, will check live listing
if model.provider_resource_id is None:
raise ValueError("Model provider_resource_id cannot be None")
if model.model_type == ModelType.embedding:
logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
await self.client.pull(model.provider_resource_id)
# TODO: you should pull here only if the model is not found in a list
response = await self.client.list()
if model.provider_resource_id not in [m.model for m in response.models]:
await self.client.pull(model.provider_resource_id)
# we use list() here instead of ps() -
# - ps() only lists running models, not available models
# - models not currently running are run by the ollama server as needed
response = await self.client.list()
available_models = [m["model"] for m in response["models"]]
if model.provider_resource_id is None:
raise ValueError("Model provider_resource_id cannot be None")
available_models = [m.model for m in response.models]
provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
if provider_resource_id is None:
provider_resource_id = model.provider_resource_id
if provider_resource_id not in available_models:
available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]]
available_models_latest = [m.model.split(":latest")[0] for m in response.models]
if provider_resource_id in available_models_latest:
logger.warning(
f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"

View file

@ -36,6 +36,10 @@ class RedisKVStoreConfig(CommonConfig):
def url(self) -> str:
return f"redis://{self.host}:{self.port}"
@property
def pip_packages(self) -> list[str]:
return ["redis"]
@classmethod
def sample_run_config(cls):
return {
@ -53,6 +57,10 @@ class SqliteKVStoreConfig(CommonConfig):
description="File path for the sqlite database",
)
@property
def pip_packages(self) -> list[str]:
return ["aiosqlite"]
@classmethod
def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"):
return {
@ -100,6 +108,10 @@ class PostgresKVStoreConfig(CommonConfig):
raise ValueError("Table name must be less than 63 characters")
return v
@property
def pip_packages(self) -> list[str]:
return ["psycopg2-binary"]
class MongoDBKVStoreConfig(CommonConfig):
type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value
@ -110,6 +122,10 @@ class MongoDBKVStoreConfig(CommonConfig):
password: str | None = None
collection_name: str = "llamastack_kvstore"
@property
def pip_packages(self) -> list[str]:
return ["pymongo"]
@classmethod
def sample_run_config(cls, collection_name: str = "llamastack_kvstore"):
return {

View file

@ -10,6 +10,13 @@ from .config import KVStoreConfig, KVStoreType
def kvstore_dependencies():
"""
Returns all possible kvstore dependencies for registry/provider specifications.
NOTE: For specific kvstore implementations, use config.pip_packages instead.
This function returns the union of all dependencies for cases where the specific
kvstore type is not known at declaration time (e.g., provider registries).
"""
return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"]

View file

@ -21,4 +21,5 @@ distribution_spec:
image_type: conda
additional_pip_packages:
- asyncpg
- psycopg2-binary
- sqlalchemy[asyncio]

View file

@ -186,8 +186,14 @@ class DistributionTemplate(BaseModel):
additional_pip_packages: list[str] = []
for run_config in self.run_configs.values():
run_config_ = run_config.run_config(self.name, self.providers, self.container_image)
# TODO: This is a hack to get the dependencies for internal APIs into build
# We should have a better way to do this by formalizing the concept of "internal" APIs
# and providers, with a way to specify dependencies for them.
if run_config_.inference_store:
additional_pip_packages.extend(run_config_.inference_store.pip_packages)
if run_config_.metadata_store:
additional_pip_packages.extend(run_config_.metadata_store.pip_packages)
if self.additional_pip_packages:
additional_pip_packages.extend(self.additional_pip_packages)

View file

@ -19,7 +19,7 @@
"@radix-ui/react-tooltip": "^1.2.6",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"llama-stack-client": "0.2.9",
"llama-stack-client": "0.2.10",
"lucide-react": "^0.510.0",
"next": "15.3.2",
"next-themes": "^0.4.6",

View file

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "llama_stack"
version = "0.2.9"
version = "0.2.10"
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
description = "Llama Stack"
readme = "README.md"
@ -22,12 +22,13 @@ classifiers = [
]
dependencies = [
"aiohttp",
"fastapi>=0.115.0,<1.0",
"fire",
"httpx",
"huggingface-hub",
"jinja2>=3.1.6",
"jsonschema",
"llama-stack-client>=0.2.9",
"llama-stack-client>=0.2.10",
"openai>=1.66",
"prompt-toolkit",
"python-dotenv",
@ -48,7 +49,7 @@ dependencies = [
ui = [
"streamlit",
"pandas",
"llama-stack-client>=0.2.9",
"llama-stack-client>=0.2.10",
"streamlit-option-menu",
]
@ -67,7 +68,6 @@ dev = [
"types-setuptools",
"pre-commit",
"uvicorn",
"fastapi",
"ruamel.yaml", # needed for openapi generator
]
# These are the dependencies required for running unit tests.
@ -133,7 +133,8 @@ llama = "llama_stack.cli.llama:main"
install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned"
[tool.setuptools.packages.find]
include = ["llama_stack"]
where = ["."]
include = ["llama_stack", "llama_stack.*"]
[[tool.uv.index]]
name = "pytorch-cpu"

View file

@ -42,6 +42,8 @@ ecdsa==0.19.1
# via python-jose
exceptiongroup==1.2.2 ; python_full_version < '3.11'
# via anyio
fastapi==0.115.8
# via llama-stack
filelock==3.17.0
# via huggingface-hub
fire==0.7.0
@ -79,7 +81,7 @@ jsonschema==4.23.0
# via llama-stack
jsonschema-specifications==2024.10.1
# via jsonschema
llama-stack-client==0.2.9
llama-stack-client==0.2.10
# via llama-stack
markdown-it-py==3.0.0
# via rich
@ -117,6 +119,7 @@ pyasn1==0.4.8
# rsa
pydantic==2.10.6
# via
# fastapi
# llama-stack
# llama-stack-client
# openai
@ -171,7 +174,9 @@ sniffio==1.3.1
# llama-stack-client
# openai
starlette==0.45.3
# via llama-stack
# via
# fastapi
# llama-stack
termcolor==2.5.0
# via
# fire
@ -187,6 +192,7 @@ tqdm==4.67.1
typing-extensions==4.12.2
# via
# anyio
# fastapi
# huggingface-hub
# llama-stack-client
# multidict

View file

@ -15,11 +15,6 @@ from pathlib import Path
from rich.progress import Progress, SpinnerColumn, TextColumn
from llama_stack.distribution.build import (
SERVER_DEPENDENCIES,
get_provider_dependencies,
)
REPO_ROOT = Path(__file__).parent.parent
@ -90,23 +85,6 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool:
return has_changes
def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]:
try:
module_name = f"llama_stack.templates.{template_dir.name}"
module = importlib.import_module(module_name)
if template_func := getattr(module, "get_distribution_template", None):
template = template_func()
normal_deps, special_deps = get_provider_dependencies(template)
# Combine all dependencies in order: normal deps, special deps, server deps
all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps))
return template.name, all_deps
except Exception:
return None, []
return None, []
def pre_import_templates(template_dirs: list[Path]) -> None:
# Pre-import all template modules to avoid deadlocks.
for template_dir in template_dirs:

13
tests/Containerfile Normal file
View file

@ -0,0 +1,13 @@
# Containerfile used to build our all in one ollama image to run tests in CI
# podman build --platform linux/amd64 -f Containerfile -t ollama-with-models .
#
FROM --platform=linux/amd64 ollama/ollama:latest
# Start ollama and pull models in a single layer
RUN ollama serve & \
sleep 5 && \
ollama pull llama3.2:3b-instruct-fp16 && \
ollama pull all-minilm:latest
# Set the entrypoint to start ollama serve
ENTRYPOINT ["ollama", "serve"]

View file

@ -6,9 +6,15 @@
from io import BytesIO
import pytest
def test_openai_client_basic_operations(openai_client):
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
def test_openai_client_basic_operations(openai_client, client_with_models):
"""Test basic file operations through OpenAI client."""
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI files are not supported when testing with library client yet.")
client = openai_client
test_content = b"files test content"

View file

@ -80,6 +80,37 @@ def openai_responses_impl(mock_inference_api, mock_tool_groups_api, mock_tool_ru
)
async def fake_stream(fixture: str = "simple_chat_completion.yaml"):
value = load_chat_completion_fixture(fixture)
yield ChatCompletionChunk(
id=value.id,
choices=[
Choice(
index=0,
delta=ChoiceDelta(
content=c.message.content,
role=c.message.role,
tool_calls=[
ChoiceDeltaToolCall(
index=0,
id=t.id,
function=ChoiceDeltaToolCallFunction(
name=t.function.name,
arguments=t.function.arguments,
),
)
for t in (c.message.tool_calls or [])
],
),
)
for c in value.choices
],
created=1,
model=value.model,
object="chat.completion.chunk",
)
@pytest.mark.asyncio
async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
"""Test creating an OpenAI response with a simple string input."""
@ -88,8 +119,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixture
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
result = await openai_responses_impl.create_openai_response(
@ -104,7 +134,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
response_format=OpenAIResponseFormatText(),
tools=None,
stream=False,
stream=True,
temperature=0.1,
)
openai_responses_impl.responses_store.store_response_object.assert_called_once()
@ -121,20 +151,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
input_text = "What is the capital of Ireland?"
model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixtures
tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.side_effect = [
tool_call_completion,
tool_response_completion,
fake_stream("tool_call_completion.yaml"),
fake_stream(),
]
openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
identifier="web_search",
provider_id="client",
toolgroup_id="web_search",
tool_host="client",
description="Search the web for information",
parameters=[
ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True)
@ -189,7 +214,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
input_text = "How hot it is in San Francisco today?"
model = "meta-llama/Llama-3.1-8B-Instruct"
async def fake_stream():
async def fake_stream_toolcall():
yield ChatCompletionChunk(
id="123",
choices=[
@ -212,7 +237,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
object="chat.completion.chunk",
)
mock_inference_api.openai_chat_completion.return_value = fake_stream()
mock_inference_api.openai_chat_completion.return_value = fake_stream_toolcall()
# Execute
result = await openai_responses_impl.create_openai_response(
@ -271,7 +296,7 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im
]
model = "meta-llama/Llama-3.1-8B-Instruct"
mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
await openai_responses_impl.create_openai_response(
@ -399,9 +424,7 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m
model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers."
# Load the chat completion fixture
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
await openai_responses_impl.create_openai_response(
@ -440,8 +463,7 @@ async def test_create_openai_response_with_instructions_and_multiple_messages(
model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers."
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
await openai_responses_impl.create_openai_response(
@ -499,8 +521,8 @@ async def test_create_openai_response_with_instructions_and_previous_response(
model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers."
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
await openai_responses_impl.create_openai_response(
@ -674,8 +696,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
current_input = "Now what is 3+3?"
model = "meta-llama/Llama-3.1-8B-Instruct"
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute - Create response with previous_response_id
result = await openai_responses_impl.create_openai_response(
@ -732,9 +754,7 @@ async def test_create_openai_response_with_text_format(
input_text = "How hot it is in San Francisco today?"
model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixture
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute
_result = await openai_responses_impl.create_openai_response(