Merge branch 'main' into fix/divide-by-zero-exception-faiss-query-vector

This commit is contained in:
Ibrahim Haroon 2025-06-06 11:29:14 -04:00 committed by GitHub
commit b05a3db358
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 254 additions and 389 deletions

View file

@ -1,26 +1,9 @@
name: Setup Ollama name: Setup Ollama
description: Start Ollama and cache model description: Start Ollama
inputs:
models:
description: Comma-separated list of models to pull
default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
runs: runs:
using: "composite" using: "composite"
steps: steps:
- name: Install and start Ollama - name: Start Ollama
shell: bash shell: bash
run: | run: |
# the ollama installer also starts the ollama service docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
curl -fsSL https://ollama.com/install.sh | sh
# Do NOT cache models - pulling the cache is actually slower than just pulling the model.
# It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
# pull them directly.
# Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
- name: Pull requested models
if: inputs.models != ''
shell: bash
run: |
for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
ollama pull "$model"
done

View file

@ -1,12 +1,17 @@
name: Setup runner name: Setup runner
description: Prepare a runner for the tests (install uv, python, project dependencies, etc.) description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
inputs:
python-version:
description: The Python version to use
required: false
default: "3.10"
runs: runs:
using: "composite" using: "composite"
steps: steps:
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
with: with:
python-version: "3.10" python-version: ${{ inputs.python-version }}
activate-environment: true activate-environment: true
version: 0.7.6 version: 0.7.6

View file

@ -26,6 +26,7 @@ jobs:
# TODO: generate matrix list from tests/integration when fixed # TODO: generate matrix list from tests/integration when fixed
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime] test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
client-type: [library, http] client-type: [library, http]
python-version: ["3.10", "3.11", "3.12"]
fail-fast: false # we want to run all tests regardless of failure fail-fast: false # we want to run all tests regardless of failure
steps: steps:
@ -34,20 +35,22 @@ jobs:
- name: Install dependencies - name: Install dependencies
uses: ./.github/actions/setup-runner uses: ./.github/actions/setup-runner
with:
python-version: ${{ matrix.python-version }}
- name: Setup ollama - name: Setup ollama
uses: ./.github/actions/setup-ollama uses: ./.github/actions/setup-ollama
- name: Build Llama Stack - name: Build Llama Stack
run: | run: |
llama stack build --template ollama --image-type venv uv run llama stack build --template ollama --image-type venv
- name: Start Llama Stack server in background - name: Start Llama Stack server in background
if: matrix.client-type == 'http' if: matrix.client-type == 'http'
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: | run: |
LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv & LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
- name: Wait for Llama Stack server to be ready - name: Wait for Llama Stack server to be ready
if: matrix.client-type == 'http' if: matrix.client-type == 'http'
@ -84,6 +87,7 @@ jobs:
- name: Run Integration Tests - name: Run Integration Tests
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
OLLAMA_URL: "http://0.0.0.0:11434"
run: | run: |
if [ "${{ matrix.client-type }}" == "library" ]; then if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="ollama" stack_config="ollama"
@ -104,13 +108,13 @@ jobs:
- name: Write ollama logs to file - name: Write ollama logs to file
if: ${{ always() }} if: ${{ always() }}
run: | run: |
sudo journalctl -u ollama.service > ollama.log sudo docker logs ollama > ollama.log
- name: Upload all logs to artifacts - name: Upload all logs to artifacts
if: ${{ always() }} if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with: with:
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }} name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
path: | path: |
*.log *.log
retention-days: 1 retention-days: 1

View file

@ -43,23 +43,12 @@ def get_provider_dependencies(
config: BuildConfig | DistributionTemplate, config: BuildConfig | DistributionTemplate,
) -> tuple[list[str], list[str]]: ) -> tuple[list[str], list[str]]:
"""Get normal and special dependencies from provider configuration.""" """Get normal and special dependencies from provider configuration."""
# Extract providers based on config type
if isinstance(config, DistributionTemplate): if isinstance(config, DistributionTemplate):
providers = config.providers config = config.build_config()
# TODO: This is a hack to get the dependencies for internal APIs into build
# We should have a better way to do this by formalizing the concept of "internal" APIs
# and providers, with a way to specify dependencies for them.
run_configs = config.run_configs
additional_pip_packages: list[str] = []
if run_configs:
for run_config in run_configs.values():
run_config_ = run_config.run_config(name="", providers={}, container_image=None)
if run_config_.inference_store:
additional_pip_packages.extend(run_config_.inference_store.pip_packages)
elif isinstance(config, BuildConfig):
providers = config.distribution_spec.providers providers = config.distribution_spec.providers
additional_pip_packages = config.additional_pip_packages additional_pip_packages = config.additional_pip_packages
deps = [] deps = []
registry = get_provider_registry(config) registry = get_provider_registry(config)
for api_str, provider_or_providers in providers.items(): for api_str, provider_or_providers in providers.items():
@ -87,8 +76,7 @@ def get_provider_dependencies(
else: else:
normal_deps.append(package) normal_deps.append(package)
if additional_pip_packages: normal_deps.extend(additional_pip_packages or [])
normal_deps.extend(additional_pip_packages)
return list(set(normal_deps)), list(set(special_deps)) return list(set(normal_deps)), list(set(special_deps))

View file

@ -149,13 +149,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
logger.info(f"Removed handler {handler.__class__.__name__} from root logger") logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
def request(self, *args, **kwargs): def request(self, *args, **kwargs):
if kwargs.get("stream"):
# NOTE: We are using AsyncLlamaStackClient under the hood # NOTE: We are using AsyncLlamaStackClient under the hood
# A new event loop is needed to convert the AsyncStream # A new event loop is needed to convert the AsyncStream
# from async client into SyncStream return type for streaming # from async client into SyncStream return type for streaming
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
if kwargs.get("stream"):
def sync_generator(): def sync_generator():
try: try:
async_stream = loop.run_until_complete(self.async_client.request(*args, **kwargs)) async_stream = loop.run_until_complete(self.async_client.request(*args, **kwargs))
@ -172,7 +173,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
return sync_generator() return sync_generator()
else: else:
return asyncio.run(self.async_client.request(*args, **kwargs)) try:
result = loop.run_until_complete(self.async_client.request(*args, **kwargs))
finally:
pending = asyncio.all_tasks(loop)
if pending:
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
loop.close()
return result
class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

View file

@ -8,7 +8,7 @@ import json
import time import time
import uuid import uuid
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from typing import Any, cast from typing import Any
from openai.types.chat import ChatCompletionToolParam from openai.types.chat import ChatCompletionToolParam
from pydantic import BaseModel from pydantic import BaseModel
@ -200,7 +200,6 @@ class ChatCompletionContext(BaseModel):
messages: list[OpenAIMessageParam] messages: list[OpenAIMessageParam]
tools: list[ChatCompletionToolParam] | None = None tools: list[ChatCompletionToolParam] | None = None
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
stream: bool
temperature: float | None temperature: float | None
response_format: OpenAIResponseFormatParam response_format: OpenAIResponseFormatParam
@ -281,49 +280,6 @@ class OpenAIResponsesImpl:
""" """
return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order) return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
def _is_function_tool_call(
self,
tool_call: OpenAIChatCompletionToolCall,
tools: list[OpenAIResponseInputTool],
) -> bool:
if not tool_call.function:
return False
for t in tools:
if t.type == "function" and t.name == tool_call.function.name:
return True
return False
async def _process_response_choices(
self,
chat_response: OpenAIChatCompletion,
ctx: ChatCompletionContext,
tools: list[OpenAIResponseInputTool] | None,
) -> list[OpenAIResponseOutput]:
"""Handle tool execution and response message creation."""
output_messages: list[OpenAIResponseOutput] = []
# Execute tool calls if any
for choice in chat_response.choices:
if choice.message.tool_calls and tools:
# Assume if the first tool is a function, all tools are functions
if self._is_function_tool_call(choice.message.tool_calls[0], tools):
for tool_call in choice.message.tool_calls:
output_messages.append(
OpenAIResponseOutputMessageFunctionToolCall(
arguments=tool_call.function.arguments or "",
call_id=tool_call.id,
name=tool_call.function.name or "",
id=f"fc_{uuid.uuid4()}",
status="completed",
)
)
else:
tool_messages = await self._execute_tool_and_return_final_output(choice, ctx)
output_messages.extend(tool_messages)
else:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
return output_messages
async def _store_response( async def _store_response(
self, self,
response: OpenAIResponseObject, response: OpenAIResponseObject,
@ -370,9 +326,48 @@ class OpenAIResponsesImpl:
tools: list[OpenAIResponseInputTool] | None = None, tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10, max_infer_iters: int | None = 10,
): ):
stream = False if stream is None else stream stream = bool(stream)
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
stream_gen = self._create_streaming_response(
input=input,
model=model,
instructions=instructions,
previous_response_id=previous_response_id,
store=store,
temperature=temperature,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
if stream:
return stream_gen
else:
response = None
async for stream_chunk in stream_gen:
if stream_chunk.type == "response.completed":
if response is not None:
raise ValueError("The response stream completed multiple times! Earlier response: {response}")
response = stream_chunk.response
# don't leave the generator half complete!
if response is None:
raise ValueError("The response stream never completed")
return response
async def _create_streaming_response(
self,
input: str | list[OpenAIResponseInput],
model: str,
instructions: str | None = None,
previous_response_id: str | None = None,
store: bool | None = True,
temperature: float | None = None,
text: OpenAIResponseText | None = None,
tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10,
) -> AsyncIterator[OpenAIResponseObjectStream]:
output_messages: list[OpenAIResponseOutput] = [] output_messages: list[OpenAIResponseOutput] = []
# Input preprocessing # Input preprocessing
@ -383,7 +378,7 @@ class OpenAIResponsesImpl:
# Structured outputs # Structured outputs
response_format = await _convert_response_text_to_chat_response_format(text) response_format = await _convert_response_text_to_chat_response_format(text)
# Tool setup # Tool setup, TODO: refactor this slightly since this can also yield events
chat_tools, mcp_tool_to_server, mcp_list_message = ( chat_tools, mcp_tool_to_server, mcp_list_message = (
await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None) await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
) )
@ -395,136 +390,10 @@ class OpenAIResponsesImpl:
messages=messages, messages=messages,
tools=chat_tools, tools=chat_tools,
mcp_tool_to_server=mcp_tool_to_server, mcp_tool_to_server=mcp_tool_to_server,
stream=stream,
temperature=temperature, temperature=temperature,
response_format=response_format, response_format=response_format,
) )
# Fork to streaming vs non-streaming - let each handle ALL inference rounds
if stream:
return self._create_streaming_response(
ctx=ctx,
output_messages=output_messages,
input=input,
model=model,
store=store,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
else:
return await self._create_non_streaming_response(
ctx=ctx,
output_messages=output_messages,
input=input,
model=model,
store=store,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
)
async def _create_non_streaming_response(
self,
ctx: ChatCompletionContext,
output_messages: list[OpenAIResponseOutput],
input: str | list[OpenAIResponseInput],
model: str,
store: bool | None,
text: OpenAIResponseText,
tools: list[OpenAIResponseInputTool] | None,
max_infer_iters: int,
) -> OpenAIResponseObject:
n_iter = 0
messages = ctx.messages.copy()
while True:
# Do inference (including the first one)
inference_result = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=messages,
tools=ctx.tools,
stream=False,
temperature=ctx.temperature,
response_format=ctx.response_format,
)
completion = OpenAIChatCompletion(**inference_result.model_dump())
# Separate function vs non-function tool calls
function_tool_calls = []
non_function_tool_calls = []
for choice in completion.choices:
if choice.message.tool_calls and tools:
for tool_call in choice.message.tool_calls:
if self._is_function_tool_call(tool_call, tools):
function_tool_calls.append(tool_call)
else:
non_function_tool_calls.append(tool_call)
# Process response choices based on tool call types
if function_tool_calls:
# For function tool calls, use existing logic and return immediately
current_output_messages = await self._process_response_choices(
chat_response=completion,
ctx=ctx,
tools=tools,
)
output_messages.extend(current_output_messages)
break
elif non_function_tool_calls:
# For non-function tool calls, execute them and continue loop
for choice in completion.choices:
tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
output_messages.extend(tool_outputs)
# Add assistant message and tool responses to messages for next iteration
messages.append(choice.message)
messages.extend(tool_response_messages)
n_iter += 1
if n_iter >= max_infer_iters:
break
# Continue with next iteration of the loop
continue
else:
# No tool calls - convert response to message and we're done
for choice in completion.choices:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
break
response = OpenAIResponseObject(
created_at=completion.created,
id=f"resp-{uuid.uuid4()}",
model=model,
object="response",
status="completed",
output=output_messages,
text=text,
)
logger.debug(f"OpenAI Responses response: {response}")
# Store response if requested
if store:
await self._store_response(
response=response,
input=input,
)
return response
async def _create_streaming_response(
self,
ctx: ChatCompletionContext,
output_messages: list[OpenAIResponseOutput],
input: str | list[OpenAIResponseInput],
model: str,
store: bool | None,
text: OpenAIResponseText,
tools: list[OpenAIResponseInputTool] | None,
max_infer_iters: int | None,
) -> AsyncIterator[OpenAIResponseObjectStream]:
# Create initial response and emit response.created immediately # Create initial response and emit response.created immediately
response_id = f"resp-{uuid.uuid4()}" response_id = f"resp-{uuid.uuid4()}"
created_at = int(time.time()) created_at = int(time.time())
@ -539,15 +408,13 @@ class OpenAIResponsesImpl:
text=text, text=text,
) )
# Emit response.created immediately
yield OpenAIResponseObjectStreamResponseCreated(response=initial_response) yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
# Implement tool execution loop for streaming - handle ALL inference rounds including the first
n_iter = 0 n_iter = 0
messages = ctx.messages.copy() messages = ctx.messages.copy()
while True: while True:
current_inference_result = await self.inference_api.openai_chat_completion( completion_result = await self.inference_api.openai_chat_completion(
model=ctx.model, model=ctx.model,
messages=messages, messages=messages,
tools=ctx.tools, tools=ctx.tools,
@ -568,7 +435,7 @@ class OpenAIResponsesImpl:
# Create a placeholder message item for delta events # Create a placeholder message item for delta events
message_item_id = f"msg_{uuid.uuid4()}" message_item_id = f"msg_{uuid.uuid4()}"
async for chunk in current_inference_result: async for chunk in completion_result:
chat_response_id = chunk.id chat_response_id = chunk.id
chunk_created = chunk.created chunk_created = chunk.created
chunk_model = chunk.model chunk_model = chunk.model
@ -628,49 +495,54 @@ class OpenAIResponsesImpl:
model=chunk_model, model=chunk_model,
) )
# Separate function vs non-function tool calls
function_tool_calls = [] function_tool_calls = []
non_function_tool_calls = [] non_function_tool_calls = []
next_turn_messages = messages.copy()
for choice in current_response.choices: for choice in current_response.choices:
next_turn_messages.append(choice.message)
if choice.message.tool_calls and tools: if choice.message.tool_calls and tools:
for tool_call in choice.message.tool_calls: for tool_call in choice.message.tool_calls:
if self._is_function_tool_call(tool_call, tools): if _is_function_tool_call(tool_call, tools):
function_tool_calls.append(tool_call) function_tool_calls.append(tool_call)
else: else:
non_function_tool_calls.append(tool_call) non_function_tool_calls.append(tool_call)
else:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
# Process response choices based on tool call types # execute non-function tool calls
if function_tool_calls: for tool_call in non_function_tool_calls:
# For function tool calls, use existing logic and break tool_call_log, tool_response_message = await self._execute_tool_call(tool_call, ctx)
current_output_messages = await self._process_response_choices( if tool_call_log:
chat_response=current_response, output_messages.append(tool_call_log)
ctx=ctx, if tool_response_message:
tools=tools, next_turn_messages.append(tool_response_message)
for tool_call in function_tool_calls:
output_messages.append(
OpenAIResponseOutputMessageFunctionToolCall(
arguments=tool_call.function.arguments or "",
call_id=tool_call.id,
name=tool_call.function.name or "",
id=f"fc_{uuid.uuid4()}",
status="completed",
)
) )
output_messages.extend(current_output_messages)
break
elif non_function_tool_calls:
# For non-function tool calls, execute them and continue loop
for choice in current_response.choices:
tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
output_messages.extend(tool_outputs)
# Add assistant message and tool responses to messages for next iteration if not function_tool_calls and not non_function_tool_calls:
messages.append(choice.message) break
messages.extend(tool_response_messages)
if function_tool_calls:
logger.info("Exiting inference loop since there is a function (client-side) tool call")
break
n_iter += 1 n_iter += 1
if n_iter >= (max_infer_iters or 10): if n_iter >= max_infer_iters:
logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}")
break break
# Continue with next iteration of the loop messages = next_turn_messages
continue
else:
# No tool calls - convert response to message and we're done
for choice in current_response.choices:
output_messages.append(await _convert_chat_choice_to_response_message(choice))
break
# Create final response # Create final response
final_response = OpenAIResponseObject( final_response = OpenAIResponseObject(
@ -683,15 +555,15 @@ class OpenAIResponsesImpl:
output=output_messages, output=output_messages,
) )
# Emit response.completed
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
if store: if store:
await self._store_response( await self._store_response(
response=final_response, response=final_response,
input=input, input=input,
) )
# Emit response.completed
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
async def _convert_response_tools_to_chat_tools( async def _convert_response_tools_to_chat_tools(
self, tools: list[OpenAIResponseInputTool] self, tools: list[OpenAIResponseInputTool]
) -> tuple[ ) -> tuple[
@ -784,73 +656,6 @@ class OpenAIResponsesImpl:
raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}") raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
return chat_tools, mcp_tool_to_server, mcp_list_message return chat_tools, mcp_tool_to_server, mcp_list_message
async def _execute_tool_calls_only(
self,
choice: OpenAIChoice,
ctx: ChatCompletionContext,
) -> tuple[list[OpenAIResponseOutput], list[OpenAIMessageParam]]:
"""Execute tool calls and return output messages and tool response messages for next inference."""
output_messages: list[OpenAIResponseOutput] = []
tool_response_messages: list[OpenAIMessageParam] = []
if not isinstance(choice.message, OpenAIAssistantMessageParam):
return output_messages, tool_response_messages
if not choice.message.tool_calls:
return output_messages, tool_response_messages
for tool_call in choice.message.tool_calls:
tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
if tool_call_log:
output_messages.append(tool_call_log)
if further_input:
tool_response_messages.append(further_input)
return output_messages, tool_response_messages
async def _execute_tool_and_return_final_output(
self,
choice: OpenAIChoice,
ctx: ChatCompletionContext,
) -> list[OpenAIResponseOutput]:
output_messages: list[OpenAIResponseOutput] = []
if not isinstance(choice.message, OpenAIAssistantMessageParam):
return output_messages
if not choice.message.tool_calls:
return output_messages
next_turn_messages = ctx.messages.copy()
# Add the assistant message with tool_calls response to the messages list
next_turn_messages.append(choice.message)
for tool_call in choice.message.tool_calls:
# TODO: telemetry spans for tool calls
tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
if tool_call_log:
output_messages.append(tool_call_log)
if further_input:
next_turn_messages.append(further_input)
tool_results_chat_response = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=next_turn_messages,
stream=ctx.stream,
temperature=ctx.temperature,
)
# type cast to appease mypy: this is needed because we don't handle streaming properly :)
tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
# Huge TODO: these are NOT the final outputs, we must keep the loop going
tool_final_outputs = [
await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
]
# TODO: Wire in annotations with URLs, titles, etc to these output messages
output_messages.extend(tool_final_outputs)
return output_messages
async def _execute_tool_call( async def _execute_tool_call(
self, self,
tool_call: OpenAIChatCompletionToolCall, tool_call: OpenAIChatCompletionToolCall,
@ -939,3 +744,15 @@ class OpenAIResponsesImpl:
input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id) input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id)
return message, input_message return message, input_message
def _is_function_tool_call(
tool_call: OpenAIChatCompletionToolCall,
tools: list[OpenAIResponseInputTool],
) -> bool:
if not tool_call.function:
return False
for t in tools:
if t.type == "function" and t.name == tool_call.function.name:
return True
return False

View file

@ -24,7 +24,7 @@ def available_providers() -> list[ProviderSpec]:
"pandas", "pandas",
"scikit-learn", "scikit-learn",
] ]
+ kvstore_dependencies(), + kvstore_dependencies(), # TODO make this dynamic based on the kvstore config
module="llama_stack.providers.inline.agents.meta_reference", module="llama_stack.providers.inline.agents.meta_reference",
config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig", config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
api_dependencies=[ api_dependencies=[

View file

@ -345,21 +345,27 @@ class OllamaInferenceAdapter(
model = await self.register_helper.register_model(model) model = await self.register_helper.register_model(model)
except ValueError: except ValueError:
pass # Ignore statically unknown model, will check live listing pass # Ignore statically unknown model, will check live listing
if model.provider_resource_id is None:
raise ValueError("Model provider_resource_id cannot be None")
if model.model_type == ModelType.embedding: if model.model_type == ModelType.embedding:
logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...") logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
# TODO: you should pull here only if the model is not found in a list
response = await self.client.list()
if model.provider_resource_id not in [m.model for m in response.models]:
await self.client.pull(model.provider_resource_id) await self.client.pull(model.provider_resource_id)
# we use list() here instead of ps() - # we use list() here instead of ps() -
# - ps() only lists running models, not available models # - ps() only lists running models, not available models
# - models not currently running are run by the ollama server as needed # - models not currently running are run by the ollama server as needed
response = await self.client.list() response = await self.client.list()
available_models = [m["model"] for m in response["models"]] available_models = [m.model for m in response.models]
if model.provider_resource_id is None:
raise ValueError("Model provider_resource_id cannot be None")
provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id) provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
if provider_resource_id is None: if provider_resource_id is None:
provider_resource_id = model.provider_resource_id provider_resource_id = model.provider_resource_id
if provider_resource_id not in available_models: if provider_resource_id not in available_models:
available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]] available_models_latest = [m.model.split(":latest")[0] for m in response.models]
if provider_resource_id in available_models_latest: if provider_resource_id in available_models_latest:
logger.warning( logger.warning(
f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'" f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"

View file

@ -36,6 +36,10 @@ class RedisKVStoreConfig(CommonConfig):
def url(self) -> str: def url(self) -> str:
return f"redis://{self.host}:{self.port}" return f"redis://{self.host}:{self.port}"
@property
def pip_packages(self) -> list[str]:
return ["redis"]
@classmethod @classmethod
def sample_run_config(cls): def sample_run_config(cls):
return { return {
@ -53,6 +57,10 @@ class SqliteKVStoreConfig(CommonConfig):
description="File path for the sqlite database", description="File path for the sqlite database",
) )
@property
def pip_packages(self) -> list[str]:
return ["aiosqlite"]
@classmethod @classmethod
def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"): def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"):
return { return {
@ -100,6 +108,10 @@ class PostgresKVStoreConfig(CommonConfig):
raise ValueError("Table name must be less than 63 characters") raise ValueError("Table name must be less than 63 characters")
return v return v
@property
def pip_packages(self) -> list[str]:
return ["psycopg2-binary"]
class MongoDBKVStoreConfig(CommonConfig): class MongoDBKVStoreConfig(CommonConfig):
type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value
@ -110,6 +122,10 @@ class MongoDBKVStoreConfig(CommonConfig):
password: str | None = None password: str | None = None
collection_name: str = "llamastack_kvstore" collection_name: str = "llamastack_kvstore"
@property
def pip_packages(self) -> list[str]:
return ["pymongo"]
@classmethod @classmethod
def sample_run_config(cls, collection_name: str = "llamastack_kvstore"): def sample_run_config(cls, collection_name: str = "llamastack_kvstore"):
return { return {

View file

@ -10,6 +10,13 @@ from .config import KVStoreConfig, KVStoreType
def kvstore_dependencies(): def kvstore_dependencies():
"""
Returns all possible kvstore dependencies for registry/provider specifications.
NOTE: For specific kvstore implementations, use config.pip_packages instead.
This function returns the union of all dependencies for cases where the specific
kvstore type is not known at declaration time (e.g., provider registries).
"""
return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"] return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"]

View file

@ -21,4 +21,5 @@ distribution_spec:
image_type: conda image_type: conda
additional_pip_packages: additional_pip_packages:
- asyncpg - asyncpg
- psycopg2-binary
- sqlalchemy[asyncio] - sqlalchemy[asyncio]

View file

@ -186,8 +186,14 @@ class DistributionTemplate(BaseModel):
additional_pip_packages: list[str] = [] additional_pip_packages: list[str] = []
for run_config in self.run_configs.values(): for run_config in self.run_configs.values():
run_config_ = run_config.run_config(self.name, self.providers, self.container_image) run_config_ = run_config.run_config(self.name, self.providers, self.container_image)
# TODO: This is a hack to get the dependencies for internal APIs into build
# We should have a better way to do this by formalizing the concept of "internal" APIs
# and providers, with a way to specify dependencies for them.
if run_config_.inference_store: if run_config_.inference_store:
additional_pip_packages.extend(run_config_.inference_store.pip_packages) additional_pip_packages.extend(run_config_.inference_store.pip_packages)
if run_config_.metadata_store:
additional_pip_packages.extend(run_config_.metadata_store.pip_packages)
if self.additional_pip_packages: if self.additional_pip_packages:
additional_pip_packages.extend(self.additional_pip_packages) additional_pip_packages.extend(self.additional_pip_packages)

View file

@ -19,7 +19,7 @@
"@radix-ui/react-tooltip": "^1.2.6", "@radix-ui/react-tooltip": "^1.2.6",
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"llama-stack-client": "0.2.9", "llama-stack-client": "0.2.10",
"lucide-react": "^0.510.0", "lucide-react": "^0.510.0",
"next": "15.3.2", "next": "15.3.2",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",

View file

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "llama_stack" name = "llama_stack"
version = "0.2.9" version = "0.2.10"
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
description = "Llama Stack" description = "Llama Stack"
readme = "README.md" readme = "README.md"
@ -22,12 +22,13 @@ classifiers = [
] ]
dependencies = [ dependencies = [
"aiohttp", "aiohttp",
"fastapi>=0.115.0,<1.0",
"fire", "fire",
"httpx", "httpx",
"huggingface-hub", "huggingface-hub",
"jinja2>=3.1.6", "jinja2>=3.1.6",
"jsonschema", "jsonschema",
"llama-stack-client>=0.2.9", "llama-stack-client>=0.2.10",
"openai>=1.66", "openai>=1.66",
"prompt-toolkit", "prompt-toolkit",
"python-dotenv", "python-dotenv",
@ -48,7 +49,7 @@ dependencies = [
ui = [ ui = [
"streamlit", "streamlit",
"pandas", "pandas",
"llama-stack-client>=0.2.9", "llama-stack-client>=0.2.10",
"streamlit-option-menu", "streamlit-option-menu",
] ]
@ -67,7 +68,6 @@ dev = [
"types-setuptools", "types-setuptools",
"pre-commit", "pre-commit",
"uvicorn", "uvicorn",
"fastapi",
"ruamel.yaml", # needed for openapi generator "ruamel.yaml", # needed for openapi generator
] ]
# These are the dependencies required for running unit tests. # These are the dependencies required for running unit tests.
@ -133,7 +133,8 @@ llama = "llama_stack.cli.llama:main"
install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned" install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
include = ["llama_stack"] where = ["."]
include = ["llama_stack", "llama_stack.*"]
[[tool.uv.index]] [[tool.uv.index]]
name = "pytorch-cpu" name = "pytorch-cpu"

View file

@ -42,6 +42,8 @@ ecdsa==0.19.1
# via python-jose # via python-jose
exceptiongroup==1.2.2 ; python_full_version < '3.11' exceptiongroup==1.2.2 ; python_full_version < '3.11'
# via anyio # via anyio
fastapi==0.115.8
# via llama-stack
filelock==3.17.0 filelock==3.17.0
# via huggingface-hub # via huggingface-hub
fire==0.7.0 fire==0.7.0
@ -79,7 +81,7 @@ jsonschema==4.23.0
# via llama-stack # via llama-stack
jsonschema-specifications==2024.10.1 jsonschema-specifications==2024.10.1
# via jsonschema # via jsonschema
llama-stack-client==0.2.9 llama-stack-client==0.2.10
# via llama-stack # via llama-stack
markdown-it-py==3.0.0 markdown-it-py==3.0.0
# via rich # via rich
@ -117,6 +119,7 @@ pyasn1==0.4.8
# rsa # rsa
pydantic==2.10.6 pydantic==2.10.6
# via # via
# fastapi
# llama-stack # llama-stack
# llama-stack-client # llama-stack-client
# openai # openai
@ -171,7 +174,9 @@ sniffio==1.3.1
# llama-stack-client # llama-stack-client
# openai # openai
starlette==0.45.3 starlette==0.45.3
# via llama-stack # via
# fastapi
# llama-stack
termcolor==2.5.0 termcolor==2.5.0
# via # via
# fire # fire
@ -187,6 +192,7 @@ tqdm==4.67.1
typing-extensions==4.12.2 typing-extensions==4.12.2
# via # via
# anyio # anyio
# fastapi
# huggingface-hub # huggingface-hub
# llama-stack-client # llama-stack-client
# multidict # multidict

View file

@ -15,11 +15,6 @@ from pathlib import Path
from rich.progress import Progress, SpinnerColumn, TextColumn from rich.progress import Progress, SpinnerColumn, TextColumn
from llama_stack.distribution.build import (
SERVER_DEPENDENCIES,
get_provider_dependencies,
)
REPO_ROOT = Path(__file__).parent.parent REPO_ROOT = Path(__file__).parent.parent
@ -90,23 +85,6 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool:
return has_changes return has_changes
def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]:
try:
module_name = f"llama_stack.templates.{template_dir.name}"
module = importlib.import_module(module_name)
if template_func := getattr(module, "get_distribution_template", None):
template = template_func()
normal_deps, special_deps = get_provider_dependencies(template)
# Combine all dependencies in order: normal deps, special deps, server deps
all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps))
return template.name, all_deps
except Exception:
return None, []
return None, []
def pre_import_templates(template_dirs: list[Path]) -> None: def pre_import_templates(template_dirs: list[Path]) -> None:
# Pre-import all template modules to avoid deadlocks. # Pre-import all template modules to avoid deadlocks.
for template_dir in template_dirs: for template_dir in template_dirs:

13
tests/Containerfile Normal file
View file

@ -0,0 +1,13 @@
# Containerfile used to build our all in one ollama image to run tests in CI
# podman build --platform linux/amd64 -f Containerfile -t ollama-with-models .
#
FROM --platform=linux/amd64 ollama/ollama:latest
# Start ollama and pull models in a single layer
RUN ollama serve & \
sleep 5 && \
ollama pull llama3.2:3b-instruct-fp16 && \
ollama pull all-minilm:latest
# Set the entrypoint to start ollama serve
ENTRYPOINT ["ollama", "serve"]

View file

@ -6,9 +6,15 @@
from io import BytesIO from io import BytesIO
import pytest
def test_openai_client_basic_operations(openai_client): from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
def test_openai_client_basic_operations(openai_client, client_with_models):
"""Test basic file operations through OpenAI client.""" """Test basic file operations through OpenAI client."""
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI files are not supported when testing with library client yet.")
client = openai_client client = openai_client
test_content = b"files test content" test_content = b"files test content"

View file

@ -80,6 +80,37 @@ def openai_responses_impl(mock_inference_api, mock_tool_groups_api, mock_tool_ru
) )
async def fake_stream(fixture: str = "simple_chat_completion.yaml"):
value = load_chat_completion_fixture(fixture)
yield ChatCompletionChunk(
id=value.id,
choices=[
Choice(
index=0,
delta=ChoiceDelta(
content=c.message.content,
role=c.message.role,
tool_calls=[
ChoiceDeltaToolCall(
index=0,
id=t.id,
function=ChoiceDeltaToolCallFunction(
name=t.function.name,
arguments=t.function.arguments,
),
)
for t in (c.message.tool_calls or [])
],
),
)
for c in value.choices
],
created=1,
model=value.model,
object="chat.completion.chunk",
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api): async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
"""Test creating an OpenAI response with a simple string input.""" """Test creating an OpenAI response with a simple string input."""
@ -88,8 +119,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixture # Load the chat completion fixture
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") mock_inference_api.openai_chat_completion.return_value = fake_stream()
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
# Execute # Execute
result = await openai_responses_impl.create_openai_response( result = await openai_responses_impl.create_openai_response(
@ -104,7 +134,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
response_format=OpenAIResponseFormatText(), response_format=OpenAIResponseFormatText(),
tools=None, tools=None,
stream=False, stream=True,
temperature=0.1, temperature=0.1,
) )
openai_responses_impl.responses_store.store_response_object.assert_called_once() openai_responses_impl.responses_store.store_response_object.assert_called_once()
@ -121,20 +151,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
input_text = "What is the capital of Ireland?" input_text = "What is the capital of Ireland?"
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixtures
tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.side_effect = [ mock_inference_api.openai_chat_completion.side_effect = [
tool_call_completion, fake_stream("tool_call_completion.yaml"),
tool_response_completion, fake_stream(),
] ]
openai_responses_impl.tool_groups_api.get_tool.return_value = Tool( openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
identifier="web_search", identifier="web_search",
provider_id="client", provider_id="client",
toolgroup_id="web_search", toolgroup_id="web_search",
tool_host="client",
description="Search the web for information", description="Search the web for information",
parameters=[ parameters=[
ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True) ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True)
@ -189,7 +214,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
input_text = "How hot it is in San Francisco today?" input_text = "How hot it is in San Francisco today?"
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
async def fake_stream(): async def fake_stream_toolcall():
yield ChatCompletionChunk( yield ChatCompletionChunk(
id="123", id="123",
choices=[ choices=[
@ -212,7 +237,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
object="chat.completion.chunk", object="chat.completion.chunk",
) )
mock_inference_api.openai_chat_completion.return_value = fake_stream() mock_inference_api.openai_chat_completion.return_value = fake_stream_toolcall()
# Execute # Execute
result = await openai_responses_impl.create_openai_response( result = await openai_responses_impl.create_openai_response(
@ -271,7 +296,7 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im
] ]
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml") mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute # Execute
await openai_responses_impl.create_openai_response( await openai_responses_impl.create_openai_response(
@ -399,9 +424,7 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers." instructions = "You are a geography expert. Provide concise answers."
# Load the chat completion fixture mock_inference_api.openai_chat_completion.return_value = fake_stream()
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
# Execute # Execute
await openai_responses_impl.create_openai_response( await openai_responses_impl.create_openai_response(
@ -440,8 +463,7 @@ async def test_create_openai_response_with_instructions_and_multiple_messages(
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers." instructions = "You are a geography expert. Provide concise answers."
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") mock_inference_api.openai_chat_completion.return_value = fake_stream()
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
# Execute # Execute
await openai_responses_impl.create_openai_response( await openai_responses_impl.create_openai_response(
@ -499,8 +521,8 @@ async def test_create_openai_response_with_instructions_and_previous_response(
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
instructions = "You are a geography expert. Provide concise answers." instructions = "You are a geography expert. Provide concise answers."
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute # Execute
await openai_responses_impl.create_openai_response( await openai_responses_impl.create_openai_response(
@ -674,8 +696,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
current_input = "Now what is 3+3?" current_input = "Now what is 3+3?"
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion mock_inference_api.openai_chat_completion.return_value = fake_stream()
# Execute - Create response with previous_response_id # Execute - Create response with previous_response_id
result = await openai_responses_impl.create_openai_response( result = await openai_responses_impl.create_openai_response(
@ -732,9 +754,7 @@ async def test_create_openai_response_with_text_format(
input_text = "How hot it is in San Francisco today?" input_text = "How hot it is in San Francisco today?"
model = "meta-llama/Llama-3.1-8B-Instruct" model = "meta-llama/Llama-3.1-8B-Instruct"
# Load the chat completion fixture mock_inference_api.openai_chat_completion.return_value = fake_stream()
mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
# Execute # Execute
_result = await openai_responses_impl.create_openai_response( _result = await openai_responses_impl.create_openai_response(