diff --git a/.github/actions/setup-ollama/action.yml b/.github/actions/setup-ollama/action.yml index 3dd6c940c..1f6e9818b 100644 --- a/.github/actions/setup-ollama/action.yml +++ b/.github/actions/setup-ollama/action.yml @@ -1,26 +1,9 @@ name: Setup Ollama -description: Start Ollama and cache model -inputs: - models: - description: Comma-separated list of models to pull - default: "llama3.2:3b-instruct-fp16,all-minilm:latest" +description: Start Ollama runs: using: "composite" steps: - - name: Install and start Ollama + - name: Start Ollama shell: bash run: | - # the ollama installer also starts the ollama service - curl -fsSL https://ollama.com/install.sh | sh - - # Do NOT cache models - pulling the cache is actually slower than just pulling the model. - # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to - # pull them directly. - # Maybe this is because the cache is being pulled at the same time by all the matrix jobs? - - name: Pull requested models - if: inputs.models != '' - shell: bash - run: | - for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do - ollama pull "$model" - done + docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml index 6cba4fdc3..cdd438eb2 100644 --- a/.github/actions/setup-runner/action.yml +++ b/.github/actions/setup-runner/action.yml @@ -1,12 +1,17 @@ name: Setup runner description: Prepare a runner for the tests (install uv, python, project dependencies, etc.) +inputs: + python-version: + description: The Python version to use + required: false + default: "3.10" runs: using: "composite" steps: - name: Install uv uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: - python-version: "3.10" + python-version: ${{ inputs.python-version }} activate-environment: true version: 0.7.6 diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index d78e82c9d..7aa8b5807 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -26,6 +26,7 @@ jobs: # TODO: generate matrix list from tests/integration when fixed test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime] client-type: [library, http] + python-version: ["3.10", "3.11", "3.12"] fail-fast: false # we want to run all tests regardless of failure steps: @@ -34,20 +35,22 @@ jobs: - name: Install dependencies uses: ./.github/actions/setup-runner + with: + python-version: ${{ matrix.python-version }} - name: Setup ollama uses: ./.github/actions/setup-ollama - name: Build Llama Stack run: | - llama stack build --template ollama --image-type venv + uv run llama stack build --template ollama --image-type venv - name: Start Llama Stack server in background if: matrix.client-type == 'http' env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" run: | - LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv & + LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" & - name: Wait for Llama Stack server to be ready if: matrix.client-type == 'http' @@ -84,6 +87,7 @@ jobs: - name: Run Integration Tests env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + OLLAMA_URL: "http://0.0.0.0:11434" run: | if [ "${{ matrix.client-type }}" == "library" ]; then stack_config="ollama" @@ -104,13 +108,13 @@ jobs: - name: Write ollama logs to file if: ${{ always() }} run: | - sudo journalctl -u ollama.service > ollama.log + sudo docker logs ollama > ollama.log - name: Upload all logs to artifacts if: ${{ always() }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }} + name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }} path: | *.log retention-days: 1 diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 5906614ed..4f9091a5d 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -43,23 +43,12 @@ def get_provider_dependencies( config: BuildConfig | DistributionTemplate, ) -> tuple[list[str], list[str]]: """Get normal and special dependencies from provider configuration.""" - # Extract providers based on config type if isinstance(config, DistributionTemplate): - providers = config.providers + config = config.build_config() + + providers = config.distribution_spec.providers + additional_pip_packages = config.additional_pip_packages - # TODO: This is a hack to get the dependencies for internal APIs into build - # We should have a better way to do this by formalizing the concept of "internal" APIs - # and providers, with a way to specify dependencies for them. - run_configs = config.run_configs - additional_pip_packages: list[str] = [] - if run_configs: - for run_config in run_configs.values(): - run_config_ = run_config.run_config(name="", providers={}, container_image=None) - if run_config_.inference_store: - additional_pip_packages.extend(run_config_.inference_store.pip_packages) - elif isinstance(config, BuildConfig): - providers = config.distribution_spec.providers - additional_pip_packages = config.additional_pip_packages deps = [] registry = get_provider_registry(config) for api_str, provider_or_providers in providers.items(): @@ -87,8 +76,7 @@ def get_provider_dependencies( else: normal_deps.append(package) - if additional_pip_packages: - normal_deps.extend(additional_pip_packages) + normal_deps.extend(additional_pip_packages or []) return list(set(normal_deps)), list(set(special_deps)) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index f32130cf9..cebfabba5 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -149,12 +149,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient): logger.info(f"Removed handler {handler.__class__.__name__} from root logger") def request(self, *args, **kwargs): + # NOTE: We are using AsyncLlamaStackClient under the hood + # A new event loop is needed to convert the AsyncStream + # from async client into SyncStream return type for streaming + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + if kwargs.get("stream"): - # NOTE: We are using AsyncLlamaStackClient under the hood - # A new event loop is needed to convert the AsyncStream - # from async client into SyncStream return type for streaming - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) def sync_generator(): try: @@ -172,7 +173,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient): return sync_generator() else: - return asyncio.run(self.async_client.request(*args, **kwargs)) + try: + result = loop.run_until_complete(self.async_client.request(*args, **kwargs)) + finally: + pending = asyncio.all_tasks(loop) + if pending: + loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) + loop.close() + return result class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py index 06f445c18..0ff6dc2c5 100644 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py @@ -8,7 +8,7 @@ import json import time import uuid from collections.abc import AsyncIterator -from typing import Any, cast +from typing import Any from openai.types.chat import ChatCompletionToolParam from pydantic import BaseModel @@ -200,7 +200,6 @@ class ChatCompletionContext(BaseModel): messages: list[OpenAIMessageParam] tools: list[ChatCompletionToolParam] | None = None mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] - stream: bool temperature: float | None response_format: OpenAIResponseFormatParam @@ -281,49 +280,6 @@ class OpenAIResponsesImpl: """ return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order) - def _is_function_tool_call( - self, - tool_call: OpenAIChatCompletionToolCall, - tools: list[OpenAIResponseInputTool], - ) -> bool: - if not tool_call.function: - return False - for t in tools: - if t.type == "function" and t.name == tool_call.function.name: - return True - return False - - async def _process_response_choices( - self, - chat_response: OpenAIChatCompletion, - ctx: ChatCompletionContext, - tools: list[OpenAIResponseInputTool] | None, - ) -> list[OpenAIResponseOutput]: - """Handle tool execution and response message creation.""" - output_messages: list[OpenAIResponseOutput] = [] - # Execute tool calls if any - for choice in chat_response.choices: - if choice.message.tool_calls and tools: - # Assume if the first tool is a function, all tools are functions - if self._is_function_tool_call(choice.message.tool_calls[0], tools): - for tool_call in choice.message.tool_calls: - output_messages.append( - OpenAIResponseOutputMessageFunctionToolCall( - arguments=tool_call.function.arguments or "", - call_id=tool_call.id, - name=tool_call.function.name or "", - id=f"fc_{uuid.uuid4()}", - status="completed", - ) - ) - else: - tool_messages = await self._execute_tool_and_return_final_output(choice, ctx) - output_messages.extend(tool_messages) - else: - output_messages.append(await _convert_chat_choice_to_response_message(choice)) - - return output_messages - async def _store_response( self, response: OpenAIResponseObject, @@ -370,9 +326,48 @@ class OpenAIResponsesImpl: tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, ): - stream = False if stream is None else stream + stream = bool(stream) text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text + stream_gen = self._create_streaming_response( + input=input, + model=model, + instructions=instructions, + previous_response_id=previous_response_id, + store=store, + temperature=temperature, + text=text, + tools=tools, + max_infer_iters=max_infer_iters, + ) + + if stream: + return stream_gen + else: + response = None + async for stream_chunk in stream_gen: + if stream_chunk.type == "response.completed": + if response is not None: + raise ValueError("The response stream completed multiple times! Earlier response: {response}") + response = stream_chunk.response + # don't leave the generator half complete! + + if response is None: + raise ValueError("The response stream never completed") + return response + + async def _create_streaming_response( + self, + input: str | list[OpenAIResponseInput], + model: str, + instructions: str | None = None, + previous_response_id: str | None = None, + store: bool | None = True, + temperature: float | None = None, + text: OpenAIResponseText | None = None, + tools: list[OpenAIResponseInputTool] | None = None, + max_infer_iters: int | None = 10, + ) -> AsyncIterator[OpenAIResponseObjectStream]: output_messages: list[OpenAIResponseOutput] = [] # Input preprocessing @@ -383,7 +378,7 @@ class OpenAIResponsesImpl: # Structured outputs response_format = await _convert_response_text_to_chat_response_format(text) - # Tool setup + # Tool setup, TODO: refactor this slightly since this can also yield events chat_tools, mcp_tool_to_server, mcp_list_message = ( await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None) ) @@ -395,136 +390,10 @@ class OpenAIResponsesImpl: messages=messages, tools=chat_tools, mcp_tool_to_server=mcp_tool_to_server, - stream=stream, temperature=temperature, response_format=response_format, ) - # Fork to streaming vs non-streaming - let each handle ALL inference rounds - if stream: - return self._create_streaming_response( - ctx=ctx, - output_messages=output_messages, - input=input, - model=model, - store=store, - text=text, - tools=tools, - max_infer_iters=max_infer_iters, - ) - else: - return await self._create_non_streaming_response( - ctx=ctx, - output_messages=output_messages, - input=input, - model=model, - store=store, - text=text, - tools=tools, - max_infer_iters=max_infer_iters, - ) - - async def _create_non_streaming_response( - self, - ctx: ChatCompletionContext, - output_messages: list[OpenAIResponseOutput], - input: str | list[OpenAIResponseInput], - model: str, - store: bool | None, - text: OpenAIResponseText, - tools: list[OpenAIResponseInputTool] | None, - max_infer_iters: int, - ) -> OpenAIResponseObject: - n_iter = 0 - messages = ctx.messages.copy() - - while True: - # Do inference (including the first one) - inference_result = await self.inference_api.openai_chat_completion( - model=ctx.model, - messages=messages, - tools=ctx.tools, - stream=False, - temperature=ctx.temperature, - response_format=ctx.response_format, - ) - completion = OpenAIChatCompletion(**inference_result.model_dump()) - - # Separate function vs non-function tool calls - function_tool_calls = [] - non_function_tool_calls = [] - - for choice in completion.choices: - if choice.message.tool_calls and tools: - for tool_call in choice.message.tool_calls: - if self._is_function_tool_call(tool_call, tools): - function_tool_calls.append(tool_call) - else: - non_function_tool_calls.append(tool_call) - - # Process response choices based on tool call types - if function_tool_calls: - # For function tool calls, use existing logic and return immediately - current_output_messages = await self._process_response_choices( - chat_response=completion, - ctx=ctx, - tools=tools, - ) - output_messages.extend(current_output_messages) - break - elif non_function_tool_calls: - # For non-function tool calls, execute them and continue loop - for choice in completion.choices: - tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx) - output_messages.extend(tool_outputs) - - # Add assistant message and tool responses to messages for next iteration - messages.append(choice.message) - messages.extend(tool_response_messages) - - n_iter += 1 - if n_iter >= max_infer_iters: - break - - # Continue with next iteration of the loop - continue - else: - # No tool calls - convert response to message and we're done - for choice in completion.choices: - output_messages.append(await _convert_chat_choice_to_response_message(choice)) - break - - response = OpenAIResponseObject( - created_at=completion.created, - id=f"resp-{uuid.uuid4()}", - model=model, - object="response", - status="completed", - output=output_messages, - text=text, - ) - logger.debug(f"OpenAI Responses response: {response}") - - # Store response if requested - if store: - await self._store_response( - response=response, - input=input, - ) - - return response - - async def _create_streaming_response( - self, - ctx: ChatCompletionContext, - output_messages: list[OpenAIResponseOutput], - input: str | list[OpenAIResponseInput], - model: str, - store: bool | None, - text: OpenAIResponseText, - tools: list[OpenAIResponseInputTool] | None, - max_infer_iters: int | None, - ) -> AsyncIterator[OpenAIResponseObjectStream]: # Create initial response and emit response.created immediately response_id = f"resp-{uuid.uuid4()}" created_at = int(time.time()) @@ -539,15 +408,13 @@ class OpenAIResponsesImpl: text=text, ) - # Emit response.created immediately yield OpenAIResponseObjectStreamResponseCreated(response=initial_response) - # Implement tool execution loop for streaming - handle ALL inference rounds including the first n_iter = 0 messages = ctx.messages.copy() while True: - current_inference_result = await self.inference_api.openai_chat_completion( + completion_result = await self.inference_api.openai_chat_completion( model=ctx.model, messages=messages, tools=ctx.tools, @@ -568,7 +435,7 @@ class OpenAIResponsesImpl: # Create a placeholder message item for delta events message_item_id = f"msg_{uuid.uuid4()}" - async for chunk in current_inference_result: + async for chunk in completion_result: chat_response_id = chunk.id chunk_created = chunk.created chunk_model = chunk.model @@ -628,50 +495,55 @@ class OpenAIResponsesImpl: model=chunk_model, ) - # Separate function vs non-function tool calls function_tool_calls = [] non_function_tool_calls = [] + next_turn_messages = messages.copy() for choice in current_response.choices: + next_turn_messages.append(choice.message) + if choice.message.tool_calls and tools: for tool_call in choice.message.tool_calls: - if self._is_function_tool_call(tool_call, tools): + if _is_function_tool_call(tool_call, tools): function_tool_calls.append(tool_call) else: non_function_tool_calls.append(tool_call) - - # Process response choices based on tool call types - if function_tool_calls: - # For function tool calls, use existing logic and break - current_output_messages = await self._process_response_choices( - chat_response=current_response, - ctx=ctx, - tools=tools, - ) - output_messages.extend(current_output_messages) - break - elif non_function_tool_calls: - # For non-function tool calls, execute them and continue loop - for choice in current_response.choices: - tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx) - output_messages.extend(tool_outputs) - - # Add assistant message and tool responses to messages for next iteration - messages.append(choice.message) - messages.extend(tool_response_messages) - - n_iter += 1 - if n_iter >= (max_infer_iters or 10): - break - - # Continue with next iteration of the loop - continue - else: - # No tool calls - convert response to message and we're done - for choice in current_response.choices: + else: output_messages.append(await _convert_chat_choice_to_response_message(choice)) + + # execute non-function tool calls + for tool_call in non_function_tool_calls: + tool_call_log, tool_response_message = await self._execute_tool_call(tool_call, ctx) + if tool_call_log: + output_messages.append(tool_call_log) + if tool_response_message: + next_turn_messages.append(tool_response_message) + + for tool_call in function_tool_calls: + output_messages.append( + OpenAIResponseOutputMessageFunctionToolCall( + arguments=tool_call.function.arguments or "", + call_id=tool_call.id, + name=tool_call.function.name or "", + id=f"fc_{uuid.uuid4()}", + status="completed", + ) + ) + + if not function_tool_calls and not non_function_tool_calls: break + if function_tool_calls: + logger.info("Exiting inference loop since there is a function (client-side) tool call") + break + + n_iter += 1 + if n_iter >= max_infer_iters: + logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}") + break + + messages = next_turn_messages + # Create final response final_response = OpenAIResponseObject( created_at=created_at, @@ -683,15 +555,15 @@ class OpenAIResponsesImpl: output=output_messages, ) + # Emit response.completed + yield OpenAIResponseObjectStreamResponseCompleted(response=final_response) + if store: await self._store_response( response=final_response, input=input, ) - # Emit response.completed - yield OpenAIResponseObjectStreamResponseCompleted(response=final_response) - async def _convert_response_tools_to_chat_tools( self, tools: list[OpenAIResponseInputTool] ) -> tuple[ @@ -784,73 +656,6 @@ class OpenAIResponsesImpl: raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}") return chat_tools, mcp_tool_to_server, mcp_list_message - async def _execute_tool_calls_only( - self, - choice: OpenAIChoice, - ctx: ChatCompletionContext, - ) -> tuple[list[OpenAIResponseOutput], list[OpenAIMessageParam]]: - """Execute tool calls and return output messages and tool response messages for next inference.""" - output_messages: list[OpenAIResponseOutput] = [] - tool_response_messages: list[OpenAIMessageParam] = [] - - if not isinstance(choice.message, OpenAIAssistantMessageParam): - return output_messages, tool_response_messages - - if not choice.message.tool_calls: - return output_messages, tool_response_messages - - for tool_call in choice.message.tool_calls: - tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx) - if tool_call_log: - output_messages.append(tool_call_log) - if further_input: - tool_response_messages.append(further_input) - - return output_messages, tool_response_messages - - async def _execute_tool_and_return_final_output( - self, - choice: OpenAIChoice, - ctx: ChatCompletionContext, - ) -> list[OpenAIResponseOutput]: - output_messages: list[OpenAIResponseOutput] = [] - - if not isinstance(choice.message, OpenAIAssistantMessageParam): - return output_messages - - if not choice.message.tool_calls: - return output_messages - - next_turn_messages = ctx.messages.copy() - - # Add the assistant message with tool_calls response to the messages list - next_turn_messages.append(choice.message) - - for tool_call in choice.message.tool_calls: - # TODO: telemetry spans for tool calls - tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx) - if tool_call_log: - output_messages.append(tool_call_log) - if further_input: - next_turn_messages.append(further_input) - - tool_results_chat_response = await self.inference_api.openai_chat_completion( - model=ctx.model, - messages=next_turn_messages, - stream=ctx.stream, - temperature=ctx.temperature, - ) - # type cast to appease mypy: this is needed because we don't handle streaming properly :) - tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response) - - # Huge TODO: these are NOT the final outputs, we must keep the loop going - tool_final_outputs = [ - await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices - ] - # TODO: Wire in annotations with URLs, titles, etc to these output messages - output_messages.extend(tool_final_outputs) - return output_messages - async def _execute_tool_call( self, tool_call: OpenAIChatCompletionToolCall, @@ -939,3 +744,15 @@ class OpenAIResponsesImpl: input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id) return message, input_message + + +def _is_function_tool_call( + tool_call: OpenAIChatCompletionToolCall, + tools: list[OpenAIResponseInputTool], +) -> bool: + if not tool_call.function: + return False + for t in tools: + if t.type == "function" and t.name == tool_call.function.name: + return True + return False diff --git a/llama_stack/providers/registry/agents.py b/llama_stack/providers/registry/agents.py index e0801a8d1..e47f84c65 100644 --- a/llama_stack/providers/registry/agents.py +++ b/llama_stack/providers/registry/agents.py @@ -24,7 +24,7 @@ def available_providers() -> list[ProviderSpec]: "pandas", "scikit-learn", ] - + kvstore_dependencies(), + + kvstore_dependencies(), # TODO make this dynamic based on the kvstore config module="llama_stack.providers.inline.agents.meta_reference", config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig", api_dependencies=[ diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 7415f0eb0..358a29d4c 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -345,21 +345,27 @@ class OllamaInferenceAdapter( model = await self.register_helper.register_model(model) except ValueError: pass # Ignore statically unknown model, will check live listing + + if model.provider_resource_id is None: + raise ValueError("Model provider_resource_id cannot be None") + if model.model_type == ModelType.embedding: logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...") - await self.client.pull(model.provider_resource_id) + # TODO: you should pull here only if the model is not found in a list + response = await self.client.list() + if model.provider_resource_id not in [m.model for m in response.models]: + await self.client.pull(model.provider_resource_id) + # we use list() here instead of ps() - # - ps() only lists running models, not available models # - models not currently running are run by the ollama server as needed response = await self.client.list() - available_models = [m["model"] for m in response["models"]] - if model.provider_resource_id is None: - raise ValueError("Model provider_resource_id cannot be None") + available_models = [m.model for m in response.models] provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id) if provider_resource_id is None: provider_resource_id = model.provider_resource_id if provider_resource_id not in available_models: - available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]] + available_models_latest = [m.model.split(":latest")[0] for m in response.models] if provider_resource_id in available_models_latest: logger.warning( f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'" diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py index 18a6eb8dd..e966e13ba 100644 --- a/llama_stack/providers/utils/kvstore/config.py +++ b/llama_stack/providers/utils/kvstore/config.py @@ -36,6 +36,10 @@ class RedisKVStoreConfig(CommonConfig): def url(self) -> str: return f"redis://{self.host}:{self.port}" + @property + def pip_packages(self) -> list[str]: + return ["redis"] + @classmethod def sample_run_config(cls): return { @@ -53,6 +57,10 @@ class SqliteKVStoreConfig(CommonConfig): description="File path for the sqlite database", ) + @property + def pip_packages(self) -> list[str]: + return ["aiosqlite"] + @classmethod def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"): return { @@ -100,6 +108,10 @@ class PostgresKVStoreConfig(CommonConfig): raise ValueError("Table name must be less than 63 characters") return v + @property + def pip_packages(self) -> list[str]: + return ["psycopg2-binary"] + class MongoDBKVStoreConfig(CommonConfig): type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value @@ -110,6 +122,10 @@ class MongoDBKVStoreConfig(CommonConfig): password: str | None = None collection_name: str = "llamastack_kvstore" + @property + def pip_packages(self) -> list[str]: + return ["pymongo"] + @classmethod def sample_run_config(cls, collection_name: str = "llamastack_kvstore"): return { diff --git a/llama_stack/providers/utils/kvstore/kvstore.py b/llama_stack/providers/utils/kvstore/kvstore.py index 3a1ee8a26..426523d8e 100644 --- a/llama_stack/providers/utils/kvstore/kvstore.py +++ b/llama_stack/providers/utils/kvstore/kvstore.py @@ -10,6 +10,13 @@ from .config import KVStoreConfig, KVStoreType def kvstore_dependencies(): + """ + Returns all possible kvstore dependencies for registry/provider specifications. + + NOTE: For specific kvstore implementations, use config.pip_packages instead. + This function returns the union of all dependencies for cases where the specific + kvstore type is not known at declaration time (e.g., provider registries). + """ return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"] diff --git a/llama_stack/templates/postgres-demo/build.yaml b/llama_stack/templates/postgres-demo/build.yaml index a7dee0787..6416cd00f 100644 --- a/llama_stack/templates/postgres-demo/build.yaml +++ b/llama_stack/templates/postgres-demo/build.yaml @@ -21,4 +21,5 @@ distribution_spec: image_type: conda additional_pip_packages: - asyncpg +- psycopg2-binary - sqlalchemy[asyncio] diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index b1ffba5be..712d2dcb4 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -186,8 +186,14 @@ class DistributionTemplate(BaseModel): additional_pip_packages: list[str] = [] for run_config in self.run_configs.values(): run_config_ = run_config.run_config(self.name, self.providers, self.container_image) + + # TODO: This is a hack to get the dependencies for internal APIs into build + # We should have a better way to do this by formalizing the concept of "internal" APIs + # and providers, with a way to specify dependencies for them. if run_config_.inference_store: additional_pip_packages.extend(run_config_.inference_store.pip_packages) + if run_config_.metadata_store: + additional_pip_packages.extend(run_config_.metadata_store.pip_packages) if self.additional_pip_packages: additional_pip_packages.extend(self.additional_pip_packages) diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json index c612a8078..e6c49f182 100644 --- a/llama_stack/ui/package.json +++ b/llama_stack/ui/package.json @@ -19,7 +19,7 @@ "@radix-ui/react-tooltip": "^1.2.6", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", - "llama-stack-client": "0.2.9", + "llama-stack-client": "0.2.10", "lucide-react": "^0.510.0", "next": "15.3.2", "next-themes": "^0.4.6", diff --git a/pyproject.toml b/pyproject.toml index f709b9c97..bf40465fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama_stack" -version = "0.2.9" +version = "0.2.10" authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] description = "Llama Stack" readme = "README.md" @@ -22,12 +22,13 @@ classifiers = [ ] dependencies = [ "aiohttp", + "fastapi>=0.115.0,<1.0", "fire", "httpx", "huggingface-hub", "jinja2>=3.1.6", "jsonschema", - "llama-stack-client>=0.2.9", + "llama-stack-client>=0.2.10", "openai>=1.66", "prompt-toolkit", "python-dotenv", @@ -48,7 +49,7 @@ dependencies = [ ui = [ "streamlit", "pandas", - "llama-stack-client>=0.2.9", + "llama-stack-client>=0.2.10", "streamlit-option-menu", ] @@ -67,7 +68,6 @@ dev = [ "types-setuptools", "pre-commit", "uvicorn", - "fastapi", "ruamel.yaml", # needed for openapi generator ] # These are the dependencies required for running unit tests. @@ -133,7 +133,8 @@ llama = "llama_stack.cli.llama:main" install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned" [tool.setuptools.packages.find] -include = ["llama_stack"] +where = ["."] +include = ["llama_stack", "llama_stack.*"] [[tool.uv.index]] name = "pytorch-cpu" diff --git a/requirements.txt b/requirements.txt index 2934ebd8e..cfd63b456 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,6 +42,8 @@ ecdsa==0.19.1 # via python-jose exceptiongroup==1.2.2 ; python_full_version < '3.11' # via anyio +fastapi==0.115.8 + # via llama-stack filelock==3.17.0 # via huggingface-hub fire==0.7.0 @@ -79,7 +81,7 @@ jsonschema==4.23.0 # via llama-stack jsonschema-specifications==2024.10.1 # via jsonschema -llama-stack-client==0.2.9 +llama-stack-client==0.2.10 # via llama-stack markdown-it-py==3.0.0 # via rich @@ -117,6 +119,7 @@ pyasn1==0.4.8 # rsa pydantic==2.10.6 # via + # fastapi # llama-stack # llama-stack-client # openai @@ -171,7 +174,9 @@ sniffio==1.3.1 # llama-stack-client # openai starlette==0.45.3 - # via llama-stack + # via + # fastapi + # llama-stack termcolor==2.5.0 # via # fire @@ -187,6 +192,7 @@ tqdm==4.67.1 typing-extensions==4.12.2 # via # anyio + # fastapi # huggingface-hub # llama-stack-client # multidict diff --git a/scripts/distro_codegen.py b/scripts/distro_codegen.py index d33c5de67..b59cd3481 100755 --- a/scripts/distro_codegen.py +++ b/scripts/distro_codegen.py @@ -15,11 +15,6 @@ from pathlib import Path from rich.progress import Progress, SpinnerColumn, TextColumn -from llama_stack.distribution.build import ( - SERVER_DEPENDENCIES, - get_provider_dependencies, -) - REPO_ROOT = Path(__file__).parent.parent @@ -90,23 +85,6 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool: return has_changes -def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]: - try: - module_name = f"llama_stack.templates.{template_dir.name}" - module = importlib.import_module(module_name) - - if template_func := getattr(module, "get_distribution_template", None): - template = template_func() - normal_deps, special_deps = get_provider_dependencies(template) - # Combine all dependencies in order: normal deps, special deps, server deps - all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps)) - - return template.name, all_deps - except Exception: - return None, [] - return None, [] - - def pre_import_templates(template_dirs: list[Path]) -> None: # Pre-import all template modules to avoid deadlocks. for template_dir in template_dirs: diff --git a/tests/Containerfile b/tests/Containerfile new file mode 100644 index 000000000..3080d053a --- /dev/null +++ b/tests/Containerfile @@ -0,0 +1,13 @@ +# Containerfile used to build our all in one ollama image to run tests in CI +# podman build --platform linux/amd64 -f Containerfile -t ollama-with-models . +# +FROM --platform=linux/amd64 ollama/ollama:latest + +# Start ollama and pull models in a single layer +RUN ollama serve & \ + sleep 5 && \ + ollama pull llama3.2:3b-instruct-fp16 && \ + ollama pull all-minilm:latest + +# Set the entrypoint to start ollama serve +ENTRYPOINT ["ollama", "serve"] diff --git a/tests/integration/files/test_files.py b/tests/integration/files/test_files.py index 2275e7eb9..8375507dc 100644 --- a/tests/integration/files/test_files.py +++ b/tests/integration/files/test_files.py @@ -6,9 +6,15 @@ from io import BytesIO +import pytest -def test_openai_client_basic_operations(openai_client): +from llama_stack.distribution.library_client import LlamaStackAsLibraryClient + + +def test_openai_client_basic_operations(openai_client, client_with_models): """Test basic file operations through OpenAI client.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("OpenAI files are not supported when testing with library client yet.") client = openai_client test_content = b"files test content" diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index e524cc7d0..34f22c39f 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -80,6 +80,37 @@ def openai_responses_impl(mock_inference_api, mock_tool_groups_api, mock_tool_ru ) +async def fake_stream(fixture: str = "simple_chat_completion.yaml"): + value = load_chat_completion_fixture(fixture) + yield ChatCompletionChunk( + id=value.id, + choices=[ + Choice( + index=0, + delta=ChoiceDelta( + content=c.message.content, + role=c.message.role, + tool_calls=[ + ChoiceDeltaToolCall( + index=0, + id=t.id, + function=ChoiceDeltaToolCallFunction( + name=t.function.name, + arguments=t.function.arguments, + ), + ) + for t in (c.message.tool_calls or []) + ], + ), + ) + for c in value.choices + ], + created=1, + model=value.model, + object="chat.completion.chunk", + ) + + @pytest.mark.asyncio async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api): """Test creating an OpenAI response with a simple string input.""" @@ -88,8 +119,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m model = "meta-llama/Llama-3.1-8B-Instruct" # Load the chat completion fixture - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute result = await openai_responses_impl.create_openai_response( @@ -104,7 +134,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], response_format=OpenAIResponseFormatText(), tools=None, - stream=False, + stream=True, temperature=0.1, ) openai_responses_impl.responses_store.store_response_object.assert_called_once() @@ -121,20 +151,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon input_text = "What is the capital of Ireland?" model = "meta-llama/Llama-3.1-8B-Instruct" - # Load the chat completion fixtures - tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml") - tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.side_effect = [ - tool_call_completion, - tool_response_completion, + fake_stream("tool_call_completion.yaml"), + fake_stream(), ] openai_responses_impl.tool_groups_api.get_tool.return_value = Tool( identifier="web_search", provider_id="client", toolgroup_id="web_search", - tool_host="client", description="Search the web for information", parameters=[ ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True) @@ -189,7 +214,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_ input_text = "How hot it is in San Francisco today?" model = "meta-llama/Llama-3.1-8B-Instruct" - async def fake_stream(): + async def fake_stream_toolcall(): yield ChatCompletionChunk( id="123", choices=[ @@ -212,7 +237,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_ object="chat.completion.chunk", ) - mock_inference_api.openai_chat_completion.return_value = fake_stream() + mock_inference_api.openai_chat_completion.return_value = fake_stream_toolcall() # Execute result = await openai_responses_impl.create_openai_response( @@ -271,7 +296,7 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im ] model = "meta-llama/Llama-3.1-8B-Instruct" - mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml") + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute await openai_responses_impl.create_openai_response( @@ -399,9 +424,7 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m model = "meta-llama/Llama-3.1-8B-Instruct" instructions = "You are a geography expert. Provide concise answers." - # Load the chat completion fixture - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute await openai_responses_impl.create_openai_response( @@ -440,8 +463,7 @@ async def test_create_openai_response_with_instructions_and_multiple_messages( model = "meta-llama/Llama-3.1-8B-Instruct" instructions = "You are a geography expert. Provide concise answers." - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute await openai_responses_impl.create_openai_response( @@ -499,8 +521,8 @@ async def test_create_openai_response_with_instructions_and_previous_response( model = "meta-llama/Llama-3.1-8B-Instruct" instructions = "You are a geography expert. Provide concise answers." - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute await openai_responses_impl.create_openai_response( @@ -674,8 +696,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response( current_input = "Now what is 3+3?" model = "meta-llama/Llama-3.1-8B-Instruct" - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute - Create response with previous_response_id result = await openai_responses_impl.create_openai_response( @@ -732,9 +754,7 @@ async def test_create_openai_response_with_text_format( input_text = "How hot it is in San Francisco today?" model = "meta-llama/Llama-3.1-8B-Instruct" - # Load the chat completion fixture - mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") - mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + mock_inference_api.openai_chat_completion.return_value = fake_stream() # Execute _result = await openai_responses_impl.create_openai_response(