From b6500974eca169ed053a7b95408ac756c160c004 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Tue, 3 Dec 2024 20:11:19 -0800 Subject: [PATCH] removed assertion in ollama.py and fixed typo in the readme (#563) # What does this PR do? 1. removed [incorrect assertion](https://github.com/meta-llama/llama-stack/blob/435f34b05e84f1747b28570234f25878cf0b31c4/llama_stack/providers/remote/inference/ollama/ollama.py#L183) in ollama.py 2. fixed a typo in [this line](https://github.com/meta-llama/llama-stack/blob/435f34b05e84f1747b28570234f25878cf0b31c4/docs/source/distributions/importing_as_library.md?plain=1#L24), as `model=` should be `model_id=` . - [x] Addresses issue ([#issue562](https://github.com/meta-llama/llama-stack/issues/562)) ## Test Plan tested with code: ```python import asyncio import os # pip install aiosqlite ollama faiss from llama_stack_client.lib.direct.direct import LlamaStackDirectClient from llama_stack_client.types import SystemMessage, UserMessage async def main(): os.environ["INFERENCE_MODEL"] = "meta-llama/Llama-3.2-1B-Instruct" client = await LlamaStackDirectClient.from_template("ollama") await client.initialize() response = await client.models.list() print(response) model_name = response[0].identifier response = await client.inference.chat_completion( messages=[ SystemMessage(content="You are a friendly assistant.", role="system"), UserMessage( content="hello world, write me a 2 sentence poem about the moon", role="user", ), ], model_id=model_name, stream=False, ) print("\nChat completion response:") print(response, type(response)) asyncio.run(main()) ``` OUTPUT: ``` python test.py Using template ollama with config: apis: - agents - inference - memory - safety - telemetry conda_env: ollama datasets: [] docker_image: null eval_tasks: [] image_name: ollama memory_banks: [] metadata_store: db_path: /Users/kaiwu/.llama/distributions/ollama/registry.db namespace: null type: sqlite models: - metadata: {} model_id: meta-llama/Llama-3.2-1B-Instruct provider_id: ollama provider_model_id: null providers: agents: - config: persistence_store: db_path: /Users/kaiwu/.llama/distributions/ollama/agents_store.db namespace: null type: sqlite provider_id: meta-reference provider_type: inline::meta-reference inference: - config: url: http://localhost:11434 provider_id: ollama provider_type: remote::ollama memory: - config: kvstore: db_path: /Users/kaiwu/.llama/distributions/ollama/faiss_store.db namespace: null type: sqlite provider_id: faiss provider_type: inline::faiss safety: - config: {} provider_id: llama-guard provider_type: inline::llama-guard telemetry: - config: {} provider_id: meta-reference provider_type: inline::meta-reference scoring_fns: [] shields: [] version: '2' [Model(identifier='meta-llama/Llama-3.2-1B-Instruct', provider_resource_id='llama3.2:1b-instruct-fp16', provider_id='ollama', type='model', metadata={})] Chat completion response: completion_message=CompletionMessage(role='assistant', content='Here is a short poem about the moon:\n\nThe moon glows bright in the midnight sky,\nA silver crescent shining, catching the eye.', stop_reason=, tool_calls=[]) logprobs=None ``` ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- docs/source/distributions/importing_as_library.md | 2 +- llama_stack/providers/remote/inference/ollama/ollama.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/distributions/importing_as_library.md b/docs/source/distributions/importing_as_library.md index 815660fd4..7e15062df 100644 --- a/docs/source/distributions/importing_as_library.md +++ b/docs/source/distributions/importing_as_library.md @@ -21,7 +21,7 @@ print(response) ```python response = await client.inference.chat_completion( messages=[UserMessage(content="What is the capital of France?", role="user")], - model="Llama3.1-8B-Instruct", + model_id="Llama3.1-8B-Instruct", stream=False, ) print("\nChat completion response:") diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 74c0b8601..f89629afc 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -180,7 +180,6 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator: params = await self._get_params(request) r = await self.client.generate(**params) - assert isinstance(r, dict) choice = OpenAICompatCompletionChoice( finish_reason=r["done_reason"] if r["done"] else None,