diff --git a/zero_to_hero_guide/00_Inference101.ipynb b/zero_to_hero_guide/00_Inference101.ipynb index 4da0d0df1..eb2816feb 100644 --- a/zero_to_hero_guide/00_Inference101.ipynb +++ b/zero_to_hero_guide/00_Inference101.ipynb @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "77c29dba", "metadata": {}, "outputs": [ @@ -104,7 +104,8 @@ " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n", " ],\n", - " model='Llama3.2-11B-Vision-Instruct',\n", + " model_id='Llama3.2-11B-Vision-Instruct',\n", + " stream=True\n", ")\n", "\n", "print(response.completion_message.content)" @@ -124,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "5c6812da", "metadata": {}, "outputs": [ @@ -143,7 +144,8 @@ " {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n", " ],\n", - " model='Llama3.2-11B-Vision-Instruct',\n", + " model_id='Llama3.2-11B-Vision-Instruct',\n", + " stream=True\n", ")\n", "\n", "print(response.completion_message.content)" @@ -226,7 +228,8 @@ " message = {\"role\": \"user\", \"content\": user_input}\n", " response = client.inference.chat_completion(\n", " messages=[message],\n", - " model='Llama3.2-11B-Vision-Instruct',\n", + " model_id='Llama3.2-11B-Vision-Instruct',\n", + " stream=True\n", " )\n", " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", "\n", @@ -274,7 +277,8 @@ "\n", " response = client.inference.chat_completion(\n", " messages=conversation_history,\n", - " model='Llama3.2-11B-Vision-Instruct',\n", + " model_id='Llama3.2-11B-Vision-Instruct',\n", + " stream=True\n", " )\n", " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", "\n", @@ -299,7 +303,7 @@ "source": [ "## Streaming Responses\n", "\n", - "Llama Stack offers a `stream` parameter in the `chat_completion` function, which allows partial responses to be returned progressively as they are generated. This can enhance user experience by providing immediate feedback without waiting for the entire response to be processed." + "Llama Stack offers a `stream` parameter in the `chat_completion` function, which allows partial responses to be returned progressively as they are generated. This can enhance user experience by providing immediate feedback without waiting for the entire response to be processed. You can change this `bool` value to `True` or `False` to allow streaming" ] }, { @@ -322,8 +326,8 @@ "\n", " response = client.inference.chat_completion(\n", " messages=[message],\n", - " model='Llama3.2-11B-Vision-Instruct',\n", - " stream=stream,\n", + " model_id='Llama3.2-11B-Vision-Instruct',\n", + " stream=True\n", " )\n", "\n", " if not stream:\n", @@ -341,7 +345,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -355,7 +359,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/zero_to_hero_guide/quickstart.md b/zero_to_hero_guide/quickstart.md index 107bf0d12..c18d0ff03 100644 --- a/zero_to_hero_guide/quickstart.md +++ b/zero_to_hero_guide/quickstart.md @@ -102,6 +102,13 @@ If you're looking for more specific topics like tool calling or agent setup, we llama stack build --template ollama --image-type conda ``` +After this step, you will see the console output: +``` +Build Successful! Next steps: + 1. Set the environment variables: LLAMASTACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL + 2. `llama stack run /Users/username/.llama/distributions/llamastack-ollama/ollama-run.yaml` +``` + 2. **Edit Configuration**: - Modify the `ollama-run.yaml` file located at `/Users/yourusername/.llama/distributions/llamastack-ollama/ollama-run.yaml`: - Change the `chromadb` port to `8000`.