diff --git a/docs/zero_to_hero_guide/00_Inference101.ipynb b/docs/zero_to_hero_guide/00_Inference101.ipynb index b1c33b8dc..596814607 100644 --- a/docs/zero_to_hero_guide/00_Inference101.ipynb +++ b/docs/zero_to_hero_guide/00_Inference101.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "38a39e44", "metadata": {}, "outputs": [], @@ -52,11 +52,9 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "d1d097ab", + "cell_type": "markdown", + "id": "7dacaa2d-94e9-42e9-82a0-73522dfc7010", "metadata": {}, - "outputs": [], "source": [ "### 1. Set Up the Client\n", "\n", @@ -65,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "7a573752", "metadata": {}, "outputs": [], @@ -73,7 +71,7 @@ "from llama_stack_client import LlamaStackClient\n", "from llama_stack_client.types import SystemMessage, UserMessage\n", "\n", - "client = LlamaStackClient(base_url='http://{HOST}:{PORT}')" + "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')" ] }, { @@ -88,10 +86,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "77c29dba", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A gentle llama roams the land,\n", + "With soft fur and a gentle hand.\n" + ] + } + ], "source": [ "response = client.inference.chat_completion(\n", " messages=[\n", @@ -123,10 +130,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5c6812da", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "O, fairest llama, with thy softest fleece,\n", + "Thy gentle eyes, like sapphires, in serenity do cease.\n" + ] + } + ], "source": [ "response = client.inference.chat_completion(\n", " messages=[\n", @@ -151,17 +167,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "02211625", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> Write me a 3 sentence poem about alpaca\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: Softly grazing, gentle soul,\n", + "Alpaca's fleece, a treasure whole,\n", + "In Andean fields, they softly roll.\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> exit\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mEnding conversation. Goodbye!\u001b[0m\n" + ] + } + ], "source": [ "import asyncio\n", "from llama_stack_client import LlamaStackClient\n", "from llama_stack_client.types import UserMessage\n", "from termcolor import cprint\n", "\n", - "client = LlamaStackClient(base_url='http://{HOST}:{PORT}')\n", + "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n", "\n", "async def chat_loop():\n", " while True:\n", @@ -177,7 +224,10 @@ " )\n", " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", "\n", - "asyncio.run(chat_loop())" + "# Run the chat loop in a Jupyter Notebook cell using `await`\n", + "await chat_loop()\n", + "# To run it in a python file, use this line instead\n", + "# asyncio.run(chat_loop())" ] }, { @@ -192,10 +242,69 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "9496f75c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> what is 1+1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: 1 + 1 = 2\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> what is llama + alpaca\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: That's a creative and imaginative question. However, since llamas and alpacas are animals, not numbers, we can't perform a mathematical operation on them.\n", + "\n", + "But if we were to interpret this as a creative or humorous question, we could say that the result of \"llama + alpaca\" is a fun and fuzzy bundle of South American camelid cuteness!\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> what was the first question\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: The first question was \"what is 1+1\"\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "User> exit\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mEnding conversation. Goodbye!\u001b[0m\n" + ] + } + ], "source": [ "async def chat_loop():\n", " conversation_history = []\n", @@ -217,7 +326,10 @@ " assistant_message = UserMessage(content=response.completion_message.content, role='user')\n", " conversation_history.append(assistant_message)\n", "\n", - "asyncio.run(chat_loop())" + "# Use `await` in the Jupyter Notebook cell to call the function\n", + "await chat_loop()\n", + "# To run it in a python file, use this line instead\n", + "# asyncio.run(chat_loop())" ] }, { @@ -234,10 +346,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "d119026e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mUser> Write me a 3 sentence poem about llama\u001b[0m\n", + "\u001b[36mAssistant> \u001b[0m\u001b[33mSoft\u001b[0m\u001b[33mly\u001b[0m\u001b[33m padded\u001b[0m\u001b[33m feet\u001b[0m\u001b[33m on\u001b[0m\u001b[33m the\u001b[0m\u001b[33m ground\u001b[0m\u001b[33m,\n", + "\u001b[0m\u001b[33mA\u001b[0m\u001b[33m gentle\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m's\u001b[0m\u001b[33m peaceful\u001b[0m\u001b[33m sound\u001b[0m\u001b[33m,\n", + "\u001b[0m\u001b[33mF\u001b[0m\u001b[33murry\u001b[0m\u001b[33m coat\u001b[0m\u001b[33m and\u001b[0m\u001b[33m calm\u001b[0m\u001b[33m,\u001b[0m\u001b[33m serene\u001b[0m\u001b[33m eyes\u001b[0m\u001b[33m all\u001b[0m\u001b[33m around\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n" + ] + } + ], "source": [ "import asyncio\n", "from llama_stack_client import LlamaStackClient\n", @@ -246,12 +369,12 @@ "from termcolor import cprint\n", "\n", "async def run_main(stream: bool = True):\n", - " client = LlamaStackClient(base_url='http://{HOST}:{PORT}')\n", + " client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n", "\n", " message = UserMessage(\n", - " content='hello world, write me a 2 sentence poem about the moon', role='user'\n", + " content='Write me a 3 sentence poem about llama', role='user'\n", " )\n", - " print(f'User>{message.content}', 'green')\n", + " cprint(f'User> {message.content}', 'green')\n", "\n", " response = client.inference.chat_completion(\n", " messages=[message],\n", @@ -260,22 +383,37 @@ " )\n", "\n", " if not stream:\n", - " cprint(f'> Response: {response}', 'cyan')\n", + " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", " else:\n", " async for log in EventLogger().log(response):\n", " log.print()\n", - "\n", + " \n", " models_response = client.models.list()\n", - " print(models_response)\n", "\n", - "if __name__ == '__main__':\n", - " asyncio.run(run_main())" + "# In a Jupyter Notebook cell, use `await` to call the function\n", + "await run_main()\n", + "# To run it in a python file, use this line instead\n", + "# asyncio.run(chat_loop())" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" } }, "nbformat": 4, diff --git a/docs/zero_to_hero_guide/00_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/00_Local_Cloud_Inference101.ipynb index 84ea7c808..35f41d9d7 100644 --- a/docs/zero_to_hero_guide/00_Local_Cloud_Inference101.ipynb +++ b/docs/zero_to_hero_guide/00_Local_Cloud_Inference101.ipynb @@ -26,13 +26,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d80c0926", "metadata": {}, "outputs": [], "source": [ "HOST = \"localhost\" # Replace with your host\n", - "PORT = 5000 # Replace with your port" + "LOCAL_PORT = 5000 # Replace with your local distro port\n", + "CLOUD_PORT = 5001 # Replace with your cloud distro port" ] }, { @@ -47,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "7f868dfe", "metadata": {}, "outputs": [], @@ -55,8 +56,8 @@ "from llama_stack_client import LlamaStackClient\n", "\n", "# Configure local and cloud clients\n", - "local_client = LlamaStackClient(base_url='http://{HOST}:{LOCAL_PORT}')\n", - "cloud_client = LlamaStackClient(base_url='http://{HOST}:{CLOUD_PORT}')" + "local_client = LlamaStackClient(base_url=f'http://{HOST}:{LOCAL_PORT}')\n", + "cloud_client = LlamaStackClient(base_url=f'http://{HOST}:{CLOUD_PORT}')" ] }, { @@ -71,26 +72,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "ff0c8277", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUsing local client.\u001b[0m\n" + ] + } + ], "source": [ "import httpx\n", "from termcolor import cprint\n", "\n", - "async def select_client() -> LlamaStackClient:\n", - " \"\"\"Use local client if available; otherwise, switch to cloud client.\"\"\"\n", - " try:\n", - " async with httpx.AsyncClient() as http_client:\n", - " response = await http_client.get(f'{local_client.base_url}/health')\n", - " if response.status_code == 200:\n", - " cprint('Using local client.', 'yellow')\n", - " return local_client\n", - " except httpx.RequestError:\n", - " pass\n", - " cprint('Local client unavailable. Switching to cloud client.', 'yellow')\n", - " return cloud_client" + "async def select_client(use_local: bool) -> LlamaStackClient:\n", + " \"\"\"\n", + " Selects the client based on the use_local flag.\n", + " \n", + " Parameters:\n", + " - use_local: bool, True to try the local client, False to use the cloud client.\n", + " \n", + " Returns:\n", + " - LlamaStackClient: the selected client instance.\n", + " \"\"\"\n", + " if use_local:\n", + " try:\n", + " async with httpx.AsyncClient() as http_client:\n", + " response = await http_client.get(f'{local_client.base_url}/health')\n", + " if response.status_code == 200:\n", + " cprint('Using local client.', 'yellow')\n", + " return local_client\n", + " except httpx.RequestError:\n", + " cprint('Failed to connect to local client.', 'red')\n", + "\n", + " cprint('Using cloud client.', 'yellow')\n", + " return cloud_client\n", + "\n", + "# Example usage: pass True for local, False for cloud\n", + "client = await select_client(use_local=True)\n" ] }, { @@ -105,15 +127,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5e19cc20", "metadata": {}, "outputs": [], "source": [ "from llama_stack_client.types import UserMessage\n", + "from termcolor import cprint\n", + "from llama_stack_client.lib.inference.event_logger import EventLogger\n", "\n", - "async def get_llama_response(stream: bool = True):\n", - " client = await select_client() # Selects the available client\n", + "async def get_llama_response(stream: bool = True, use_local: bool = True):\n", + " client = await select_client(use_local) # Selects the available client\n", " message = UserMessage(content='hello world, write me a 2 sentence poem about the moon', role='user')\n", " cprint(f'User> {message.content}', 'green')\n", "\n", @@ -124,11 +148,10 @@ " )\n", "\n", " if not stream:\n", - " cprint(f'> Response: {response}', 'cyan')\n", + " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", " else:\n", - " # Stream tokens progressively\n", " async for log in EventLogger().log(response):\n", - " log.print()" + " log.print()\n" ] }, { @@ -136,82 +159,67 @@ "id": "6edf5e57", "metadata": {}, "source": [ - "#### 4. Run the Asynchronous Response Generation\n", + "#### 4. Run with Cloud Model\n", "\n", "Use `asyncio.run()` to execute `get_llama_response` in an asynchronous event loop.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "c10f487e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUsing cloud client.\u001b[0m\n", + "\u001b[32mUser> hello world, write me a 2 sentence poem about the moon\u001b[0m\n", + "\u001b[36mAssistant> \u001b[0m\u001b[33mSilver\u001b[0m\u001b[33m cres\u001b[0m\u001b[33mcent\u001b[0m\u001b[33m in\u001b[0m\u001b[33m the\u001b[0m\u001b[33m midnight\u001b[0m\u001b[33m sky\u001b[0m\u001b[33m,\n", + "\u001b[0m\u001b[33mA\u001b[0m\u001b[33m gentle\u001b[0m\u001b[33m glow\u001b[0m\u001b[33m that\u001b[0m\u001b[33m whispers\u001b[0m\u001b[33m,\u001b[0m\u001b[33m \"\u001b[0m\u001b[33mI\u001b[0m\u001b[33m'm\u001b[0m\u001b[33m passing\u001b[0m\u001b[33m by\u001b[0m\u001b[33m.\"\u001b[0m\u001b[97m\u001b[0m\n" + ] + } + ], "source": [ "import asyncio\n", "\n", - "# Initiate the response generation process\n", - "asyncio.run(get_llama_response())" + "\n", + "# Run this function directly in a Jupyter Notebook cell with `await`\n", + "await get_llama_response(use_local=False)\n", + "# To run it in a python file, use this line instead\n", + "# asyncio.run(get_llama_response(use_local=False))" ] }, { "cell_type": "markdown", - "id": "56aa9a09", + "id": "5c433511-9321-4718-ab7f-e21cf6b5ca79", "metadata": {}, "source": [ - "### Complete code\n", - "Summing it up, here's the complete code for local-cloud model implementation with Llama Stack:\n" + "#### 4. Run with Local Model\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d9fd74ff", + "execution_count": 8, + "id": "02eacfaf-c7f1-494b-ac28-129d2a0258e3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUsing local client.\u001b[0m\n", + "\u001b[32mUser> hello world, write me a 2 sentence poem about the moon\u001b[0m\n", + "\u001b[36mAssistant> \u001b[0m\u001b[33mSilver\u001b[0m\u001b[33m cres\u001b[0m\u001b[33mcent\u001b[0m\u001b[33m in\u001b[0m\u001b[33m the\u001b[0m\u001b[33m midnight\u001b[0m\u001b[33m sky\u001b[0m\u001b[33m,\n", + "\u001b[0m\u001b[33mA\u001b[0m\u001b[33m gentle\u001b[0m\u001b[33m glow\u001b[0m\u001b[33m that\u001b[0m\u001b[33m whispers\u001b[0m\u001b[33m,\u001b[0m\u001b[33m \"\u001b[0m\u001b[33mI\u001b[0m\u001b[33m'm\u001b[0m\u001b[33m passing\u001b[0m\u001b[33m by\u001b[0m\u001b[33m.\"\u001b[0m\u001b[97m\u001b[0m\n" + ] + } + ], "source": [ "import asyncio\n", - "import httpx\n", - "from llama_stack_client import LlamaStackClient\n", - "from llama_stack_client.lib.inference.event_logger import EventLogger\n", - "from llama_stack_client.types import UserMessage\n", - "from termcolor import cprint\n", "\n", - "local_client = LlamaStackClient(base_url='http://{HOST}:{LOCAL_PORT}')\n", - "cloud_client = LlamaStackClient(base_url='http://{HOST}:{CLOUD_PORT}')\n", - "\n", - "async def select_client() -> LlamaStackClient:\n", - " try:\n", - " async with httpx.AsyncClient() as http_client:\n", - " response = await http_client.get(f'{local_client.base_url}/health')\n", - " if response.status_code == 200:\n", - " cprint('Using local client.', 'yellow')\n", - " return local_client\n", - " except httpx.RequestError:\n", - " pass\n", - " cprint('Local client unavailable. Switching to cloud client.', 'yellow')\n", - " return cloud_client\n", - "\n", - "async def get_llama_response(stream: bool = True):\n", - " client = await select_client()\n", - " message = UserMessage(\n", - " content='hello world, write me a 2 sentence poem about the moon', role='user'\n", - " )\n", - " cprint(f'User> {message.content}', 'green')\n", - "\n", - " response = client.inference.chat_completion(\n", - " messages=[message],\n", - " model='Llama3.2-11B-Vision-Instruct',\n", - " stream=stream,\n", - " )\n", - "\n", - " if not stream:\n", - " cprint(f'> Response: {response}', 'cyan')\n", - " else:\n", - " async for log in EventLogger().log(response):\n", - " log.print()\n", - "\n", - "asyncio.run(get_llama_response())" + "await get_llama_response(use_local=True)" ] }, { @@ -226,8 +234,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" } }, "nbformat": 4, diff --git a/docs/zero_to_hero_guide/01_Prompt_Engineering101.ipynb b/docs/zero_to_hero_guide/01_Prompt_Engineering101.ipynb index 835a7adb2..0221a7f3a 100644 --- a/docs/zero_to_hero_guide/01_Prompt_Engineering101.ipynb +++ b/docs/zero_to_hero_guide/01_Prompt_Engineering101.ipynb @@ -9,7 +9,7 @@ "\n", "Prompt engineering is using natural language to produce a desired response from a large language model (LLM).\n", "\n", - "This interactive guide covers prompt engineering & best practices with Llama 3.1 and Llama Stack.\n", + "This interactive guide covers prompt engineering & best practices with Llama 3.2 and Llama Stack.\n", "\n", "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)." ] @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "df35d1e2", "metadata": {}, "outputs": [], @@ -62,14 +62,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a0e359", "metadata": {}, "outputs": [], "source": [ "from llama_stack_client import LlamaStackClient\n", "\n", - "client = LlamaStackClient(base_url='http://{HOST}:{PORT}')" + "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')" ] }, { @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "da140b33", "metadata": {}, "outputs": [], @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8b321089", "metadata": {}, "outputs": [], @@ -170,10 +170,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "4ac1ac3e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: That's Llama!\u001b[0m\n" + ] + } + ], "source": [ "from termcolor import cprint\n", "\n", @@ -191,16 +199,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "524189bd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: That's Llama!\u001b[0m\n" + ] + } + ], "source": [ "from llama_stack_client import LlamaStackClient\n", "from llama_stack_client.types import CompletionMessage, UserMessage\n", "from termcolor import cprint\n", "\n", - "client = LlamaStackClient(base_url='http://{HOST}:{PORT}')\n", + "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n", "\n", "response = client.inference.chat_completion(\n", " messages=[\n", @@ -251,17 +267,25 @@ "\n", "The next one will be a guide on how to chat with images, continue to the notebook [here](./02_Image_Chat101.ipynb). Happy learning!" ] - }, - { - "cell_type": "markdown", - "id": "cce1f624", - "metadata": {}, - "source": [] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" } }, "nbformat": 4,