diff --git a/docs/image_chat101.ipynb b/docs/image_chat101.ipynb new file mode 100644 index 000000000..d62646a4a --- /dev/null +++ b/docs/image_chat101.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "923343b0-d4bd-4361-b8d4-dd29f86a0fbd", + "metadata": {}, + "source": [ + "## Getting Started with LlamaStack Vision API\n", + "\n", + "Let's import the necessary packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eae04594-49f9-43af-bb42-9df114d9ddd6", + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import base64\n", + "import mimetypes\n", + "from llama_stack_client import LlamaStackClient\n", + "from llama_stack_client.lib.inference.event_logger import EventLogger\n", + "from llama_stack_client.types import UserMessage\n", + "from termcolor import cprint" + ] + }, + { + "cell_type": "markdown", + "id": "143837c6-1072-4015-8297-514712704087", + "metadata": {}, + "source": [ + "## Configuration\n", + "Set up your connection parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1d293479-9dde-4b68-94ab-d0c4c61ab08c", + "metadata": {}, + "outputs": [], + "source": [ + "HOST = \"localhost\" # Replace with your host\n", + "PORT = 5001 # Replace with your port" + ] + }, + { + "cell_type": "markdown", + "id": "51984856-dfc7-4226-817a-1d44853e6661", + "metadata": {}, + "source": [ + "## Helper Functions\n", + "Let's create some utility functions to handle image processing and API interaction:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8e65aae0-3ef0-4084-8c59-273a89ac9510", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image_to_data_url(file_path: str) -> str:\n", + " \"\"\"\n", + " Encode an image file to a data URL.\n", + " \n", + " Args:\n", + " file_path (str): Path to the image file\n", + " \n", + " Returns:\n", + " str: Data URL string\n", + " \"\"\"\n", + " mime_type, _ = mimetypes.guess_type(file_path)\n", + " if mime_type is None:\n", + " raise ValueError(\"Could not determine MIME type of the file\")\n", + " \n", + " with open(file_path, \"rb\") as image_file:\n", + " encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + " \n", + " return f\"data:{mime_type};base64,{encoded_string}\"\n", + "\n", + "async def process_image(client: LlamaStackClient, image_path: str, stream: bool = True):\n", + " \"\"\"\n", + " Process an image through the LlamaStack Vision API.\n", + " \n", + " Args:\n", + " client (LlamaStackClient): Initialized client\n", + " image_path (str): Path to image file\n", + " stream (bool): Whether to stream the response\n", + " \"\"\"\n", + " data_url = encode_image_to_data_url(image_path)\n", + " \n", + " message = UserMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"image\": {\"uri\": data_url}},\n", + " \"Describe what is in this image.\",\n", + " ],\n", + " )\n", + " \n", + " cprint(f\"User> Sending image for analysis...\", \"green\")\n", + " response = client.inference.chat_completion(\n", + " messages=[message],\n", + " model=\"Llama3.2-11B-Vision-Instruct\",\n", + " stream=stream,\n", + " )\n", + " \n", + " if not stream:\n", + " cprint(f\"> Response: {response}\", \"cyan\")\n", + " else:\n", + " async for log in EventLogger().log(response):\n", + " log.print()" + ] + }, + { + "cell_type": "markdown", + "id": "8073b673-e730-4557-8980-fd8b7ea11975", + "metadata": {}, + "source": [ + "## Chat with Image\n", + "\n", + "Now let's put it all together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64d36476-95d7-49f9-a548-312cf8d8c49e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mUser> Sending image for analysis...\u001b[0m\n", + "\u001b[36mAssistant> \u001b[0m\u001b[33mThe\u001b[0m\u001b[33m image\u001b[0m\u001b[33m features\u001b[0m\u001b[33m a\u001b[0m\u001b[33m styl\u001b[0m\u001b[33mized\u001b[0m\u001b[33m,\u001b[0m\u001b[33m mon\u001b[0m\u001b[33moch\u001b[0m\u001b[33mromatic\u001b[0m\u001b[33m logo\u001b[0m\u001b[33m for\u001b[0m\u001b[33m \"\u001b[0m\u001b[33mLL\u001b[0m\u001b[33mAMA\u001b[0m\u001b[33m STACK\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m against\u001b[0m\u001b[33m a\u001b[0m\u001b[33m solid\u001b[0m\u001b[33m black\u001b[0m\u001b[33m background\u001b[0m\u001b[33m.\u001b[0m\u001b[33m The\u001b[0m\u001b[33m logo\u001b[0m\u001b[33m is\u001b[0m\u001b[33m centered\u001b[0m\u001b[33m and\u001b[0m\u001b[33m consists\u001b[0m\u001b[33m of\u001b[0m\u001b[33m a\u001b[0m\u001b[33m simple\u001b[0m\u001b[33m line\u001b[0m\u001b[33m drawing\u001b[0m\u001b[33m of\u001b[0m\u001b[33m a\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m's\u001b[0m\u001b[33m head\u001b[0m\u001b[33m and\u001b[0m\u001b[33m neck\u001b[0m\u001b[33m,\u001b[0m\u001b[33m with\u001b[0m\u001b[33m its\u001b[0m\u001b[33m body\u001b[0m\u001b[33m replaced\u001b[0m\u001b[33m by\u001b[0m\u001b[33m a\u001b[0m\u001b[33m stack\u001b[0m\u001b[33m of\u001b[0m\u001b[33m three\u001b[0m\u001b[33m rounded\u001b[0m\u001b[33m rectangles\u001b[0m\u001b[33m resembling\u001b[0m\u001b[33m a\u001b[0m\u001b[33m pile\u001b[0m\u001b[33m of\u001b[0m\u001b[33m pancakes\u001b[0m\u001b[33m or\u001b[0m\u001b[33m a\u001b[0m\u001b[33m stack\u001b[0m\u001b[33m of\u001b[0m\u001b[33m books\u001b[0m\u001b[33m.\u001b[0m\u001b[33m The\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m's\u001b[0m\u001b[33m head\u001b[0m\u001b[33m is\u001b[0m\u001b[33m depicted\u001b[0m\u001b[33m in\u001b[0m\u001b[33m profile\u001b[0m\u001b[33m,\u001b[0m\u001b[33m facing\u001b[0m\u001b[33m to\u001b[0m\u001b[33m the\u001b[0m\u001b[33m left\u001b[0m\u001b[33m,\u001b[0m\u001b[33m with\u001b[0m\u001b[33m a\u001b[0m\u001b[33m small\u001b[0m\u001b[33m circle\u001b[0m\u001b[33m representing\u001b[0m\u001b[33m the\u001b[0m\u001b[33m eye\u001b[0m\u001b[33m and\u001b[0m\u001b[33m a\u001b[0m\u001b[33m curved\u001b[0m\u001b[33m line\u001b[0m\u001b[33m indicating\u001b[0m\u001b[33m the\u001b[0m\u001b[33m ear\u001b[0m\u001b[33m.\u001b[0m\u001b[33m The\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m's\u001b[0m\u001b[33m neck\u001b[0m" + ] + } + ], + "source": [ + "# [Cell 5] - Initialize client and process image\n", + "async def main():\n", + " # Initialize client\n", + " client = LlamaStackClient(\n", + " base_url=f\"http://{HOST}:{PORT}\",\n", + " )\n", + " \n", + " # Process image\n", + " await process_image(client, \"logo.png\")\n", + " \n", + " # Query available models\n", + " models_response = client.models.list()\n", + " print(\"\\nAvailable Models:\")\n", + " print(models_response)\n", + "\n", + "# Execute the main function\n", + "await main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "277adb5d-a9cc-40ec-a961-2d194f88a00b", + "metadata": {}, + "outputs": [], + "source": [ + "#fin" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}