llama-stack-mirror/docs/zero_to_hero_guide/02_Image_Chat101.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "923343b0-d4bd-4361-b8d4-dd29f86a0fbd",
   "metadata": {},
   "source": [
    "## Getting Started with LlamaStack Vision API\n",
    "\n",
    "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
    "\n",
    "Let's import the necessary packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "eae04594-49f9-43af-bb42-9df114d9ddd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import base64\n",
    "import mimetypes\n",
    "from llama_stack_client import LlamaStackClient\n",
    "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
    "from llama_stack_client.types import UserMessage\n",
    "from termcolor import cprint"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "143837c6-1072-4015-8297-514712704087",
   "metadata": {},
   "source": [
    "## Configuration\n",
    "Set up your connection parameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1d293479-9dde-4b68-94ab-d0c4c61ab08c",
   "metadata": {},
   "outputs": [],
   "source": [
    "HOST = \"localhost\"  # Replace with your host\n",
    "PORT = 5000        # Replace with your port"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51984856-dfc7-4226-817a-1d44853e6661",
   "metadata": {},
   "source": [
    "## Helper Functions\n",
    "Let's create some utility functions to handle image processing and API interaction:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8e65aae0-3ef0-4084-8c59-273a89ac9510",
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode_image_to_data_url(file_path: str) -> str:\n",
    "    \"\"\"\n",
    "    Encode an image file to a data URL.\n",
    "\n",
    "    Args:\n",
    "        file_path (str): Path to the image file\n",
    "\n",
    "    Returns:\n",
    "        str: Data URL string\n",
    "    \"\"\"\n",
    "    mime_type, _ = mimetypes.guess_type(file_path)\n",
    "    if mime_type is None:\n",
    "        raise ValueError(\"Could not determine MIME type of the file\")\n",
    "\n",
    "    with open(file_path, \"rb\") as image_file:\n",
    "        encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
    "\n",
    "    return f\"data:{mime_type};base64,{encoded_string}\"\n",
    "\n",
    "async def process_image(client: LlamaStackClient, image_path: str, stream: bool = True):\n",
    "    \"\"\"\n",
    "    Process an image through the LlamaStack Vision API.\n",
    "\n",
    "    Args:\n",
    "        client (LlamaStackClient): Initialized client\n",
    "        image_path (str): Path to image file\n",
    "        stream (bool): Whether to stream the response\n",
    "    \"\"\"\n",
    "    data_url = encode_image_to_data_url(image_path)\n",
    "\n",
    "    message = UserMessage(\n",
    "        role=\"user\",\n",
    "        content=[\n",
    "            {\"image\": {\"uri\": data_url}},\n",
    "            \"Describe what is in this image.\",\n",
    "        ],\n",
    "    )\n",
    "\n",
    "    cprint(f\"User> Sending image for analysis...\", \"green\")\n",
    "    response = client.inference.chat_completion(\n",
    "        messages=[message],\n",
    "        model=\"Llama3.2-11B-Vision-Instruct\",\n",
    "        stream=stream,\n",
    "    )\n",
    "\n",
    "    if not stream:\n",
    "        cprint(f\"> Response: {response}\", \"cyan\")\n",
    "    else:\n",
    "        async for log in EventLogger().log(response):\n",
    "            log.print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8073b673-e730-4557-8980-fd8b7ea11975",
   "metadata": {},
   "source": [
    "## Chat with Image\n",
    "\n",
    "Now let's put it all together:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "64d36476-95d7-49f9-a548-312cf8d8c49e",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'logo.png'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 17\u001b[0m\n\u001b[1;32m     14\u001b[0m     \u001b[38;5;28mprint\u001b[39m(models_response)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# Execute the main function\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m main()\n",
      "Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m      4\u001b[0m client \u001b[38;5;241m=\u001b[39m LlamaStackClient(\n\u001b[1;32m      5\u001b[0m     base_url\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttp://\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mHOST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPORT\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      6\u001b[0m )\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# Process image\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m process_image(client, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlogo.png\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# Query available models\u001b[39;00m\n\u001b[1;32m     12\u001b[0m models_response \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mlist()\n",
      "Cell \u001b[0;32mIn[3], line 29\u001b[0m, in \u001b[0;36mprocess_image\u001b[0;34m(client, image_path, stream)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_image\u001b[39m(client: LlamaStackClient, image_path: \u001b[38;5;28mstr\u001b[39m, stream: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m     21\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03m    Process an image through the LlamaStack Vision API.\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;124;03m        stream (bool): Whether to stream the response\u001b[39;00m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m     data_url \u001b[38;5;241m=\u001b[39m \u001b[43mencode_image_to_data_url\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     31\u001b[0m     message \u001b[38;5;241m=\u001b[39m UserMessage(\n\u001b[1;32m     32\u001b[0m         role\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     33\u001b[0m         content\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     36\u001b[0m         ],\n\u001b[1;32m     37\u001b[0m     )\n\u001b[1;32m     39\u001b[0m     cprint(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser> Sending image for analysis...\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgreen\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "Cell \u001b[0;32mIn[3], line 15\u001b[0m, in \u001b[0;36mencode_image_to_data_url\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mime_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not determine MIME type of the file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m image_file:\n\u001b[1;32m     16\u001b[0m     encoded_string \u001b[38;5;241m=\u001b[39m base64\u001b[38;5;241m.\u001b[39mb64encode(image_file\u001b[38;5;241m.\u001b[39mread())\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmime_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m;base64,\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mencoded_string\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
      "File \u001b[0;32m~/miniconda3/envs/stack/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    319\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    322\u001b[0m     )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'logo.png'"
     ]
    }
   ],
   "source": [
    "# [Cell 5] - Initialize client and process image\n",
    "async def main():\n",
    "    # Initialize client\n",
    "    client = LlamaStackClient(\n",
    "        base_url=f\"http://{HOST}:{PORT}\",\n",
    "    )\n",
    "\n",
    "    # Process image\n",
    "    await process_image(client, \"logo.png\")\n",
    "\n",
    "    # Query available models\n",
    "    models_response = client.models.list()\n",
    "    print(\"\\nAvailable Models:\")\n",
    "    print(models_response)\n",
    "\n",
    "# Execute the main function\n",
    "await main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b39efb4",
   "metadata": {},
   "source": [
    "Thanks for checking out this notebook! \n",
    "\n",
    "The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./03_Tool_Calling101.ipynb). Enjoy!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}