llama-stack-mirror/docs/zero_to_hero_guide/02_Image_Chat101.ipynb
2024-11-06 17:54:38 -08:00

203 lines
12 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"id": "923343b0-d4bd-4361-b8d4-dd29f86a0fbd",
"metadata": {},
"source": [
"## Getting Started with LlamaStack Vision API\n",
"\n",
"Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
"\n",
"Let's import the necessary packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "eae04594-49f9-43af-bb42-9df114d9ddd6",
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"import base64\n",
"import mimetypes\n",
"from llama_stack_client import LlamaStackClient\n",
"from llama_stack_client.lib.inference.event_logger import EventLogger\n",
"from llama_stack_client.types import UserMessage\n",
"from termcolor import cprint"
]
},
{
"cell_type": "markdown",
"id": "143837c6-1072-4015-8297-514712704087",
"metadata": {},
"source": [
"## Configuration\n",
"Set up your connection parameters:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1d293479-9dde-4b68-94ab-d0c4c61ab08c",
"metadata": {},
"outputs": [],
"source": [
"HOST = \"localhost\" # Replace with your host\n",
"PORT = 5000 # Replace with your port"
]
},
{
"cell_type": "markdown",
"id": "51984856-dfc7-4226-817a-1d44853e6661",
"metadata": {},
"source": [
"## Helper Functions\n",
"Let's create some utility functions to handle image processing and API interaction:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8e65aae0-3ef0-4084-8c59-273a89ac9510",
"metadata": {},
"outputs": [],
"source": [
"def encode_image_to_data_url(file_path: str) -> str:\n",
" \"\"\"\n",
" Encode an image file to a data URL.\n",
"\n",
" Args:\n",
" file_path (str): Path to the image file\n",
"\n",
" Returns:\n",
" str: Data URL string\n",
" \"\"\"\n",
" mime_type, _ = mimetypes.guess_type(file_path)\n",
" if mime_type is None:\n",
" raise ValueError(\"Could not determine MIME type of the file\")\n",
"\n",
" with open(file_path, \"rb\") as image_file:\n",
" encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
"\n",
" return f\"data:{mime_type};base64,{encoded_string}\"\n",
"\n",
"async def process_image(client: LlamaStackClient, image_path: str, stream: bool = True):\n",
" \"\"\"\n",
" Process an image through the LlamaStack Vision API.\n",
"\n",
" Args:\n",
" client (LlamaStackClient): Initialized client\n",
" image_path (str): Path to image file\n",
" stream (bool): Whether to stream the response\n",
" \"\"\"\n",
" data_url = encode_image_to_data_url(image_path)\n",
"\n",
" message = UserMessage(\n",
" role=\"user\",\n",
" content=[\n",
" {\"image\": {\"uri\": data_url}},\n",
" \"Describe what is in this image.\",\n",
" ],\n",
" )\n",
"\n",
" cprint(f\"User> Sending image for analysis...\", \"green\")\n",
" response = client.inference.chat_completion(\n",
" messages=[message],\n",
" model=\"Llama3.2-11B-Vision-Instruct\",\n",
" stream=stream,\n",
" )\n",
"\n",
" if not stream:\n",
" cprint(f\"> Response: {response}\", \"cyan\")\n",
" else:\n",
" async for log in EventLogger().log(response):\n",
" log.print()"
]
},
{
"cell_type": "markdown",
"id": "8073b673-e730-4557-8980-fd8b7ea11975",
"metadata": {},
"source": [
"## Chat with Image\n",
"\n",
"Now let's put it all together:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "64d36476-95d7-49f9-a548-312cf8d8c49e",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'logo.png'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 17\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(models_response)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Execute the main function\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m main()\n",
"Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m client \u001b[38;5;241m=\u001b[39m LlamaStackClient(\n\u001b[1;32m 5\u001b[0m base_url\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttp://\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mHOST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPORT\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m )\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Process image\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m process_image(client, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlogo.png\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# Query available models\u001b[39;00m\n\u001b[1;32m 12\u001b[0m models_response \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mlist()\n",
"Cell \u001b[0;32mIn[3], line 29\u001b[0m, in \u001b[0;36mprocess_image\u001b[0;34m(client, image_path, stream)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_image\u001b[39m(client: LlamaStackClient, image_path: \u001b[38;5;28mstr\u001b[39m, stream: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m 21\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03m Process an image through the LlamaStack Vision API.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;124;03m stream (bool): Whether to stream the response\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m data_url \u001b[38;5;241m=\u001b[39m \u001b[43mencode_image_to_data_url\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m message \u001b[38;5;241m=\u001b[39m UserMessage(\n\u001b[1;32m 32\u001b[0m role\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 33\u001b[0m content\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 36\u001b[0m ],\n\u001b[1;32m 37\u001b[0m )\n\u001b[1;32m 39\u001b[0m cprint(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser> Sending image for analysis...\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgreen\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"Cell \u001b[0;32mIn[3], line 15\u001b[0m, in \u001b[0;36mencode_image_to_data_url\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mime_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not determine MIME type of the file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m image_file:\n\u001b[1;32m 16\u001b[0m encoded_string \u001b[38;5;241m=\u001b[39m base64\u001b[38;5;241m.\u001b[39mb64encode(image_file\u001b[38;5;241m.\u001b[39mread())\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmime_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m;base64,\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mencoded_string\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
"File \u001b[0;32m~/miniconda3/envs/stack/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'logo.png'"
]
}
],
"source": [
"# [Cell 5] - Initialize client and process image\n",
"async def main():\n",
" # Initialize client\n",
" client = LlamaStackClient(\n",
" base_url=f\"http://{HOST}:{PORT}\",\n",
" )\n",
"\n",
" # Process image\n",
" await process_image(client, \"logo.png\")\n",
"\n",
" # Query available models\n",
" models_response = client.models.list()\n",
" print(\"\\nAvailable Models:\")\n",
" print(models_response)\n",
"\n",
"# Execute the main function\n",
"await main()"
]
},
{
"cell_type": "markdown",
"id": "9b39efb4",
"metadata": {},
"source": [
"Thanks for checking out this notebook! \n",
"\n",
"The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./03_Tool_Calling101.ipynb). Enjoy!"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}