From 0fffcc15791860c1c0685e29ce87cbbdbf691a8c Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 2 Jan 2024 14:41:28 +0530 Subject: [PATCH] fix(utils.py): support token counting for gpt-4-vision models --- litellm/tests/test_img_resize.py | 78 +++++++++++++++ litellm/tests/test_token_counter.py | 20 ++++ litellm/utils.py | 146 ++++++++++++++++++++++++++-- 3 files changed, 237 insertions(+), 7 deletions(-) create mode 100644 litellm/tests/test_img_resize.py diff --git a/litellm/tests/test_img_resize.py b/litellm/tests/test_img_resize.py new file mode 100644 index 000000000..b5ea9364f --- /dev/null +++ b/litellm/tests/test_img_resize.py @@ -0,0 +1,78 @@ +from typing import Literal + + +def calculage_img_tokens( + width, + height, + mode: Literal["low", "high", "auto"] = "auto", + base_tokens: int = 85, # openai default - https://openai.com/pricing +): + if mode == "low": + return base_tokens + elif mode == "high" or mode == "auto": + resized_width, resized_height = resize_image_high_res( + width=width, height=height + ) + tiles_needed_high_res = calculate_tiles_needed(resized_width, resized_height) + tile_tokens = (base_tokens * 2) * tiles_needed_high_res + total_tokens = base_tokens + tile_tokens + return total_tokens + + +def resize_image_high_res(width, height): + # Maximum dimensions for high res mode + max_short_side = 768 + max_long_side = 2000 + + # Determine the longer and shorter sides + longer_side = max(width, height) + shorter_side = min(width, height) + + # Calculate the aspect ratio + aspect_ratio = longer_side / shorter_side + + # Resize based on the short side being 768px + if width <= height: # Portrait or square + resized_width = max_short_side + resized_height = int(resized_width * aspect_ratio) + # if the long side exceeds the limit after resizing, adjust both sides accordingly + if resized_height > max_long_side: + resized_height = max_long_side + resized_width = int(resized_height / aspect_ratio) + else: # Landscape + resized_height = max_short_side + resized_width = int(resized_height * aspect_ratio) + # if the long side exceeds the limit after resizing, adjust both sides accordingly + if resized_width > max_long_side: + resized_width = max_long_side + resized_height = int(resized_width / aspect_ratio) + + return resized_width, resized_height + + +# Test the function with the given example +def calculate_tiles_needed( + resized_width, resized_height, tile_width=512, tile_height=512 +): + tiles_across = (resized_width + tile_width - 1) // tile_width + tiles_down = (resized_height + tile_height - 1) // tile_height + total_tiles = tiles_across * tiles_down + return total_tiles + + +# Test high res mode with 1875 x 768 image +resized_width_high_res = 1875 +resized_height_high_res = 768 +tiles_needed_high_res = calculate_tiles_needed( + resized_width_high_res, resized_height_high_res +) +print( + f"Tiles needed for high res image ({resized_width_high_res}x{resized_height_high_res}): {tiles_needed_high_res}" +) + +# If you had the original size and needed to resize and then calculate tiles: +original_size = (10000, 4096) +resized_size_high_res = resize_image_high_res(*original_size) +print(f"Resized dimensions in high res mode: {resized_size_high_res}") +tiles_needed = calculate_tiles_needed(*resized_size_high_res) +print(f"Tiles needed for high res image {resized_size_high_res}: {tiles_needed}") diff --git a/litellm/tests/test_token_counter.py b/litellm/tests/test_token_counter.py index c127dbefd..bf05232d9 100644 --- a/litellm/tests/test_token_counter.py +++ b/litellm/tests/test_token_counter.py @@ -119,3 +119,23 @@ def test_encoding_and_decoding(): # test_encoding_and_decoding() + + +def test_gpt_vision_token_counting(): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + }, + ], + } + ] + tokens = token_counter(model="gpt-4-vision-preview", messages=messages) + print(f"tokens: {tokens}") + + +# test_gpt_vision_token_counting() diff --git a/litellm/utils.py b/litellm/utils.py index 46fdd87f4..d8a75934a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7,7 +7,7 @@ # # Thank you users! We ❤️ you! - Krrish & Ishaan -import sys, re, binascii +import sys, re, binascii, struct import litellm import dotenv, json, traceback, threading, base64 import subprocess, os @@ -2495,15 +2495,127 @@ def openai_token_counter( for message in messages: num_tokens += tokens_per_message for key, value in message.items(): - num_tokens += len(encoding.encode(value, disallowed_special=())) - if key == "name": - num_tokens += tokens_per_name + if isinstance(value, str): + num_tokens += len(encoding.encode(value, disallowed_special=())) + if key == "name": + num_tokens += tokens_per_name + elif isinstance(value, List): + for c in value: + if c["type"] == "text": + text += c["text"] + elif c["type"] == "image_url": + if isinstance(c["image_url"], dict): + image_url_dict = c["image_url"] + detail = image_url_dict.get("detail", "auto") + url = image_url_dict.get("url") + num_tokens += calculage_img_tokens( + data=url, mode=detail + ) + elif isinstance(c["image_url"], str): + image_url_str = c["image_url"] + num_tokens += calculage_img_tokens( + data=image_url_str, mode="auto" + ) elif text is not None: num_tokens = len(encoding.encode(text, disallowed_special=())) num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> return num_tokens +def resize_image_high_res(width, height): + # Maximum dimensions for high res mode + max_short_side = 768 + max_long_side = 2000 + + # Determine the longer and shorter sides + longer_side = max(width, height) + shorter_side = min(width, height) + + # Calculate the aspect ratio + aspect_ratio = longer_side / shorter_side + + # Resize based on the short side being 768px + if width <= height: # Portrait or square + resized_width = max_short_side + resized_height = int(resized_width * aspect_ratio) + # if the long side exceeds the limit after resizing, adjust both sides accordingly + if resized_height > max_long_side: + resized_height = max_long_side + resized_width = int(resized_height / aspect_ratio) + else: # Landscape + resized_height = max_short_side + resized_width = int(resized_height * aspect_ratio) + # if the long side exceeds the limit after resizing, adjust both sides accordingly + if resized_width > max_long_side: + resized_width = max_long_side + resized_height = int(resized_width / aspect_ratio) + + return resized_width, resized_height + + +# Test the function with the given example +def calculate_tiles_needed( + resized_width, resized_height, tile_width=512, tile_height=512 +): + tiles_across = (resized_width + tile_width - 1) // tile_width + tiles_down = (resized_height + tile_height - 1) // tile_height + total_tiles = tiles_across * tiles_down + return total_tiles + + +def get_image_dimensions(data): + img_data = None + + # Check if data is a URL by trying to parse it + try: + response = requests.get(data) + response.raise_for_status() # Check if the request was successful + img_data = response.content + except Exception: + # Data is not a URL, handle as base64 + header, encoded = data.split(",", 1) + img_data = base64.b64decode(encoded) + + # Try to determine dimensions from headers + # This is a very simplistic check, primarily works with PNG and non-progressive JPEG + if img_data[:8] == b"\x89PNG\r\n\x1a\n": + # PNG Image; width and height are 4 bytes each and start at offset 16 + width, height = struct.unpack(">ii", img_data[16:24]) + return width, height + elif img_data[:2] == b"\xff\xd8": + # JPEG Image; for dimensions, SOF0 block (0xC0) gives dimensions at offset 3 for length, and then 5 and 7 for height and width + # This will NOT find dimensions for all JPEGs (e.g., progressive JPEGs) + # Find SOF0 marker (0xFF followed by 0xC0) + sof = re.search(b"\xff\xc0....", img_data) + if sof: + # Parse SOF0 block to find dimensions + height, width = struct.unpack(">HH", sof.group()[5:9]) + return width, height + else: + return None, None + else: + # Unsupported format + return None, None + + +def calculage_img_tokens( + data, + mode: Literal["low", "high", "auto"] = "auto", + base_tokens: int = 85, # openai default - https://openai.com/pricing +): + if mode == "low" or mode == "auto": + return base_tokens + elif mode == "high": + width, height = get_image_dimensions(data=data) + resized_width, resized_height = resize_image_high_res( + width=width, height=height + ) + tiles_needed_high_res = calculate_tiles_needed(resized_width, resized_height) + tile_tokens = (base_tokens * 2) * tiles_needed_high_res + total_tokens = base_tokens + tile_tokens + return total_tokens + + def token_counter( model="", text: Optional[Union[str, List[str]]] = None, @@ -2522,13 +2634,33 @@ def token_counter( """ # use tiktoken, anthropic, cohere or llama2's tokenizer depending on the model is_tool_call = False + num_tokens = 0 if text == None: if messages is not None: print_verbose(f"token_counter messages received: {messages}") text = "" for message in messages: - if message.get("content", None): - text += message["content"] + if message.get("content", None) is not None: + content = message.get("content") + if isinstance(content, str): + text += message["content"] + elif isinstance(content, List): + for c in content: + if c["type"] == "text": + text += c["text"] + elif c["type"] == "image_url": + if isinstance(c["image_url"], dict): + image_url_dict = c["image_url"] + detail = image_url_dict.get("detail", "auto") + url = image_url_dict.get("url") + num_tokens += calculage_img_tokens( + data=url, mode=detail + ) + elif isinstance(c["image_url"], str): + image_url_str = c["image_url"] + num_tokens += calculage_img_tokens( + data=image_url_str, mode="auto" + ) if "tool_calls" in message: is_tool_call = True for tool_call in message["tool_calls"]: @@ -2539,7 +2671,7 @@ def token_counter( raise ValueError("text and messages cannot both be None") elif isinstance(text, List): text = "".join(t for t in text if isinstance(t, str)) - num_tokens = 0 + if model is not None: tokenizer_json = _select_tokenizer(model=model) if tokenizer_json["type"] == "huggingface_tokenizer":