diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc index fd30776861..c047a81442 100644 Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 6b6ee11b22..06cc9494ee 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index c4a41fd5a0..93c419413c 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/main.py b/litellm/main.py index adc8df9d3a..a51498140a 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1,14 +1,13 @@ import os, openai, cohere, replicate, sys from typing import Any from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT -import traceback from functools import partial -import dotenv -import traceback +import dotenv, traceback, random, asyncio, time +from copy import deepcopy import litellm from litellm import client, logging, exception_type, timeout, get_optional_params -import random -import asyncio +import tiktoken +encoding = tiktoken.get_encoding("cl100k_base") from tenacity import ( retry, stop_after_attempt, @@ -17,6 +16,17 @@ from tenacity import ( ####### ENVIRONMENT VARIABLES ################### dotenv.load_dotenv() # Loading env variables using dotenv +new_response = { + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "role": "assistant" + } + } + ] + } # TODO move this to utils.py # TODO add translations # TODO see if this worked - model_name == krrish @@ -44,6 +54,8 @@ def completion( *, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False ): try: + global new_response + model_response = deepcopy(new_response) # deep copy the default response format so we can mutate it and it's thread-safe. # check if user passed in any of the OpenAI optional params optional_params = get_optional_params( functions=functions, function_call=function_call, @@ -128,6 +140,15 @@ def completion( model=model, prompt = prompt ) + completion_response = response["choices"]["text"] + ## LOGGING + logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = response["created"] + model_response["model"] = model + model_response["usage"] = response["usage"] + response = model_response elif "replicate" in model: # replicate defaults to os.environ.get("REPLICATE_API_TOKEN") # checking in case user set it to REPLICATE_API_KEY instead @@ -153,19 +174,21 @@ def completion( response = "" for item in output: response += item - new_response = { - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": response, - "role": "assistant" - } - } - ] - } - response = new_response + completion_response = response + ## LOGGING + logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) + prompt_tokens = len(encoding.encode(prompt)) + completion_tokens = len(encoding.encode(completion_response)) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens + } + response = model_response elif model in litellm.anthropic_models: #anthropic defaults to os.environ.get("ANTHROPIC_API_KEY") if api_key: @@ -183,7 +206,6 @@ def completion( prompt += f"{HUMAN_PROMPT}{message['content']}" prompt += f"{AI_PROMPT}" anthropic = Anthropic() - # check if user passed in max_tokens != float('inf') if max_tokens != float('inf'): max_tokens_to_sample = max_tokens else: @@ -196,20 +218,22 @@ def completion( prompt=prompt, max_tokens_to_sample=max_tokens_to_sample ) - new_response = { - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": completion.completion, - "role": "assistant" - } - } - ] - } - print_verbose(f"new response: {new_response}") - response = new_response + completion_response = completion.completion + ## LOGGING + logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) + prompt_tokens = anthropic.count_tokens(prompt) + completion_tokens = anthropic.count_tokens(completion_response) + ## RESPONSE OBJECT + print(f"model_response: {model_response}") + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens + } + response = model_response elif model in litellm.cohere_models: if api_key: cohere_key = api_key @@ -226,19 +250,21 @@ def completion( model=model, prompt = prompt ) - new_response = { - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": response[0].text, - "role": "assistant" - } - } - ], - } - response = new_response + completion_response = response[0].text + ## LOGGING + logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) + prompt_tokens = len(encoding.encode(prompt)) + completion_tokens = len(encoding.encode(completion_response)) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens + } + response = model_response else: ## LOGGING logging(model=model, input=messages, azure=azure, logger_fn=logger_fn) diff --git a/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc b/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc index 2baa7bc5f0..0c5fa166d7 100644 Binary files a/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc and b/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc differ diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 88d2ef782b..c99b278b13 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -8,11 +8,14 @@ import pytest import litellm from litellm import embedding, completion -litellm.set_verbose = True +# litellm.set_verbose = True user_message = "Hello, whats the weather in San Francisco??" messages = [{ "content": user_message,"role": "user"}] +def logger_fn(user_model_dict): + print(f"user_model_dict: {user_model_dict}") + def test_completion_openai(): try: response = completion(model="gpt-3.5-turbo", messages=messages) @@ -83,7 +86,7 @@ def test_completion_azure(): def test_completion_claude(): try: - response = completion(model="claude-instant-1", messages=messages) + response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) # Add any assertions here to check the response print(response) except Exception as e: @@ -97,7 +100,8 @@ def test_completion_cohere(): except Exception as e: pytest.fail(f"Error occurred: {e}") - +# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. +# [TODO] improve our try-except block to handle for these # def test_completion_replicate_llama(): # model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" # try: diff --git a/litellm/utils.py b/litellm/utils.py index 94880e6699..8e62d3470a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -33,13 +33,16 @@ def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=N if azure: model_call_details["azure"] = azure if exception: - model_call_details["original_exception"] = exception + model_call_details["exception"] = exception if litellm.telemetry: safe_crash_reporting(model=model, exception=exception, azure=azure) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`. if input: model_call_details["input"] = input + + if len(additional_args): + model_call_details["additional_args"] = additional_args # log additional call details -> api key, etc. if model: if azure == True or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_embedding_models: @@ -53,7 +56,6 @@ def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=N model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY") elif model in litellm.cohere_models: model_call_details["api_key"] = os.environ.get("COHERE_API_KEY") - model_call_details["additional_args"] = additional_args ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs print_verbose(f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}") if logger_fn and callable(logger_fn): @@ -318,6 +320,7 @@ def exception_type(model, original_exception): exception_type = type(original_exception).__name__ else: exception_type = "" + logging(model=model, additional_args={"error_str": error_str, "exception_type": exception_type, "original_exception": original_exception}, logger_fn=user_logger_fn) if "claude" in model: #one of the anthropics if "status_code" in original_exception: print_verbose(f"status_code: {original_exception.status_code}") @@ -357,7 +360,7 @@ def exception_type(model, original_exception): raise original_exception except Exception as e: ## LOGGING - logging(logger_fn=user_logger_fn, additional_args={"original_exception": original_exception}, exception=e) + logging(logger_fn=user_logger_fn, additional_args={"exception_mapping_worked": exception_mapping_worked, "original_exception": original_exception}, exception=e) if exception_mapping_worked: raise e else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls diff --git a/pyproject.toml b/pyproject.toml index ea45eb1cef..4fa2b2fd3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.331" +version = "0.1.341" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License" @@ -15,6 +15,7 @@ anthropic = "^0.3.7" replicate = "^0.10.0" python-dotenv = "^1.0.0" tenacity = "^8.0.1" +tiktoken = "^0.4.0" [build-system] requires = ["poetry-core"] diff --git a/requirements.txt b/requirements.txt index 098bd48021..eca980d36a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ replicate pytest python-dotenv openai[datalib] -tenacity \ No newline at end of file +tenacity +tiktoken \ No newline at end of file