litellm/tests/local_testing/test_cost_calc.py

import os
import sys
import traceback

from dotenv import load_dotenv

load_dotenv()
import io
import os

sys.path.insert(
    0, os.path.abspath("../..")
)  # Adds the parent directory to the system-path
from typing import Literal

import pytest
from pydantic import BaseModel, ConfigDict

import litellm
from litellm import Router, completion_cost, stream_chunk_builder

models = [
    dict(
        model_name="openai/gpt-3.5-turbo",
    ),
    dict(
        model_name="anthropic/claude-3-haiku-20240307",
    ),
    dict(
        model_name="together_ai/meta-llama/Llama-2-7b-chat-hf",
    ),
]

router = Router(
    model_list=[
        {
            "model_name": m["model_name"],
            "litellm_params": {
                "model": m.get("model", m["model_name"]),
            },
        }
        for m in models
    ],
    routing_strategy="simple-shuffle",
    num_retries=3,
    retry_after=1,
    timeout=60.0,
    allowed_fails=2,
    cooldown_time=0,
    debug_level="INFO",
)


@pytest.mark.parametrize(
    "model",
    [
        "openai/gpt-3.5-turbo",
        # "anthropic/claude-3-haiku-20240307",
        # "together_ai/meta-llama/Llama-2-7b-chat-hf",
    ],
)
def test_run(model: str):
    """
    Relevant issue - https://github.com/BerriAI/litellm/issues/4965
    """
    # litellm.set_verbose = True
    prompt = "Hi"
    kwargs = dict(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.001,
        top_p=0.001,
        max_tokens=20,
        input_cost_per_token=2,
        output_cost_per_token=2,
    )

    print(f"--------- {model} ---------")
    print(f"Prompt: {prompt}")

    response = router.completion(**kwargs)  # type: ignore
    non_stream_output = response.choices[0].message.content.replace("\n", "")  # type: ignore
    non_stream_cost_calc = response._hidden_params["response_cost"] * 100

    print(f"Non-stream output: {non_stream_output}")
    print(f"Non-stream usage : {response.usage}")  # type: ignore
    try:
        print(
            f"Non-stream cost  : {response._hidden_params['response_cost'] * 100:.4f}"
        )
    except TypeError:
        print("Non-stream cost  : NONE")
    print(f"Non-stream cost  : {completion_cost(response) * 100:.4f} (response)")

    response = router.completion(**kwargs, stream=True)  # type: ignore
    response = stream_chunk_builder(list(response), messages=kwargs["messages"])  # type: ignore
    output = response.choices[0].message.content.replace("\n", "")  # type: ignore
    streaming_cost_calc = completion_cost(response) * 100
    print(f"Stream output    : {output}")

    print(f"Stream usage     : {response.usage}")  # type: ignore
    print(f"Stream cost      : {streaming_cost_calc} (response)")
    print("")
    if output == non_stream_output:
        # assert cost is the same
        assert streaming_cost_calc == non_stream_cost_calc