litellm-mirror/tests/test_fallbacks.py

# What is this?
## This tests if the proxy fallbacks work as expected
import pytest
import asyncio
import aiohttp
from large_text import text
import time
from typing import Optional


async def generate_key(
    session,
    i,
    models: list,
    calling_key="sk-1234",
):
    url = "http://0.0.0.0:4000/key/generate"
    headers = {
        "Authorization": f"Bearer {calling_key}",
        "Content-Type": "application/json",
    }
    data = {
        "models": models,
    }

    print(f"data: {data}")

    async with session.post(url, headers=headers, json=data) as response:
        status = response.status
        response_text = await response.text()

        print(f"Response {i} (Status code: {status}):")
        print(response_text)
        print()

        if status != 200:
            raise Exception(f"Request {i} did not return a 200 status code: {status}")

        return await response.json()


async def chat_completion(
    session,
    key: str,
    model: str,
    messages: list,
    return_headers: bool = False,
    extra_headers: Optional[dict] = None,
    **kwargs,
):
    url = "http://0.0.0.0:4000/chat/completions"
    headers = {
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json",
    }
    if extra_headers is not None:
        headers.update(extra_headers)
    data = {"model": model, "messages": messages, **kwargs}

    async with session.post(url, headers=headers, json=data) as response:
        status = response.status
        response_text = await response.text()

        print(response_text)
        print()

        if status != 200:
            if return_headers:
                return None, response.headers
            else:
                raise Exception(f"Request did not return a 200 status code: {status}")

        if return_headers:
            return await response.json(), response.headers
        else:
            return await response.json()


@pytest.mark.asyncio
async def test_chat_completion():
    """
    make chat completion call with prompt > context window. expect it to work with fallback
    """
    async with aiohttp.ClientSession() as session:
        model = "gpt-3.5-turbo"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        await chat_completion(
            session=session, key="sk-1234", model=model, messages=messages
        )


@pytest.mark.parametrize("has_access", [True, False])
@pytest.mark.asyncio
async def test_chat_completion_client_fallbacks(has_access):
    """
    make chat completion call with prompt > context window. expect it to work with fallback
    """

    async with aiohttp.ClientSession() as session:
        models = ["gpt-3.5-turbo"]

        if has_access:
            models.append("gpt-instruct")

        ## CREATE KEY WITH MODELS
        generated_key = await generate_key(session=session, i=0, models=models)
        calling_key = generated_key["key"]
        model = "gpt-3.5-turbo"
        messages = [
            {"role": "user", "content": "Who was Alexander?"},
        ]

        ## CALL PROXY
        try:
            await chat_completion(
                session=session,
                key=calling_key,
                model=model,
                messages=messages,
                mock_testing_fallbacks=True,
                fallbacks=["gpt-instruct"],
            )
            if not has_access:
                pytest.fail(
                    "Expected this to fail, submitted fallback model that key did not have access to"
                )
        except Exception as e:
            if has_access:
                pytest.fail("Expected this to work: {}".format(str(e)))


@pytest.mark.asyncio
async def test_chat_completion_with_retries():
    """
    make chat completion call with prompt > context window. expect it to work with fallback
    """
    async with aiohttp.ClientSession() as session:
        model = "fake-openai-endpoint-4"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        response, headers = await chat_completion(
            session=session,
            key="sk-1234",
            model=model,
            messages=messages,
            mock_testing_rate_limit_error=True,
            return_headers=True,
        )
        print(f"headers: {headers}")
        assert headers["x-litellm-attempted-retries"] == "1"
        assert headers["x-litellm-max-retries"] == "50"


@pytest.mark.asyncio
async def test_chat_completion_with_fallbacks():
    """
    make chat completion call with prompt > context window. expect it to work with fallback
    """
    async with aiohttp.ClientSession() as session:
        model = "badly-configured-openai-endpoint"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        response, headers = await chat_completion(
            session=session,
            key="sk-1234",
            model=model,
            messages=messages,
            fallbacks=["fake-openai-endpoint-5"],
            return_headers=True,
        )
        print(f"headers: {headers}")
        assert headers["x-litellm-attempted-fallbacks"] == "1"


@pytest.mark.asyncio
async def test_chat_completion_with_timeout():
    """
    make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
    """
    async with aiohttp.ClientSession() as session:
        model = "fake-openai-endpoint-5"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        start_time = time.time()
        response, headers = await chat_completion(
            session=session,
            key="sk-1234",
            model=model,
            messages=messages,
            num_retries=0,
            mock_timeout=True,
            return_headers=True,
        )
        end_time = time.time()
        print(f"headers: {headers}")
        assert (
            headers["x-litellm-timeout"] == "1.0"
        )  # assert model-specific timeout used


@pytest.mark.asyncio
async def test_chat_completion_with_timeout_from_request():
    """
    make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
    """
    async with aiohttp.ClientSession() as session:
        model = "fake-openai-endpoint-5"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        extra_headers = {
            "x-litellm-timeout": "0.001",
        }
        start_time = time.time()
        response, headers = await chat_completion(
            session=session,
            key="sk-1234",
            model=model,
            messages=messages,
            num_retries=0,
            mock_timeout=True,
            extra_headers=extra_headers,
            return_headers=True,
        )
        end_time = time.time()
        print(f"headers: {headers}")
        assert (
            headers["x-litellm-timeout"] == "0.001"
        )  # assert model-specific timeout used


@pytest.mark.parametrize("has_access", [True, False])
@pytest.mark.asyncio
async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
    """
    make chat completion call with prompt > context window. expect it to work with fallback
    """

    async with aiohttp.ClientSession() as session:
        models = ["gpt-3.5-turbo"]

        if has_access:
            models.append("gpt-instruct")

        ## CREATE KEY WITH MODELS
        generated_key = await generate_key(session=session, i=0, models=models)
        calling_key = generated_key["key"]
        model = "gpt-3.5-turbo"
        messages = [
            {"role": "user", "content": "Who was Alexander?"},
        ]

        ## CALL PROXY
        try:
            await chat_completion(
                session=session,
                key=calling_key,
                model=model,
                messages=messages,
                mock_testing_fallbacks=True,
                fallbacks=[
                    {
                        "model": "gpt-instruct",
                        "messages": [
                            {
                                "role": "assistant",
                                "content": "This is a custom message",
                            }
                        ],
                    }
                ],
            )
            if not has_access:
                pytest.fail(
                    "Expected this to fail, submitted fallback model that key did not have access to"
                )
        except Exception as e:
            if has_access:
                pytest.fail("Expected this to work: {}".format(str(e)))


import asyncio
from openai import AsyncOpenAI
from typing import List
import time


async def make_request(client: AsyncOpenAI, model: str) -> bool:
    try:
        await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Who was Alexander?"}],
        )
        return True
    except Exception as e:
        print(f"Error with {model}: {str(e)}")
        return False


async def run_good_model_test(client: AsyncOpenAI, num_requests: int) -> bool:
    tasks = [make_request(client, "good-model") for _ in range(num_requests)]
    good_results = await asyncio.gather(*tasks)
    return all(good_results)


@pytest.mark.asyncio
async def test_chat_completion_bad_and_good_model():
    """
    Prod test - ensure even if bad model is down, good model is still working.
    """
    client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
    num_requests = 100
    num_iterations = 3

    for iteration in range(num_iterations):
        print(f"\nIteration {iteration + 1}/{num_iterations}")
        start_time = time.time()

        # Fire and forget bad model requests
        for _ in range(num_requests):
            asyncio.create_task(make_request(client, "bad-model"))

        # Wait only for good model requests
        success = await run_good_model_test(client, num_requests)
        print(
            f"Iteration {iteration + 1}: {'✓' if success else '✗'} ({time.time() - start_time:.2f}s)"
        )
        assert success, "Not all good model requests succeeded"