#### What this tests ####
# This tests litellm router with batch completion

import sys, os, time, openai
import traceback, asyncio
import pytest

sys.path.insert(
    0, os.path.abspath("../..")
)  # Adds the parent directory to the system path
import litellm
from litellm import Router
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from dotenv import load_dotenv
import os, httpx

load_dotenv()


@pytest.mark.parametrize("mode", ["all_responses", "fastest_response"])
@pytest.mark.asyncio
async def test_batch_completion_multiple_models(mode):
    litellm.set_verbose = True

    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                },
            },
            {
                "model_name": "groq-llama",
                "litellm_params": {
                    "model": "groq/llama3-8b-8192",
                },
            },
        ]
    )

    if mode == "all_responses":
        response = await router.abatch_completion(
            models=["gpt-3.5-turbo", "groq-llama"],
            messages=[
                {"role": "user", "content": "is litellm becoming a better product ?"}
            ],
            max_tokens=15,
        )

        print(response)
        assert len(response) == 2

        models_in_responses = []
        for individual_response in response:
            _model = individual_response["model"]
            models_in_responses.append(_model)

        # assert both models are different
        assert models_in_responses[0] != models_in_responses[1]
    elif mode == "fastest_response":
        from openai.types.chat.chat_completion import ChatCompletion

        response = await router.abatch_completion_fastest_response(
            model="gpt-3.5-turbo, groq-llama",
            messages=[
                {"role": "user", "content": "is litellm becoming a better product ?"}
            ],
            max_tokens=15,
        )

        ChatCompletion.model_validate(response.model_dump(), strict=True)


@pytest.mark.asyncio
async def test_batch_completion_fastest_response_unit_test():
    """
    Unit test to confirm fastest response will always return the response which arrives earliest.

    2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned
    """
    litellm.set_verbose = True

    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt-4",
                "litellm_params": {
                    "model": "gpt-4",
                },
                "model_info": {"id": "1"},
            },
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                    "mock_response": "This is a fake response",
                },
                "model_info": {"id": "2"},
            },
        ]
    )

    response = await router.abatch_completion_fastest_response(
        model="gpt-4, gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": "is litellm becoming a better product ?"}
        ],
        max_tokens=500,
    )

    assert response._hidden_params["model_id"] == "2"
    assert response.choices[0].message.content == "This is a fake response"
    print(f"response: {response}")