LiteLLM Minor Fixes & Improvements (10/07/2024) (#6101)

* fix(utils.py): support dropping temperature param for azure o1 models * fix(main.py): handle azure o1 streaming requests o1 doesn't support streaming, fake it to ensure code works as expected * feat(utils.py): expose `hosted_vllm/` endpoint, with tool handling for vllm Fixes https://github.com/BerriAI/litellm/issues/6088 * refactor(internal_user_endpoints.py): cleanup unused params + update docstring Closes https://github.com/BerriAI/litellm/issues/6100 * fix(main.py): expose custom image generation api support Fixes https://github.com/BerriAI/litellm/issues/6097 * fix: fix linting errors * docs(custom_llm_server.md): add docs on custom api for image gen calls * fix(types/utils.py): handle dict type * fix(types/utils.py): fix linting errors
2024-10-08 01:17:22 -04:00 · 2024-10-08 01:17:22 -04:00 · 6729c9ca7f
commit 6729c9ca7f
parent 5de69cb1b2
17 changed files with 643 additions and 76 deletions
--- a/tests/local_testing/test_text_completion.py
+++ b/tests/local_testing/test_text_completion.py
@ -4223,7 +4223,8 @@ def mock_post(*args, **kwargs):
    return mock_response


-def test_completion_vllm():
+@pytest.mark.parametrize("provider", ["openai", "hosted_vllm"])
+def test_completion_vllm(provider):
    """
    Asserts a text completion call for vllm actually goes to the text completion endpoint
    """
@ -4235,7 +4236,10 @@ def test_completion_vllm():
        client.completions.with_raw_response, "create", side_effect=mock_post
    ) as mock_call:
        response = text_completion(
-            model="openai/gemini-1.5-flash", prompt="ping", client=client, hello="world"
+            model="{provider}/gemini-1.5-flash".format(provider=provider),
+            prompt="ping",
+            client=client,
+            hello="world",
        )
        print("raw response", response)