litellm-mirror/tests/local_testing/test_azure_perf.py
Krish Dholakia 859b47f08b
LiteLLM Minor Fixes & Improvements (11/29/2024) (#6965)
* fix(factory.py): ensure tool call converts image url

Fixes https://github.com/BerriAI/litellm/issues/6953

* fix(transformation.py): support mp4 + pdf url's for vertex ai

Fixes https://github.com/BerriAI/litellm/issues/6936

* fix(http_handler.py): mask gemini api key in error logs

Fixes https://github.com/BerriAI/litellm/issues/6963

* docs(prometheus.md): update prometheus FAQs

* feat(auth_checks.py): ensure specific model access > wildcard model access

if wildcard model is in access group, but specific model is not - deny access

* fix(auth_checks.py): handle auth checks for team based model access groups

handles scenario where model access group used for wildcard models

* fix(internal_user_endpoints.py): support adding guardrails on `/user/update`

Fixes https://github.com/BerriAI/litellm/issues/6942

* fix(key_management_endpoints.py): fix prepare_metadata_fields helper

* fix: fix tests

* build(requirements.txt): bump openai dep version

fixes proxies argument

* test: fix tests

* fix(http_handler.py): fix error message masking

* fix(bedrock_guardrails.py): pass in prepped data

* test: fix test

* test: fix nvidia nim test

* fix(http_handler.py): return original response headers

* fix: revert maskedhttpstatuserror

* test: update tests

* test: cleanup test

* fix(key_management_endpoints.py): fix metadata field update logic

* fix(key_management_endpoints.py): maintain initial order of guardrails in key update

* fix(key_management_endpoints.py): handle prepare metadata

* fix: fix linting errors

* fix: fix linting errors

* fix: fix linting errors

* fix: fix key management errors

* fix(key_management_endpoints.py): update metadata

* test: update test

* refactor: add more debug statements

* test: skip flaky test

* test: fix test

* fix: fix test

* fix: fix update metadata logic

* fix: fix test

* ci(config.yml): change db url for e2e ui testing
2024-12-01 05:24:11 -08:00

128 lines
4.2 KiB
Python

# #### What this tests ####
# # This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
# import sys, os, time, inspect, asyncio, traceback
# from datetime import datetime
# import pytest
# sys.path.insert(0, os.path.abspath("../.."))
# import openai, litellm, uuid
# from openai import AsyncAzureOpenAI
# client = AsyncAzureOpenAI(
# api_key=os.getenv("AZURE_API_KEY"),
# azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore
# api_version=os.getenv("AZURE_API_VERSION"),
# )
# model_list = [
# {
# "model_name": "azure-test",
# "litellm_params": {
# "model": "azure/chatgpt-v-2",
# "api_key": os.getenv("AZURE_API_KEY"),
# "api_base": os.getenv("AZURE_API_BASE"),
# "api_version": os.getenv("AZURE_API_VERSION"),
# },
# }
# ]
# router = litellm.Router(model_list=model_list) # type: ignore
# async def _openai_completion():
# try:
# start_time = time.time()
# response = await client.chat.completions.create(
# model="chatgpt-v-2",
# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
# stream=True,
# )
# time_to_first_token = None
# first_token_ts = None
# init_chunk = None
# async for chunk in response:
# if (
# time_to_first_token is None
# and len(chunk.choices) > 0
# and chunk.choices[0].delta.content is not None
# ):
# first_token_ts = time.time()
# time_to_first_token = first_token_ts - start_time
# init_chunk = chunk
# end_time = time.time()
# print(
# "OpenAI Call: ",
# init_chunk,
# start_time,
# first_token_ts,
# time_to_first_token,
# end_time,
# )
# return time_to_first_token
# except Exception as e:
# print(e)
# return None
# async def _router_completion():
# try:
# start_time = time.time()
# response = await router.acompletion(
# model="azure-test",
# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
# stream=True,
# )
# time_to_first_token = None
# first_token_ts = None
# init_chunk = None
# async for chunk in response:
# if (
# time_to_first_token is None
# and len(chunk.choices) > 0
# and chunk.choices[0].delta.content is not None
# ):
# first_token_ts = time.time()
# time_to_first_token = first_token_ts - start_time
# init_chunk = chunk
# end_time = time.time()
# print(
# "Router Call: ",
# init_chunk,
# start_time,
# first_token_ts,
# time_to_first_token,
# end_time - first_token_ts,
# )
# return time_to_first_token
# except Exception as e:
# print(e)
# return None
# async def test_azure_completion_streaming():
# """
# Test azure streaming call - measure on time to first (non-null) token.
# """
# n = 3 # Number of concurrent tasks
# ## OPENAI AVG. TIME
# tasks = [_openai_completion() for _ in range(n)]
# chat_completions = await asyncio.gather(*tasks)
# successful_completions = [c for c in chat_completions if c is not None]
# total_time = 0
# for item in successful_completions:
# total_time += item
# avg_openai_time = total_time / 3
# ## ROUTER AVG. TIME
# tasks = [_router_completion() for _ in range(n)]
# chat_completions = await asyncio.gather(*tasks)
# successful_completions = [c for c in chat_completions if c is not None]
# total_time = 0
# for item in successful_completions:
# total_time += item
# avg_router_time = total_time / 3
# ## COMPARE
# print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
# assert avg_router_time < avg_openai_time + 0.5
# # asyncio.run(test_azure_completion_streaming())