forked from phoenix/litellm-mirror
Litellm dev 11 07 2024 (#6649)
* fix(streaming_handler.py): save finish_reasons which might show up mid-stream (store last received one) Fixes https://github.com/BerriAI/litellm/issues/6104 * refactor: add readme to litellm_core_utils/ make it easier to navigate * fix(team_endpoints.py): return team id + object for invalid team in `/team/list` * fix(streaming_handler.py): remove import * fix(pattern_match_deployments.py): default to user input if unable to map based on wildcards (#6646) * fix(pattern_match_deployments.py): default to user input if unable to… (#6632) * fix(pattern_match_deployments.py): default to user input if unable to map based on wildcards * test: fix test * test: reset test name * test: update conftest to reload proxy server module between tests * ci(config.yml): move langfuse out of local_testing reduce ci/cd time * ci(config.yml): cleanup langfuse ci/cd tests * fix: update test to not use global proxy_server app module * ci: move caching to a separate test pipeline speed up ci pipeline * test: update conftest to check if proxy_server attr exists before reloading * build(conftest.py): don't block on inability to reload proxy_server * ci(config.yml): update caching unit test filter to work on 'cache' keyword as well * fix(encrypt_decrypt_utils.py): use function to get salt key * test: mark flaky test * test: handle anthropic overloaded errors * refactor: create separate ci/cd pipeline for proxy unit tests make ci/cd faster * ci(config.yml): add litellm_proxy_unit_testing to build_and_test jobs * ci(config.yml): generate prisma binaries for proxy unit tests * test: readd vertex_key.json * ci(config.yml): remove `-s` from proxy_unit_test cmd speed up test * ci: remove any 'debug' logging flag speed up ci pipeline * test: fix test * test(test_braintrust.py): rerun * test: add delay for braintrust test * chore: comment for maritalk (#6607) * Update gpt-4o-2024-08-06, and o1-preview, o1-mini models in model cost map (#6654) * Adding supports_response_schema to gpt-4o-2024-08-06 models * o1 models do not support vision --------- Co-authored-by: Emerson Gomes <emerson.gomes@thalesgroup.com> * (QOL improvement) add unit testing for all static_methods in litellm_logging.py (#6640) * add unit testing for standard logging payload * unit testing for static methods in litellm_logging * add code coverage check for litellm_logging * litellm_logging_code_coverage * test_get_final_response_obj * fix validate_redacted_message_span_attributes * test validate_redacted_message_span_attributes * (feat) log error class, function_name on prometheus service failure hook + only log DB related failures on DB service hook (#6650) * log error on prometheus service failure hook * use a more accurate function name for wrapper that handles logging db metrics * fix log_db_metrics * test_log_db_metrics_failure_error_types * fix linting * fix auth checks * Update several Azure AI models in model cost map (#6655) * Adding Azure Phi 3/3.5 models to model cost map * Update gpt-4o-mini models * Adding missing Azure Mistral models to model cost map * Adding Azure Llama3.2 models to model cost map * Fix Gemini-1.5-flash pricing * Fix Gemini-1.5-flash output pricing * Fix Gemini-1.5-pro prices * Fix Gemini-1.5-flash output prices * Correct gemini-1.5-pro prices * Correction on Vertex Llama3.2 entry --------- Co-authored-by: Emerson Gomes <emerson.gomes@thalesgroup.com> * fix(streaming_handler.py): fix linting error * test: remove duplicate test causes gemini ratelimit error --------- Co-authored-by: nobuo kawasaki <nobu007@users.noreply.github.com> Co-authored-by: Emerson Gomes <emerson.gomes@gmail.com> Co-authored-by: Emerson Gomes <emerson.gomes@thalesgroup.com> Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
This commit is contained in:
parent
9f2053e4af
commit
1bef6457c7
10 changed files with 4253 additions and 2151 deletions
11
litellm/litellm_core_utils/README.md
Normal file
11
litellm/litellm_core_utils/README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
## Folder Contents
|
||||
|
||||
This folder contains general-purpose utilities that are used in multiple places in the codebase.
|
||||
|
||||
Core files:
|
||||
- `streaming_handler.py`: The core streaming logic + streaming related helper utils
|
||||
- `core_helpers.py`: code used in `types/` - e.g. `map_finish_reason`.
|
||||
- `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types.
|
||||
- `default_encoding.py`: code for loading the default encoding (tiktoken)
|
||||
- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name.
|
||||
|
|
@ -3,6 +3,8 @@
|
|||
import os
|
||||
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union
|
||||
|
||||
import httpx
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -99,3 +101,28 @@ def _get_parent_otel_span_from_kwargs(
|
|||
"Error in _get_parent_otel_span_from_kwargs: " + str(e)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict:
|
||||
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
|
||||
|
||||
openai_headers = {}
|
||||
processed_headers = {}
|
||||
additional_headers = {}
|
||||
|
||||
for k, v in response_headers.items():
|
||||
if k in OPENAI_RESPONSE_HEADERS: # return openai-compatible headers
|
||||
openai_headers[k] = v
|
||||
if k.startswith(
|
||||
"llm_provider-"
|
||||
): # return raw provider headers (incl. openai-compatible ones)
|
||||
processed_headers[k] = v
|
||||
else:
|
||||
additional_headers["{}-{}".format("llm_provider", k)] = v
|
||||
|
||||
additional_headers = {
|
||||
**openai_headers,
|
||||
**processed_headers,
|
||||
**additional_headers,
|
||||
}
|
||||
return additional_headers
|
||||
|
|
21
litellm/litellm_core_utils/default_encoding.py
Normal file
21
litellm/litellm_core_utils/default_encoding.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
|
||||
import litellm
|
||||
|
||||
try:
|
||||
# New and recommended way to access resources
|
||||
from importlib import resources
|
||||
|
||||
filename = str(resources.files(litellm).joinpath("llms/tokenizers"))
|
||||
except (ImportError, AttributeError):
|
||||
# Old way to access resources, which setuptools deprecated some time ago
|
||||
import pkg_resources # type: ignore
|
||||
|
||||
filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")
|
||||
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
|
||||
"CUSTOM_TIKTOKEN_CACHE_DIR", filename
|
||||
) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
|
||||
import tiktoken
|
||||
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
50
litellm/litellm_core_utils/rules.py
Normal file
50
litellm/litellm_core_utils/rules.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from typing import Optional
|
||||
|
||||
import litellm
|
||||
|
||||
|
||||
class Rules:
|
||||
"""
|
||||
Fail calls based on the input or llm api output
|
||||
|
||||
Example usage:
|
||||
import litellm
|
||||
def my_custom_rule(input): # receives the model response
|
||||
if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer
|
||||
return False
|
||||
return True
|
||||
|
||||
litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call
|
||||
|
||||
response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user",
|
||||
"content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"])
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def pre_call_rules(self, input: str, model: str):
|
||||
for rule in litellm.pre_call_rules:
|
||||
if callable(rule):
|
||||
decision = rule(input)
|
||||
if decision is False:
|
||||
raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore
|
||||
return True
|
||||
|
||||
def post_call_rules(self, input: Optional[str], model: str) -> bool:
|
||||
if input is None:
|
||||
return True
|
||||
for rule in litellm.post_call_rules:
|
||||
if callable(rule):
|
||||
decision = rule(input)
|
||||
if isinstance(decision, bool):
|
||||
if decision is False:
|
||||
raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore
|
||||
elif isinstance(decision, dict):
|
||||
decision_val = decision.get("decision", True)
|
||||
decision_message = decision.get(
|
||||
"message", "LLM Response failed post-call-rule check"
|
||||
)
|
||||
if decision_val is False:
|
||||
raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model) # type: ignore
|
||||
return True
|
2020
litellm/litellm_core_utils/streaming_handler.py
Normal file
2020
litellm/litellm_core_utils/streaming_handler.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,14 +0,0 @@
|
|||
from litellm.types.utils import GenericStreamingChunk as GChunk
|
||||
|
||||
|
||||
def generic_chunk_has_all_required_fields(chunk: dict) -> bool:
|
||||
"""
|
||||
Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk.
|
||||
|
||||
:param chunk: The dictionary to check.
|
||||
:return: True if all required fields are present, False otherwise.
|
||||
"""
|
||||
_all_fields = GChunk.__annotations__
|
||||
|
||||
decision = all(key in _all_fields for key in chunk)
|
||||
return decision
|
|
@ -1,5 +1,5 @@
|
|||
import json
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import litellm
|
||||
from litellm import verbose_logger
|
||||
|
@ -10,7 +10,7 @@ from litellm.types.llms.openai import (
|
|||
ChatCompletionToolCallFunctionChunk,
|
||||
ChatCompletionUsageBlock,
|
||||
)
|
||||
from litellm.types.utils import GenericStreamingChunk
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse, Usage
|
||||
|
||||
|
||||
class ModelResponseIterator:
|
||||
|
|
|
@ -1281,12 +1281,20 @@ async def list_team(
|
|||
where={"team_id": team.team_id}
|
||||
)
|
||||
|
||||
returned_responses.append(
|
||||
TeamListResponseObject(
|
||||
**team.model_dump(),
|
||||
team_memberships=_team_memberships,
|
||||
keys=keys,
|
||||
try:
|
||||
returned_responses.append(
|
||||
TeamListResponseObject(
|
||||
**team.model_dump(),
|
||||
team_memberships=_team_memberships,
|
||||
keys=keys,
|
||||
)
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
team_exception = """Invalid team object for team_id: {}. team_object={}.
|
||||
Error: {}
|
||||
""".format(
|
||||
team.team_id, team.model_dump(), str(e)
|
||||
)
|
||||
raise HTTPException(status_code=400, detail={"error": team_exception})
|
||||
|
||||
return returned_responses
|
||||
|
|
4157
litellm/utils.py
4157
litellm/utils.py
File diff suppressed because it is too large
Load diff
|
@ -3470,6 +3470,86 @@ def test_unit_test_custom_stream_wrapper_repeating_chunk(
|
|||
continue
|
||||
|
||||
|
||||
def test_unit_test_gemini_streaming_content_filter():
|
||||
chunks = [
|
||||
{
|
||||
"text": "##",
|
||||
"tool_use": None,
|
||||
"is_finished": False,
|
||||
"finish_reason": "stop",
|
||||
"usage": {"prompt_tokens": 37, "completion_tokens": 1, "total_tokens": 38},
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "",
|
||||
"is_finished": False,
|
||||
"finish_reason": "",
|
||||
"usage": None,
|
||||
"index": 0,
|
||||
"tool_use": None,
|
||||
},
|
||||
{
|
||||
"text": " Downsides of Prompt Hacking in a Customer Portal\n\nWhile prompt engineering can be incredibly",
|
||||
"tool_use": None,
|
||||
"is_finished": False,
|
||||
"finish_reason": "stop",
|
||||
"usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54},
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "",
|
||||
"is_finished": False,
|
||||
"finish_reason": "",
|
||||
"usage": None,
|
||||
"index": 0,
|
||||
"tool_use": None,
|
||||
},
|
||||
{
|
||||
"text": "",
|
||||
"tool_use": None,
|
||||
"is_finished": False,
|
||||
"finish_reason": "content_filter",
|
||||
"usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54},
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "",
|
||||
"is_finished": False,
|
||||
"finish_reason": "",
|
||||
"usage": None,
|
||||
"index": 0,
|
||||
"tool_use": None,
|
||||
},
|
||||
]
|
||||
|
||||
completion_stream = ModelResponseListIterator(model_responses=chunks)
|
||||
|
||||
response = litellm.CustomStreamWrapper(
|
||||
completion_stream=completion_stream,
|
||||
model="gemini/gemini-1.5-pro",
|
||||
custom_llm_provider="gemini",
|
||||
logging_obj=litellm.Logging(
|
||||
model="gemini/gemini-1.5-pro",
|
||||
messages=[{"role": "user", "content": "Hey"}],
|
||||
stream=True,
|
||||
call_type="completion",
|
||||
start_time=time.time(),
|
||||
litellm_call_id="12345",
|
||||
function_id="1245",
|
||||
),
|
||||
)
|
||||
|
||||
stream_finish_reason: Optional[str] = None
|
||||
idx = 0
|
||||
for chunk in response:
|
||||
print(f"chunk: {chunk}")
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
stream_finish_reason = chunk.choices[0].finish_reason
|
||||
idx += 1
|
||||
print(f"num chunks: {idx}")
|
||||
assert stream_finish_reason == "content_filter"
|
||||
|
||||
|
||||
def test_unit_test_custom_stream_wrapper_openai():
|
||||
"""
|
||||
Test if last streaming chunk ends with '?', if the message repeats itself.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue