mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
LiteLLM Minor Fixes & Improvements (11/05/2024) (#6590)
* fix(pattern_matching_router.py): update model name using correct function
* fix(langfuse.py): metadata deepcopy can cause unhandled error (#6563)
Co-authored-by: seva <seva@inita.com>
* fix(stream_chunk_builder_utils.py): correctly set prompt tokens + log correct streaming usage
Closes https://github.com/BerriAI/litellm/issues/6488
* build(deps): bump cookie and express in /docs/my-website (#6566)
Bumps [cookie](https://github.com/jshttp/cookie) and [express](https://github.com/expressjs/express). These dependencies needed to be updated together.
Updates `cookie` from 0.6.0 to 0.7.1
- [Release notes](https://github.com/jshttp/cookie/releases)
- [Commits](https://github.com/jshttp/cookie/compare/v0.6.0...v0.7.1)
Updates `express` from 4.20.0 to 4.21.1
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.1/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.20.0...4.21.1)
---
updated-dependencies:
- dependency-name: cookie
dependency-type: indirect
- dependency-name: express
dependency-type: indirect
...
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
* docs(virtual_keys.md): update Dockerfile reference (#6554)
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
* (proxy fix) - call connect on prisma client when running setup (#6534)
* critical fix - call connect on prisma client when running setup
* fix test_proxy_server_prisma_setup
* fix test_proxy_server_prisma_setup
* Add 3.5 haiku (#6588)
* feat: add claude-3-5-haiku-20241022 entries
* feat: add claude-3-5-haiku-20241022 and vertex_ai/claude-3-5-haiku@20241022 models
* add missing entries, remove vision
* remove image token costs
* Litellm perf improvements 3 (#6573)
* perf: move writing key to cache, to background task
* perf(litellm_pre_call_utils.py): add otel tracing for pre-call utils
adds 200ms on calls with pgdb connected
* fix(litellm_pre_call_utils.py'): rename call_type to actual call used
* perf(proxy_server.py): remove db logic from _get_config_from_file
was causing db calls to occur on every llm request, if team_id was set on key
* fix(auth_checks.py): add check for reducing db calls if user/team id does not exist in db
reduces latency/call by ~100ms
* fix(proxy_server.py): minor fix on existing_settings not incl alerting
* fix(exception_mapping_utils.py): map databricks exception string
* fix(auth_checks.py): fix auth check logic
* test: correctly mark flaky test
* fix(utils.py): handle auth token error for tokenizers.from_pretrained
* build: fix map
* build: fix map
* build: fix json for model map
* fix ImageObject conversion (#6584)
* (fix) litellm.text_completion raises a non-blocking error on simple usage (#6546)
* unit test test_huggingface_text_completion_logprobs
* fix return TextCompletionHandler convert_chat_to_text_completion
* fix hf rest api
* fix test_huggingface_text_completion_logprobs
* fix linting errors
* fix importLiteLLMResponseObjectHandler
* fix test for LiteLLMResponseObjectHandler
* fix test text completion
* fix allow using 15 seconds for premium license check
* testing fix bedrock deprecated cohere.command-text-v14
* (feat) add `Predicted Outputs` for OpenAI (#6594)
* bump openai to openai==1.54.0
* add 'prediction' param
* testing fix bedrock deprecated cohere.command-text-v14
* test test_openai_prediction_param.py
* test_openai_prediction_param_with_caching
* doc Predicted Outputs
* doc Predicted Output
* (fix) Vertex Improve Performance when using `image_url` (#6593)
* fix transformation vertex
* test test_process_gemini_image
* test_image_completion_request
* testing fix - bedrock has deprecated cohere.command-text-v14
* fix vertex pdf
* bump: version 1.51.5 → 1.52.0
* fix(lowest_tpm_rpm_routing.py): fix parallel rate limit check (#6577)
* fix(lowest_tpm_rpm_routing.py): fix parallel rate limit check
* fix(lowest_tpm_rpm_v2.py): return headers in correct format
* test: update test
* build(deps): bump cookie and express in /docs/my-website (#6566)
Bumps [cookie](https://github.com/jshttp/cookie) and [express](https://github.com/expressjs/express). These dependencies needed to be updated together.
Updates `cookie` from 0.6.0 to 0.7.1
- [Release notes](https://github.com/jshttp/cookie/releases)
- [Commits](https://github.com/jshttp/cookie/compare/v0.6.0...v0.7.1)
Updates `express` from 4.20.0 to 4.21.1
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.1/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.20.0...4.21.1)
---
updated-dependencies:
- dependency-name: cookie
dependency-type: indirect
- dependency-name: express
dependency-type: indirect
...
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
* docs(virtual_keys.md): update Dockerfile reference (#6554)
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
* (proxy fix) - call connect on prisma client when running setup (#6534)
* critical fix - call connect on prisma client when running setup
* fix test_proxy_server_prisma_setup
* fix test_proxy_server_prisma_setup
* Add 3.5 haiku (#6588)
* feat: add claude-3-5-haiku-20241022 entries
* feat: add claude-3-5-haiku-20241022 and vertex_ai/claude-3-5-haiku@20241022 models
* add missing entries, remove vision
* remove image token costs
* Litellm perf improvements 3 (#6573)
* perf: move writing key to cache, to background task
* perf(litellm_pre_call_utils.py): add otel tracing for pre-call utils
adds 200ms on calls with pgdb connected
* fix(litellm_pre_call_utils.py'): rename call_type to actual call used
* perf(proxy_server.py): remove db logic from _get_config_from_file
was causing db calls to occur on every llm request, if team_id was set on key
* fix(auth_checks.py): add check for reducing db calls if user/team id does not exist in db
reduces latency/call by ~100ms
* fix(proxy_server.py): minor fix on existing_settings not incl alerting
* fix(exception_mapping_utils.py): map databricks exception string
* fix(auth_checks.py): fix auth check logic
* test: correctly mark flaky test
* fix(utils.py): handle auth token error for tokenizers.from_pretrained
* build: fix map
* build: fix map
* build: fix json for model map
* test: remove eol model
* fix(proxy_server.py): fix db config loading logic
* fix(proxy_server.py): fix order of config / db updates, to ensure fields not overwritten
* test: skip test if required env var is missing
* test: fix test
---------
Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com>
* test: mark flaky test
* test: handle anthropic api instability
* test(test_proxy_utils.py): add testing for db config update logic
* Update setuptools in docker and fastapi to latest verison, in order to upgrade starlette version (#6597)
* build(deps): bump cookie and express in /docs/my-website (#6566)
Bumps [cookie](https://github.com/jshttp/cookie) and [express](https://github.com/expressjs/express). These dependencies needed to be updated together.
Updates `cookie` from 0.6.0 to 0.7.1
- [Release notes](https://github.com/jshttp/cookie/releases)
- [Commits](https://github.com/jshttp/cookie/compare/v0.6.0...v0.7.1)
Updates `express` from 4.20.0 to 4.21.1
- [Release notes](https://github.com/expressjs/express/releases)
- [Changelog](https://github.com/expressjs/express/blob/4.21.1/History.md)
- [Commits](https://github.com/expressjs/express/compare/4.20.0...4.21.1)
---
updated-dependencies:
- dependency-name: cookie
dependency-type: indirect
- dependency-name: express
dependency-type: indirect
...
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
* docs(virtual_keys.md): update Dockerfile reference (#6554)
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
* (proxy fix) - call connect on prisma client when running setup (#6534)
* critical fix - call connect on prisma client when running setup
* fix test_proxy_server_prisma_setup
* fix test_proxy_server_prisma_setup
* Add 3.5 haiku (#6588)
* feat: add claude-3-5-haiku-20241022 entries
* feat: add claude-3-5-haiku-20241022 and vertex_ai/claude-3-5-haiku@20241022 models
* add missing entries, remove vision
* remove image token costs
* Litellm perf improvements 3 (#6573)
* perf: move writing key to cache, to background task
* perf(litellm_pre_call_utils.py): add otel tracing for pre-call utils
adds 200ms on calls with pgdb connected
* fix(litellm_pre_call_utils.py'): rename call_type to actual call used
* perf(proxy_server.py): remove db logic from _get_config_from_file
was causing db calls to occur on every llm request, if team_id was set on key
* fix(auth_checks.py): add check for reducing db calls if user/team id does not exist in db
reduces latency/call by ~100ms
* fix(proxy_server.py): minor fix on existing_settings not incl alerting
* fix(exception_mapping_utils.py): map databricks exception string
* fix(auth_checks.py): fix auth check logic
* test: correctly mark flaky test
* fix(utils.py): handle auth token error for tokenizers.from_pretrained
* build: fix map
* build: fix map
* build: fix json for model map
* fix ImageObject conversion (#6584)
* (fix) litellm.text_completion raises a non-blocking error on simple usage (#6546)
* unit test test_huggingface_text_completion_logprobs
* fix return TextCompletionHandler convert_chat_to_text_completion
* fix hf rest api
* fix test_huggingface_text_completion_logprobs
* fix linting errors
* fix importLiteLLMResponseObjectHandler
* fix test for LiteLLMResponseObjectHandler
* fix test text completion
* fix allow using 15 seconds for premium license check
* testing fix bedrock deprecated cohere.command-text-v14
* (feat) add `Predicted Outputs` for OpenAI (#6594)
* bump openai to openai==1.54.0
* add 'prediction' param
* testing fix bedrock deprecated cohere.command-text-v14
* test test_openai_prediction_param.py
* test_openai_prediction_param_with_caching
* doc Predicted Outputs
* doc Predicted Output
* (fix) Vertex Improve Performance when using `image_url` (#6593)
* fix transformation vertex
* test test_process_gemini_image
* test_image_completion_request
* testing fix - bedrock has deprecated cohere.command-text-v14
* fix vertex pdf
* bump: version 1.51.5 → 1.52.0
* Update setuptools in docker and fastapi to latest verison, in order to upgrade starlette version
---------
Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com>
Co-authored-by: Krish Dholakia <krrishdholakia@gmail.com>
Co-authored-by: Jacob Hagstedt <wcgs@novonordisk.com>
* fix(langfuse.py): fix linting errors
* fix: fix linting errors
* fix: fix casting error
* fix: fix typing error
* fix: add more tests
* fix(utils.py): fix return_processed_chunk_logic
* Revert "Update setuptools in docker and fastapi to latest verison, in order t…" (#6615)
This reverts commit 1a7f7bdfb7
.
* docs fix clarify team_id on team based logging
* doc fix team based logging with langfuse
* fix flake8 checks
* test: bump sleep time
* refactor: replace claude-instant-1.2 with haiku in testing
* fix(proxy_server.py): move to using sl payload in track_cost_callback
* fix(proxy_server.py): fix linting errors
* fix(proxy_server.py): fallback to kwargs(response_cost) if given
* test: remove claude-instant-1 from tests
* test: fix claude test
* docs fix clarify team_id on team based logging
* doc fix team based logging with langfuse
* build: remove lint.yml
---------
Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: Vsevolod Karvetskiy <56288164+karvetskiy@users.noreply.github.com>
Co-authored-by: seva <seva@inita.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com>
Co-authored-by: Jacob Hagstedt P Suorra <Jacobh2@users.noreply.github.com>
Co-authored-by: Jacob Hagstedt <wcgs@novonordisk.com>
This commit is contained in:
parent
66c1ee09cf
commit
136693cac4
32 changed files with 634 additions and 533 deletions
|
@ -1,9 +1,9 @@
|
||||||
#### What this does ####
|
#### What this does ####
|
||||||
# On success, logs events to Langfuse
|
# On success, logs events to Langfuse
|
||||||
import copy
|
import copy
|
||||||
import inspect
|
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
|
from collections.abc import MutableMapping, MutableSequence, MutableSet
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||||
|
|
||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
|
@ -14,7 +14,7 @@ from litellm._logging import verbose_logger
|
||||||
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
|
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
|
||||||
from litellm.secret_managers.main import str_to_bool
|
from litellm.secret_managers.main import str_to_bool
|
||||||
from litellm.types.integrations.langfuse import *
|
from litellm.types.integrations.langfuse import *
|
||||||
from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload
|
from litellm.types.utils import StandardLoggingPayload
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
|
from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
|
||||||
|
@ -355,6 +355,47 @@ class LangFuseLogger:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _prepare_metadata(self, metadata) -> Any:
|
||||||
|
try:
|
||||||
|
return copy.deepcopy(metadata) # Avoid modifying the original metadata
|
||||||
|
except (TypeError, copy.Error) as e:
|
||||||
|
verbose_logger.warning(f"Langfuse Layer Error - {e}")
|
||||||
|
|
||||||
|
new_metadata: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
# if metadata is not a MutableMapping, return an empty dict since we can't call items() on it
|
||||||
|
if not isinstance(metadata, MutableMapping):
|
||||||
|
verbose_logger.warning(
|
||||||
|
"Langfuse Layer Logging - metadata is not a MutableMapping, returning empty dict"
|
||||||
|
)
|
||||||
|
return new_metadata
|
||||||
|
|
||||||
|
for key, value in metadata.items():
|
||||||
|
try:
|
||||||
|
if isinstance(value, MutableMapping):
|
||||||
|
new_metadata[key] = self._prepare_metadata(value)
|
||||||
|
elif isinstance(value, (MutableSequence, MutableSet)):
|
||||||
|
new_metadata[key] = type(value)(
|
||||||
|
*(
|
||||||
|
(
|
||||||
|
self._prepare_metadata(v)
|
||||||
|
if isinstance(v, MutableMapping)
|
||||||
|
else copy.deepcopy(v)
|
||||||
|
)
|
||||||
|
for v in value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif isinstance(value, BaseModel):
|
||||||
|
new_metadata[key] = value.model_dump()
|
||||||
|
else:
|
||||||
|
new_metadata[key] = copy.deepcopy(value)
|
||||||
|
except (TypeError, copy.Error):
|
||||||
|
verbose_logger.warning(
|
||||||
|
f"Langfuse Layer Error - Couldn't copy metadata key: {key} - {traceback.format_exc()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return new_metadata
|
||||||
|
|
||||||
def _log_langfuse_v2( # noqa: PLR0915
|
def _log_langfuse_v2( # noqa: PLR0915
|
||||||
self,
|
self,
|
||||||
user_id,
|
user_id,
|
||||||
|
@ -373,40 +414,19 @@ class LangFuseLogger:
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
import langfuse
|
import langfuse
|
||||||
|
|
||||||
|
print_verbose("Langfuse Layer Logging - logging to langfuse v2")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tags = []
|
metadata = self._prepare_metadata(metadata)
|
||||||
try:
|
|
||||||
optional_params.pop("metadata")
|
|
||||||
metadata = copy.deepcopy(
|
|
||||||
metadata
|
|
||||||
) # Avoid modifying the original metadata
|
|
||||||
except Exception:
|
|
||||||
new_metadata = {}
|
|
||||||
for key, value in metadata.items():
|
|
||||||
if (
|
|
||||||
isinstance(value, list)
|
|
||||||
or isinstance(value, dict)
|
|
||||||
or isinstance(value, str)
|
|
||||||
or isinstance(value, int)
|
|
||||||
or isinstance(value, float)
|
|
||||||
):
|
|
||||||
new_metadata[key] = copy.deepcopy(value)
|
|
||||||
elif isinstance(value, BaseModel):
|
|
||||||
new_metadata[key] = value.model_dump()
|
|
||||||
metadata = new_metadata
|
|
||||||
|
|
||||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
langfuse_version = Version(langfuse.version.__version__)
|
||||||
supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3")
|
|
||||||
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
|
|
||||||
supports_completion_start_time = Version(
|
|
||||||
langfuse.version.__version__
|
|
||||||
) >= Version("2.7.3")
|
|
||||||
|
|
||||||
print_verbose("Langfuse Layer Logging - logging to langfuse v2 ")
|
supports_tags = langfuse_version >= Version("2.6.3")
|
||||||
|
supports_prompt = langfuse_version >= Version("2.7.3")
|
||||||
|
supports_costs = langfuse_version >= Version("2.7.3")
|
||||||
|
supports_completion_start_time = langfuse_version >= Version("2.7.3")
|
||||||
|
|
||||||
if supports_tags:
|
tags = metadata.pop("tags", []) if supports_tags else []
|
||||||
metadata_tags = metadata.pop("tags", [])
|
|
||||||
tags = metadata_tags
|
|
||||||
|
|
||||||
# Clean Metadata before logging - never log raw metadata
|
# Clean Metadata before logging - never log raw metadata
|
||||||
# the raw metadata can contain circular references which leads to infinite recursion
|
# the raw metadata can contain circular references which leads to infinite recursion
|
||||||
|
|
|
@ -243,6 +243,49 @@ class ChunkProcessor:
|
||||||
id=id,
|
id=id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _usage_chunk_calculation_helper(self, usage_chunk: Usage) -> dict:
|
||||||
|
prompt_tokens = 0
|
||||||
|
completion_tokens = 0
|
||||||
|
## anthropic prompt caching information ##
|
||||||
|
cache_creation_input_tokens: Optional[int] = None
|
||||||
|
cache_read_input_tokens: Optional[int] = None
|
||||||
|
completion_tokens_details: Optional[CompletionTokensDetails] = None
|
||||||
|
prompt_tokens_details: Optional[PromptTokensDetails] = None
|
||||||
|
|
||||||
|
if "prompt_tokens" in usage_chunk:
|
||||||
|
prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0
|
||||||
|
if "completion_tokens" in usage_chunk:
|
||||||
|
completion_tokens = usage_chunk.get("completion_tokens", 0) or 0
|
||||||
|
if "cache_creation_input_tokens" in usage_chunk:
|
||||||
|
cache_creation_input_tokens = usage_chunk.get("cache_creation_input_tokens")
|
||||||
|
if "cache_read_input_tokens" in usage_chunk:
|
||||||
|
cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens")
|
||||||
|
if hasattr(usage_chunk, "completion_tokens_details"):
|
||||||
|
if isinstance(usage_chunk.completion_tokens_details, dict):
|
||||||
|
completion_tokens_details = CompletionTokensDetails(
|
||||||
|
**usage_chunk.completion_tokens_details
|
||||||
|
)
|
||||||
|
elif isinstance(
|
||||||
|
usage_chunk.completion_tokens_details, CompletionTokensDetails
|
||||||
|
):
|
||||||
|
completion_tokens_details = usage_chunk.completion_tokens_details
|
||||||
|
if hasattr(usage_chunk, "prompt_tokens_details"):
|
||||||
|
if isinstance(usage_chunk.prompt_tokens_details, dict):
|
||||||
|
prompt_tokens_details = PromptTokensDetails(
|
||||||
|
**usage_chunk.prompt_tokens_details
|
||||||
|
)
|
||||||
|
elif isinstance(usage_chunk.prompt_tokens_details, PromptTokensDetails):
|
||||||
|
prompt_tokens_details = usage_chunk.prompt_tokens_details
|
||||||
|
|
||||||
|
return {
|
||||||
|
"prompt_tokens": prompt_tokens,
|
||||||
|
"completion_tokens": completion_tokens,
|
||||||
|
"cache_creation_input_tokens": cache_creation_input_tokens,
|
||||||
|
"cache_read_input_tokens": cache_read_input_tokens,
|
||||||
|
"completion_tokens_details": completion_tokens_details,
|
||||||
|
"prompt_tokens_details": prompt_tokens_details,
|
||||||
|
}
|
||||||
|
|
||||||
def calculate_usage(
|
def calculate_usage(
|
||||||
self,
|
self,
|
||||||
chunks: List[Union[Dict[str, Any], ModelResponse]],
|
chunks: List[Union[Dict[str, Any], ModelResponse]],
|
||||||
|
@ -269,37 +312,30 @@ class ChunkProcessor:
|
||||||
elif isinstance(chunk, ModelResponse) and hasattr(chunk, "_hidden_params"):
|
elif isinstance(chunk, ModelResponse) and hasattr(chunk, "_hidden_params"):
|
||||||
usage_chunk = chunk._hidden_params.get("usage", None)
|
usage_chunk = chunk._hidden_params.get("usage", None)
|
||||||
if usage_chunk is not None:
|
if usage_chunk is not None:
|
||||||
if "prompt_tokens" in usage_chunk:
|
usage_chunk_dict = self._usage_chunk_calculation_helper(usage_chunk)
|
||||||
prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0
|
if (
|
||||||
if "completion_tokens" in usage_chunk:
|
usage_chunk_dict["prompt_tokens"] is not None
|
||||||
completion_tokens = usage_chunk.get("completion_tokens", 0) or 0
|
and usage_chunk_dict["prompt_tokens"] > 0
|
||||||
if "cache_creation_input_tokens" in usage_chunk:
|
):
|
||||||
cache_creation_input_tokens = usage_chunk.get(
|
prompt_tokens = usage_chunk_dict["prompt_tokens"]
|
||||||
|
if (
|
||||||
|
usage_chunk_dict["completion_tokens"] is not None
|
||||||
|
and usage_chunk_dict["completion_tokens"] > 0
|
||||||
|
):
|
||||||
|
completion_tokens = usage_chunk_dict["completion_tokens"]
|
||||||
|
if usage_chunk_dict["cache_creation_input_tokens"] is not None:
|
||||||
|
cache_creation_input_tokens = usage_chunk_dict[
|
||||||
"cache_creation_input_tokens"
|
"cache_creation_input_tokens"
|
||||||
)
|
]
|
||||||
if "cache_read_input_tokens" in usage_chunk:
|
if usage_chunk_dict["cache_read_input_tokens"] is not None:
|
||||||
cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens")
|
cache_read_input_tokens = usage_chunk_dict[
|
||||||
if hasattr(usage_chunk, "completion_tokens_details"):
|
"cache_read_input_tokens"
|
||||||
if isinstance(usage_chunk.completion_tokens_details, dict):
|
]
|
||||||
completion_tokens_details = CompletionTokensDetails(
|
if usage_chunk_dict["completion_tokens_details"] is not None:
|
||||||
**usage_chunk.completion_tokens_details
|
completion_tokens_details = usage_chunk_dict[
|
||||||
)
|
"completion_tokens_details"
|
||||||
elif isinstance(
|
]
|
||||||
usage_chunk.completion_tokens_details, CompletionTokensDetails
|
prompt_tokens_details = usage_chunk_dict["prompt_tokens_details"]
|
||||||
):
|
|
||||||
completion_tokens_details = (
|
|
||||||
usage_chunk.completion_tokens_details
|
|
||||||
)
|
|
||||||
if hasattr(usage_chunk, "prompt_tokens_details"):
|
|
||||||
if isinstance(usage_chunk.prompt_tokens_details, dict):
|
|
||||||
prompt_tokens_details = PromptTokensDetails(
|
|
||||||
**usage_chunk.prompt_tokens_details
|
|
||||||
)
|
|
||||||
elif isinstance(
|
|
||||||
usage_chunk.prompt_tokens_details, PromptTokensDetails
|
|
||||||
):
|
|
||||||
prompt_tokens_details = usage_chunk.prompt_tokens_details
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
returned_usage.prompt_tokens = prompt_tokens or token_counter(
|
returned_usage.prompt_tokens = prompt_tokens or token_counter(
|
||||||
model=model, messages=messages
|
model=model, messages=messages
|
||||||
|
|
|
@ -769,6 +769,7 @@ class ModelResponseIterator:
|
||||||
message=message,
|
message=message,
|
||||||
status_code=500, # it looks like Anthropic API does not return a status code in the chunk error - default to 500
|
status_code=500, # it looks like Anthropic API does not return a status code in the chunk error - default to 500
|
||||||
)
|
)
|
||||||
|
|
||||||
returned_chunk = GenericStreamingChunk(
|
returned_chunk = GenericStreamingChunk(
|
||||||
text=text,
|
text=text,
|
||||||
tool_use=tool_use,
|
tool_use=tool_use,
|
||||||
|
|
|
@ -24,13 +24,20 @@ model_list:
|
||||||
api_key: my-fake-key
|
api_key: my-fake-key
|
||||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
|
||||||
|
- model_name: gpt-4
|
||||||
|
litellm_params:
|
||||||
|
model: azure/chatgpt-v-2
|
||||||
|
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
||||||
|
api_version: "2023-05-15"
|
||||||
|
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
|
||||||
|
rpm: 480
|
||||||
|
timeout: 300
|
||||||
|
stream_timeout: 60
|
||||||
# litellm_settings:
|
# litellm_settings:
|
||||||
# fallbacks: [{ "claude-3-5-sonnet-20240620": ["claude-3-5-sonnet-aihubmix"] }]
|
# fallbacks: [{ "claude-3-5-sonnet-20240620": ["claude-3-5-sonnet-aihubmix"] }]
|
||||||
# callbacks: ["otel", "prometheus"]
|
# callbacks: ["otel", "prometheus"]
|
||||||
# default_redis_batch_cache_expiry: 10
|
# default_redis_batch_cache_expiry: 10
|
||||||
|
|
||||||
|
|
||||||
# litellm_settings:
|
# litellm_settings:
|
||||||
# cache: True
|
# cache: True
|
||||||
# cache_params:
|
# cache_params:
|
||||||
|
@ -74,4 +81,4 @@ model_list:
|
||||||
|
|
||||||
# # # see https://docs.litellm.ai/docs/proxy/caching#advanced---user-api-key-cache-ttl
|
# # # see https://docs.litellm.ai/docs/proxy/caching#advanced---user-api-key-cache-ttl
|
||||||
# # # our api keys rarely change
|
# # # our api keys rarely change
|
||||||
# # user_api_key_cache_ttl: 3600
|
# # user_api_key_cache_ttl: 3600
|
||||||
|
|
|
@ -770,8 +770,16 @@ async def _PROXY_track_cost_callback(
|
||||||
org_id = metadata.get("user_api_key_org_id", None)
|
org_id = metadata.get("user_api_key_org_id", None)
|
||||||
key_alias = metadata.get("user_api_key_alias", None)
|
key_alias = metadata.get("user_api_key_alias", None)
|
||||||
end_user_max_budget = metadata.get("user_api_end_user_max_budget", None)
|
end_user_max_budget = metadata.get("user_api_end_user_max_budget", None)
|
||||||
if kwargs.get("response_cost", None) is not None:
|
sl_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||||
response_cost = kwargs["response_cost"]
|
"standard_logging_object", None
|
||||||
|
)
|
||||||
|
response_cost = (
|
||||||
|
sl_object.get("response_cost", None)
|
||||||
|
if sl_object is not None
|
||||||
|
else kwargs.get("response_cost", None)
|
||||||
|
)
|
||||||
|
|
||||||
|
if response_cost is not None:
|
||||||
user_api_key = metadata.get("user_api_key", None)
|
user_api_key = metadata.get("user_api_key", None)
|
||||||
if kwargs.get("cache_hit", False) is True:
|
if kwargs.get("cache_hit", False) is True:
|
||||||
response_cost = 0.0
|
response_cost = 0.0
|
||||||
|
@ -824,9 +832,15 @@ async def _PROXY_track_cost_callback(
|
||||||
if kwargs["stream"] is not True or (
|
if kwargs["stream"] is not True or (
|
||||||
kwargs["stream"] is True and "complete_streaming_response" in kwargs
|
kwargs["stream"] is True and "complete_streaming_response" in kwargs
|
||||||
):
|
):
|
||||||
cost_tracking_failure_debug_info = kwargs.get(
|
if sl_object is not None:
|
||||||
"response_cost_failure_debug_information"
|
cost_tracking_failure_debug_info: Union[dict, str] = (
|
||||||
)
|
sl_object["response_cost_failure_debug_info"] # type: ignore
|
||||||
|
or "response_cost_failure_debug_info is None in standard_logging_object"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cost_tracking_failure_debug_info = (
|
||||||
|
"standard_logging_object not found"
|
||||||
|
)
|
||||||
model = kwargs.get("model")
|
model = kwargs.get("model")
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Cost tracking failed for model={model}.\nDebug info - {cost_tracking_failure_debug_info}\nAdd custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing"
|
f"Cost tracking failed for model={model}.\nDebug info - {cost_tracking_failure_debug_info}\nAdd custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing"
|
||||||
|
@ -842,7 +856,7 @@ async def _PROXY_track_cost_callback(
|
||||||
failing_model=model,
|
failing_model=model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
|
verbose_proxy_logger.debug(error_msg)
|
||||||
|
|
||||||
|
|
||||||
def error_tracking():
|
def error_tracking():
|
||||||
|
|
|
@ -61,6 +61,24 @@ class PatternMatchRouter:
|
||||||
# return f"^{regex}$"
|
# return f"^{regex}$"
|
||||||
return re.escape(pattern).replace(r"\*", "(.*)")
|
return re.escape(pattern).replace(r"\*", "(.*)")
|
||||||
|
|
||||||
|
def _return_pattern_matched_deployments(
|
||||||
|
self, matched_pattern: Match, deployments: List[Dict]
|
||||||
|
) -> List[Dict]:
|
||||||
|
new_deployments = []
|
||||||
|
for deployment in deployments:
|
||||||
|
new_deployment = copy.deepcopy(deployment)
|
||||||
|
new_deployment["litellm_params"]["model"] = (
|
||||||
|
PatternMatchRouter.set_deployment_model_name(
|
||||||
|
matched_pattern=matched_pattern,
|
||||||
|
litellm_deployment_litellm_model=deployment["litellm_params"][
|
||||||
|
"model"
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
new_deployments.append(new_deployment)
|
||||||
|
|
||||||
|
return new_deployments
|
||||||
|
|
||||||
def route(self, request: Optional[str]) -> Optional[List[Dict]]:
|
def route(self, request: Optional[str]) -> Optional[List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Route a requested model to the corresponding llm deployments based on the regex pattern
|
Route a requested model to the corresponding llm deployments based on the regex pattern
|
||||||
|
@ -79,8 +97,11 @@ class PatternMatchRouter:
|
||||||
if request is None:
|
if request is None:
|
||||||
return None
|
return None
|
||||||
for pattern, llm_deployments in self.patterns.items():
|
for pattern, llm_deployments in self.patterns.items():
|
||||||
if re.match(pattern, request):
|
pattern_match = re.match(pattern, request)
|
||||||
return llm_deployments
|
if pattern_match:
|
||||||
|
return self._return_pattern_matched_deployments(
|
||||||
|
matched_pattern=pattern_match, deployments=llm_deployments
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_router_logger.debug(f"Error in PatternMatchRouter.route: {str(e)}")
|
verbose_router_logger.debug(f"Error in PatternMatchRouter.route: {str(e)}")
|
||||||
|
|
||||||
|
@ -102,6 +123,7 @@ class PatternMatchRouter:
|
||||||
|
|
||||||
if model_name = "llmengine/foo" -> model = "openai/foo"
|
if model_name = "llmengine/foo" -> model = "openai/foo"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
## BASE CASE: if the deployment model name does not contain a wildcard, return the deployment model name
|
## BASE CASE: if the deployment model name does not contain a wildcard, return the deployment model name
|
||||||
if "*" not in litellm_deployment_litellm_model:
|
if "*" not in litellm_deployment_litellm_model:
|
||||||
return litellm_deployment_litellm_model
|
return litellm_deployment_litellm_model
|
||||||
|
@ -165,12 +187,7 @@ class PatternMatchRouter:
|
||||||
"""
|
"""
|
||||||
pattern_match = self.get_pattern(model, custom_llm_provider)
|
pattern_match = self.get_pattern(model, custom_llm_provider)
|
||||||
if pattern_match:
|
if pattern_match:
|
||||||
provider_deployments = []
|
return pattern_match
|
||||||
for deployment in pattern_match:
|
|
||||||
dep = copy.deepcopy(deployment)
|
|
||||||
dep["litellm_params"]["model"] = model
|
|
||||||
provider_deployments.append(dep)
|
|
||||||
return provider_deployments
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -745,13 +745,13 @@ class StreamingChatCompletionChunk(OpenAIChatCompletionChunk):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
class ModelResponse(OpenAIObject):
|
from openai.types.chat import ChatCompletionChunk
|
||||||
|
|
||||||
|
|
||||||
|
class ModelResponseBase(OpenAIObject):
|
||||||
id: str
|
id: str
|
||||||
"""A unique identifier for the completion."""
|
"""A unique identifier for the completion."""
|
||||||
|
|
||||||
choices: List[Union[Choices, StreamingChoices]]
|
|
||||||
"""The list of completion choices the model generated for the input prompt."""
|
|
||||||
|
|
||||||
created: int
|
created: int
|
||||||
"""The Unix timestamp (in seconds) of when the completion was created."""
|
"""The Unix timestamp (in seconds) of when the completion was created."""
|
||||||
|
|
||||||
|
@ -772,6 +772,55 @@ class ModelResponse(OpenAIObject):
|
||||||
|
|
||||||
_response_headers: Optional[dict] = None
|
_response_headers: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ModelResponseStream(ModelResponseBase):
|
||||||
|
choices: List[StreamingChoices]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
choices: Optional[List[Union[StreamingChoices, dict, BaseModel]]] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if choices is not None and isinstance(choices, list):
|
||||||
|
new_choices = []
|
||||||
|
for choice in choices:
|
||||||
|
_new_choice = None
|
||||||
|
if isinstance(choice, StreamingChoices):
|
||||||
|
_new_choice = choice
|
||||||
|
elif isinstance(choice, dict):
|
||||||
|
_new_choice = StreamingChoices(**choice)
|
||||||
|
elif isinstance(choice, BaseModel):
|
||||||
|
_new_choice = StreamingChoices(**choice.model_dump())
|
||||||
|
new_choices.append(_new_choice)
|
||||||
|
kwargs["choices"] = new_choices
|
||||||
|
else:
|
||||||
|
kwargs["choices"] = [StreamingChoices()]
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
# Define custom behavior for the 'in' operator
|
||||||
|
return hasattr(self, key)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
||||||
|
return getattr(self, key, default)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
# Allow dictionary-style access to attributes
|
||||||
|
return getattr(self, key)
|
||||||
|
|
||||||
|
def json(self, **kwargs): # type: ignore
|
||||||
|
try:
|
||||||
|
return self.model_dump() # noqa
|
||||||
|
except Exception:
|
||||||
|
# if using pydantic v1
|
||||||
|
return self.dict()
|
||||||
|
|
||||||
|
|
||||||
|
class ModelResponse(ModelResponseBase):
|
||||||
|
choices: List[Union[Choices, StreamingChoices]]
|
||||||
|
"""The list of completion choices the model generated for the input prompt."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
id=None,
|
id=None,
|
||||||
|
|
419
litellm/utils.py
419
litellm/utils.py
|
@ -114,6 +114,7 @@ from litellm.types.utils import (
|
||||||
Message,
|
Message,
|
||||||
ModelInfo,
|
ModelInfo,
|
||||||
ModelResponse,
|
ModelResponse,
|
||||||
|
ModelResponseStream,
|
||||||
ProviderField,
|
ProviderField,
|
||||||
StreamingChoices,
|
StreamingChoices,
|
||||||
TextChoices,
|
TextChoices,
|
||||||
|
@ -5642,6 +5643,9 @@ class CustomStreamWrapper:
|
||||||
)
|
)
|
||||||
self.messages = getattr(logging_obj, "messages", None)
|
self.messages = getattr(logging_obj, "messages", None)
|
||||||
self.sent_stream_usage = False
|
self.sent_stream_usage = False
|
||||||
|
self.send_stream_usage = (
|
||||||
|
True if self.check_send_stream_usage(self.stream_options) else False
|
||||||
|
)
|
||||||
self.tool_call = False
|
self.tool_call = False
|
||||||
self.chunks: List = (
|
self.chunks: List = (
|
||||||
[]
|
[]
|
||||||
|
@ -5654,6 +5658,12 @@ class CustomStreamWrapper:
|
||||||
def __aiter__(self):
|
def __aiter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def check_send_stream_usage(self, stream_options: Optional[dict]):
|
||||||
|
return (
|
||||||
|
stream_options is not None
|
||||||
|
and stream_options.get("include_usage", False) is True
|
||||||
|
)
|
||||||
|
|
||||||
def check_is_function_call(self, logging_obj) -> bool:
|
def check_is_function_call(self, logging_obj) -> bool:
|
||||||
if hasattr(logging_obj, "optional_params") and isinstance(
|
if hasattr(logging_obj, "optional_params") and isinstance(
|
||||||
logging_obj.optional_params, dict
|
logging_obj.optional_params, dict
|
||||||
|
@ -6506,9 +6516,148 @@ class CustomStreamWrapper:
|
||||||
is_empty = False
|
is_empty = False
|
||||||
return is_empty
|
return is_empty
|
||||||
|
|
||||||
|
def return_processed_chunk_logic( # noqa
|
||||||
|
self,
|
||||||
|
completion_obj: dict,
|
||||||
|
model_response: ModelResponseStream,
|
||||||
|
response_obj: dict,
|
||||||
|
):
|
||||||
|
|
||||||
|
print_verbose(
|
||||||
|
f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
"content" in completion_obj
|
||||||
|
and (
|
||||||
|
isinstance(completion_obj["content"], str)
|
||||||
|
and len(completion_obj["content"]) > 0
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
"tool_calls" in completion_obj
|
||||||
|
and completion_obj["tool_calls"] is not None
|
||||||
|
and len(completion_obj["tool_calls"]) > 0
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
"function_call" in completion_obj
|
||||||
|
and completion_obj["function_call"] is not None
|
||||||
|
)
|
||||||
|
): # cannot set content of an OpenAI Object to be an empty string
|
||||||
|
self.safety_checker()
|
||||||
|
hold, model_response_str = self.check_special_tokens(
|
||||||
|
chunk=completion_obj["content"],
|
||||||
|
finish_reason=model_response.choices[0].finish_reason,
|
||||||
|
) # filter out bos/eos tokens from openai-compatible hf endpoints
|
||||||
|
print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
|
||||||
|
if hold is False:
|
||||||
|
## check if openai/azure chunk
|
||||||
|
original_chunk = response_obj.get("original_chunk", None)
|
||||||
|
if original_chunk:
|
||||||
|
model_response.id = original_chunk.id
|
||||||
|
self.response_id = original_chunk.id
|
||||||
|
if len(original_chunk.choices) > 0:
|
||||||
|
choices = []
|
||||||
|
for choice in original_chunk.choices:
|
||||||
|
try:
|
||||||
|
if isinstance(choice, BaseModel):
|
||||||
|
choice_json = choice.model_dump()
|
||||||
|
choice_json.pop(
|
||||||
|
"finish_reason", None
|
||||||
|
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
|
||||||
|
print_verbose(f"choice_json: {choice_json}")
|
||||||
|
choices.append(StreamingChoices(**choice_json))
|
||||||
|
except Exception:
|
||||||
|
choices.append(StreamingChoices())
|
||||||
|
print_verbose(f"choices in streaming: {choices}")
|
||||||
|
setattr(model_response, "choices", choices)
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
model_response.system_fingerprint = (
|
||||||
|
original_chunk.system_fingerprint
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
model_response,
|
||||||
|
"citations",
|
||||||
|
getattr(original_chunk, "citations", None),
|
||||||
|
)
|
||||||
|
print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
|
||||||
|
if self.sent_first_chunk is False:
|
||||||
|
model_response.choices[0].delta["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
elif self.sent_first_chunk is True and hasattr(
|
||||||
|
model_response.choices[0].delta, "role"
|
||||||
|
):
|
||||||
|
_initial_delta = model_response.choices[0].delta.model_dump()
|
||||||
|
_initial_delta.pop("role", None)
|
||||||
|
model_response.choices[0].delta = Delta(**_initial_delta)
|
||||||
|
print_verbose(
|
||||||
|
f"model_response.choices[0].delta: {model_response.choices[0].delta}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
## else
|
||||||
|
completion_obj["content"] = model_response_str
|
||||||
|
if self.sent_first_chunk is False:
|
||||||
|
completion_obj["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
|
||||||
|
model_response.choices[0].delta = Delta(**completion_obj)
|
||||||
|
_index: Optional[int] = completion_obj.get("index")
|
||||||
|
if _index is not None:
|
||||||
|
model_response.choices[0].index = _index
|
||||||
|
print_verbose(f"returning model_response: {model_response}")
|
||||||
|
return model_response
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
elif self.received_finish_reason is not None:
|
||||||
|
if self.sent_last_chunk is True:
|
||||||
|
# Bedrock returns the guardrail trace in the last chunk - we want to return this here
|
||||||
|
if self.custom_llm_provider == "bedrock" and "trace" in model_response:
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
# Default - return StopIteration
|
||||||
|
raise StopIteration
|
||||||
|
# flush any remaining holding chunk
|
||||||
|
if len(self.holding_chunk) > 0:
|
||||||
|
if model_response.choices[0].delta.content is None:
|
||||||
|
model_response.choices[0].delta.content = self.holding_chunk
|
||||||
|
else:
|
||||||
|
model_response.choices[0].delta.content = (
|
||||||
|
self.holding_chunk + model_response.choices[0].delta.content
|
||||||
|
)
|
||||||
|
self.holding_chunk = ""
|
||||||
|
# if delta is None
|
||||||
|
_is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
|
||||||
|
|
||||||
|
if _is_delta_empty:
|
||||||
|
# get any function call arguments
|
||||||
|
model_response.choices[0].finish_reason = map_finish_reason(
|
||||||
|
finish_reason=self.received_finish_reason
|
||||||
|
) # ensure consistent output to openai
|
||||||
|
|
||||||
|
self.sent_last_chunk = True
|
||||||
|
|
||||||
|
return model_response
|
||||||
|
elif (
|
||||||
|
model_response.choices[0].delta.tool_calls is not None
|
||||||
|
or model_response.choices[0].delta.function_call is not None
|
||||||
|
):
|
||||||
|
if self.sent_first_chunk is False:
|
||||||
|
model_response.choices[0].delta["role"] = "assistant"
|
||||||
|
self.sent_first_chunk = True
|
||||||
|
return model_response
|
||||||
|
elif (
|
||||||
|
len(model_response.choices) > 0
|
||||||
|
and hasattr(model_response.choices[0].delta, "audio")
|
||||||
|
and model_response.choices[0].delta.audio is not None
|
||||||
|
):
|
||||||
|
return model_response
|
||||||
|
else:
|
||||||
|
if hasattr(model_response, "usage"):
|
||||||
|
self.chunks.append(model_response)
|
||||||
|
return
|
||||||
|
|
||||||
def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
|
def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
|
||||||
model_response = self.model_response_creator()
|
model_response = self.model_response_creator()
|
||||||
response_obj = {}
|
response_obj: dict = {}
|
||||||
try:
|
try:
|
||||||
# return this for all models
|
# return this for all models
|
||||||
completion_obj = {"content": ""}
|
completion_obj = {"content": ""}
|
||||||
|
@ -6559,6 +6708,7 @@ class CustomStreamWrapper:
|
||||||
"provider_specific_fields"
|
"provider_specific_fields"
|
||||||
].items():
|
].items():
|
||||||
setattr(model_response, key, value)
|
setattr(model_response, key, value)
|
||||||
|
|
||||||
response_obj = anthropic_response_obj
|
response_obj = anthropic_response_obj
|
||||||
elif (
|
elif (
|
||||||
self.custom_llm_provider
|
self.custom_llm_provider
|
||||||
|
@ -6626,7 +6776,7 @@ class CustomStreamWrapper:
|
||||||
if self.sent_first_chunk is False:
|
if self.sent_first_chunk is False:
|
||||||
raise Exception("An unknown error occurred with the stream")
|
raise Exception("An unknown error occurred with the stream")
|
||||||
self.received_finish_reason = "stop"
|
self.received_finish_reason = "stop"
|
||||||
elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
|
elif self.custom_llm_provider == "vertex_ai":
|
||||||
import proto # type: ignore
|
import proto # type: ignore
|
||||||
|
|
||||||
if self.model.startswith("claude-3"):
|
if self.model.startswith("claude-3"):
|
||||||
|
@ -7009,145 +7159,12 @@ class CustomStreamWrapper:
|
||||||
self.tool_call = True
|
self.tool_call = True
|
||||||
|
|
||||||
## RETURN ARG
|
## RETURN ARG
|
||||||
if (
|
return self.return_processed_chunk_logic(
|
||||||
"content" in completion_obj
|
completion_obj=completion_obj,
|
||||||
and (
|
model_response=model_response, # type: ignore
|
||||||
isinstance(completion_obj["content"], str)
|
response_obj=response_obj,
|
||||||
and len(completion_obj["content"]) > 0
|
)
|
||||||
)
|
|
||||||
or (
|
|
||||||
"tool_calls" in completion_obj
|
|
||||||
and completion_obj["tool_calls"] is not None
|
|
||||||
and len(completion_obj["tool_calls"]) > 0
|
|
||||||
)
|
|
||||||
or (
|
|
||||||
"function_call" in completion_obj
|
|
||||||
and completion_obj["function_call"] is not None
|
|
||||||
)
|
|
||||||
): # cannot set content of an OpenAI Object to be an empty string
|
|
||||||
self.safety_checker()
|
|
||||||
hold, model_response_str = self.check_special_tokens(
|
|
||||||
chunk=completion_obj["content"],
|
|
||||||
finish_reason=model_response.choices[0].finish_reason,
|
|
||||||
) # filter out bos/eos tokens from openai-compatible hf endpoints
|
|
||||||
print_verbose(
|
|
||||||
f"hold - {hold}, model_response_str - {model_response_str}"
|
|
||||||
)
|
|
||||||
if hold is False:
|
|
||||||
## check if openai/azure chunk
|
|
||||||
original_chunk = response_obj.get("original_chunk", None)
|
|
||||||
if original_chunk:
|
|
||||||
model_response.id = original_chunk.id
|
|
||||||
self.response_id = original_chunk.id
|
|
||||||
if len(original_chunk.choices) > 0:
|
|
||||||
choices = []
|
|
||||||
for idx, choice in enumerate(original_chunk.choices):
|
|
||||||
try:
|
|
||||||
if isinstance(choice, BaseModel):
|
|
||||||
try:
|
|
||||||
choice_json = choice.model_dump()
|
|
||||||
except Exception:
|
|
||||||
choice_json = choice.dict()
|
|
||||||
choice_json.pop(
|
|
||||||
"finish_reason", None
|
|
||||||
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
|
|
||||||
print_verbose(f"choice_json: {choice_json}")
|
|
||||||
choices.append(StreamingChoices(**choice_json))
|
|
||||||
except Exception:
|
|
||||||
choices.append(StreamingChoices())
|
|
||||||
print_verbose(f"choices in streaming: {choices}")
|
|
||||||
model_response.choices = choices
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
model_response.system_fingerprint = (
|
|
||||||
original_chunk.system_fingerprint
|
|
||||||
)
|
|
||||||
model_response.citations = getattr(
|
|
||||||
original_chunk, "citations", None
|
|
||||||
)
|
|
||||||
print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
|
|
||||||
if self.sent_first_chunk is False:
|
|
||||||
model_response.choices[0].delta["role"] = "assistant"
|
|
||||||
self.sent_first_chunk = True
|
|
||||||
elif self.sent_first_chunk is True and hasattr(
|
|
||||||
model_response.choices[0].delta, "role"
|
|
||||||
):
|
|
||||||
_initial_delta = model_response.choices[
|
|
||||||
0
|
|
||||||
].delta.model_dump()
|
|
||||||
_initial_delta.pop("role", None)
|
|
||||||
model_response.choices[0].delta = Delta(**_initial_delta)
|
|
||||||
print_verbose(
|
|
||||||
f"model_response.choices[0].delta: {model_response.choices[0].delta}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
## else
|
|
||||||
completion_obj["content"] = model_response_str
|
|
||||||
if self.sent_first_chunk is False:
|
|
||||||
completion_obj["role"] = "assistant"
|
|
||||||
self.sent_first_chunk = True
|
|
||||||
|
|
||||||
model_response.choices[0].delta = Delta(**completion_obj)
|
|
||||||
if completion_obj.get("index") is not None:
|
|
||||||
model_response.choices[0].index = completion_obj.get(
|
|
||||||
"index"
|
|
||||||
)
|
|
||||||
print_verbose(f"returning model_response: {model_response}")
|
|
||||||
return model_response
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.received_finish_reason is not None:
|
|
||||||
if self.sent_last_chunk is True:
|
|
||||||
# Bedrock returns the guardrail trace in the last chunk - we want to return this here
|
|
||||||
if (
|
|
||||||
self.custom_llm_provider == "bedrock"
|
|
||||||
and "trace" in model_response
|
|
||||||
):
|
|
||||||
return model_response
|
|
||||||
|
|
||||||
# Default - return StopIteration
|
|
||||||
raise StopIteration
|
|
||||||
# flush any remaining holding chunk
|
|
||||||
if len(self.holding_chunk) > 0:
|
|
||||||
if model_response.choices[0].delta.content is None:
|
|
||||||
model_response.choices[0].delta.content = self.holding_chunk
|
|
||||||
else:
|
|
||||||
model_response.choices[0].delta.content = (
|
|
||||||
self.holding_chunk + model_response.choices[0].delta.content
|
|
||||||
)
|
|
||||||
self.holding_chunk = ""
|
|
||||||
# if delta is None
|
|
||||||
_is_delta_empty = self.is_delta_empty(
|
|
||||||
delta=model_response.choices[0].delta
|
|
||||||
)
|
|
||||||
|
|
||||||
if _is_delta_empty:
|
|
||||||
# get any function call arguments
|
|
||||||
model_response.choices[0].finish_reason = map_finish_reason(
|
|
||||||
finish_reason=self.received_finish_reason
|
|
||||||
) # ensure consistent output to openai
|
|
||||||
|
|
||||||
self.sent_last_chunk = True
|
|
||||||
|
|
||||||
return model_response
|
|
||||||
elif (
|
|
||||||
model_response.choices[0].delta.tool_calls is not None
|
|
||||||
or model_response.choices[0].delta.function_call is not None
|
|
||||||
):
|
|
||||||
if self.sent_first_chunk is False:
|
|
||||||
model_response.choices[0].delta["role"] = "assistant"
|
|
||||||
self.sent_first_chunk = True
|
|
||||||
return model_response
|
|
||||||
elif (
|
|
||||||
len(model_response.choices) > 0
|
|
||||||
and hasattr(model_response.choices[0].delta, "audio")
|
|
||||||
and model_response.choices[0].delta.audio is not None
|
|
||||||
):
|
|
||||||
return model_response
|
|
||||||
else:
|
|
||||||
if hasattr(model_response, "usage"):
|
|
||||||
self.chunks.append(model_response)
|
|
||||||
return
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -7293,27 +7310,24 @@ class CustomStreamWrapper:
|
||||||
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
if self.sent_last_chunk is True:
|
if self.sent_last_chunk is True:
|
||||||
if (
|
complete_streaming_response = litellm.stream_chunk_builder(
|
||||||
self.sent_stream_usage is False
|
chunks=self.chunks, messages=self.messages
|
||||||
and self.stream_options is not None
|
)
|
||||||
and self.stream_options.get("include_usage", False) is True
|
response = self.model_response_creator()
|
||||||
):
|
if complete_streaming_response is not None:
|
||||||
# send the final chunk with stream options
|
setattr(
|
||||||
complete_streaming_response = litellm.stream_chunk_builder(
|
response,
|
||||||
chunks=self.chunks, messages=self.messages
|
"usage",
|
||||||
|
getattr(complete_streaming_response, "usage"),
|
||||||
)
|
)
|
||||||
response = self.model_response_creator()
|
|
||||||
if complete_streaming_response is not None:
|
## LOGGING
|
||||||
setattr(
|
threading.Thread(
|
||||||
response,
|
target=self.logging_obj.success_handler,
|
||||||
"usage",
|
args=(response, None, None, cache_hit),
|
||||||
getattr(complete_streaming_response, "usage"),
|
).start() # log response
|
||||||
)
|
|
||||||
## LOGGING
|
if self.sent_stream_usage is False and self.send_stream_usage is True:
|
||||||
threading.Thread(
|
|
||||||
target=self.logging_obj.success_handler,
|
|
||||||
args=(response, None, None, cache_hit),
|
|
||||||
).start() # log response
|
|
||||||
self.sent_stream_usage = True
|
self.sent_stream_usage = True
|
||||||
return response
|
return response
|
||||||
raise # Re-raise StopIteration
|
raise # Re-raise StopIteration
|
||||||
|
@ -7401,7 +7415,6 @@ class CustomStreamWrapper:
|
||||||
or self.custom_llm_provider in litellm._custom_providers
|
or self.custom_llm_provider in litellm._custom_providers
|
||||||
):
|
):
|
||||||
async for chunk in self.completion_stream:
|
async for chunk in self.completion_stream:
|
||||||
print_verbose(f"value of async chunk: {chunk}")
|
|
||||||
if chunk == "None" or chunk is None:
|
if chunk == "None" or chunk is None:
|
||||||
raise Exception
|
raise Exception
|
||||||
elif (
|
elif (
|
||||||
|
@ -7431,10 +7444,7 @@ class CustomStreamWrapper:
|
||||||
end_time=None,
|
end_time=None,
|
||||||
cache_hit=cache_hit,
|
cache_hit=cache_hit,
|
||||||
)
|
)
|
||||||
# threading.Thread(
|
|
||||||
# target=self.logging_obj.success_handler,
|
|
||||||
# args=(processed_chunk, None, None, cache_hit),
|
|
||||||
# ).start() # log response
|
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.logging_obj.async_success_handler(
|
self.logging_obj.async_success_handler(
|
||||||
processed_chunk, cache_hit=cache_hit
|
processed_chunk, cache_hit=cache_hit
|
||||||
|
@ -7515,82 +7525,33 @@ class CustomStreamWrapper:
|
||||||
# RETURN RESULT
|
# RETURN RESULT
|
||||||
self.chunks.append(processed_chunk)
|
self.chunks.append(processed_chunk)
|
||||||
return processed_chunk
|
return processed_chunk
|
||||||
except StopAsyncIteration:
|
except (StopAsyncIteration, StopIteration):
|
||||||
if self.sent_last_chunk is True:
|
if self.sent_last_chunk is True:
|
||||||
if (
|
# log the final chunk with accurate streaming values
|
||||||
self.sent_stream_usage is False
|
complete_streaming_response = litellm.stream_chunk_builder(
|
||||||
and self.stream_options is not None
|
chunks=self.chunks, messages=self.messages
|
||||||
and self.stream_options.get("include_usage", False) is True
|
)
|
||||||
):
|
response = self.model_response_creator()
|
||||||
# send the final chunk with stream options
|
if complete_streaming_response is not None:
|
||||||
complete_streaming_response = litellm.stream_chunk_builder(
|
setattr(
|
||||||
chunks=self.chunks, messages=self.messages
|
response,
|
||||||
|
"usage",
|
||||||
|
getattr(complete_streaming_response, "usage"),
|
||||||
)
|
)
|
||||||
response = self.model_response_creator()
|
|
||||||
if complete_streaming_response is not None:
|
|
||||||
setattr(
|
|
||||||
response,
|
|
||||||
"usage",
|
|
||||||
getattr(complete_streaming_response, "usage"),
|
|
||||||
)
|
|
||||||
## LOGGING
|
|
||||||
threading.Thread(
|
|
||||||
target=self.logging_obj.success_handler,
|
|
||||||
args=(response, None, None, cache_hit),
|
|
||||||
).start() # log response
|
|
||||||
asyncio.create_task(
|
|
||||||
self.logging_obj.async_success_handler(
|
|
||||||
response, cache_hit=cache_hit
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.sent_stream_usage = True
|
|
||||||
return response
|
|
||||||
raise # Re-raise StopIteration
|
|
||||||
else:
|
|
||||||
self.sent_last_chunk = True
|
|
||||||
processed_chunk = self.finish_reason_handler()
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
target=self.logging_obj.success_handler,
|
target=self.logging_obj.success_handler,
|
||||||
args=(processed_chunk, None, None, cache_hit),
|
args=(response, None, None, cache_hit),
|
||||||
).start() # log response
|
).start() # log response
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.logging_obj.async_success_handler(
|
self.logging_obj.async_success_handler(
|
||||||
processed_chunk, cache_hit=cache_hit
|
response, cache_hit=cache_hit
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return processed_chunk
|
if self.sent_stream_usage is False and self.send_stream_usage is True:
|
||||||
except StopIteration:
|
|
||||||
if self.sent_last_chunk is True:
|
|
||||||
if (
|
|
||||||
self.sent_stream_usage is False
|
|
||||||
and self.stream_options is not None
|
|
||||||
and self.stream_options.get("include_usage", False) is True
|
|
||||||
):
|
|
||||||
# send the final chunk with stream options
|
|
||||||
complete_streaming_response = litellm.stream_chunk_builder(
|
|
||||||
chunks=self.chunks, messages=self.messages
|
|
||||||
)
|
|
||||||
response = self.model_response_creator()
|
|
||||||
if complete_streaming_response is not None:
|
|
||||||
setattr(
|
|
||||||
response,
|
|
||||||
"usage",
|
|
||||||
getattr(complete_streaming_response, "usage"),
|
|
||||||
)
|
|
||||||
## LOGGING
|
|
||||||
threading.Thread(
|
|
||||||
target=self.logging_obj.success_handler,
|
|
||||||
args=(response, None, None, cache_hit),
|
|
||||||
).start() # log response
|
|
||||||
asyncio.create_task(
|
|
||||||
self.logging_obj.async_success_handler(
|
|
||||||
response, cache_hit=cache_hit
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.sent_stream_usage = True
|
self.sent_stream_usage = True
|
||||||
return response
|
return response
|
||||||
raise StopAsyncIteration
|
raise StopAsyncIteration # Re-raise StopIteration
|
||||||
else:
|
else:
|
||||||
self.sent_last_chunk = True
|
self.sent_last_chunk = True
|
||||||
processed_chunk = self.finish_reason_handler()
|
processed_chunk = self.finish_reason_handler()
|
||||||
|
|
|
@ -13,7 +13,7 @@ import litellm
|
||||||
## case 1: set_function_to_prompt not set
|
## case 1: set_function_to_prompt not set
|
||||||
def test_function_call_non_openai_model():
|
def test_function_call_non_openai_model():
|
||||||
try:
|
try:
|
||||||
model = "claude-instant-1"
|
model = "claude-3-5-haiku-20241022"
|
||||||
messages = [{"role": "user", "content": "what's the weather in sf?"}]
|
messages = [{"role": "user", "content": "what's the weather in sf?"}]
|
||||||
functions = [
|
functions = [
|
||||||
{
|
{
|
||||||
|
@ -43,38 +43,4 @@ def test_function_call_non_openai_model():
|
||||||
|
|
||||||
# test_function_call_non_openai_model()
|
# test_function_call_non_openai_model()
|
||||||
|
|
||||||
|
|
||||||
## case 2: add_function_to_prompt set
|
|
||||||
@pytest.mark.skip(reason="Anthropic now supports tool calling")
|
|
||||||
def test_function_call_non_openai_model_litellm_mod_set():
|
|
||||||
litellm.add_function_to_prompt = True
|
|
||||||
litellm.set_verbose = True
|
|
||||||
try:
|
|
||||||
model = "claude-instant-1.2"
|
|
||||||
messages = [{"role": "user", "content": "what's the weather in sf?"}]
|
|
||||||
functions = [
|
|
||||||
{
|
|
||||||
"name": "get_current_weather",
|
|
||||||
"description": "Get the current weather in a given location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
|
||||||
},
|
|
||||||
"required": ["location"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
]
|
|
||||||
response = litellm.completion(
|
|
||||||
model=model, messages=messages, functions=functions
|
|
||||||
)
|
|
||||||
print(f"response: {response}")
|
|
||||||
except Exception as e:
|
|
||||||
pytest.fail(f"An error occurred {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# test_function_call_non_openai_model_litellm_mod_set()
|
# test_function_call_non_openai_model_litellm_mod_set()
|
||||||
|
|
|
@ -480,28 +480,6 @@ async def test_aaalangfuse_logging_metadata(langfuse_client):
|
||||||
print("generation_from_langfuse", generation)
|
print("generation_from_langfuse", generation)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="beta test - checking langfuse output")
|
|
||||||
def test_langfuse_logging():
|
|
||||||
try:
|
|
||||||
pre_langfuse_setup()
|
|
||||||
litellm.set_verbose = True
|
|
||||||
response = completion(
|
|
||||||
model="claude-instant-1.2",
|
|
||||||
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
|
|
||||||
max_tokens=10,
|
|
||||||
temperature=0.2,
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
# time.sleep(5)
|
|
||||||
# # check langfuse.log to see if there was a failed response
|
|
||||||
# search_logs("langfuse.log")
|
|
||||||
|
|
||||||
except litellm.Timeout as e:
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
pytest.fail(f"An exception occurred - {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# test_langfuse_logging()
|
# test_langfuse_logging()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@ def test_batch_completions_models():
|
||||||
def test_batch_completion_models_all_responses():
|
def test_batch_completion_models_all_responses():
|
||||||
try:
|
try:
|
||||||
responses = batch_completion_models_all_responses(
|
responses = batch_completion_models_all_responses(
|
||||||
models=["j2-light", "claude-instant-1.2"],
|
models=["j2-light", "claude-3-haiku-20240307"],
|
||||||
messages=[{"role": "user", "content": "write a poem"}],
|
messages=[{"role": "user", "content": "write a poem"}],
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
)
|
)
|
||||||
|
|
|
@ -343,7 +343,7 @@ def test_completion_claude():
|
||||||
try:
|
try:
|
||||||
# test without max tokens
|
# test without max tokens
|
||||||
response = completion(
|
response = completion(
|
||||||
model="claude-instant-1", messages=messages, request_timeout=10
|
model="claude-3-5-haiku-20241022", messages=messages, request_timeout=10
|
||||||
)
|
)
|
||||||
# Add any assertions here to check response args
|
# Add any assertions here to check response args
|
||||||
print(response)
|
print(response)
|
||||||
|
|
|
@ -1562,3 +1562,65 @@ def test_logging_key_masking_gemini():
|
||||||
trimmed_key = key.split("key=")[1]
|
trimmed_key = key.split("key=")[1]
|
||||||
trimmed_key = trimmed_key.replace("*", "")
|
trimmed_key = trimmed_key.replace("*", "")
|
||||||
assert "PART" == trimmed_key
|
assert "PART" == trimmed_key
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_standard_logging_payload_stream_usage(sync_mode):
|
||||||
|
"""
|
||||||
|
Even if stream_options is not provided, correct usage should be logged
|
||||||
|
"""
|
||||||
|
from litellm.types.utils import StandardLoggingPayload
|
||||||
|
from litellm.main import stream_chunk_builder
|
||||||
|
|
||||||
|
stream = True
|
||||||
|
try:
|
||||||
|
# sync completion
|
||||||
|
customHandler = CompletionCustomHandler()
|
||||||
|
litellm.callbacks = [customHandler]
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
patch_event = "log_success_event"
|
||||||
|
return_val = MagicMock()
|
||||||
|
else:
|
||||||
|
patch_event = "async_log_success_event"
|
||||||
|
return_val = AsyncMock()
|
||||||
|
|
||||||
|
with patch.object(customHandler, patch_event, new=return_val) as mock_client:
|
||||||
|
if sync_mode:
|
||||||
|
resp = litellm.completion(
|
||||||
|
model="anthropic/claude-3-5-sonnet-20240620",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
stream=stream,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for chunk in resp:
|
||||||
|
chunks.append(chunk)
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
resp = await litellm.acompletion(
|
||||||
|
model="anthropic/claude-3-5-sonnet-20240620",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
stream=stream,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
async for chunk in resp:
|
||||||
|
chunks.append(chunk)
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
mock_client.assert_called_once()
|
||||||
|
|
||||||
|
standard_logging_object: StandardLoggingPayload = (
|
||||||
|
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
|
||||||
|
)
|
||||||
|
|
||||||
|
built_response = stream_chunk_builder(chunks=chunks)
|
||||||
|
assert (
|
||||||
|
built_response.usage.total_tokens
|
||||||
|
!= standard_logging_object["total_tokens"]
|
||||||
|
)
|
||||||
|
print(f"standard_logging_object usage: {built_response.usage}")
|
||||||
|
except litellm.InternalServerError:
|
||||||
|
pass
|
||||||
|
|
|
@ -163,7 +163,7 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th
|
||||||
elif model == "azure/chatgpt-v-2":
|
elif model == "azure/chatgpt-v-2":
|
||||||
temporary_key = os.environ["AZURE_API_KEY"]
|
temporary_key = os.environ["AZURE_API_KEY"]
|
||||||
os.environ["AZURE_API_KEY"] = "bad-key"
|
os.environ["AZURE_API_KEY"] = "bad-key"
|
||||||
elif model == "claude-instant-1":
|
elif model == "claude-3-5-haiku-20241022":
|
||||||
temporary_key = os.environ["ANTHROPIC_API_KEY"]
|
temporary_key = os.environ["ANTHROPIC_API_KEY"]
|
||||||
os.environ["ANTHROPIC_API_KEY"] = "bad-key"
|
os.environ["ANTHROPIC_API_KEY"] = "bad-key"
|
||||||
elif model == "command-nightly":
|
elif model == "command-nightly":
|
||||||
|
@ -213,7 +213,7 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th
|
||||||
elif model == "chatgpt-test":
|
elif model == "chatgpt-test":
|
||||||
os.environ["AZURE_API_KEY"] = temporary_key
|
os.environ["AZURE_API_KEY"] = temporary_key
|
||||||
azure = True
|
azure = True
|
||||||
elif model == "claude-instant-1":
|
elif model == "claude-3-5-haiku-20241022":
|
||||||
os.environ["ANTHROPIC_API_KEY"] = temporary_key
|
os.environ["ANTHROPIC_API_KEY"] = temporary_key
|
||||||
elif model == "command-nightly":
|
elif model == "command-nightly":
|
||||||
os.environ["COHERE_API_KEY"] = temporary_key
|
os.environ["COHERE_API_KEY"] = temporary_key
|
||||||
|
|
|
@ -77,71 +77,6 @@ async def test_langsmith_queue_logging():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Flaky test. covered by unit tests on custom logger.")
|
|
||||||
@pytest.mark.asyncio()
|
|
||||||
async def test_async_langsmith_logging():
|
|
||||||
try:
|
|
||||||
test_langsmith_logger = LangsmithLogger()
|
|
||||||
run_id = str(uuid.uuid4())
|
|
||||||
litellm.set_verbose = True
|
|
||||||
litellm.callbacks = ["langsmith"]
|
|
||||||
response = await litellm.acompletion(
|
|
||||||
model="claude-instant-1.2",
|
|
||||||
messages=[{"role": "user", "content": "what llm are u"}],
|
|
||||||
max_tokens=10,
|
|
||||||
temperature=0.2,
|
|
||||||
metadata={
|
|
||||||
"id": run_id,
|
|
||||||
"tags": ["tag1", "tag2"],
|
|
||||||
"user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
|
|
||||||
"user_api_key_alias": "ishaans-langmsith-key",
|
|
||||||
"user_api_end_user_max_budget": None,
|
|
||||||
"litellm_api_version": "1.40.19",
|
|
||||||
"global_max_parallel_requests": None,
|
|
||||||
"user_api_key_user_id": "admin",
|
|
||||||
"user_api_key_org_id": None,
|
|
||||||
"user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709",
|
|
||||||
"user_api_key_team_alias": "testing-team",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
print("run_id", run_id)
|
|
||||||
logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id)
|
|
||||||
|
|
||||||
print("logged_run_on_langsmith", logged_run_on_langsmith)
|
|
||||||
|
|
||||||
print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys())
|
|
||||||
|
|
||||||
input_fields_on_langsmith = logged_run_on_langsmith.get("inputs")
|
|
||||||
extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get(
|
|
||||||
"invocation_params"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith)
|
|
||||||
|
|
||||||
print("\nextra fields on langsmith", extra_fields_on_langsmith)
|
|
||||||
|
|
||||||
assert isinstance(input_fields_on_langsmith, dict)
|
|
||||||
assert "api_key" not in input_fields_on_langsmith
|
|
||||||
assert "api_key" not in extra_fields_on_langsmith
|
|
||||||
|
|
||||||
# assert user_api_key in extra_fields_on_langsmith
|
|
||||||
assert "user_api_key" in extra_fields_on_langsmith
|
|
||||||
assert "user_api_key_user_id" in extra_fields_on_langsmith
|
|
||||||
assert "user_api_key_team_alias" in extra_fields_on_langsmith
|
|
||||||
|
|
||||||
for cb in litellm.callbacks:
|
|
||||||
if isinstance(cb, LangsmithLogger):
|
|
||||||
await cb.async_httpx_client.client.aclose()
|
|
||||||
# test_langsmith_logger.async_httpx_client.close()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
pytest.fail(f"Error occurred: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# test_langsmith_logging()
|
# test_langsmith_logging()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,7 @@
|
||||||
# # old_stdout = sys.stdout
|
# # old_stdout = sys.stdout
|
||||||
# # sys.stdout = new_stdout = io.StringIO()
|
# # sys.stdout = new_stdout = io.StringIO()
|
||||||
|
|
||||||
# # response = completion(model="claude-instant-1", messages=messages)
|
# # response = completion(model="claude-3-5-haiku-20241022", messages=messages)
|
||||||
|
|
||||||
# # # Restore stdout
|
# # # Restore stdout
|
||||||
# # sys.stdout = old_stdout
|
# # sys.stdout = old_stdout
|
||||||
|
@ -154,7 +154,7 @@
|
||||||
# old_stdout = sys.stdout
|
# old_stdout = sys.stdout
|
||||||
# sys.stdout = new_stdout = io.StringIO()
|
# sys.stdout = new_stdout = io.StringIO()
|
||||||
|
|
||||||
# response = completion(model="claude-instant-1", messages=messages, stream=True)
|
# response = completion(model="claude-3-5-haiku-20241022", messages=messages, stream=True)
|
||||||
# for idx, chunk in enumerate(response):
|
# for idx, chunk in enumerate(response):
|
||||||
# pass
|
# pass
|
||||||
|
|
||||||
|
@ -255,7 +255,7 @@
|
||||||
# # sys.stdout = new_stdout = io.StringIO()
|
# # sys.stdout = new_stdout = io.StringIO()
|
||||||
|
|
||||||
# # try:
|
# # try:
|
||||||
# # response = completion(model="claude-instant-1", messages=messages)
|
# # response = completion(model="claude-3-5-haiku-20241022", messages=messages)
|
||||||
# # except AuthenticationError:
|
# # except AuthenticationError:
|
||||||
# # pass
|
# # pass
|
||||||
|
|
||||||
|
@ -327,7 +327,7 @@
|
||||||
# # sys.stdout = new_stdout = io.StringIO()
|
# # sys.stdout = new_stdout = io.StringIO()
|
||||||
|
|
||||||
# # try:
|
# # try:
|
||||||
# # response = completion(model="claude-instant-1", messages=messages)
|
# # response = completion(model="claude-3-5-haiku-20241022", messages=messages)
|
||||||
# # except AuthenticationError:
|
# # except AuthenticationError:
|
||||||
# # pass
|
# # pass
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# BASE_URL = 'http://localhost:8080'
|
# BASE_URL = 'http://localhost:8080'
|
||||||
|
|
||||||
# def test_hello_route():
|
# def test_hello_route():
|
||||||
# data = {"model": "claude-instant-1", "messages": [{"role": "user", "content": "hey, how's it going?"}]}
|
# data = {"model": "claude-3-5-haiku-20241022", "messages": [{"role": "user", "content": "hey, how's it going?"}]}
|
||||||
# headers = {'Content-Type': 'application/json'}
|
# headers = {'Content-Type': 'application/json'}
|
||||||
# response = requests.get(BASE_URL, headers=headers, data=json.dumps(data))
|
# response = requests.get(BASE_URL, headers=headers, data=json.dumps(data))
|
||||||
# print(response.text)
|
# print(response.text)
|
||||||
|
|
|
@ -31,63 +31,6 @@ litellm.set_verbose = True
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="duplicate test of logging with callbacks")
|
|
||||||
@pytest.mark.asyncio()
|
|
||||||
async def test_async_prometheus_success_logging():
|
|
||||||
from litellm.integrations.prometheus import PrometheusLogger
|
|
||||||
|
|
||||||
pl = PrometheusLogger()
|
|
||||||
run_id = str(uuid.uuid4())
|
|
||||||
|
|
||||||
litellm.set_verbose = True
|
|
||||||
litellm.callbacks = [pl]
|
|
||||||
|
|
||||||
response = await litellm.acompletion(
|
|
||||||
model="claude-instant-1.2",
|
|
||||||
messages=[{"role": "user", "content": "what llm are u"}],
|
|
||||||
max_tokens=10,
|
|
||||||
mock_response="hi",
|
|
||||||
temperature=0.2,
|
|
||||||
metadata={
|
|
||||||
"id": run_id,
|
|
||||||
"tags": ["tag1", "tag2"],
|
|
||||||
"user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
|
|
||||||
"user_api_key_alias": "ishaans-prometheus-key",
|
|
||||||
"user_api_end_user_max_budget": None,
|
|
||||||
"litellm_api_version": "1.40.19",
|
|
||||||
"global_max_parallel_requests": None,
|
|
||||||
"user_api_key_user_id": "admin",
|
|
||||||
"user_api_key_org_id": None,
|
|
||||||
"user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709",
|
|
||||||
"user_api_key_team_alias": "testing-team",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
# get prometheus logger
|
|
||||||
test_prometheus_logger = pl
|
|
||||||
print("done with success request")
|
|
||||||
|
|
||||||
print(
|
|
||||||
"vars of test_prometheus_logger",
|
|
||||||
vars(test_prometheus_logger.litellm_requests_metric),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get the metrics
|
|
||||||
metrics = {}
|
|
||||||
for metric in REGISTRY.collect():
|
|
||||||
for sample in metric.samples:
|
|
||||||
metrics[sample.name] = sample.value
|
|
||||||
|
|
||||||
print("metrics from prometheus", metrics)
|
|
||||||
assert metrics["litellm_requests_metric_total"] == 1.0
|
|
||||||
assert metrics["litellm_total_tokens_total"] == 30.0
|
|
||||||
assert metrics["litellm_deployment_success_responses_total"] == 1.0
|
|
||||||
assert metrics["litellm_deployment_total_requests_total"] == 1.0
|
|
||||||
assert metrics["litellm_deployment_latency_per_output_token_bucket"] == 1.0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
async def test_async_prometheus_success_logging_with_callbacks():
|
async def test_async_prometheus_success_logging_with_callbacks():
|
||||||
|
|
||||||
|
@ -107,7 +50,7 @@ async def test_async_prometheus_success_logging_with_callbacks():
|
||||||
initial_metrics[sample.name] = sample.value
|
initial_metrics[sample.name] = sample.value
|
||||||
|
|
||||||
response = await litellm.acompletion(
|
response = await litellm.acompletion(
|
||||||
model="claude-instant-1.2",
|
model="claude-3-haiku-20240307",
|
||||||
messages=[{"role": "user", "content": "what llm are u"}],
|
messages=[{"role": "user", "content": "what llm are u"}],
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
mock_response="hi",
|
mock_response="hi",
|
||||||
|
|
|
@ -18,7 +18,7 @@ import time
|
||||||
# sys.stdout = new_stdout = io.StringIO()
|
# sys.stdout = new_stdout = io.StringIO()
|
||||||
|
|
||||||
|
|
||||||
# response = completion(model="claude-instant-1.2",
|
# response = completion(model="claude-3-5-haiku-20241022",
|
||||||
# messages=[{
|
# messages=[{
|
||||||
# "role": "user",
|
# "role": "user",
|
||||||
# "content": "Hi 👋 - i'm claude"
|
# "content": "Hi 👋 - i'm claude"
|
||||||
|
|
|
@ -56,7 +56,7 @@ def claude_test_completion():
|
||||||
try:
|
try:
|
||||||
# OVERRIDE WITH DYNAMIC MAX TOKENS
|
# OVERRIDE WITH DYNAMIC MAX TOKENS
|
||||||
response_1 = litellm.completion(
|
response_1 = litellm.completion(
|
||||||
model="claude-instant-1.2",
|
model="claude-3-haiku-20240307",
|
||||||
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
)
|
)
|
||||||
|
@ -66,7 +66,7 @@ def claude_test_completion():
|
||||||
|
|
||||||
# USE CONFIG TOKENS
|
# USE CONFIG TOKENS
|
||||||
response_2 = litellm.completion(
|
response_2 = litellm.completion(
|
||||||
model="claude-instant-1.2",
|
model="claude-3-haiku-20240307",
|
||||||
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
)
|
)
|
||||||
# Add any assertions here to check the response
|
# Add any assertions here to check the response
|
||||||
|
@ -77,7 +77,7 @@ def claude_test_completion():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response_3 = litellm.completion(
|
response_3 = litellm.completion(
|
||||||
model="claude-instant-1.2",
|
model="claude-3-5-haiku-20241022",
|
||||||
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
n=2,
|
n=2,
|
||||||
)
|
)
|
||||||
|
|
|
@ -10,7 +10,7 @@ sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the parent directory to the system path
|
) # Adds the parent directory to the system path
|
||||||
import litellm
|
import litellm
|
||||||
|
from unittest.mock import MagicMock, patch, AsyncMock
|
||||||
|
|
||||||
from litellm.proxy._types import LitellmUserRoles, UserAPIKeyAuth
|
from litellm.proxy._types import LitellmUserRoles, UserAPIKeyAuth
|
||||||
from litellm.proxy.auth.auth_utils import is_request_body_safe
|
from litellm.proxy.auth.auth_utils import is_request_body_safe
|
||||||
|
@ -465,3 +465,48 @@ def test_update_internal_user_params():
|
||||||
updated_data_json["budget_duration"]
|
updated_data_json["budget_duration"]
|
||||||
== litellm.default_internal_user_params["budget_duration"]
|
== litellm.default_internal_user_params["budget_duration"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_proxy_config_update_from_db():
|
||||||
|
from litellm.proxy.proxy_server import ProxyConfig
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
proxy_config = ProxyConfig()
|
||||||
|
|
||||||
|
pc = AsyncMock()
|
||||||
|
|
||||||
|
test_config = {
|
||||||
|
"litellm_settings": {
|
||||||
|
"callbacks": ["prometheus", "otel"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ReturnValue(BaseModel):
|
||||||
|
param_name: str
|
||||||
|
param_value: dict
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
pc,
|
||||||
|
"get_generic_data",
|
||||||
|
new=AsyncMock(
|
||||||
|
return_value=ReturnValue(
|
||||||
|
param_name="litellm_settings",
|
||||||
|
param_value={
|
||||||
|
"success_callback": "langfuse",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
),
|
||||||
|
):
|
||||||
|
new_config = await proxy_config._update_config_from_db(
|
||||||
|
prisma_client=pc,
|
||||||
|
config=test_config,
|
||||||
|
store_model_in_db=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert new_config == {
|
||||||
|
"litellm_settings": {
|
||||||
|
"callbacks": ["prometheus", "otel"],
|
||||||
|
"success_callback": "langfuse",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1807,7 +1807,7 @@ def test_router_anthropic_key_dynamic():
|
||||||
{
|
{
|
||||||
"model_name": "anthropic-claude",
|
"model_name": "anthropic-claude",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "claude-instant-1.2",
|
"model": "claude-3-5-haiku-20241022",
|
||||||
"api_key": anthropic_api_key,
|
"api_key": anthropic_api_key,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -824,8 +824,8 @@ def test_ausage_based_routing_fallbacks():
|
||||||
"rpm": OPENAI_RPM,
|
"rpm": OPENAI_RPM,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model_name": "anthropic-claude-instant-1.2",
|
"model_name": "anthropic-claude-3-5-haiku-20241022",
|
||||||
"litellm_params": get_anthropic_params("claude-instant-1.2"),
|
"litellm_params": get_anthropic_params("claude-3-5-haiku-20241022"),
|
||||||
"model_info": {"id": 4},
|
"model_info": {"id": 4},
|
||||||
"rpm": ANTHROPIC_RPM,
|
"rpm": ANTHROPIC_RPM,
|
||||||
},
|
},
|
||||||
|
@ -834,7 +834,7 @@ def test_ausage_based_routing_fallbacks():
|
||||||
fallbacks_list = [
|
fallbacks_list = [
|
||||||
{"azure/gpt-4-fast": ["azure/gpt-4-basic"]},
|
{"azure/gpt-4-fast": ["azure/gpt-4-basic"]},
|
||||||
{"azure/gpt-4-basic": ["openai-gpt-4"]},
|
{"azure/gpt-4-basic": ["openai-gpt-4"]},
|
||||||
{"openai-gpt-4": ["anthropic-claude-instant-1.2"]},
|
{"openai-gpt-4": ["anthropic-claude-3-5-haiku-20241022"]},
|
||||||
]
|
]
|
||||||
|
|
||||||
router = Router(
|
router = Router(
|
||||||
|
@ -864,7 +864,7 @@ def test_ausage_based_routing_fallbacks():
|
||||||
assert response._hidden_params["model_id"] == "1"
|
assert response._hidden_params["model_id"] == "1"
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
|
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-3-5-haiku-20241022
|
||||||
response = router.completion(
|
response = router.completion(
|
||||||
model="azure/gpt-4-fast",
|
model="azure/gpt-4-fast",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
|
|
@ -17,6 +17,7 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from unittest.mock import patch, MagicMock, AsyncMock
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -155,3 +156,35 @@ def test_route_with_exception():
|
||||||
|
|
||||||
result = router.route("openai/gpt-3.5-turbo")
|
result = router.route("openai/gpt-3.5-turbo")
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_pattern_match_e2e():
|
||||||
|
"""
|
||||||
|
Tests the end to end flow of the router
|
||||||
|
"""
|
||||||
|
from litellm.llms.custom_httpx.http_handler import HTTPHandler
|
||||||
|
|
||||||
|
client = HTTPHandler()
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "llmengine/*",
|
||||||
|
"litellm_params": {"model": "anthropic/*", "api_key": "test"},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(client, "post", new=MagicMock()) as mock_post:
|
||||||
|
|
||||||
|
router.completion(
|
||||||
|
model="llmengine/my-custom-model",
|
||||||
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
||||||
|
client=client,
|
||||||
|
api_key="test",
|
||||||
|
)
|
||||||
|
mock_post.assert_called_once()
|
||||||
|
print(mock_post.call_args.kwargs["data"])
|
||||||
|
mock_post.call_args.kwargs["data"] == {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"messages": [{"role": "user", "content": "Hello, how are you?"}],
|
||||||
|
}
|
||||||
|
|
|
@ -38,9 +38,9 @@ def test_router_timeouts():
|
||||||
"tpm": 80000,
|
"tpm": 80000,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model_name": "anthropic-claude-instant-1.2",
|
"model_name": "anthropic-claude-3-5-haiku-20241022",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "claude-instant-1.2",
|
"model": "claude-3-5-haiku-20241022",
|
||||||
"api_key": "os.environ/ANTHROPIC_API_KEY",
|
"api_key": "os.environ/ANTHROPIC_API_KEY",
|
||||||
"mock_response": "hello world",
|
"mock_response": "hello world",
|
||||||
},
|
},
|
||||||
|
@ -49,7 +49,7 @@ def test_router_timeouts():
|
||||||
]
|
]
|
||||||
|
|
||||||
fallbacks_list = [
|
fallbacks_list = [
|
||||||
{"openai-gpt-4": ["anthropic-claude-instant-1.2"]},
|
{"openai-gpt-4": ["anthropic-claude-3-5-haiku-20241022"]},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Configure router
|
# Configure router
|
||||||
|
|
|
@ -681,7 +681,7 @@ def test_completion_ollama_hosted_stream():
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model",
|
"model",
|
||||||
[
|
[
|
||||||
# "claude-instant-1.2",
|
# "claude-3-5-haiku-20241022",
|
||||||
# "claude-2",
|
# "claude-2",
|
||||||
# "mistral/mistral-medium",
|
# "mistral/mistral-medium",
|
||||||
"openrouter/openai/gpt-4o-mini",
|
"openrouter/openai/gpt-4o-mini",
|
||||||
|
@ -1112,7 +1112,7 @@ def test_completion_claude_stream_bad_key():
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
response = completion(
|
response = completion(
|
||||||
model="claude-instant-1",
|
model="claude-3-5-haiku-20241022",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
stream=True,
|
stream=True,
|
||||||
max_tokens=50,
|
max_tokens=50,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#### What this tests ####
|
#### What this tests ####
|
||||||
# This tests litellm.token_counter() function
|
# This tests litellm.token_counter() function
|
||||||
|
import traceback
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -116,7 +116,9 @@ def test_tokenizers():
|
||||||
openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
|
openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
|
||||||
|
|
||||||
# claude tokenizer
|
# claude tokenizer
|
||||||
claude_tokens = token_counter(model="claude-instant-1", text=sample_text)
|
claude_tokens = token_counter(
|
||||||
|
model="claude-3-5-haiku-20241022", text=sample_text
|
||||||
|
)
|
||||||
|
|
||||||
# cohere tokenizer
|
# cohere tokenizer
|
||||||
cohere_tokens = token_counter(model="command-nightly", text=sample_text)
|
cohere_tokens = token_counter(model="command-nightly", text=sample_text)
|
||||||
|
@ -167,8 +169,9 @@ def test_encoding_and_decoding():
|
||||||
assert openai_text == sample_text
|
assert openai_text == sample_text
|
||||||
|
|
||||||
# claude encoding + decoding
|
# claude encoding + decoding
|
||||||
claude_tokens = encode(model="claude-instant-1", text=sample_text)
|
claude_tokens = encode(model="claude-3-5-haiku-20241022", text=sample_text)
|
||||||
claude_text = decode(model="claude-instant-1", tokens=claude_tokens.ids)
|
|
||||||
|
claude_text = decode(model="claude-3-5-haiku-20241022", tokens=claude_tokens)
|
||||||
|
|
||||||
assert claude_text == sample_text
|
assert claude_text == sample_text
|
||||||
|
|
||||||
|
@ -186,7 +189,7 @@ def test_encoding_and_decoding():
|
||||||
|
|
||||||
assert llama2_text == sample_text
|
assert llama2_text == sample_text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occured: {e}")
|
pytest.fail(f"An exception occured: {e}\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
|
||||||
# test_encoding_and_decoding()
|
# test_encoding_and_decoding()
|
||||||
|
|
|
@ -26,7 +26,7 @@ def exporter():
|
||||||
return exporter
|
return exporter
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", ["claude-instant-1.2", "gpt-3.5-turbo"])
|
@pytest.mark.parametrize("model", ["claude-3-5-haiku-20241022", "gpt-3.5-turbo"])
|
||||||
def test_traceloop_logging(exporter, model):
|
def test_traceloop_logging(exporter, model):
|
||||||
litellm.completion(
|
litellm.completion(
|
||||||
model=model,
|
model=model,
|
||||||
|
|
|
@ -57,7 +57,7 @@ test_wandb_logging_async()
|
||||||
def test_wandb_logging():
|
def test_wandb_logging():
|
||||||
try:
|
try:
|
||||||
response = completion(
|
response = completion(
|
||||||
model="claude-instant-1.2",
|
model="claude-3-5-haiku-20241022",
|
||||||
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
|
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
|
|
|
@ -1,19 +1,13 @@
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from pydantic.main import Model
|
|
||||||
|
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the parent directory to the system-path
|
) # Adds the parent directory to the system-path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import litellm
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
from litellm._logging import verbose_logger
|
|
||||||
from litellm.integrations.langfuse.langfuse import (
|
from litellm.integrations.langfuse.langfuse import (
|
||||||
LangFuseLogger,
|
LangFuseLogger,
|
||||||
)
|
)
|
||||||
|
@ -217,3 +211,27 @@ def test_get_langfuse_logger_for_request_with_cached_logger():
|
||||||
|
|
||||||
assert result == cached_logger
|
assert result == cached_logger
|
||||||
mock_cache.get_cache.assert_called_once()
|
mock_cache.get_cache.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("metadata", [
|
||||||
|
{'a': 1, 'b': 2, 'c': 3},
|
||||||
|
{'a': {'nested_a': 1}, 'b': {'nested_b': 2}},
|
||||||
|
{'a': [1, 2, 3], 'b': {4, 5, 6}},
|
||||||
|
{'a': (1, 2), 'b': frozenset([3, 4]), 'c': {'d': [5, 6]}},
|
||||||
|
{'lock': threading.Lock()},
|
||||||
|
{'func': lambda x: x + 1},
|
||||||
|
{
|
||||||
|
'int': 42,
|
||||||
|
'str': 'hello',
|
||||||
|
'list': [1, 2, 3],
|
||||||
|
'set': {4, 5},
|
||||||
|
'dict': {'nested': 'value'},
|
||||||
|
'non_copyable': threading.Lock(),
|
||||||
|
'function': print
|
||||||
|
},
|
||||||
|
['list', 'not', 'a', 'dict'],
|
||||||
|
{'timestamp': datetime.now()},
|
||||||
|
{},
|
||||||
|
None,
|
||||||
|
])
|
||||||
|
def test_langfuse_logger_prepare_metadata(metadata):
|
||||||
|
global_langfuse_logger._prepare_metadata(metadata)
|
||||||
|
|
|
@ -986,3 +986,16 @@ def test_pattern_match_deployment_set_model_name(
|
||||||
|
|
||||||
print(updated_model) # Expected output: "openai/fo::hi:static::hello"
|
print(updated_model) # Expected output: "openai/fo::hi:static::hello"
|
||||||
assert updated_model == expected_model
|
assert updated_model == expected_model
|
||||||
|
|
||||||
|
updated_models = pattern_router._return_pattern_matched_deployments(
|
||||||
|
match,
|
||||||
|
deployments=[
|
||||||
|
{
|
||||||
|
"model_name": model_name,
|
||||||
|
"litellm_params": {"model": litellm_model},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
for model in updated_models:
|
||||||
|
assert model["litellm_params"]["model"] == expected_model
|
||||||
|
|
|
@ -523,8 +523,8 @@ async def test_key_info_spend_values():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.flaky(retries=3, delay=1)
|
@pytest.mark.flaky(retries=6, delay=2)
|
||||||
async def test_key_info_spend_values_streaming():
|
async def test_aaaaakey_info_spend_values_streaming():
|
||||||
"""
|
"""
|
||||||
Test to ensure spend is correctly calculated.
|
Test to ensure spend is correctly calculated.
|
||||||
- create key
|
- create key
|
||||||
|
@ -545,7 +545,7 @@ async def test_key_info_spend_values_streaming():
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
)
|
)
|
||||||
response_cost = prompt_cost + completion_cost
|
response_cost = prompt_cost + completion_cost
|
||||||
await asyncio.sleep(5) # allow db log to be updated
|
await asyncio.sleep(8) # allow db log to be updated
|
||||||
print(f"new_key: {new_key}")
|
print(f"new_key: {new_key}")
|
||||||
key_info = await get_key_info(
|
key_info = await get_key_info(
|
||||||
session=session, get_key=new_key, call_key=new_key
|
session=session, get_key=new_key, call_key=new_key
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue