LiteLLM Minor Fixes & Improvements (09/27/2024) (#5938)

* fix(langfuse.py): prevent double logging requester metadata

Fixes https://github.com/BerriAI/litellm/issues/5935

* build(model_prices_and_context_window.json): add mistral pixtral cost tracking

Closes https://github.com/BerriAI/litellm/issues/5837

* handle streaming for azure ai studio error

* [Perf Proxy] parallel request limiter - use one cache update call (#5932)

* fix parallel request limiter - use one cache update call

* ci/cd run again

* run ci/cd again

* use docker username password

* fix config.yml

* fix config

* fix config

* fix config.yml

* ci/cd run again

* use correct typing for batch set cache

* fix async_set_cache_pipeline

* fix only check user id tpm / rpm limits when limits set

* fix test_openai_azure_embedding_with_oidc_and_cf

* fix(groq/chat/transformation.py): Fixes https://github.com/BerriAI/litellm/issues/5839

* feat(anthropic/chat.py): return 'retry-after' headers from anthropic

Fixes https://github.com/BerriAI/litellm/issues/4387

* feat: raise validation error if message has tool calls without passing `tools` param for anthropic/bedrock

Closes https://github.com/BerriAI/litellm/issues/5747

* [Feature]#5940, add max_workers parameter for the batch_completion (#5947)

* handle streaming for azure ai studio error

* bump: version 1.48.2 → 1.48.3

* docs(data_security.md): add legal/compliance faq's

Make it easier for companies to use litellm

* docs: resolve imports

* [Feature]#5940, add max_workers parameter for the batch_completion method

---------

Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: Krrish Dholakia <krrishdholakia@gmail.com>
Co-authored-by: josearangos <josearangos@Joses-MacBook-Pro.local>

* fix(converse_transformation.py): fix default message value

* fix(utils.py): fix get_model_info to handle finetuned models

Fixes issue for standard logging payloads, where model_map_value was null for finetuned openai models

* fix(litellm_pre_call_utils.py): add debug statement for data sent after updating with team/key callbacks

* fix: fix linting errors

* fix(anthropic/chat/handler.py): fix cache creation input tokens

* fix(exception_mapping_utils.py): fix missing imports

* fix(anthropic/chat/handler.py): fix usage block translation

* test: fix test

* test: fix tests

* style(types/utils.py): trigger new build

* test: fix test

---------

Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: Jose Alberto Arango Sanchez <jose.arangos@udea.edu.co>
Co-authored-by: josearangos <josearangos@Joses-MacBook-Pro.local>
This commit is contained in:
Krish Dholakia 2024-09-27 22:52:57 -07:00 committed by GitHub
parent 754981a78f
commit 0b30e212da
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 3657 additions and 2820 deletions

View file

@ -96,6 +96,7 @@ from .llms.cohere import completion as cohere_completion # type: ignore
from .llms.cohere import embed as cohere_embed
from .llms.custom_llm import CustomLLM, custom_chat_llm_router
from .llms.databricks.chat import DatabricksChatCompletion
from .llms.groq.chat.handler import GroqChatCompletion
from .llms.huggingface_restapi import Huggingface
from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
from .llms.OpenAI.chat.o1_handler import OpenAIO1ChatCompletion
@ -168,6 +169,7 @@ openai_text_completions = OpenAITextCompletion()
openai_o1_chat_completions = OpenAIO1ChatCompletion()
openai_audio_transcriptions = OpenAIAudioTranscription()
databricks_chat_completions = DatabricksChatCompletion()
groq_chat_completions = GroqChatCompletion()
azure_ai_chat_completions = AzureAIChatCompletion()
azure_ai_embedding = AzureAIEmbedding()
anthropic_chat_completions = AnthropicChatCompletion()
@ -958,6 +960,7 @@ def completion(
extra_headers=extra_headers,
api_version=api_version,
parallel_tool_calls=parallel_tool_calls,
messages=messages,
**non_default_params,
)
@ -1318,13 +1321,56 @@ def completion(
additional_args={"headers": headers},
)
response = _response
elif custom_llm_provider == "groq":
api_base = (
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or get_secret("GROQ_API_BASE")
or "https://api.groq.com/openai/v1"
)
# set API KEY
api_key = (
api_key
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
or litellm.groq_key
or get_secret("GROQ_API_KEY")
)
headers = headers or litellm.headers
## LOAD CONFIG - if set
config = litellm.GroqChatConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
): # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
response = groq_chat_completions.completion(
model=model,
messages=messages,
headers=headers,
model_response=model_response,
print_verbose=print_verbose,
api_key=api_key,
api_base=api_base,
acompletion=acompletion,
logging_obj=logging,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
timeout=timeout, # type: ignore
custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
custom_llm_provider=custom_llm_provider,
)
elif (
model in litellm.open_ai_chat_completion_models
or custom_llm_provider == "custom_openai"
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
or custom_llm_provider == "nvidia_nim"
or custom_llm_provider == "cerebras"
or custom_llm_provider == "sambanova"
@ -1431,6 +1477,7 @@ def completion(
original_response=response,
additional_args={"headers": headers},
)
elif (
"replicate" in model
or custom_llm_provider == "replicate"
@ -2933,6 +2980,7 @@ def batch_completion(
deployment_id=None,
request_timeout: Optional[int] = None,
timeout: Optional[int] = 600,
max_workers:Optional[int]= 100,
# Optional liteLLM function params
**kwargs,
):
@ -2956,6 +3004,7 @@ def batch_completion(
user (str, optional): The user string for generating completions. Defaults to "".
deployment_id (optional): The deployment ID for generating completions. Defaults to None.
request_timeout (int, optional): The request timeout for generating completions. Defaults to None.
max_workers (int,optional): The maximum number of threads to use for parallel processing.
Returns:
list: A list of completion results.
@ -3001,7 +3050,7 @@ def batch_completion(
for i in range(0, len(lst), n):
yield lst[i : i + n]
with ThreadPoolExecutor(max_workers=100) as executor:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for sub_batch in chunks(batch_messages, 100):
for message_list in sub_batch:
kwargs_modified = args.copy()