mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Merge pull request #1344 from BerriAI/litellm_speed_improvements
Litellm speed improvements
This commit is contained in:
commit
439ee3bafc
7 changed files with 46 additions and 82 deletions
|
@ -248,7 +248,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
else:
|
||||
azure_client = client
|
||||
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=messages,
|
||||
|
@ -261,7 +261,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
},
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(stringified_response),
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
)
|
||||
except AzureOpenAIError as e:
|
||||
|
@ -323,7 +323,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
**data, timeout=timeout
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(response.model_dump_json()),
|
||||
response_object=response.model_dump(),
|
||||
model_response_object=model_response,
|
||||
)
|
||||
except AzureOpenAIError as e:
|
||||
|
@ -465,7 +465,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
else:
|
||||
openai_aclient = client
|
||||
response = await openai_aclient.embeddings.create(**data, timeout=timeout)
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
|
@ -474,7 +474,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
original_response=stringified_response,
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(stringified_response),
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
response_type="embedding",
|
||||
)
|
||||
|
@ -564,7 +564,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
original_response=response,
|
||||
)
|
||||
|
||||
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
except AzureOpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
|
@ -599,7 +599,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
else:
|
||||
openai_aclient = client
|
||||
response = await openai_aclient.images.generate(**data, timeout=timeout)
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
|
@ -608,7 +608,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
original_response=stringified_response,
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(stringified_response),
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
response_type="image_generation",
|
||||
)
|
||||
|
@ -697,7 +697,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
original_response=response,
|
||||
)
|
||||
# return response
|
||||
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
except AzureOpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
|
|
|
@ -280,18 +280,6 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
max_retries=max_retries,
|
||||
)
|
||||
else:
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
additional_args={
|
||||
"headers": headers,
|
||||
"api_base": api_base,
|
||||
"acompletion": acompletion,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
|
||||
if not isinstance(max_retries, int):
|
||||
raise OpenAIError(
|
||||
status_code=422, message="max retries must be an int"
|
||||
|
@ -306,8 +294,21 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
)
|
||||
else:
|
||||
openai_client = client
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=messages,
|
||||
api_key=openai_client.api_key,
|
||||
additional_args={
|
||||
"headers": headers,
|
||||
"api_base": openai_client._base_url._uri_reference,
|
||||
"acompletion": acompletion,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
|
||||
response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
|
@ -315,7 +316,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(stringified_response),
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
)
|
||||
except Exception as e:
|
||||
|
@ -386,7 +387,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
response = await openai_aclient.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
input=data["messages"],
|
||||
api_key=api_key,
|
||||
|
@ -394,7 +395,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=json.loads(stringified_response),
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
)
|
||||
except Exception as e:
|
||||
|
@ -527,7 +528,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
else:
|
||||
openai_aclient = client
|
||||
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
|
@ -535,7 +536,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
original_response=stringified_response,
|
||||
)
|
||||
return convert_to_model_response_object(response_object=json.loads(stringified_response), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -597,7 +598,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
original_response=response,
|
||||
)
|
||||
|
||||
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
|
@ -634,7 +635,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
else:
|
||||
openai_aclient = client
|
||||
response = await openai_aclient.images.generate(**data, timeout=timeout) # type: ignore
|
||||
stringified_response = response.model_dump_json()
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
|
@ -642,7 +643,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
original_response=stringified_response,
|
||||
)
|
||||
return convert_to_model_response_object(response_object=json.loads(stringified_response), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -710,7 +711,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
original_response=response,
|
||||
)
|
||||
# return response
|
||||
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
|
|
|
@ -360,14 +360,6 @@ def embedding(
|
|||
except Exception as e:
|
||||
raise SagemakerError(status_code=500, message=f"{str(e)}")
|
||||
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
api_key="",
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=response,
|
||||
)
|
||||
|
||||
response = json.loads(response["Body"].read().decode("utf8"))
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -376,6 +368,7 @@ def embedding(
|
|||
original_response=response,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
|
||||
print_verbose(f"raw model_response: {response}")
|
||||
if "embedding" not in response:
|
||||
raise SagemakerError(status_code=500, message="embedding not found in response")
|
||||
|
|
|
@ -352,18 +352,16 @@ class Router:
|
|||
else:
|
||||
model_client = potential_model_client
|
||||
self.total_calls[model_name] += 1
|
||||
response = await asyncio.wait_for(
|
||||
litellm.acompletion(
|
||||
response = await litellm.acompletion(
|
||||
**{
|
||||
**data,
|
||||
"messages": messages,
|
||||
"caching": self.cache_responses,
|
||||
"client": model_client,
|
||||
"timeout": self.timeout,
|
||||
**kwargs,
|
||||
}
|
||||
),
|
||||
timeout=self.timeout,
|
||||
)
|
||||
)
|
||||
self.success_calls[model_name] += 1
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -614,18 +612,16 @@ class Router:
|
|||
else:
|
||||
model_client = potential_model_client
|
||||
self.total_calls[model_name] += 1
|
||||
response = await asyncio.wait_for(
|
||||
litellm.atext_completion(
|
||||
response = await litellm.atext_completion(
|
||||
**{
|
||||
**data,
|
||||
"prompt": prompt,
|
||||
"caching": self.cache_responses,
|
||||
"client": model_client,
|
||||
"timeout": self.timeout,
|
||||
**kwargs,
|
||||
}
|
||||
),
|
||||
timeout=self.timeout,
|
||||
)
|
||||
)
|
||||
self.success_calls[model_name] += 1
|
||||
return response
|
||||
except Exception as e:
|
||||
|
|
|
@ -956,6 +956,8 @@ class Logging:
|
|||
):
|
||||
# Log the exact result from the LLM API, for streaming - log the type of response received
|
||||
litellm.error_logs["POST_CALL"] = locals()
|
||||
if isinstance(original_response, dict):
|
||||
original_response = json.dumps(original_response)
|
||||
try:
|
||||
self.model_call_details["input"] = input
|
||||
self.model_call_details["api_key"] = api_key
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue