litellm-mirror/litellm/llms/replicate/chat/handler.py
Krish Dholakia 3671829e39
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 12s
Complete 'requests' library removal (#7350)
* refactor: initial commit moving watsonx_text to base_llm_http_handler + clarifying new provider directory structure

* refactor(watsonx/completion/handler.py): move to using base llm http handler

removes 'requests' library usage

* fix(watsonx_text/transformation.py): fix result transformation

migrates to transformation.py, for usage with base llm http handler

* fix(streaming_handler.py): migrate watsonx streaming to transformation.py

ensures streaming works with base llm http handler

* fix(streaming_handler.py): fix streaming linting errors and remove watsonx conditional logic

* fix(watsonx/): fix chat route post completion route refactor

* refactor(watsonx/embed): refactor watsonx to use base llm http handler for embedding calls as well

* refactor(base.py): remove requests library usage from litellm

* build(pyproject.toml): remove requests library usage

* fix: fix linting errors

* fix: fix linting errors

* fix(types/utils.py): fix validation errors for modelresponsestream

* fix(replicate/handler.py): fix linting errors

* fix(litellm_logging.py): handle modelresponsestream object

* fix(streaming_handler.py): fix modelresponsestream args

* fix: remove unused imports

* test: fix test

* fix: fix test

* test: fix test

* test: fix tests

* test: fix test

* test: fix patch target

* test: fix test
2024-12-22 07:21:25 -08:00

284 lines
10 KiB
Python

import asyncio
import json
import time
from typing import Callable, List, Union
import litellm
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
_get_httpx_client,
get_async_httpx_client,
)
from litellm.types.llms.openai import AllMessageValues
from litellm.utils import CustomStreamWrapper, ModelResponse
from ..common_utils import ReplicateError
from .transformation import ReplicateConfig
replicate_config = ReplicateConfig()
# Function to handle prediction response (streaming)
def handle_prediction_response_streaming(
prediction_url, api_token, print_verbose, headers: dict, http_client: HTTPHandler
):
previous_output = ""
output_string = ""
status = ""
while True and (status not in ["succeeded", "failed", "canceled"]):
time.sleep(0.5) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = http_client.get(prediction_url, headers=headers)
if response.status_code == 200:
response_data = response.json()
status = response_data["status"]
if "output" in response_data:
try:
output_string = "".join(response_data["output"])
except Exception:
raise ReplicateError(
status_code=422,
message="Unable to parse response. Got={}".format(
response_data["output"]
),
headers=response.headers,
)
new_output = output_string[len(previous_output) :]
print_verbose(f"New chunk: {new_output}")
yield {"output": new_output, "status": status}
previous_output = output_string
status = response_data["status"]
if status == "failed":
replicate_error = response_data.get("error", "")
raise ReplicateError(
status_code=400,
message=f"Error: {replicate_error}",
headers=response.headers,
)
else:
# this can fail temporarily but it does not mean the replicate request failed, replicate request fails when status=="failed"
print_verbose(
f"Replicate: Failed to fetch prediction status and output.{response.status_code}{response.text}"
)
# Function to handle prediction response (streaming)
async def async_handle_prediction_response_streaming(
prediction_url,
api_token,
print_verbose,
headers: dict,
http_client: AsyncHTTPHandler,
):
previous_output = ""
output_string = ""
status = ""
while True and (status not in ["succeeded", "failed", "canceled"]):
await asyncio.sleep(0.5) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = await http_client.get(prediction_url, headers=headers)
if response.status_code == 200:
response_data = response.json()
status = response_data["status"]
if "output" in response_data:
try:
output_string = "".join(response_data["output"])
except Exception:
raise ReplicateError(
status_code=422,
message="Unable to parse response. Got={}".format(
response_data["output"]
),
headers=response.headers,
)
new_output = output_string[len(previous_output) :]
print_verbose(f"New chunk: {new_output}")
yield {"output": new_output, "status": status}
previous_output = output_string
status = response_data["status"]
if status == "failed":
replicate_error = response_data.get("error", "")
raise ReplicateError(
status_code=400,
message=f"Error: {replicate_error}",
headers=response.headers,
)
else:
# this can fail temporarily but it does not mean the replicate request failed, replicate request fails when status=="failed"
print_verbose(
f"Replicate: Failed to fetch prediction status and output.{response.status_code}{response.text}"
)
# Main function for prediction completion
def completion(
model: str,
messages: list,
api_base: str,
model_response: ModelResponse,
print_verbose: Callable,
optional_params: dict,
litellm_params: dict,
logging_obj,
api_key,
encoding,
custom_prompt_dict={},
logger_fn=None,
acompletion=None,
headers={},
) -> Union[ModelResponse, CustomStreamWrapper]:
headers = replicate_config.validate_environment(
api_key=api_key,
headers=headers,
model=model,
messages=messages,
optional_params=optional_params,
)
# Start a prediction and get the prediction URL
version_id = replicate_config.model_to_version_id(model)
input_data = replicate_config.transform_request(
model=model,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
headers=headers,
)
if acompletion is not None and acompletion is True:
return async_completion(
model_response=model_response,
model=model,
encoding=encoding,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
version_id=version_id,
input_data=input_data,
api_key=api_key,
api_base=api_base,
logging_obj=logging_obj,
print_verbose=print_verbose,
headers=headers,
) # type: ignore
## COMPLETION CALL
model_response.created = int(
time.time()
) # for pricing this must remain right before calling api
prediction_url = replicate_config.get_complete_url(
api_base=api_base, model=model, optional_params=optional_params
)
## COMPLETION CALL
httpx_client = _get_httpx_client(
params={"timeout": 600.0},
)
response = httpx_client.post(
url=prediction_url,
headers=headers,
data=json.dumps(input_data),
)
prediction_url = replicate_config.get_prediction_url(response)
# Handle the prediction response (streaming or non-streaming)
if "stream" in optional_params and optional_params["stream"] is True:
print_verbose("streaming request")
_response = handle_prediction_response_streaming(
prediction_url,
api_key,
print_verbose,
headers=headers,
http_client=httpx_client,
)
return CustomStreamWrapper(_response, model, logging_obj=logging_obj, custom_llm_provider="replicate") # type: ignore
else:
for _ in range(litellm.DEFAULT_MAX_RETRIES):
time.sleep(
1
) # wait 1s to allow response to be generated by replicate - else partial output is generated with status=="processing"
response = httpx_client.get(url=prediction_url, headers=headers)
return litellm.ReplicateConfig().transform_response(
model=model,
raw_response=response,
model_response=model_response,
logging_obj=logging_obj,
api_key=api_key,
request_data=input_data,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
encoding=encoding,
)
raise ReplicateError(
status_code=500,
message="No response received from Replicate API after max retries",
headers=None,
)
async def async_completion(
model_response: ModelResponse,
model: str,
messages: List[AllMessageValues],
encoding,
optional_params: dict,
litellm_params: dict,
version_id,
input_data,
api_key,
api_base,
logging_obj,
print_verbose,
headers: dict,
) -> Union[ModelResponse, CustomStreamWrapper]:
prediction_url = replicate_config.get_complete_url(
api_base=api_base, model=model, optional_params=optional_params
)
async_handler = get_async_httpx_client(
llm_provider=litellm.LlmProviders.REPLICATE,
params={"timeout": 600.0},
)
response = await async_handler.post(
url=prediction_url, headers=headers, data=json.dumps(input_data)
)
prediction_url = replicate_config.get_prediction_url(response)
if "stream" in optional_params and optional_params["stream"] is True:
_response = async_handle_prediction_response_streaming(
prediction_url,
api_key,
print_verbose,
headers=headers,
http_client=async_handler,
)
return CustomStreamWrapper(_response, model, logging_obj=logging_obj, custom_llm_provider="replicate") # type: ignore
for _ in range(litellm.DEFAULT_REPLICATE_POLLING_RETRIES):
await asyncio.sleep(
litellm.DEFAULT_REPLICATE_POLLING_DELAY_SECONDS
) # wait 1s to allow response to be generated by replicate - else partial output is generated with status=="processing"
response = await async_handler.get(url=prediction_url, headers=headers)
return litellm.ReplicateConfig().transform_response(
model=model,
raw_response=response,
model_response=model_response,
logging_obj=logging_obj,
api_key=api_key,
request_data=input_data,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
encoding=encoding,
)
# Add a fallback return if no response is received after max retries
raise ReplicateError(
status_code=500,
message="No response received from Replicate API after max retries",
headers=None,
)