Retry for replicate completion response of status=processing (#7901) (#7965)

We use the DEFAULT_REPLICATE_ constants for retry count and initial delay. If the completion response returns status=processing, we loop to retry. Fixes https://github.com/BerriAI/litellm/issues/7900 Signed-off-by: BJ Hargrave <hargrave@us.ibm.com> Co-authored-by: BJ Hargrave <bj@hargrave.dev>
2025-04-27 11:43:54 +00:00 · 2025-01-23 22:45:43 -08:00 · 2025-01-23 22:45:43 -08:00 · b94f60632a
commit b94f60632a
parent fe460f19f5
1 changed files with 16 additions and 6 deletions
--- a/litellm/llms/replicate/chat/handler.py
+++ b/litellm/llms/replicate/chat/handler.py
@ -196,11 +196,16 @@ def completion(
        )
        return CustomStreamWrapper(_response, model, logging_obj=logging_obj, custom_llm_provider="replicate")  # type: ignore
    else:
-        for _ in range(litellm.DEFAULT_MAX_RETRIES):
+        for retry in range(litellm.DEFAULT_REPLICATE_POLLING_RETRIES):
            time.sleep(
-                1
-            )  # wait 1s to allow response to be generated by replicate - else partial output is generated with status=="processing"
+                litellm.DEFAULT_REPLICATE_POLLING_DELAY_SECONDS + 2 * retry
+            )  # wait to allow response to be generated by replicate - else partial output is generated with status=="processing"
            response = httpx_client.get(url=prediction_url, headers=headers)
+            if (
+                response.status_code == 200
+                and response.json().get("status") == "processing"
+            ):
+                continue
            return litellm.ReplicateConfig().transform_response(
                model=model,
                raw_response=response,
@ -259,11 +264,16 @@ async def async_completion(
        )
        return CustomStreamWrapper(_response, model, logging_obj=logging_obj, custom_llm_provider="replicate")  # type: ignore

-    for _ in range(litellm.DEFAULT_REPLICATE_POLLING_RETRIES):
+    for retry in range(litellm.DEFAULT_REPLICATE_POLLING_RETRIES):
        await asyncio.sleep(
-            litellm.DEFAULT_REPLICATE_POLLING_DELAY_SECONDS
-        )  # wait 1s to allow response to be generated by replicate - else partial output is generated with status=="processing"
+            litellm.DEFAULT_REPLICATE_POLLING_DELAY_SECONDS + 2 * retry
+        )  # wait to allow response to be generated by replicate - else partial output is generated with status=="processing"
        response = await async_handler.get(url=prediction_url, headers=headers)
+        if (
+            response.status_code == 200
+            and response.json().get("status") == "processing"
+        ):
+            continue
        return litellm.ReplicateConfig().transform_response(
            model=model,
            raw_response=response,