From 825ce39879f55acf77681e1a38b1e32366884c4b Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 22 Apr 2025 11:47:53 -0400
Subject: [PATCH] fix: Together provider shutdown and default to non-streaming
 (#2001)

# What does this PR do?

The together inference provider was throwing a stack trace every time it
shut down, as it was trying to call a non-existent `close` method on the
AsyncTogether client. While fixing that, I also adjusted its shutdown
logic to close the OpenAI client if we've created one of those, as that
client does have a `close` method.

In testing that, I also realized we were defaulting to treating all
requests as streaming requests instead of defaulting to non-streaming.
So, this flips that default to non-streaming to match how the other
providers work.

## Test Plan

I tested this by ensuring the together inference provider no longer
spits out a long stack trace when shutting it down and by running the
OpenAI API chat completion verification suite to ensure the change in
default streaming logic didn't mess anything else up.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../providers/remote/inference/together/together.py        | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 001e6aac4..48e41f5b0 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -76,8 +76,11 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
 
     async def shutdown(self) -> None:
         if self._client:
-            await self._client.close()
+            # Together client has no close method, so just set to None
             self._client = None
+        if self._openai_client:
+            await self._openai_client.close()
+            self._openai_client = None
 
     async def completion(
         self,
@@ -359,7 +362,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
             top_p=top_p,
             user=user,
         )
-        if params.get("stream", True):
+        if params.get("stream", False):
             return self._stream_openai_chat_completion(params)
         return await self._get_openai_client().chat.completions.create(**params)  # type: ignore