From a3c70a099e3fdfcbc85d3e3994643211991c3d40 Mon Sep 17 00:00:00 2001
From: Sunny Wan <SunnyWan2020@gmail.com>
Date: Tue, 4 Mar 2025 18:12:26 -0500
Subject: [PATCH 1/2] fixed memory leak and added test

---
 litellm/llms/openai/openai.py         |  1 +
 tests/load_tests/test_memory_usage.py | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
index 5465a24945..fdbe4d3475 100644
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@@ -460,6 +460,7 @@ class OpenAIChatCompletion(BaseLLM):
             else:
                 headers = {}
             response = raw_response.parse()
+            raw_response.http_response.close()
             return headers, response
         except Exception as e:
             if raw_response is not None:
diff --git a/tests/load_tests/test_memory_usage.py b/tests/load_tests/test_memory_usage.py
index f273865a29..63c54f2052 100644
--- a/tests/load_tests/test_memory_usage.py
+++ b/tests/load_tests/test_memory_usage.py
@@ -83,6 +83,13 @@ async def make_text_completion_request():
         api_base="https://exampleopenaiendpoint-production.up.railway.app/",
     )
 
+def make_streaming_completion_request():
+    return litellm.acompletion(
+        model="openai/gpt-4o",
+        messages=[{"role": "user", "content": "Test message for memory usage"}],
+        stream=True,
+    )
+
 
 @pytest.mark.asyncio
 @pytest.mark.skip(
@@ -102,6 +109,20 @@ async def test_atext_completion_memory():
     await run_memory_test(make_text_completion_request, "atext_completion")
 
 
+@pytest.mark.skip(
+    reason="This test is too slow to run on every commit. We can use this after nightly release"
+)
+def test_streaming_completion_memory():
+    """Test memory usage for streaming litellm.acompletion"""
+    run_memory_test(make_streaming_completion_request,"completion")
+
+@pytest.mark.skip(
+    reason="This test is too slow to run on every commit. We can use this after nightly release"
+)    
+def test_streaming_acompletion_memory():
+    """Test memory usage for streaming litellm.atext_completion"""
+    run_memory_test(make_streaming_completion_request,"acompletion")
+
 litellm_router = Router(
     model_list=[
         {

From 2ab4fc96e460dd3fc10fda27711d063b00345406 Mon Sep 17 00:00:00 2001
From: Sunny Wan <SunnyWan2020@gmail.com>
Date: Wed, 5 Mar 2025 17:21:16 -0500
Subject: [PATCH 2/2] removed close

Can't just close() the response because you still need to stream the results
---
 litellm/llms/openai/openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
index fdbe4d3475..b5d24ad605 100644
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@@ -460,7 +460,7 @@ class OpenAIChatCompletion(BaseLLM):
             else:
                 headers = {}
             response = raw_response.parse()
-            raw_response.http_response.close()
+            # raw_response.http_response.close()
             return headers, response
         except Exception as e:
             if raw_response is not None: