mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-07 12:47:37 +00:00
base: 35 RPS; safety, 75 RPS
# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
This commit is contained in:
parent
faf891b40c
commit
c3fa3e6333
5 changed files with 41 additions and 4 deletions
|
@ -525,9 +525,8 @@ class InferenceRouter(Inference):
|
|||
|
||||
response = await self._nonstream_openai_chat_completion(provider, params)
|
||||
|
||||
# Store the response with the ID that will be returned to the client
|
||||
if self.store:
|
||||
await self.store.store_chat_completion(response, messages)
|
||||
asyncio.create_task(self.store.store_chat_completion(response, messages))
|
||||
|
||||
if self.telemetry:
|
||||
metrics = self._construct_metrics(
|
||||
|
@ -855,4 +854,4 @@ class InferenceRouter(Inference):
|
|||
object="chat.completion",
|
||||
)
|
||||
logger.debug(f"InferenceRouter.completion_response: {final_response}")
|
||||
await self.store.store_chat_completion(final_response, messages)
|
||||
asyncio.create_task(self.store.store_chat_completion(final_response, messages))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue