fix: Revert "feat: record token usage for inference API (#1300)" (#1476)

This reverts commit b8535417e0.

Test plan:
LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run
~/.llama/distributions/together/together-run.yaml
python -m examples.agents.e2e_loop_with_client_tools localhost 8321
This commit is contained in:
Dinesh Yeduguru 2025-03-07 10:16:47 -08:00 committed by GitHub
parent df4fbae35c
commit 60e7f3d705
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 14 additions and 161 deletions

View file

@ -285,7 +285,7 @@ class CompletionRequest(BaseModel):
@json_schema_type
class CompletionResponse(MetricResponseMixin):
class CompletionResponse(BaseModel):
"""Response from a completion request.
:param content: The generated completion text
@ -299,7 +299,7 @@ class CompletionResponse(MetricResponseMixin):
@json_schema_type
class CompletionResponseStreamChunk(MetricResponseMixin):
class CompletionResponseStreamChunk(BaseModel):
"""A chunk of a streamed completion response.
:param delta: New content generated since last chunk. This can be one or more tokens.
@ -368,7 +368,7 @@ class ChatCompletionRequest(BaseModel):
@json_schema_type
class ChatCompletionResponseStreamChunk(MetricResponseMixin):
class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
"""A chunk of a streamed chat completion response.
:param event: The event containing the new content
@ -378,7 +378,7 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
@json_schema_type
class ChatCompletionResponse(MetricResponseMixin):
class ChatCompletionResponse(MetricResponseMixin, BaseModel):
"""Response from a chat completion request.
:param completion_message: The complete response message