diff --git a/docs/docs/references/python_sdk_reference/index.md b/docs/docs/references/python_sdk_reference/index.md
index e0b29363e..bce87e14a 100644
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@@ -217,7 +217,6 @@ from llama_stack_client.types import (
Methods:
- client.inference.chat_completion(\*\*params) -> InferenceChatCompletionResponse
-- client.inference.completion(\*\*params) -> InferenceCompletionResponse
- client.inference.embeddings(\*\*params) -> EmbeddingsResponse
## VectorIo
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 449bd2be1..56aef2b7d 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -824,16 +824,10 @@
"\n",
"\n",
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
- "response = client.inference.completion(\n",
- " model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
- " content=user_input,\n",
- " stream=False,\n",
- " sampling_params={\n",
- " \"strategy\": {\n",
- " \"type\": \"greedy\",\n",
- " },\n",
- " \"max_tokens\": 50,\n",
- " },\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
+ " messages=[{\"role\": \"user\", \"content\": user_input}],\n",
+ " max_tokens=50,\n",
" response_format={\n",
" \"type\": \"json_schema\",\n",
" \"json_schema\": Output.model_json_schema(),\n",
diff --git a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
index d8f29d999..601276526 100644
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@@ -706,20 +706,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
- "response = client.inference.completion(\n",
- " content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
+ "response = client.completions.create(\n",
+ " prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
- " model_id=CUSTOMIZED_MODEL_DIR,\n",
- " sampling_params={\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 0.7,\n",
- " \"top_p\": 0.9\n",
- " },\n",
- " \"max_tokens\": 20,\n",
- " },\n",
+ " model=CUSTOMIZED_MODEL_DIR,\n",
+ " temperature=0.7,\n",
+ " top_p=0.9,\n",
+ " max_tokens=20,\n",
")\n",
- "print(f\"Inference response: {response.content}\")"
+ "print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
@@ -1233,20 +1228,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
- "response = client.inference.completion(\n",
- " content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
+ "response = client.completions.create(\n",
+ " prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
- " model_id=customized_chat_model_dir,\n",
- " sampling_params={\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 0.7,\n",
- " \"top_p\": 0.9\n",
- " },\n",
- " \"max_tokens\": 20,\n",
- " },\n",
+ " model=customized_chat_model_dir,\n",
+ " temperature=0.7,\n",
+ " top_p=0.9,\n",
+ " max_tokens=20,\n",
")\n",
- "print(f\"Inference response: {response.content}\")"
+ "print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
index d9c18533a..4cb2dc394 100644
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -39,25 +39,6 @@ client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
```
-### Create Completion
-
-The following example shows how to create a completion for an NVIDIA NIM.
-
-> [!NOTE]
-> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
-
-```python
-response = client.inference.completion(
- model_id="meta-llama/Llama-3.1-8B-Instruct",
- content="Complete the sentence using one word: Roses are red, violets are :",
- stream=False,
- sampling_params={
- "max_tokens": 50,
- },
-)
-print(f"Response: {response.content}")
-```
-
### Create Chat Completion
The following example shows how to create a chat completion for an NVIDIA NIM.
diff --git a/llama_stack/providers/remote/post_training/nvidia/README.md b/llama_stack/providers/remote/post_training/nvidia/README.md
index 6647316df..9b088a615 100644
--- a/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/llama_stack/providers/remote/post_training/nvidia/README.md
@@ -140,13 +140,11 @@ client.models.register(
#### 2. Inference with the fine-tuned model
```python
-response = client.inference.completion(
- content="Complete the sentence using one word: Roses are red, violets are ",
+response = client.completions.create(
+ prompt="Complete the sentence using one word: Roses are red, violets are ",
stream=False,
- model_id="test-example-model@v1",
- sampling_params={
- "max_tokens": 50,
- },
+ model="test-example-model@v1",
+ max_tokens=50,
)
-print(response.content)
+print(response.choices[0].text)
```
diff --git a/tests/integration/README.md b/tests/integration/README.md
index 467f97e02..b68526410 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -178,10 +178,10 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.
### Basic Test Pattern
```python
-def test_basic_completion(llama_stack_client, text_model_id):
- response = llama_stack_client.inference.completion(
+def test_basic_chat_completion(llama_stack_client, text_model_id):
+ response = llama_stack_client.inference.chat_completion(
model_id=text_model_id,
- content=CompletionMessage(role="user", content="Hello"),
+ messages=[{"role": "user", "content": "Hello"}],
)
# Test structure, not AI output quality