diff --git a/docs/docs/references/python_sdk_reference/index.md b/docs/docs/references/python_sdk_reference/index.md index e0b29363e..bce87e14a 100644 --- a/docs/docs/references/python_sdk_reference/index.md +++ b/docs/docs/references/python_sdk_reference/index.md @@ -217,7 +217,6 @@ from llama_stack_client.types import ( Methods: - client.inference.chat_completion(\*\*params) -> InferenceChatCompletionResponse -- client.inference.completion(\*\*params) -> InferenceCompletionResponse - client.inference.embeddings(\*\*params) -> EmbeddingsResponse ## VectorIo diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index 449bd2be1..56aef2b7d 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -824,16 +824,10 @@ "\n", "\n", "user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n", - "response = client.inference.completion(\n", - " model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n", - " content=user_input,\n", - " stream=False,\n", - " sampling_params={\n", - " \"strategy\": {\n", - " \"type\": \"greedy\",\n", - " },\n", - " \"max_tokens\": 50,\n", - " },\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Llama-3.1-8B-Instruct\",\n", + " messages=[{\"role\": \"user\", \"content\": user_input}],\n", + " max_tokens=50,\n", " response_format={\n", " \"type\": \"json_schema\",\n", " \"json_schema\": Output.model_json_schema(),\n", diff --git a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb index d8f29d999..601276526 100644 --- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb +++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb @@ -706,20 +706,15 @@ " provider_id=\"nvidia\",\n", ")\n", "\n", - "response = client.inference.completion(\n", - " content=\"Complete the sentence using one word: Roses are red, violets are \",\n", + "response = client.completions.create(\n", + " prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n", " stream=False,\n", - " model_id=CUSTOMIZED_MODEL_DIR,\n", - " sampling_params={\n", - " \"strategy\": {\n", - " \"type\": \"top_p\",\n", - " \"temperature\": 0.7,\n", - " \"top_p\": 0.9\n", - " },\n", - " \"max_tokens\": 20,\n", - " },\n", + " model=CUSTOMIZED_MODEL_DIR,\n", + " temperature=0.7,\n", + " top_p=0.9,\n", + " max_tokens=20,\n", ")\n", - "print(f\"Inference response: {response.content}\")" + "print(f\"Inference response: {response.choices[0].text}\")" ] }, { @@ -1233,20 +1228,15 @@ " provider_id=\"nvidia\",\n", ")\n", "\n", - "response = client.inference.completion(\n", - " content=\"Complete the sentence using one word: Roses are red, violets are \",\n", + "response = client.completions.create(\n", + " prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n", " stream=False,\n", - " model_id=customized_chat_model_dir,\n", - " sampling_params={\n", - " \"strategy\": {\n", - " \"type\": \"top_p\",\n", - " \"temperature\": 0.7,\n", - " \"top_p\": 0.9\n", - " },\n", - " \"max_tokens\": 20,\n", - " },\n", + " model=customized_chat_model_dir,\n", + " temperature=0.7,\n", + " top_p=0.9,\n", + " max_tokens=20,\n", ")\n", - "print(f\"Inference response: {response.content}\")" + "print(f\"Inference response: {response.choices[0].text}\")" ] }, { diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index d9c18533a..4cb2dc394 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -39,25 +39,6 @@ client = LlamaStackAsLibraryClient("nvidia") client.initialize() ``` -### Create Completion - -The following example shows how to create a completion for an NVIDIA NIM. - -> [!NOTE] -> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do. - -```python -response = client.inference.completion( - model_id="meta-llama/Llama-3.1-8B-Instruct", - content="Complete the sentence using one word: Roses are red, violets are :", - stream=False, - sampling_params={ - "max_tokens": 50, - }, -) -print(f"Response: {response.content}") -``` - ### Create Chat Completion The following example shows how to create a chat completion for an NVIDIA NIM. diff --git a/llama_stack/providers/remote/post_training/nvidia/README.md b/llama_stack/providers/remote/post_training/nvidia/README.md index 6647316df..9b088a615 100644 --- a/llama_stack/providers/remote/post_training/nvidia/README.md +++ b/llama_stack/providers/remote/post_training/nvidia/README.md @@ -140,13 +140,11 @@ client.models.register( #### 2. Inference with the fine-tuned model ```python -response = client.inference.completion( - content="Complete the sentence using one word: Roses are red, violets are ", +response = client.completions.create( + prompt="Complete the sentence using one word: Roses are red, violets are ", stream=False, - model_id="test-example-model@v1", - sampling_params={ - "max_tokens": 50, - }, + model="test-example-model@v1", + max_tokens=50, ) -print(response.content) +print(response.choices[0].text) ``` diff --git a/tests/integration/README.md b/tests/integration/README.md index 467f97e02..b68526410 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -178,10 +178,10 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i. ### Basic Test Pattern ```python -def test_basic_completion(llama_stack_client, text_model_id): - response = llama_stack_client.inference.completion( +def test_basic_chat_completion(llama_stack_client, text_model_id): + response = llama_stack_client.inference.chat_completion( model_id=text_model_id, - content=CompletionMessage(role="user", content="Hello"), + messages=[{"role": "user", "content": "Hello"}], ) # Test structure, not AI output quality