mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
fix: remove inference.completion from docs (#3589)
# What does this PR do? now that /v1/inference/completion has been removed, no docs should refer to it this cleans up remaining references ## Test Plan ci Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
parent
498be131a1
commit
e9eb004bf8
6 changed files with 26 additions and 64 deletions
|
@ -217,7 +217,6 @@ from llama_stack_client.types import (
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
|
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
|
||||||
- <code title="post /v1/inference/completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
|
|
||||||
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
|
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
|
||||||
|
|
||||||
## VectorIo
|
## VectorIo
|
||||||
|
|
|
@ -824,16 +824,10 @@
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
|
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
|
||||||
"response = client.inference.completion(\n",
|
"response = client.chat.completions.create(\n",
|
||||||
" model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
||||||
" content=user_input,\n",
|
" messages=[{\"role\": \"user\", \"content\": user_input}],\n",
|
||||||
" stream=False,\n",
|
" max_tokens=50,\n",
|
||||||
" sampling_params={\n",
|
|
||||||
" \"strategy\": {\n",
|
|
||||||
" \"type\": \"greedy\",\n",
|
|
||||||
" },\n",
|
|
||||||
" \"max_tokens\": 50,\n",
|
|
||||||
" },\n",
|
|
||||||
" response_format={\n",
|
" response_format={\n",
|
||||||
" \"type\": \"json_schema\",\n",
|
" \"type\": \"json_schema\",\n",
|
||||||
" \"json_schema\": Output.model_json_schema(),\n",
|
" \"json_schema\": Output.model_json_schema(),\n",
|
||||||
|
|
|
@ -706,20 +706,15 @@
|
||||||
" provider_id=\"nvidia\",\n",
|
" provider_id=\"nvidia\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.inference.completion(\n",
|
"response = client.completions.create(\n",
|
||||||
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||||
" stream=False,\n",
|
" stream=False,\n",
|
||||||
" model_id=CUSTOMIZED_MODEL_DIR,\n",
|
" model=CUSTOMIZED_MODEL_DIR,\n",
|
||||||
" sampling_params={\n",
|
" temperature=0.7,\n",
|
||||||
" \"strategy\": {\n",
|
" top_p=0.9,\n",
|
||||||
" \"type\": \"top_p\",\n",
|
" max_tokens=20,\n",
|
||||||
" \"temperature\": 0.7,\n",
|
|
||||||
" \"top_p\": 0.9\n",
|
|
||||||
" },\n",
|
|
||||||
" \"max_tokens\": 20,\n",
|
|
||||||
" },\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"print(f\"Inference response: {response.content}\")"
|
"print(f\"Inference response: {response.choices[0].text}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1233,20 +1228,15 @@
|
||||||
" provider_id=\"nvidia\",\n",
|
" provider_id=\"nvidia\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.inference.completion(\n",
|
"response = client.completions.create(\n",
|
||||||
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||||
" stream=False,\n",
|
" stream=False,\n",
|
||||||
" model_id=customized_chat_model_dir,\n",
|
" model=customized_chat_model_dir,\n",
|
||||||
" sampling_params={\n",
|
" temperature=0.7,\n",
|
||||||
" \"strategy\": {\n",
|
" top_p=0.9,\n",
|
||||||
" \"type\": \"top_p\",\n",
|
" max_tokens=20,\n",
|
||||||
" \"temperature\": 0.7,\n",
|
|
||||||
" \"top_p\": 0.9\n",
|
|
||||||
" },\n",
|
|
||||||
" \"max_tokens\": 20,\n",
|
|
||||||
" },\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"print(f\"Inference response: {response.content}\")"
|
"print(f\"Inference response: {response.choices[0].text}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -39,25 +39,6 @@ client = LlamaStackAsLibraryClient("nvidia")
|
||||||
client.initialize()
|
client.initialize()
|
||||||
```
|
```
|
||||||
|
|
||||||
### Create Completion
|
|
||||||
|
|
||||||
The following example shows how to create a completion for an NVIDIA NIM.
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
|
|
||||||
|
|
||||||
```python
|
|
||||||
response = client.inference.completion(
|
|
||||||
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
content="Complete the sentence using one word: Roses are red, violets are :",
|
|
||||||
stream=False,
|
|
||||||
sampling_params={
|
|
||||||
"max_tokens": 50,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
print(f"Response: {response.content}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Create Chat Completion
|
### Create Chat Completion
|
||||||
|
|
||||||
The following example shows how to create a chat completion for an NVIDIA NIM.
|
The following example shows how to create a chat completion for an NVIDIA NIM.
|
||||||
|
|
|
@ -140,13 +140,11 @@ client.models.register(
|
||||||
#### 2. Inference with the fine-tuned model
|
#### 2. Inference with the fine-tuned model
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.inference.completion(
|
response = client.completions.create(
|
||||||
content="Complete the sentence using one word: Roses are red, violets are ",
|
prompt="Complete the sentence using one word: Roses are red, violets are ",
|
||||||
stream=False,
|
stream=False,
|
||||||
model_id="test-example-model@v1",
|
model="test-example-model@v1",
|
||||||
sampling_params={
|
max_tokens=50,
|
||||||
"max_tokens": 50,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
print(response.content)
|
print(response.choices[0].text)
|
||||||
```
|
```
|
||||||
|
|
|
@ -178,10 +178,10 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.
|
||||||
|
|
||||||
### Basic Test Pattern
|
### Basic Test Pattern
|
||||||
```python
|
```python
|
||||||
def test_basic_completion(llama_stack_client, text_model_id):
|
def test_basic_chat_completion(llama_stack_client, text_model_id):
|
||||||
response = llama_stack_client.inference.completion(
|
response = llama_stack_client.inference.chat_completion(
|
||||||
model_id=text_model_id,
|
model_id=text_model_id,
|
||||||
content=CompletionMessage(role="user", content="Hello"),
|
messages=[{"role": "user", "content": "Hello"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test structure, not AI output quality
|
# Test structure, not AI output quality
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue