mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore: unpublish /inference/chat-completion (#3609)
# What does this PR do?
BREAKING CHANGE: removes /inference/chat-completion route and updates
relevant documentation
## Test Plan
🤷
This commit is contained in:
parent
62e302613f
commit
cb33f45c11
23 changed files with 1448 additions and 2137 deletions
|
@ -3615,7 +3615,7 @@
|
|||
"from rich.pretty import pprint\n",
|
||||
"\n",
|
||||
"response = client.models.register(\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
|
||||
" provider_id=\"ollama\",\n",
|
||||
" provider_model_id=\"llama3.2:3b\",\n",
|
||||
" # base model id\n",
|
||||
|
@ -5762,7 +5762,7 @@
|
|||
"source": [
|
||||
"response = client.models.register(\n",
|
||||
" # the model id here needs to be the finetuned checkpoint identifier\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" provider_id=\"ollama\",\n",
|
||||
" provider_model_id=\"llama_3_2_finetuned:latest\",\n",
|
||||
" # base model id\n",
|
||||
|
@ -5816,14 +5816,14 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is the primary purpose of a W-2 form in relation to income tax?\"}\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -1003,7 +1003,7 @@
|
|||
"source": [
|
||||
"# register 405B as LLM Judge model\n",
|
||||
"client.models.register(\n",
|
||||
" model_id=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
|
||||
" model=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
|
||||
" provider_model_id=\"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\",\n",
|
||||
" provider_id=\"together\",\n",
|
||||
")\n",
|
||||
|
|
|
@ -419,21 +419,15 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Test inference\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": sample_prompt}\n",
|
||||
" ],\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=20,\n",
|
||||
" temperature=0.7,\n",
|
||||
")\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -945,20 +939,14 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Test inference\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=sample_messages,\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=20,\n",
|
||||
" temperature=0.7,\n",
|
||||
")\n",
|
||||
"assert response.completion_message.content is not None\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"assert response.choices[0].message.content is not None\n",
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1438,15 +1426,13 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Check inference without guardrails\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 150,\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=150,\n",
|
||||
")\n",
|
||||
"assert response.completion_message.content is not None\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"assert response.choices[0].message.content is not None\n",
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -687,23 +687,17 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"completion = client.inference.chat_completion(\n",
|
||||
" model_id=CUSTOMIZED_MODEL,\n",
|
||||
"completion = client.chat.completions.create(\n",
|
||||
" model=CUSTOMIZED_MODEL,\n",
|
||||
" messages=test_sample[\"messages\"],\n",
|
||||
" tools=test_sample[\"tools\"],\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 512,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.1,\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" max_tokens=512,\n",
|
||||
" temperature=0.1,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"completion.completion_message.tool_calls"
|
||||
"completion.choices[0].message.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -423,42 +423,30 @@
|
|||
" violation = self.check_guardrails(user_message.get(\"content\"))\n",
|
||||
" \n",
|
||||
" if violation is None:\n",
|
||||
" completion = client.inference.chat_completion(\n",
|
||||
" model_id=self.customized_model,\n",
|
||||
" completion = client.chat.completions.create(\n",
|
||||
" model=self.customized_model,\n",
|
||||
" messages=[user_message],\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 1024,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" \"temperature\": 0.2\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" max_tokens=1024,\n",
|
||||
" temperature=0.2,\n",
|
||||
" )\n",
|
||||
" return completion.completion_message\n",
|
||||
" return completion.choices[0].message.content\n",
|
||||
" else:\n",
|
||||
" return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n",
|
||||
" \n",
|
||||
" elif self.guardrails == \"OFF\":\n",
|
||||
" completion = client.inference.chat_completion(\n",
|
||||
" model_id=self.customized_model,\n",
|
||||
" completion = client.chat.completions.create(\n",
|
||||
" model=self.customized_model,\n",
|
||||
" messages=[user_message],\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 1024,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" \"temperature\": 0.2\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" max_tokens=1024,\n",
|
||||
" temperature=0.2,\n",
|
||||
" )\n",
|
||||
" return completion.completion_message"
|
||||
" return completion.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue