From d5b136ac6660e457db044b98fef0e997f5f978b4 Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Tue, 7 Oct 2025 14:00:56 -0400 Subject: [PATCH 1/6] feat: Enabling Annotations in Responses (#3698) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Implements annotations for `file_search` tool. Also adds some logs and tests. ## How does this work? 1. **Citation Markers**: Models insert `<|file-id|>` tokens during generation with instructions from search results 2. **Post-Processing**: Extract markers using regex to calculate character positions and create `AnnotationFileCitation` objects 3. **File Mapping**: Store filename metadata during vector store operations for proper citation display ## Example This is the updated `quickstart.py` script, which uses the `extra_body` to register the embedding model. ```python import io, requests from openai import OpenAI url="https://www.paulgraham.com/greatwork.html" model = "gpt-4o-mini" client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none") vs = client.vector_stores.create( name="my_citations_db", extra_body={ "embedding_model": "ollama/nomic-embed-text:latest", "embedding_dimension": 768, } ) response = requests.get(url) pseudo_file = io.BytesIO(str(response.content).encode('utf-8')) file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id) resp = client.responses.create( model=model, input="How do you do great work? Use our existing knowledge_search tool.", tools=[{"type": "file_search", "vector_store_ids": [vs.id]}], include=["file_search_call.results"], ) print(resp) ```
Example of the full response ```python INFO:httpx:HTTP Request: POST http://localhost:8321/v1/openai/v1/vector_stores "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://localhost:8321/v1/openai/v1/files "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://localhost:8321/v1/openai/v1/vector_stores/vs_0f6f7e35-f48b-4850-8604-8117d9a50e0a/files "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://localhost:8321/v1/openai/v1/responses "HTTP/1.1 200 OK" Response(id='resp-28f5793d-3272-4de3-81f6-8cbf107d5bcd', created_at=1759797954.0, error=None, incomplete_details=None, instructions=None, metadata=None, model='gpt-4o-mini', object='response', output=[ResponseFileSearchToolCall(id='call_xWtvEQETN5GNiRLLiBIDKntg', queries=['how to do great work tips'], status='completed', type='file_search_call', results=[Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.3722624322210302, text='\\\'re looking where few have looked before.

One sign that you\\\'re suited for some kind of work is when you like\\neven the parts that other people find tedious or frightening.

But fields aren\\\'t people; you don\\\'t owe them any loyalty. If in the\\ncourse of working on one thing you discover another that\\\'s more\\nexciting, don\\\'t be afraid to switch.

If you\\\'re making something for people, make sure it\\\'s something\\nthey actually want. The best way to do this is to make something\\nyou yourself want. Write the story you want to read; build the tool\\nyou want to use. Since your friends probably have similar interests,\\nthis will also get you your initial audience.

This should follow from the excitingness rule. Obviously the most\\nexciting story to write will be the one you want to read. The reason\\nI mention this case explicitly is that so many people get it wrong.\\nInstead of making what they want, they try to make what some\\nimaginary, more sophisticated audience wants. And once you go down\\nthat route, you\\\'re lost.\\n[6]

There are a lot of forces that will lead you astray when you\\\'re\\ntrying to figure out what to work on. Pretentiousness, fashion,\\nfear, money, politics, other people\\\'s wishes, eminent frauds. But\\nif you stick to what you find genuinely interesting, you\\\'ll be proof\\nagainst all of them. If you\\\'re interested, you\\\'re not astray.





\\nFollowing your interests may sound like a rather passive strategy,\\nbut in practice it usually means following them past all sorts of\\nobstacles. You usually have to risk rejection and failure. So it\\ndoes take a good deal of boldness.

But while you need boldness, you don\\\'t usually need much planning.\\nIn most cases the recipe for doing great work is simply: work hard\\non excitingly ambitious projects, and something good will come of\\nit. Instead of making a plan and then executing it, you just try\\nto preserve certain invariants.

The trouble with planning is that it only works for achievements\\nyou can describe in advance. You can win a gold medal or get rich\\nby deciding to as a child and then tenaciously pursuing that goal,\\nbut you can\\\'t discover natural selection that way.

I think for most people who want to do great work, the right strategy\\nis not to plan too much. At each stage do whatever seems most\\ninteresting and gives you the best options for the future. I call\\nthis approach "staying upwind." This is how most people who\\\'ve done\\ngreat work seem to have done it.





\\nEven when you\\\'ve found something exciting to work on, working on\\nit is not always straightforward. There will be times when some new\\nidea makes you leap out of bed in the morning and get straight to\\nwork. But there will also be plenty of times when things aren\\\'t\\nlike that.

You don\\\'t just put out your sail and get blown forward by inspiration.\\nThere are headwinds and currents and hidden shoals. So there\\\'s a\\ntechnique to working, just as there is to sailing.

For example, while you must work hard, it\\\'s possible to work too\\nhard, and if'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.2532794607643494, text=' with anyone who\\\'s genuinely interested. If they\\\'re\\nreally good at their work, then they probably have a hobbyist\\\'s\\ninterest in it, and hobbyists always want to talk about their\\nhobbies.

It may take some effort to find the people who are really good,\\nthough. Doing great work has such prestige that in some places,\\nparticularly universities, there\\\'s a polite fiction that everyone\\nis engaged in it. And that is far from true. People within universities\\ncan\\\'t say so openly, but the quality of the work being done in\\ndifferent departments varies immensely. Some departments have people\\ndoing great work; others have in the past; others never have.





\\nSeek out the best colleagues. There are a lot of projects that can\\\'t\\nbe done alone, and even if you\\\'re working on one that can be, it\\\'s\\ngood to have other people to encourage you and to bounce ideas off.

Colleagues don\\\'t just affect your work, though; they also affect\\nyou. So work with people you want to become like, because you will.

Quality is more important than quantity in colleagues. It\\\'s better\\nto have one or two great ones than a building full of pretty good\\nones. In fact it\\\'s not merely better, but necessary, judging from\\nhistory: the degree to which great work happens in clusters suggests\\nthat one\\\'s colleagues often make the difference between doing great\\nwork and not.

How do you know when you have sufficiently good colleagues? In my\\nexperience, when you do, you know. Which means if you\\\'re unsure,\\nyou probably don\\\'t. But it may be possible to give a more concrete\\nanswer than that. Here\\\'s an attempt: sufficiently good colleagues\\noffer surprising insights. They can see and do things that you\\ncan\\\'t. So if you have a handful of colleagues good enough to keep\\nyou on your toes in this sense, you\\\'re probably over the threshold.

Most of us can benefit from collaborating with colleagues, but some\\nprojects require people on a larger scale, and starting one of those\\nis not for everyone. If you want to run a project like that, you\\\'ll\\nhave to become a manager, and managing well takes aptitude and\\ninterest like any other kind of work. If you don\\\'t have them, there\\nis no middle path: you must either force yourself to learn management\\nas a second language, or avoid such projects.\\n[27]





\\nHusband your morale. It\\\'s the basis of everything when you\\\'re working\\non ambitious projects. You have to nurture and protect it like a\\nliving organism.

Morale starts with your view of life. You\\\'re more likely to do great\\nwork if you\\\'re an optimist, and more likely to if you think of\\nyourself as lucky than if you think of yourself as a victim.

Indeed, work can to some extent protect you from your problems. If\\nyou choose work that\\\'s pure, its very difficulties will serve as a\\nrefuge from the difficulties of everyday life. If this is escapism,\\nit\\\'s a very productive form of it, and one that has been used by\\nsome of the greatest minds in history.

Morale compounds via work: high morale helps you do good work, which\\nincreases your morale and helps you do even'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.1973485818164222, text=' your\\nability and interest can take you. And you can only answer that by\\ntrying.

Many more people could try to do great work than do. What holds\\nthem back is a combination of modesty and fear. It seems presumptuous\\nto try to be Newton or Shakespeare. It also seems hard; surely if\\nyou tried something like that, you\\\'d fail. Presumably the calculation\\nis rarely explicit. Few people consciously decide not to try to do\\ngreat work. But that\\\'s what\\\'s going on subconsciously; they shy\\naway from the question.

So I\\\'m going to pull a sneaky trick on you. Do you want to do great\\nwork, or not? Now you have to decide consciously. Sorry about that.\\nI wouldn\\\'t have done it to a general audience. But we already know\\nyou\\\'re interested.

Don\\\'t worry about being presumptuous. You don\\\'t have to tell anyone.\\nAnd if it\\\'s too hard and you fail, so what? Lots of people have\\nworse problems than that. In fact you\\\'ll be lucky if it\\\'s the worst\\nproblem you have.

Yes, you\\\'ll have to work hard. But again, lots of people have to\\nwork hard. And if you\\\'re working on something you find very\\ninteresting, which you necessarily will if you\\\'re on the right path,\\nthe work will probably feel less burdensome than a lot of your\\npeers\\\'.

The discoveries are out there, waiting to be made. Why not by you?









\\nNotes

[1]\\nI don\\\'t think you could give a precise definition of what\\ncounts as great work. Doing great work means doing something important\\nso well that you expand people\\\'s ideas of what\\\'s possible. But\\nthere\\\'s no threshold for importance. It\\\'s a matter of degree, and\\noften hard to judge at the time anyway. So I\\\'d rather people focused\\non developing their interests rather than worrying about whether\\nthey\\\'re important or not. Just try to do something amazing, and\\nleave it to future generations to say if you succeeded.

[2]\\nA lot of standup comedy is based on noticing anomalies in\\neveryday life. "Did you ever notice...?" New ideas come from doing\\nthis about nontrivial things. Which may help explain why people\\\'s\\nreaction to a new idea is often the first half of laughing: Ha!

[3]\\nThat second qualifier is critical. If you\\\'re excited about\\nsomething most authorities discount, but you can\\\'t give a more\\nprecise explanation than "they don\\\'t get it," then you\\\'re starting\\nto drift into the territory of cranks.

[4]\\nFinding something to work on is not simply a matter of finding\\na match between the current version of you and a list of known\\nproblems. You\\\'ll often have to coevolve with the problem. That\\\'s\\nwhy it can sometimes be so hard to figure out what to work on. The\\nsearch space is huge. It\\\'s the cartesian product of all possible\\nt'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.1764591706535943, text='\\noptimistic, and even though one of the sources of their optimism\\nis ignorance, in this case ignorance can sometimes beat knowledge.

Try to finish what you start, though, even if it turns out to be\\nmore work than you expected. Finishing things is not just an exercise\\nin tidiness or self-discipline. In many projects a lot of the best\\nwork happens in what was meant to be the final stage.

Another permissible lie is to exaggerate the importance of what\\nyou\\\'re working on, at least in your own mind. If that helps you\\ndiscover something new, it may turn out not to have been a lie after\\nall.\\n[7]





\\nSince there are two senses of starting work — per day and per\\nproject — there are also two forms of procrastination. Per-project\\nprocrastination is far the more dangerous. You put off starting\\nthat ambitious project from year to year because the time isn\\\'t\\nquite right. When you\\\'re procrastinating in units of years, you can\\nget a lot not done.\\n[8]

One reason per-project procrastination is so dangerous is that it\\nusually camouflages itself as work. You\\\'re not just sitting around\\ndoing nothing; you\\\'re working industriously on something else. So\\nper-project procrastination doesn\\\'t set off the alarms that per-day\\nprocrastination does. You\\\'re too busy to notice it.

The way to beat it is to stop occasionally and ask yourself: Am I\\nworking on what I most want to work on? When you\\\'re young it\\\'s ok\\nif the answer is sometimes no, but this gets increasingly dangerous\\nas you get older.\\n[9]





\\nGreat work usually entails spending what would seem to most people\\nan unreasonable amount of time on a problem. You can\\\'t think of\\nthis time as a cost, or it will seem too high. You have to find the\\nwork sufficiently engaging as it\\\'s happening.

There may be some jobs where you have to work diligently for years\\nat things you hate before you get to the good part, but this is not\\nhow great work happens. Great work happens by focusing consistently\\non something you\\\'re genuinely interested in. When you pause to take\\nstock, you\\\'re surprised how far you\\\'ve come.

The reason we\\\'re surprised is that we underestimate the cumulative\\neffect of work. Writing a page a day doesn\\\'t sound like much, but\\nif you do it every day you\\\'ll write a book a year. That\\\'s the key:\\nconsistency. People who do great things don\\\'t get a lot done every\\nday. They get something done, rather than nothing.

If you do work that compounds, you\\\'ll get exponential growth. Most\\npeople who do this do it unconsciously, but it\\\'s worth stopping to\\nthink about. Learning, for example, is an instance of this phenomenon:\\nthe more you learn about something, the easier it is to learn more.\\nGrowing an audience is another: the more fans you have, the more\\nnew fans they\\\'ll bring you.

'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.174069664815369, text='\\ninside.





Let\\\'s talk a little more about the complicated business of figuring\\nout what to work on. The main reason it\\\'s hard is that you can\\\'t\\ntell what most kinds of work are like except by doing them. Which\\nmeans the four steps overlap: you may have to work at something for\\nyears before you know how much you like it or how good you are at\\nit. And in the meantime you\\\'re not doing, and thus not learning\\nabout, most other kinds of work. So in the worst case you choose\\nlate based on very incomplete information.\\n[4]

The nature of ambition exacerbates this problem. Ambition comes in\\ntwo forms, one that precedes interest in the subject and one that\\ngrows out of it. Most people who do great work have a mix, and the\\nmore you have of the former, the harder it will be to decide what\\nto do.

The educational systems in most countries pretend it\\\'s easy. They\\nexpect you to commit to a field long before you could know what\\nit\\\'s really like. And as a result an ambitious person on an optimal\\ntrajectory will often read to the system as an instance of breakage.

It would be better if they at least admitted it — if they admitted\\nthat the system not only can\\\'t do much to help you figure out what\\nto work on, but is designed on the assumption that you\\\'ll somehow\\nmagically guess as a teenager. They don\\\'t tell you, but I will:\\nwhen it comes to figuring out what to work on, you\\\'re on your own.\\nSome people get lucky and do guess correctly, but the rest will\\nfind themselves scrambling diagonally across tracks laid down on\\nthe assumption that everyone does.

What should you do if you\\\'re young and ambitious but don\\\'t know\\nwhat to work on? What you should not do is drift along passively,\\nassuming the problem will solve itself. You need to take action.\\nBut there is no systematic procedure you can follow. When you read\\nbiographies of people who\\\'ve done great work, it\\\'s remarkable how\\nmuch luck is involved. They discover what to work on as a result\\nof a chance meeting, or by reading a book they happen to pick up.\\nSo you need to make yourself a big target for luck, and the way to\\ndo that is to be curious. Try lots of things, meet lots of people,\\nread lots of books, ask lots of questions.\\n[5]

When in doubt, optimize for interestingness. Fields change as you\\nlearn more about them. What mathematicians do, for example, is very\\ndifferent from what you do in high school math classes. So you need\\nto give different types of work a chance to show you what they\\\'re\\nlike. But a field should become increasingly interesting as you\\nlearn more about it. If it doesn\\\'t, it\\\'s probably not for you.

Don\\\'t worry if you find you\\\'re interested in different things than\\nother people. The stranger your tastes in interestingness, the\\nbetter. Strange tastes are often strong ones, and a strong taste\\nfor work means you\\\'ll be productive. And you\\\'re more likely to find\\nnew things if you'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.158095578895721, text='. Don\\\'t copy the manner of\\nan eminent 50 year old professor if you\\\'re 18, for example, or the\\nidiom of a Renaissance poem hundreds of years later.

Some of the features of things you admire are flaws they succeeded\\ndespite. Indeed, the features that are easiest to imitate are the\\nmost likely to be the flaws.

This is particularly true for behavior. Some talented people are\\njerks, and this sometimes makes it seem to the inexperienced that\\nbeing a jerk is part of being talented. It isn\\\'t; being talented\\nis merely how they get away with it.

One of the most powerful kinds of copying is to copy something from\\none field into another. History is so full of chance discoveries\\nof this type that it\\\'s probably worth giving chance a hand by\\ndeliberately learning about other kinds of work. You can take ideas\\nfrom quite distant fields if you let them be metaphors.

Negative examples can be as inspiring as positive ones. In fact you\\ncan sometimes learn more from things done badly than from things\\ndone well; sometimes it only becomes clear what\\\'s needed when it\\\'s\\nmissing.





\\nIf a lot of the best people in your field are collected in one\\nplace, it\\\'s usually a good idea to visit for a while. It will\\nincrease your ambition, and also, by showing you that these people\\nare human, increase your self-confidence.\\n[26]

If you\\\'re earnest you\\\'ll probably get a warmer welcome than you\\nmight expect. Most people who are very good at something are happy\\nto talk about it with anyone who\\\'s genuinely interested. If they\\\'re\\nreally good at their work, then they probably have a hobbyist\\\'s\\ninterest in it, and hobbyists always want to talk about their\\nhobbies.

It may take some effort to find the people who are really good,\\nthough. Doing great work has such prestige that in some places,\\nparticularly universities, there\\\'s a polite fiction that everyone\\nis engaged in it. And that is far from true. People within universities\\ncan\\\'t say so openly, but the quality of the work being done in\\ndifferent departments varies immensely. Some departments have people\\ndoing great work; others have in the past; others never have.





\\nSeek out the best colleagues. There are a lot of projects that can\\\'t\\nbe done alone, and even if you\\\'re working on one that can be, it\\\'s\\ngood to have other people to encourage you and to bounce ideas off.

Colleagues don\\\'t just affect your work, though; they also affect\\nyou. So work with people you want to become like, because you will.

Quality is more important than quantity in colleagues. It\\\'s better\\nto have one or two great ones than a building full of pretty good\\nones. In fact it\\\'s not merely better, but necessary, judging from\\nhistory: the degree to which great work happens in clusters suggests\\nthat one\\\'s colleagues often make the difference between doing great\\nwork and not.

How do you know when you have sufficiently good colleagues? In my\\nexperience, when you do, you know. Which means if you\\\'re unsure,\\nyou probably don\\\'t. But it may be possible to give a more concrete\\nanswer than that. Here\\\'s an attempt: sufficiently good'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.1566747762241967, text=',\\nbut in practice it usually means following them past all sorts of\\nobstacles. You usually have to risk rejection and failure. So it\\ndoes take a good deal of boldness.

But while you need boldness, you don\\\'t usually need much planning.\\nIn most cases the recipe for doing great work is simply: work hard\\non excitingly ambitious projects, and something good will come of\\nit. Instead of making a plan and then executing it, you just try\\nto preserve certain invariants.

The trouble with planning is that it only works for achievements\\nyou can describe in advance. You can win a gold medal or get rich\\nby deciding to as a child and then tenaciously pursuing that goal,\\nbut you can\\\'t discover natural selection that way.

I think for most people who want to do great work, the right strategy\\nis not to plan too much. At each stage do whatever seems most\\ninteresting and gives you the best options for the future. I call\\nthis approach "staying upwind." This is how most people who\\\'ve done\\ngreat work seem to have done it.





\\nEven when you\\\'ve found something exciting to work on, working on\\nit is not always straightforward. There will be times when some new\\nidea makes you leap out of bed in the morning and get straight to\\nwork. But there will also be plenty of times when things aren\\\'t\\nlike that.

You don\\\'t just put out your sail and get blown forward by inspiration.\\nThere are headwinds and currents and hidden shoals. So there\\\'s a\\ntechnique to working, just as there is to sailing.

For example, while you must work hard, it\\\'s possible to work too\\nhard, and if you do that you\\\'ll find you get diminishing returns:\\nfatigue will make you stupid, and eventually even damage your health.\\nThe point at which work yields diminishing returns depends on the\\ntype. Some of the hardest types you might only be able to do for\\nfour or five hours a day.

Ideally those hours will be contiguous. To the extent you can, try\\nto arrange your life so you have big blocks of time to work in.\\nYou\\\'ll shy away from hard tasks if you know you might be interrupted.

It will probably be harder to start working than to keep working.\\nYou\\\'ll often have to trick yourself to get over that initial\\nthreshold. Don\\\'t worry about this; it\\\'s the nature of work, not a\\nflaw in your character. Work has a sort of activation energy, both\\nper day and per project. And since this threshold is fake in the\\nsense that it\\\'s higher than the energy required to keep going, it\\\'s\\nok to tell yourself a lie of corresponding magnitude to get over\\nit.

It\\\'s usually a mistake to lie to yourself if you want to do great\\nwork, but this is one of the rare cases where it isn\\\'t. When I\\\'m\\nreluctant to start work in the morning, I often trick myself by\\nsaying "I\\\'ll just read over what I\\\'ve got so far." Five minutes\\nlater I\\\'ve found something that seems mistaken or incomplete, and\\nI\\\'m off.

Similar techniques work for starting new projects. It\\\'s ok to lie\\nto yourself about how much work a project will entail, for example.\\nLots of great things began with someone saying "How hard could it\\nbe?"

This is one case where the young have an advantage. They\\\'re more'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.1349744395573516, text=' audience\\nin the traditional sense. Either way it doesn\\\'t need to be big.\\nThe value of an audience doesn\\\'t grow anything like linearly with\\nits size. Which is bad news if you\\\'re famous, but good news if\\nyou\\\'re just starting out, because it means a small but dedicated\\naudience can be enough to sustain you. If a handful of people\\ngenuinely love what you\\\'re doing, that\\\'s enough.

To the extent you can, avoid letting intermediaries come between\\nyou and your audience. In some types of work this is inevitable,\\nbut it\\\'s so liberating to escape it that you might be better off\\nswitching to an adjacent type if that will let you go direct.\\n[28]

The people you spend time with will also have a big effect on your\\nmorale. You\\\'ll find there are some who increase your energy and\\nothers who decrease it, and the effect someone has is not always\\nwhat you\\\'d expect. Seek out the people who increase your energy and\\navoid those who decrease it. Though of course if there\\\'s someone\\nyou need to take care of, that takes precedence.

Don\\\'t marry someone who doesn\\\'t understand that you need to work,\\nor sees your work as competition for your attention. If you\\\'re\\nambitious, you need to work; it\\\'s almost like a medical condition;\\nso someone who won\\\'t let you work either doesn\\\'t understand you,\\nor does and doesn\\\'t care.

Ultimately morale is physical. You think with your body, so it\\\'s\\nimportant to take care of it. That means exercising regularly,\\neating and sleeping well, and avoiding the more dangerous kinds of\\ndrugs. Running and walking are particularly good forms of exercise\\nbecause they\\\'re good for thinking.\\n[29]

People who do great work are not necessarily happier than everyone\\nelse, but they\\\'re happier than they\\\'d be if they didn\\\'t. In fact,\\nif you\\\'re smart and ambitious, it\\\'s dangerous not to be productive.\\nPeople who are smart and ambitious but don\\\'t achieve much tend to\\nbecome bitter.





\\nIt\\\'s ok to want to impress other people, but choose the right people.\\nThe opinion of people you respect is signal. Fame, which is the\\nopinion of a much larger group you might or might not respect, just\\nadds noise.

The prestige of a type of work is at best a trailing indicator and\\nsometimes completely mistaken. If you do anything well enough,\\nyou\\\'ll make it prestigious. So the question to ask about a type of\\nwork is not how much prestige it has, but how well it could be done.

Competition can be an effective motivator, but don\\\'t let it choose\\nthe problem for you; don\\\'t let yourself get drawn into chasing\\nsomething just because others are. In fact, don\\\'t let competitors\\nmake you do anything much more specific than work harder.

Curiosity is the best guide. Your curiosity never lies, and it knows\\nmore than you do about what\\\'s worth paying attention to.





\\nNotice how often that word has come up. If you asked an oracle the\\nsecret to doing great work and the oracle replied'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.123214818076958, text='b\'How to Do Great Work\\n\\n


How to Do Great Work

July 2023

If you collected lists of techniques for doing great work in a lot\\nof different fields, what would the intersection look like? I decided\\nto find out'), Result(attributes={}, file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='file-a98ada68681c4fbeba2201e9c7213fc3', score=1.1193194369249235, text=' dangerous kinds of\\ndrugs. Running and walking are particularly good forms of exercise\\nbecause they\\\'re good for thinking.\\n[29]

People who do great work are not necessarily happier than everyone\\nelse, but they\\\'re happier than they\\\'d be if they didn\\\'t. In fact,\\nif you\\\'re smart and ambitious, it\\\'s dangerous not to be productive.\\nPeople who are smart and ambitious but don\\\'t achieve much tend to\\nbecome bitter.





\\nIt\\\'s ok to want to impress other people, but choose the right people.\\nThe opinion of people you respect is signal. Fame, which is the\\nopinion of a much larger group you might or might not respect, just\\nadds noise.

The prestige of a type of work is at best a trailing indicator and\\nsometimes completely mistaken. If you do anything well enough,\\nyou\\\'ll make it prestigious. So the question to ask about a type of\\nwork is not how much prestige it has, but how well it could be done.

Competition can be an effective motivator, but don\\\'t let it choose\\nthe problem for you; don\\\'t let yourself get drawn into chasing\\nsomething just because others are. In fact, don\\\'t let competitors\\nmake you do anything much more specific than work harder.

Curiosity is the best guide. Your curiosity never lies, and it knows\\nmore than you do about what\\\'s worth paying attention to.





\\nNotice how often that word has come up. If you asked an oracle the\\nsecret to doing great work and the oracle replied with a single\\nword, my bet would be on "curiosity."

That doesn\\\'t translate directly to advice. It\\\'s not enough just to\\nbe curious, and you can\\\'t command curiosity anyway. But you can\\nnurture it and let it drive you.

Curiosity is the key to all four steps in doing great work: it will\\nchoose the field for you, get you to the frontier, cause you to\\nnotice the gaps in it, and drive you to explore them. The whole\\nprocess is a kind of dance with curiosity.





\\nBelieve it or not, I tried to make this essay as short as I could.\\nBut its length at least means it acts as a filter. If you made it\\nthis far, you must be interested in doing great work. And if so\\nyou\\\'re already further along than you might realize, because the\\nset of people willing to want to is small.

The factors in doing great work are factors in the literal,\\nmathematical sense, and they are: ability, interest, effort, and\\nluck. Luck by definition you can\\\'t do anything about, so we can\\nignore that. And we can assume effort, if you do in fact want to\\ndo great work. So the problem boils down to ability and interest.\\nCan you find a kind of work where your ability and interest will\\ncombine to yield an explosion of new ideas?

Here there are grounds for optimism. There are so many different\\nways to do great work, and even more that are still undiscovered.\\nOut of all those different types of work, the one you\\\'re most suited\\nfor is probably a pretty close match. Probably a comically close\\nmatch. It\\\'s just a question of finding it, and how far into it')]), ResponseOutputMessage(id='msg_3591ea71-8b35-4efd-a5ad-c1c250801971', content=[ResponseOutputText(annotations=[AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=361, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=676, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=948, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1259, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1520, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1747, type='file_citation')], text='To do great work, consider the following principles:\n\n1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.\n\n2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.\n\n3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.\n\n4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.\n\n5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.\n\n6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.\n\nBy focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.', type='output_text', logprobs=None)], role='assistant', status='completed', type='message')], parallel_tool_calls=False, temperature=None, tool_choice=None, tools=None, top_p=None, background=None, conversation=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=None, safety_identifier=None, service_tier=None, status='completed', text=ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity=None), top_logprobs=None, truncation=None, usage=None, user=None) In [34]: resp.output[1].content[0].text Out[34]: 'To do great work, consider the following principles:\n\n1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.\n\n2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.\n\n3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.\n\n4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.\n\n5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.\n\n6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.\n\nBy focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.' ``` The relevant output looks like this: ```python >resp.output[1].content[0].annotations [AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=361, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=676, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=948, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1259, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1520, type='file_citation'), AnnotationFileCitation(file_id='file-a98ada68681c4fbeba2201e9c7213fc3', filename='https://www.paulgraham.com/greatwork.html', index=1747, type='file_citation')]``` And ```python In [144]: print(resp.output[1].content[0].text) To do great work, consider the following principles: 1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too. 2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements. 3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you. 4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale. 5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor. 6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights. By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment. ``` And the code below outputs only periods highlighting that the position/index behaves as expected—i.e., the annotation happens at the end of the sentence. ```python print([resp.output[1].content[0].text[j.index] for j in resp.output[1].content[0].annotations]) Out[41]: ['.', '.', '.', '.', '.', '.'] ``` ## Test Plan Unit tests added. --------- Signed-off-by: Francisco Javier Arceo --- .../meta_reference/responses/streaming.py | 9 ++- .../meta_reference/responses/tool_executor.py | 44 ++++++++++++-- .../agents/meta_reference/responses/types.py | 1 + .../agents/meta_reference/responses/utils.py | 57 ++++++++++++++++++- .../inline/tool_runtime/rag/memory.py | 5 +- .../utils/memory/openai_vector_store_mixin.py | 7 ++- .../test_response_conversion_utils.py | 25 ++++++++ 7 files changed, 136 insertions(+), 12 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 0bb524f5c..8a662e6db 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -97,6 +97,8 @@ class StreamingResponseOrchestrator: self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = {} # Track final messages after all tool executions self.final_messages: list[OpenAIMessageParam] = [] + # mapping for annotations + self.citation_files: dict[str, str] = {} async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]: # Initialize output messages @@ -126,6 +128,7 @@ class StreamingResponseOrchestrator: # Text is the default response format for chat completion so don't need to pass it # (some providers don't support non-empty response_format when tools are present) response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format + logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}") completion_result = await self.inference_api.openai_chat_completion( model=self.ctx.model, messages=messages, @@ -160,7 +163,7 @@ class StreamingResponseOrchestrator: # Handle choices with no tool calls for choice in current_response.choices: if not (choice.message.tool_calls and self.ctx.response_tools): - output_messages.append(await convert_chat_choice_to_response_message(choice)) + output_messages.append(await convert_chat_choice_to_response_message(choice, self.citation_files)) # Execute tool calls and coordinate results async for stream_event in self._coordinate_tool_execution( @@ -211,6 +214,8 @@ class StreamingResponseOrchestrator: for choice in current_response.choices: next_turn_messages.append(choice.message) + logger.debug(f"Choice message content: {choice.message.content}") + logger.debug(f"Choice message tool_calls: {choice.message.tool_calls}") if choice.message.tool_calls and self.ctx.response_tools: for tool_call in choice.message.tool_calls: @@ -470,6 +475,8 @@ class StreamingResponseOrchestrator: tool_call_log = result.final_output_message tool_response_message = result.final_input_message self.sequence_number = result.sequence_number + if result.citation_files: + self.citation_files.update(result.citation_files) if tool_call_log: output_messages.append(tool_call_log) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py index b028c018b..b33b47454 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py @@ -94,7 +94,10 @@ class ToolExecutor: # Yield the final result yield ToolExecutionResult( - sequence_number=sequence_number, final_output_message=output_message, final_input_message=input_message + sequence_number=sequence_number, + final_output_message=output_message, + final_input_message=input_message, + citation_files=result.metadata.get("citation_files") if result and result.metadata else None, ) async def _execute_knowledge_search_via_vector_store( @@ -129,8 +132,6 @@ class ToolExecutor: for results in all_results: search_results.extend(results) - # Convert search results to tool result format matching memory.py - # Format the results as interleaved content similar to memory.py content_items = [] content_items.append( TextContentItem( @@ -138,27 +139,58 @@ class ToolExecutor: ) ) + unique_files = set() for i, result_item in enumerate(search_results): chunk_text = result_item.content[0].text if result_item.content else "" - metadata_text = f"document_id: {result_item.file_id}, score: {result_item.score}" + # Get file_id from attributes if result_item.file_id is empty + file_id = result_item.file_id or ( + result_item.attributes.get("document_id") if result_item.attributes else None + ) + metadata_text = f"document_id: {file_id}, score: {result_item.score}" if result_item.attributes: metadata_text += f", attributes: {result_item.attributes}" - text_content = f"[{i + 1}] {metadata_text}\n{chunk_text}\n" + + text_content = f"[{i + 1}] {metadata_text} (cite as <|{file_id}|>)\n{chunk_text}\n" content_items.append(TextContentItem(text=text_content)) + unique_files.add(file_id) content_items.append(TextContentItem(text="END of knowledge_search tool results.\n")) + + citation_instruction = "" + if unique_files: + citation_instruction = ( + " Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). " + "Do not add extra punctuation. Use only the file IDs provided (do not invent new ones)." + ) + content_items.append( TextContentItem( - text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.\n', + text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.{citation_instruction}\n', ) ) + # handling missing attributes for old versions + citation_files = {} + for result in search_results: + file_id = result.file_id + if not file_id and result.attributes: + file_id = result.attributes.get("document_id") + + filename = result.filename + if not filename and result.attributes: + filename = result.attributes.get("filename") + if not filename: + filename = "unknown" + + citation_files[file_id] = filename + return ToolInvocationResult( content=content_items, metadata={ "document_ids": [r.file_id for r in search_results], "chunks": [r.content[0].text if r.content else "" for r in search_results], "scores": [r.score for r in search_results], + "citation_files": citation_files, }, ) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/llama_stack/providers/inline/agents/meta_reference/responses/types.py index d3b5a16bd..fd5f44242 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/types.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/types.py @@ -27,6 +27,7 @@ class ToolExecutionResult(BaseModel): sequence_number: int final_output_message: OpenAIResponseOutput | None = None final_input_message: OpenAIMessageParam | None = None + citation_files: dict[str, str] | None = None @dataclass diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py index 310a88298..5b013b9c4 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py @@ -4,9 +4,11 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import re import uuid from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseAnnotationFileCitation, OpenAIResponseInput, OpenAIResponseInputFunctionToolCallOutput, OpenAIResponseInputMessageContent, @@ -45,7 +47,9 @@ from llama_stack.apis.inference import ( ) -async def convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage: +async def convert_chat_choice_to_response_message( + choice: OpenAIChoice, citation_files: dict[str, str] | None = None +) -> OpenAIResponseMessage: """Convert an OpenAI Chat Completion choice into an OpenAI Response output message.""" output_content = "" if isinstance(choice.message.content, str): @@ -57,9 +61,11 @@ async def convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenA f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}" ) + annotations, clean_text = _extract_citations_from_text(output_content, citation_files or {}) + return OpenAIResponseMessage( id=f"msg_{uuid.uuid4()}", - content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)], + content=[OpenAIResponseOutputMessageContentOutputText(text=clean_text, annotations=annotations)], status="completed", role="assistant", ) @@ -200,6 +206,53 @@ async def get_message_type_by_role(role: str): return role_to_type.get(role) +def _extract_citations_from_text( + text: str, citation_files: dict[str, str] +) -> tuple[list[OpenAIResponseAnnotationFileCitation], str]: + """Extract citation markers from text and create annotations + + Args: + text: The text containing citation markers like [file-Cn3MSNn72ENTiiq11Qda4A] + citation_files: Dictionary mapping file_id to filename + + Returns: + Tuple of (annotations_list, clean_text_without_markers) + """ + file_id_regex = re.compile(r"<\|(?Pfile-[A-Za-z0-9_-]+)\|>") + + annotations = [] + parts = [] + total_len = 0 + last_end = 0 + + for m in file_id_regex.finditer(text): + # segment before the marker + prefix = text[last_end : m.start()] + + # drop one space if it exists (since marker is at sentence end) + if prefix.endswith(" "): + prefix = prefix[:-1] + + parts.append(prefix) + total_len += len(prefix) + + fid = m.group(1) + if fid in citation_files: + annotations.append( + OpenAIResponseAnnotationFileCitation( + file_id=fid, + filename=citation_files[fid], + index=total_len, # index points to punctuation + ) + ) + + last_end = m.end() + + parts.append(text[last_end:]) + cleaned_text = "".join(parts) + return annotations, cleaned_text + + def is_function_tool_call( tool_call: OpenAIChatCompletionToolCall, tools: list[OpenAIResponseInputTool], diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py index c8499a9b8..aac86a056 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/memory.py +++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py @@ -331,5 +331,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti return ToolInvocationResult( content=result.content or [], - metadata=result.metadata, + metadata={ + **(result.metadata or {}), + "citation_files": getattr(result, "citation_files", None), + }, ) diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 0d0aa25a4..97079c3b3 100644 --- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -587,7 +587,7 @@ class OpenAIVectorStoreMixin(ABC): content = self._chunk_to_vector_store_content(chunk) response_data_item = VectorStoreSearchResponse( - file_id=chunk.metadata.get("file_id", ""), + file_id=chunk.metadata.get("document_id", ""), filename=chunk.metadata.get("filename", ""), score=score, attributes=chunk.metadata, @@ -746,12 +746,15 @@ class OpenAIVectorStoreMixin(ABC): content = content_from_data_and_mime_type(content_response.body, mime_type) + chunk_attributes = attributes.copy() + chunk_attributes["filename"] = file_response.filename + chunks = make_overlapped_chunks( file_id, content, max_chunk_size_tokens, chunk_overlap_tokens, - attributes, + chunk_attributes, ) if not chunks: vector_store_file_object.status = "failed" diff --git a/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py b/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py index 187540f82..2698b88c8 100644 --- a/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py +++ b/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py @@ -8,6 +8,7 @@ import pytest from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseAnnotationFileCitation, OpenAIResponseInputFunctionToolCallOutput, OpenAIResponseInputMessageContentImage, OpenAIResponseInputMessageContentText, @@ -35,6 +36,7 @@ from llama_stack.apis.inference import ( OpenAIUserMessageParam, ) from llama_stack.providers.inline.agents.meta_reference.responses.utils import ( + _extract_citations_from_text, convert_chat_choice_to_response_message, convert_response_content_to_chat_content, convert_response_input_to_chat_messages, @@ -340,3 +342,26 @@ class TestIsFunctionToolCall: result = is_function_tool_call(tool_call, tools) assert result is False + + +class TestExtractCitationsFromText: + def test_extract_citations_and_annotations(self): + text = "Start [not-a-file]. New source <|file-abc123|>. " + text += "Other source <|file-def456|>? Repeat source <|file-abc123|>! No citation." + file_mapping = {"file-abc123": "doc1.pdf", "file-def456": "doc2.txt"} + + annotations, cleaned_text = _extract_citations_from_text(text, file_mapping) + + expected_annotations = [ + OpenAIResponseAnnotationFileCitation(file_id="file-abc123", filename="doc1.pdf", index=30), + OpenAIResponseAnnotationFileCitation(file_id="file-def456", filename="doc2.txt", index=44), + OpenAIResponseAnnotationFileCitation(file_id="file-abc123", filename="doc1.pdf", index=59), + ] + expected_clean_text = "Start [not-a-file]. New source. Other source? Repeat source! No citation." + + assert cleaned_text == expected_clean_text + assert annotations == expected_annotations + # OpenAI cites at the end of the sentence + assert cleaned_text[expected_annotations[0].index] == "." + assert cleaned_text[expected_annotations[1].index] == "?" + assert cleaned_text[expected_annotations[2].index] == "!" From 1970b4aa4b581d18c4a0a14b8ddcfd0041064d0c Mon Sep 17 00:00:00 2001 From: Akram Ben Aissi Date: Tue, 7 Oct 2025 19:27:24 +0100 Subject: [PATCH 2/6] fix: improve model availability checks: Allows use of unavailable models on startup (#3717) - Allows use of unavailable models on startup - Add has_model method to ModelsRoutingTable for checking pre-registered models - Update check_model_availability to check model_store before provider APIs # What does this PR do? ## Test Plan Start llama stack and point unavailable vLLM ``` VLLM_URL=https://my-unavailable-vllm/v1 MILVUS_DB_PATH=./milvus.db INFERENCE_MODEL=vllm uv run --with llama-stack llama stack build --distro starter --image-type venv --run ``` llama stack will start without crashing but only notifying error. ``` - provider_id: rag-runtime toolgroup_id: builtin::rag vector_dbs: [] version: 2 INFO 2025-10-07 06:40:41,804 llama_stack.providers.utils.inference.inference_store:74 inference: Write queue disabled for SQLite to avoid concurrency issues INFO 2025-10-07 06:40:42,066 llama_stack.providers.utils.responses.responses_store:96 openai_responses: Write queue disabled for SQLite to avoid concurrency issues ERROR 2025-10-07 06:40:58,882 llama_stack.providers.utils.inference.openai_mixin:436 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: Request timed out. WARNING 2025-10-07 06:40:58,883 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider vllm: Request timed out. [...] INFO 2025-10-07 06:40:59,036 uvicorn.error:216 uncategorized: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO 2025-10-07 06:41:04,064 openai._base_client:1618 uncategorized: Retrying request to /models in 0.398814 seconds INFO 2025-10-07 06:41:09,497 openai._base_client:1618 uncategorized: Retrying request to /models in 0.781908 seconds ERROR 2025-10-07 06:41:15,282 llama_stack.providers.utils.inference.openai_mixin:436 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: Request timed out. WARNING 2025-10-07 06:41:15,283 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider vllm: Request timed out. ``` --- llama_stack/core/routing_tables/models.py | 13 +++++++ .../providers/utils/inference/openai_mixin.py | 10 ++++- .../routers/test_routing_tables.py | 6 +++ .../utils/inference/test_openai_mixin.py | 39 ++++++++++++++++++- 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/llama_stack/core/routing_tables/models.py b/llama_stack/core/routing_tables/models.py index 69d7e9b6f..716be936a 100644 --- a/llama_stack/core/routing_tables/models.py +++ b/llama_stack/core/routing_tables/models.py @@ -67,6 +67,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): raise ValueError(f"Provider {model.provider_id} not found in the routing table") return self.impls_by_provider_id[model.provider_id] + async def has_model(self, model_id: str) -> bool: + """ + Check if a model exists in the routing table. + + :param model_id: The model identifier to check + :return: True if the model exists, False otherwise + """ + try: + await lookup_model(self, model_id) + return True + except ModelNotFoundError: + return False + async def register_model( self, model_id: str, diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index 3c5c5b4de..cba7508a2 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -474,11 +474,17 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): async def check_model_availability(self, model: str) -> bool: """ - Check if a specific model is available from the provider's /v1/models. + Check if a specific model is available from the provider's /v1/models or pre-registered. :param model: The model identifier to check. - :return: True if the model is available dynamically, False otherwise. + :return: True if the model is available dynamically or pre-registered, False otherwise. """ + # First check if the model is pre-registered in the model store + if hasattr(self, "model_store") and self.model_store: + if await self.model_store.has_model(model): + return True + + # Then check the provider's dynamic model cache if not self._model_cache: await self.list_models() return model in self._model_cache diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py index 54a9dd72e..a1c3d1e95 100644 --- a/tests/unit/distribution/routers/test_routing_tables.py +++ b/tests/unit/distribution/routers/test_routing_tables.py @@ -201,6 +201,12 @@ async def test_models_routing_table(cached_disk_dist_registry): non_existent = await table.get_object_by_identifier("model", "non-existent-model") assert non_existent is None + # Test has_model + assert await table.has_model("test_provider/test-model") + assert await table.has_model("test_provider/test-model-2") + assert not await table.has_model("non-existent-model") + assert not await table.has_model("test_provider/non-existent-model") + await table.unregister_model(model_id="test_provider/test-model") await table.unregister_model(model_id="test_provider/test-model-2") diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py index 2e3a62ca6..ad9406951 100644 --- a/tests/unit/providers/utils/inference/test_openai_mixin.py +++ b/tests/unit/providers/utils/inference/test_openai_mixin.py @@ -44,11 +44,12 @@ def mixin(): config = RemoteInferenceProviderConfig() mixin_instance = OpenAIMixinImpl(config=config) - # just enough to satisfy _get_provider_model_id calls - mock_model_store = MagicMock() + # Mock model_store with async methods + mock_model_store = AsyncMock() mock_model = MagicMock() mock_model.provider_resource_id = "test-provider-resource-id" mock_model_store.get_model = AsyncMock(return_value=mock_model) + mock_model_store.has_model = AsyncMock(return_value=False) # Default to False, tests can override mixin_instance.model_store = mock_model_store return mixin_instance @@ -189,6 +190,40 @@ class TestOpenAIMixinCheckModelAvailability: assert len(mixin._model_cache) == 3 + async def test_check_model_availability_with_pre_registered_model( + self, mixin, mock_client_with_models, mock_client_context + ): + """Test that check_model_availability returns True for pre-registered models in model_store""" + # Mock model_store.has_model to return True for a specific model + mock_model_store = AsyncMock() + mock_model_store.has_model = AsyncMock(return_value=True) + mixin.model_store = mock_model_store + + # Test that pre-registered model is found without calling the provider's API + with mock_client_context(mixin, mock_client_with_models): + mock_client_with_models.models.list.assert_not_called() + assert await mixin.check_model_availability("pre-registered-model") + # Should not call the provider's list_models since model was found in store + mock_client_with_models.models.list.assert_not_called() + mock_model_store.has_model.assert_called_once_with("pre-registered-model") + + async def test_check_model_availability_fallback_to_provider_when_not_in_store( + self, mixin, mock_client_with_models, mock_client_context + ): + """Test that check_model_availability falls back to provider when model not in store""" + # Mock model_store.has_model to return False + mock_model_store = AsyncMock() + mock_model_store.has_model = AsyncMock(return_value=False) + mixin.model_store = mock_model_store + + # Test that it falls back to provider's model cache + with mock_client_context(mixin, mock_client_with_models): + mock_client_with_models.models.list.assert_not_called() + assert await mixin.check_model_availability("some-mock-model-id") + # Should call the provider's list_models since model was not found in store + mock_client_with_models.models.list.assert_called_once() + mock_model_store.has_model.assert_called_once_with("some-mock-model-id") + class TestOpenAIMixinCacheBehavior: """Test cases for cache behavior and edge cases""" From c2d97a9db95965d4272c4bf4ee2a70f57637e456 Mon Sep 17 00:00:00 2001 From: slekkala1 Date: Tue, 7 Oct 2025 14:23:14 -0700 Subject: [PATCH 3/6] chore: fix flaky unit test and add proper shutdown for file batches (#3725) # What does this PR do? Have been running into flaky unit test failures: https://github.com/llamastack/llama-stack/actions/runs/18319987543/job/52170354944?pr=3711 Fixing below 1. Shutting down properly by cancelling any stale file batches tasks running in background. 2. Also, use unique_kvstore_config, so the test dont use same db path and maintain test isolation ## Test Plan Ran unit test locally and CI --- .../providers/inline/vector_io/faiss/faiss.py | 4 ++-- .../inline/vector_io/sqlite_vec/sqlite_vec.py | 4 ++-- .../remote/vector_io/chroma/chroma.py | 3 ++- .../remote/vector_io/milvus/milvus.py | 2 ++ .../remote/vector_io/pgvector/pgvector.py | 2 ++ .../remote/vector_io/qdrant/qdrant.py | 2 ++ .../remote/vector_io/weaviate/weaviate.py | 2 ++ .../utils/memory/openai_vector_store_mixin.py | 13 ++++++++++ tests/unit/providers/vector_io/conftest.py | 24 +++++++++---------- 9 files changed, 39 insertions(+), 17 deletions(-) diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py index 405c134e5..5a456c7c9 100644 --- a/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -225,8 +225,8 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr await self.initialize_openai_vector_stores() async def shutdown(self) -> None: - # Cleanup if needed - pass + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def health(self) -> HealthResponse: """ diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py index 26231a9b7..a433257b2 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py @@ -434,8 +434,8 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc await self.initialize_openai_vector_stores() async def shutdown(self) -> None: - # nothing to do since we don't maintain a persistent connection - pass + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def list_vector_dbs(self) -> list[VectorDB]: return [v.vector_db for v in self.cache.values()] diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 511123d6e..331e5432e 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -167,7 +167,8 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP self.openai_vector_stores = await self._load_openai_vector_stores() async def shutdown(self) -> None: - pass + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def register_vector_db( self, diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py index 0acc90595..029eacfe3 100644 --- a/llama_stack/providers/remote/vector_io/milvus/milvus.py +++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py @@ -349,6 +349,8 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP async def shutdown(self) -> None: self.client.close() + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def register_vector_db( self, diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py index dfdfef6eb..21c388b1d 100644 --- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py +++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py @@ -390,6 +390,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco if self.conn is not None: self.conn.close() log.info("Connection to PGVector database server closed") + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def register_vector_db(self, vector_db: VectorDB) -> None: # Persist vector DB metadata in the KV store diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 6b386840c..021938afd 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -191,6 +191,8 @@ class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP async def shutdown(self) -> None: await self.client.close() + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def register_vector_db( self, diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py index 54ac6f8d3..21df3bc45 100644 --- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py +++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py @@ -347,6 +347,8 @@ class WeaviateVectorIOAdapter( async def shutdown(self) -> None: for client in self.client_cache.values(): client.close() + # Clean up mixin resources (file batch tasks) + await super().shutdown() async def register_vector_db( self, diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 97079c3b3..2a5177f93 100644 --- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -293,6 +293,19 @@ class OpenAIVectorStoreMixin(ABC): await self._resume_incomplete_batches() self._last_file_batch_cleanup_time = 0 + async def shutdown(self) -> None: + """Clean up mixin resources including background tasks.""" + # Cancel any running file batch tasks gracefully + if hasattr(self, "_file_batch_tasks"): + tasks_to_cancel = list(self._file_batch_tasks.items()) + for _, task in tasks_to_cancel: + if not task.done(): + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + @abstractmethod async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Delete chunks from a vector store.""" diff --git a/tests/unit/providers/vector_io/conftest.py b/tests/unit/providers/vector_io/conftest.py index 70ace695e..d122f9323 100644 --- a/tests/unit/providers/vector_io/conftest.py +++ b/tests/unit/providers/vector_io/conftest.py @@ -145,10 +145,10 @@ async def sqlite_vec_vec_index(embedding_dimension, tmp_path_factory): @pytest.fixture -async def sqlite_vec_adapter(sqlite_vec_db_path, mock_inference_api, embedding_dimension): +async def sqlite_vec_adapter(sqlite_vec_db_path, unique_kvstore_config, mock_inference_api, embedding_dimension): config = SQLiteVectorIOConfig( db_path=sqlite_vec_db_path, - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = SQLiteVecVectorIOAdapter( config=config, @@ -187,10 +187,10 @@ async def milvus_vec_index(milvus_vec_db_path, embedding_dimension): @pytest.fixture -async def milvus_vec_adapter(milvus_vec_db_path, mock_inference_api): +async def milvus_vec_adapter(milvus_vec_db_path, unique_kvstore_config, mock_inference_api): config = MilvusVectorIOConfig( db_path=milvus_vec_db_path, - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = MilvusVectorIOAdapter( config=config, @@ -264,10 +264,10 @@ async def chroma_vec_index(chroma_vec_db_path, embedding_dimension): @pytest.fixture -async def chroma_vec_adapter(chroma_vec_db_path, mock_inference_api, embedding_dimension): +async def chroma_vec_adapter(chroma_vec_db_path, unique_kvstore_config, mock_inference_api, embedding_dimension): config = ChromaVectorIOConfig( db_path=chroma_vec_db_path, - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = ChromaVectorIOAdapter( config=config, @@ -296,12 +296,12 @@ def qdrant_vec_db_path(tmp_path_factory): @pytest.fixture -async def qdrant_vec_adapter(qdrant_vec_db_path, mock_inference_api, embedding_dimension): +async def qdrant_vec_adapter(qdrant_vec_db_path, unique_kvstore_config, mock_inference_api, embedding_dimension): import uuid config = QdrantVectorIOConfig( db_path=qdrant_vec_db_path, - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = QdrantVectorIOAdapter( config=config, @@ -386,14 +386,14 @@ async def pgvector_vec_index(embedding_dimension, mock_psycopg2_connection): @pytest.fixture -async def pgvector_vec_adapter(mock_inference_api, embedding_dimension): +async def pgvector_vec_adapter(unique_kvstore_config, mock_inference_api, embedding_dimension): config = PGVectorVectorIOConfig( host="localhost", port=5432, db="test_db", user="test_user", password="test_password", - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = PGVectorVectorIOAdapter(config, mock_inference_api, None) @@ -476,7 +476,7 @@ async def weaviate_vec_index(weaviate_vec_db_path): @pytest.fixture -async def weaviate_vec_adapter(weaviate_vec_db_path, mock_inference_api, embedding_dimension): +async def weaviate_vec_adapter(weaviate_vec_db_path, unique_kvstore_config, mock_inference_api, embedding_dimension): import pytest_socket import weaviate @@ -492,7 +492,7 @@ async def weaviate_vec_adapter(weaviate_vec_db_path, mock_inference_api, embeddi config = WeaviateVectorIOConfig( weaviate_cluster_url="localhost:8080", weaviate_api_key=None, - kvstore=SqliteKVStoreConfig(), + kvstore=unique_kvstore_config, ) adapter = WeaviateVectorIOAdapter( config=config, From bc7d4b423b68c28fabc8ac15a0a1aa353270d3fc Mon Sep 17 00:00:00 2001 From: Emilio Garcia Date: Tue, 7 Oct 2025 17:59:53 -0400 Subject: [PATCH 4/6] fix(scripts): select container runtime for telemetry (#3727) # What does this PR do? script runs with either docker or podman ## Test Plan passes when run --- scripts/telemetry/setup_telemetry.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/telemetry/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh index e0b57a354..ecdd56175 100755 --- a/scripts/telemetry/setup_telemetry.sh +++ b/scripts/telemetry/setup_telemetry.sh @@ -16,10 +16,19 @@ set -Eeuo pipefail -CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker} -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if command -v podman &> /dev/null; then + CONTAINER_RUNTIME="podman" +elif command -v docker &> /dev/null; then + CONTAINER_RUNTIME="docker" +else + echo "🚨 Neither Podman nor Docker could be found" + echo "Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation" + exit 1 +fi -echo "🚀 Setting up telemetry stack for Llama Stack using Podman..." +echo "🚀 Setting up telemetry stack for Llama Stack using $CONTAINER_RUNTIME..." + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if ! command -v "$CONTAINER_RUNTIME" &> /dev/null; then echo "🚨 $CONTAINER_RUNTIME could not be found" From c940fe79389a930923bd7f30787c467ed6f132c3 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Tue, 7 Oct 2025 18:23:12 -0700 Subject: [PATCH 5/6] fix: fix nvidia provider (#3716) # What does this PR do? (Used claude to solve #3715, coded with claude but tested by me) ## From claude summary: **Problem**: The `NVIDIAInferenceAdapter` class was missing the `alias_to_provider_id_map` attribute, which caused the error: `ERROR 'NVIDIAInferenceAdapter' object has no attribute 'alias_to_provider_id_map'` **Root Cause**: The `NVIDIAInferenceAdapter` only inherited from `OpenAIMixin`, but some parts of the system expected it to have the `alias_to_provider_id_map` attribute, which is provided by the `ModelRegistryHelper` class. **Solution**: 1. **Added ModelRegistryHelper import**: Imported the `ModelRegistryHelper` class from `llama_stack.providers.utils.inference.model_registry` 2. **Updated inheritance**: Changed the class declaration to inherit from both `OpenAIMixin` and `ModelRegistryHelper` 3. **Added proper initialization**: Added an `__init__` method that properly initializes the `ModelRegistryHelper` with empty model entries (since NVIDIA uses dynamic model discovery) and the allowed models from the configuration **Key Changes**: * Added `from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper` * Changed class declaration from `class NVIDIAInferenceAdapter(OpenAIMixin):` to `class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper):` * Added `__init__` method that calls `ModelRegistryHelper.__init__(self, model_entries=[], allowed_models=config.allowed_models)` The inheritance order is important - `OpenAIMixin` comes first to ensure its `check_model_availability()` method takes precedence over the `ModelRegistryHelper` version, as mentioned in the class documentation. This fix ensures that the `NVIDIAInferenceAdapter` has the required `alias_to_provider_id_map` attribute while maintaining all existing functionality. ## Test Plan Launching llama-stack server successfully, see logs: ``` NVIDIA_API_KEY=dummy NVIDIA_BASE_URL=http://localhost:8912 llama stack run /home/nvidia/.llama/distributions/starter/starter-run.yaml --image-type venv & [2] 3753042 (venv) nvidia@nv-meta-H100-testing-gpu01:~/kai/llama-stack$ WARNING 2025-10-07 00:29:09,848 root:266 uncategorized: Unknown logging category: openai::conversations. Falling back to default 'root' level: 20 WARNING 2025-10-07 00:29:09,932 root:266 uncategorized: Unknown logging category: cli. Falling back to default 'root' level: 20 INFO 2025-10-07 00:29:09,937 llama_stack.core.utils.config_resolution:45 core: Using file path: /home/nvidia/.llama/distributions/starter/starter-run.yaml INFO 2025-10-07 00:29:09,937 llama_stack.cli.stack.run:136 cli: Using run configuration: /home/nvidia/.llama/distributions/starter/starter-run.yaml Using virtual environment: /home/nvidia/kai/venv Virtual environment already activated + '[' -n /home/nvidia/.llama/distributions/starter/starter-run.yaml ']' + yaml_config_arg=/home/nvidia/.llama/distributions/starter/starter-run.yaml + llama stack run /home/nvidia/.llama/distributions/starter/starter-run.yaml --port 8321 WARNING 2025-10-07 00:29:11,432 root:266 uncategorized: Unknown logging category: openai::conversations. Falling back to default 'root' level: 20 WARNING 2025-10-07 00:29:11,593 root:266 uncategorized: Unknown logging category: cli. Falling back to default 'root' level: 20 INFO 2025-10-07 00:29:11,603 llama_stack.core.utils.config_resolution:45 core: Using file path: /home/nvidia/.llama/distributions/starter/starter-run.yaml INFO 2025-10-07 00:29:11,604 llama_stack.cli.stack.run:136 cli: Using run configuration: /home/nvidia/.llama/distributions/starter/starter-run.yaml INFO 2025-10-07 00:29:11,624 llama_stack.cli.stack.run:155 cli: No image type or image name provided. Assuming environment packages. INFO 2025-10-07 00:29:11,625 llama_stack.core.utils.config_resolution:45 core: Using file path: /home/nvidia/.llama/distributions/starter/starter-run.yaml INFO 2025-10-07 00:29:11,644 llama_stack.cli.stack.run:230 cli: HTTPS enabled with certificates: Key: None Cert: None INFO 2025-10-07 00:29:11,645 llama_stack.cli.stack.run:232 cli: Listening on ['::', '0.0.0.0']:8321 INFO 2025-10-07 00:29:11,816 llama_stack.core.utils.config_resolution:45 core: Using file path: /home/nvidia/.llama/distributions/starter/starter-run.yaml INFO 2025-10-07 00:29:11,836 llama_stack.core.server.server:480 core::server: Run configuration: INFO 2025-10-07 00:29:11,845 llama_stack.core.server.server:483 core::server: apis: - agents - batches - datasetio - eval - files - inference - post_training - safety - scoring - telemetry - tool_runtime - vector_io benchmarks: [] datasets: [] image_name: starter inference_store: db_path: /home/nvidia/.llama/distributions/starter/inference_store.db type: sqlite metadata_store: db_path: /home/nvidia/.llama/distributions/starter/registry.db type: sqlite models: [] providers: agents: - config: persistence_store: db_path: /home/nvidia/.llama/distributions/starter/agents_store.db type: sqlite responses_store: db_path: /home/nvidia/.llama/distributions/starter/responses_store.db type: sqlite provider_id: meta-reference provider_type: inline::meta-reference batches: - config: kvstore: db_path: /home/nvidia/.llama/distributions/starter/batches.db type: sqlite provider_id: reference provider_type: inline::reference datasetio: - config: kvstore: db_path: /home/nvidia/.llama/distributions/starter/huggingface_datasetio.db type: sqlite provider_id: huggingface provider_type: remote::huggingface - config: kvstore: db_path: /home/nvidia/.llama/distributions/starter/localfs_datasetio.db type: sqlite provider_id: localfs provider_type: inline::localfs eval: - config: kvstore: db_path: /home/nvidia/.llama/distributions/starter/meta_reference_eval.db type: sqlite provider_id: meta-reference provider_type: inline::meta-reference files: - config: metadata_store: db_path: /home/nvidia/.llama/distributions/starter/files_metadata.db type: sqlite storage_dir: /home/nvidia/.llama/distributions/starter/files provider_id: meta-reference-files provider_type: inline::localfs inference: - config: api_key: '********' url: https://api.fireworks.ai/inference/v1 provider_id: fireworks provider_type: remote::fireworks - config: api_key: '********' url: https://api.together.xyz/v1 provider_id: together provider_type: remote::together - config: {} provider_id: bedrock provider_type: remote::bedrock - config: api_key: '********' append_api_version: true url: http://localhost:8912 provider_id: nvidia provider_type: remote::nvidia - config: api_key: '********' base_url: https://api.openai.com/v1 provider_id: openai provider_type: remote::openai - config: api_key: '********' provider_id: anthropic provider_type: remote::anthropic - config: api_key: '********' provider_id: gemini provider_type: remote::gemini - config: api_key: '********' url: https://api.groq.com provider_id: groq provider_type: remote::groq - config: api_key: '********' url: https://api.sambanova.ai/v1 provider_id: sambanova provider_type: remote::sambanova - config: {} provider_id: sentence-transformers provider_type: inline::sentence-transformers post_training: - config: checkpoint_format: meta provider_id: torchtune-cpu provider_type: inline::torchtune-cpu safety: - config: excluded_categories: [] provider_id: llama-guard provider_type: inline::llama-guard - config: {} provider_id: code-scanner provider_type: inline::code-scanner scoring: - config: {} provider_id: basic provider_type: inline::basic - config: {} provider_id: llm-as-judge provider_type: inline::llm-as-judge - config: openai_api_key: '********' provider_id: braintrust provider_type: inline::braintrust telemetry: - config: service_name: "\u200B" sinks: sqlite sqlite_db_path: /home/nvidia/.llama/distributions/starter/trace_store.db provider_id: meta-reference provider_type: inline::meta-reference tool_runtime: - config: api_key: '********' max_results: 3 provider_id: brave-search provider_type: remote::brave-search - config: api_key: '********' max_results: 3 provider_id: tavily-search provider_type: remote::tavily-search - config: {} provider_id: rag-runtime provider_type: inline::rag-runtime - config: {} provider_id: model-context-protocol provider_type: remote::model-context-protocol vector_io: - config: kvstore: db_path: /home/nvidia/.llama/distributions/starter/faiss_store.db type: sqlite provider_id: faiss provider_type: inline::faiss - config: db_path: /home/nvidia/.llama/distributions/starter/sqlite_vec.db kvstore: db_path: /home/nvidia/.llama/distributions/starter/sqlite_vec_registry.db type: sqlite provider_id: sqlite-vec provider_type: inline::sqlite-vec scoring_fns: [] server: port: 8321 shields: [] tool_groups: - provider_id: tavily-search toolgroup_id: builtin::websearch - provider_id: rag-runtime toolgroup_id: builtin::rag vector_dbs: [] version: 2 INFO 2025-10-07 00:29:12,138 llama_stack.providers.remote.inference.nvidia.nvidia:49 inference::nvidia: Initializing NVIDIAInferenceAdapter(http://localhost:8912)... INFO 2025-10-07 00:29:12,921 llama_stack.providers.utils.inference.inference_store:74 inference: Write queue disabled for SQLite to avoid concurrency issues INFO 2025-10-07 00:29:13,524 llama_stack.providers.utils.responses.responses_store:96 openai_responses: Write queue disabled for SQLite to avoid concurrency issues ERROR 2025-10-07 00:29:13,679 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: FireworksInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"fireworks_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:13,681 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider fireworks: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"fireworks_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:13,682 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: TogetherInferenceAdapter.list_provider_model_ids() failed with: Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": } WARNING 2025-10-07 00:29:13,684 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider together: Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": } Handling connection for 8912 INFO 2025-10-07 00:29:14,047 llama_stack.providers.utils.inference.openai_mixin:448 providers::utils: NVIDIAInferenceAdapter.list_provider_model_ids() returned 3 models ERROR 2025-10-07 00:29:14,062 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: OpenAIInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"openai_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,063 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider openai: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"openai_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,099 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: AnthropicInferenceAdapter.list_provider_model_ids() failed with: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted" WARNING 2025-10-07 00:29:14,100 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider anthropic: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted" ERROR 2025-10-07 00:29:14,102 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: GeminiInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"gemini_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,103 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider gemini: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"gemini_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,105 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: GroqInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"groq_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,106 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider groq: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"groq_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,107 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: SambaNovaInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"sambanova_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,109 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider sambanova: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"sambanova_api_key": ""}, or in the provider config. INFO 2025-10-07 00:29:14,454 uvicorn.error:84 uncategorized: Started server process [3753046] INFO 2025-10-07 00:29:14,455 uvicorn.error:48 uncategorized: Waiting for application startup. INFO 2025-10-07 00:29:14,457 llama_stack.core.server.server:170 core::server: Starting up INFO 2025-10-07 00:29:14,458 llama_stack.core.stack:415 core: starting registry refresh task ERROR 2025-10-07 00:29:14,459 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: FireworksInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"fireworks_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,461 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider fireworks: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"fireworks_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,462 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: TogetherInferenceAdapter.list_provider_model_ids() failed with: Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": } WARNING 2025-10-07 00:29:14,463 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider together: Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": } ERROR 2025-10-07 00:29:14,465 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: OpenAIInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"openai_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,466 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider openai: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"openai_api_key": ""}, or in the provider config. INFO 2025-10-07 00:29:14,500 uvicorn.error:62 uncategorized: Application startup complete. ERROR 2025-10-07 00:29:14,502 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: AnthropicInferenceAdapter.list_provider_model_ids() failed with: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted" WARNING 2025-10-07 00:29:14,503 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider anthropic: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted" ERROR 2025-10-07 00:29:14,504 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: GeminiInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"gemini_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,506 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider gemini: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"gemini_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,507 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: GroqInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"groq_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,508 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider groq: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"groq_api_key": ""}, or in the provider config. ERROR 2025-10-07 00:29:14,510 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: SambaNovaInferenceAdapter.list_provider_model_ids() failed with: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"sambanova_api_key": ""}, or in the provider config. WARNING 2025-10-07 00:29:14,511 llama_stack.core.routing_tables.models:36 core::routing_tables: Model refresh failed for provider sambanova: API key is not set. Please provide a valid API key in the provider data header, e.g. x-llamastack-provider-data: {"sambanova_api_key": ""}, or in the provider config. INFO 2025-10-07 00:29:14,513 uvicorn.error:216 uncategorized: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) ``` tested with curl model, it also works: ``` curl http://localhost:8321/v1/models {"data":[{"identifier":"bedrock/meta.llama3-1-8b-instruct-v1:0","provider_resource_id":"meta.llama3-1-8b-instruct-v1:0","provider_id":"bedrock","type":"model","metadata":{},"model_type":"llm"},{"identifier":"bedrock/meta.llama3-1-70b-instruct-v1:0","provider_resource_id":"meta.llama3-1-70b-instruct-v1:0","provider_id":"bedrock","type":"model","metadata":{},"model_type":"llm"},{"identifier":"bedrock/meta.llama3-1-405b-instruct-v1:0","provider_resource_id":"meta.llama3-1-405b-instruct-v1:0","provider_id":"bedrock","type":"model","metadata":{},"model_type":"llm"},{"identifier":"nvidia/bigcode/starcoder2-7b","provider_resource_id":"bigcode/starcoder2-7b","provider_id":"nvidia","type":"model","metadata":{},"model_type":"llm"},{"identifier":"nvidia/meta/llama-3.3-70b-instruct","provider_resource_id":"meta/llama-3.3-70b-instruct","provider_id":"nvidia","type":"model","metadata":{},"model_type":"llm"},{"identifier":"nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2","provider_resource_id":"nvidia/llama-3.2-nv-embedqa-1b-v2","provider_id":"nvidia","type":"model","metadata":{"embedding_dimension":2048,"context_length":8192},"model_type":"embedding"},{"identifier":"sentence-transformers/all-MiniLM-L6-v2","provider_resource_id":"all-MiniLM-L6-v2","provider_id":"sentence-transformers","type":"model","metadata":{"embedding_dimension":384},"model_type":"embedding"}]}% ``` --------- Co-authored-by: github-actions[bot] --- .../remote/inference/nvidia/nvidia.py | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 7a2697327..b2ad060fb 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -13,6 +13,7 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingUsage, ) from llama_stack.log import get_logger +from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from . import NVIDIAConfig @@ -21,9 +22,7 @@ from .utils import _is_nvidia_hosted logger = get_logger(name=__name__, category="inference::nvidia") -class NVIDIAInferenceAdapter(OpenAIMixin): - config: NVIDIAConfig - +class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): """ NVIDIA Inference Adapter for Llama Stack. @@ -37,12 +36,27 @@ class NVIDIAInferenceAdapter(OpenAIMixin): - ModelRegistryHelper.check_model_availability() just returns False and shows a warning """ + def __init__(self, config: NVIDIAConfig) -> None: + """Initialize the NVIDIA inference adapter with configuration.""" + # Initialize ModelRegistryHelper with empty model entries since NVIDIA uses dynamic model discovery + ModelRegistryHelper.__init__(self, model_entries=[], allowed_models=config.allowed_models) + self.config = config + # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html embedding_model_metadata: dict[str, dict[str, int]] = { - "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192}, + "nvidia/llama-3.2-nv-embedqa-1b-v2": { + "embedding_dimension": 2048, + "context_length": 8192, + }, "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024}, - "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096}, - "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024}, + "nvidia/nv-embedqa-mistral-7b-v2": { + "embedding_dimension": 512, + "context_length": 4096, + }, + "snowflake/arctic-embed-l": { + "embedding_dimension": 512, + "context_length": 1024, + }, } async def initialize(self) -> None: @@ -95,7 +109,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin): response = await self.client.embeddings.create( model=await self._get_provider_model_id(model), input=input, - encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN, + encoding_format=(encoding_format if encoding_format is not None else NOT_GIVEN), dimensions=dimensions if dimensions is not None else NOT_GIVEN, user=user if user is not None else NOT_GIVEN, extra_body=extra_body, From 9e61a4ab8c4fc68b6895ca24d965c2a120294180 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Tue, 7 Oct 2025 19:07:45 -0700 Subject: [PATCH 6/6] chore: Revert "fix: fix nvidia provider (#3716)" This reverts commit c940fe79389a930923bd7f30787c467ed6f132c3. --- .../remote/inference/nvidia/nvidia.py | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index b2ad060fb..7a2697327 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -13,7 +13,6 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingUsage, ) from llama_stack.log import get_logger -from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from . import NVIDIAConfig @@ -22,7 +21,9 @@ from .utils import _is_nvidia_hosted logger = get_logger(name=__name__, category="inference::nvidia") -class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): +class NVIDIAInferenceAdapter(OpenAIMixin): + config: NVIDIAConfig + """ NVIDIA Inference Adapter for Llama Stack. @@ -36,27 +37,12 @@ class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): - ModelRegistryHelper.check_model_availability() just returns False and shows a warning """ - def __init__(self, config: NVIDIAConfig) -> None: - """Initialize the NVIDIA inference adapter with configuration.""" - # Initialize ModelRegistryHelper with empty model entries since NVIDIA uses dynamic model discovery - ModelRegistryHelper.__init__(self, model_entries=[], allowed_models=config.allowed_models) - self.config = config - # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html embedding_model_metadata: dict[str, dict[str, int]] = { - "nvidia/llama-3.2-nv-embedqa-1b-v2": { - "embedding_dimension": 2048, - "context_length": 8192, - }, + "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192}, "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024}, - "nvidia/nv-embedqa-mistral-7b-v2": { - "embedding_dimension": 512, - "context_length": 4096, - }, - "snowflake/arctic-embed-l": { - "embedding_dimension": 512, - "context_length": 1024, - }, + "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096}, + "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024}, } async def initialize(self) -> None: @@ -109,7 +95,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): response = await self.client.embeddings.create( model=await self._get_provider_model_id(model), input=input, - encoding_format=(encoding_format if encoding_format is not None else NOT_GIVEN), + encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN, dimensions=dimensions if dimensions is not None else NOT_GIVEN, user=user if user is not None else NOT_GIVEN, extra_body=extra_body,