add everyting for docs

This commit is contained in:
ishaan-jaff 2023-07-29 07:00:13 -07:00
parent de45a738ee
commit 0fe8799f94
1015 changed files with 185353 additions and 0 deletions

View file

@ -0,0 +1,161 @@
# Implement a Custom Retriever
In this walkthrough, you will implement a simple custom retriever in LangChain using a simple dot product distance lookup.
All retrievers inherit from the `BaseRetriever` class and override the following abstract methods:
```python
from abc import ABC, abstractmethod
from typing import Any, List
from langchain.schema import Document
from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun,
CallbackManagerForRetrieverRun,
)
class BaseRetriever(ABC):
@abstractmethod
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
"""Get documents relevant to a query.
Args:
query: string to find relevant documents for
run_manager: The callbacks handler to use
Returns:
List of relevant documents
"""
@abstractmethod
async def _aget_relevant_documents(
self,
query: str,
*,
run_manager: AsyncCallbackManagerForRetrieverRun,
) -> List[Document]:
"""Asynchronously get documents relevant to a query.
Args:
query: string to find relevant documents for
run_manager: The callbacks handler to use
Returns:
List of relevant documents
"""
```
The `_get_relevant_documents` and async `_get_relevant_documents` methods can be implemented however you see fit. The `run_manager` is useful if your retriever calls other traceable LangChain primitives like LLMs, chains, or tools.
Below, implement an example that fetches the most similar documents from a list of documents using a numpy array of embeddings.
```python
from typing import Any, List, Optional
import numpy as np
from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun,
CallbackManagerForRetrieverRun,
)
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.schema import BaseRetriever, Document
class NumpyRetriever(BaseRetriever):
"""Retrieves documents from a numpy array."""
def __init__(
self,
texts: List[str],
vectors: np.ndarray,
embeddings: Optional[Embeddings] = None,
num_to_return: int = 1,
) -> None:
super().__init__()
self.embeddings = embeddings or OpenAIEmbeddings()
self.texts = texts
self.vectors = vectors
self.num_to_return = num_to_return
@classmethod
def from_texts(
cls,
texts: List[str],
embeddings: Optional[Embeddings] = None,
**kwargs: Any,
) -> "NumpyRetriever":
embeddings = embeddings or OpenAIEmbeddings()
vectors = np.array(embeddings.embed_documents(texts))
return cls(texts, vectors, embeddings)
def _get_relevant_documents_from_query_vector(
self, vector_query: np.ndarray
) -> List[Document]:
dot_product = np.dot(self.vectors, vector_query)
# Get the indices of the min 5 documents
indices = np.argpartition(
dot_product, -min(self.num_to_return, len(self.vectors))
)[-self.num_to_return :]
# Sort indices by distance
indices = indices[np.argsort(dot_product[indices])]
return [
Document(
page_content=self.texts[idx],
metadata={"index": idx},
)
for idx in indices
]
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
"""Get documents relevant to a query.
Args:
query: string to find relevant documents for
run_manager: The callbacks handler to use
Returns:
List of relevant documents
"""
vector_query = np.array(self.embeddings.embed_query(query))
return self._get_relevant_documents_from_query_vector(vector_query)
async def _aget_relevant_documents(
self,
query: str,
*,
run_manager: AsyncCallbackManagerForRetrieverRun,
) -> List[Document]:
"""Asynchronously get documents relevant to a query.
Args:
query: string to find relevant documents for
run_manager: The callbacks handler to use
Returns:
List of relevant documents
"""
query_emb = await self.embeddings.aembed_query(query)
return self._get_relevant_documents_from_query_vector(np.array(query_emb))
```
The retriever can be instantiated through the class method `from_texts`. It embeds the texts and stores them in a numpy array. To look up documents, it embeds the query and finds the most similar documents using a simple dot product distance.
Once the retriever is implemented, you can use it like any other retriever in LangChain.
```python
retriever = NumpyRetriever.from_texts(texts= ["hello world", "goodbye world"])
```
You can then use the retriever to get relevant documents.
```python
retriever.get_relevant_documents("Hi there!")
# [Document(page_content='hello world', metadata={'index': 0})]
```
```python
retriever.get_relevant_documents("Bye!")
# [Document(page_content='goodbye world', metadata={'index': 1})]
```

View file

@ -0,0 +1,124 @@
```python
import faiss
from datetime import datetime, timedelta
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
from langchain.vectorstores import FAISS
```
## Low Decay Rate
A low `decay rate` (in this, to be extreme, we will set close to 0) means memories will be "remembered" for longer. A `decay rate` of 0 means memories never be forgotten, making this retriever equivalent to the vector lookup.
```python
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=.0000000000000000000000001, k=1)
```
```python
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents([Document(page_content="hello world", metadata={"last_accessed_at": yesterday})])
retriever.add_documents([Document(page_content="hello foo")])
```
<CodeOutputBlock lang="python">
```
['d7f85756-2371-4bdf-9140-052780a0f9b3']
```
</CodeOutputBlock>
```python
# "Hello World" is returned first because it is most salient, and the decay rate is close to 0., meaning it's still recent enough
retriever.get_relevant_documents("hello world")
```
<CodeOutputBlock lang="python">
```
[Document(page_content='hello world', metadata={'last_accessed_at': datetime.datetime(2023, 5, 13, 21, 0, 27, 678341), 'created_at': datetime.datetime(2023, 5, 13, 21, 0, 27, 279596), 'buffer_idx': 0})]
```
</CodeOutputBlock>
## High Decay Rate
With a high `decay rate` (e.g., several 9's), the `recency score` quickly goes to 0! If you set this all the way to 1, `recency` is 0 for all objects, once again making this equivalent to a vector lookup.
```python
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=.999, k=1)
```
```python
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents([Document(page_content="hello world", metadata={"last_accessed_at": yesterday})])
retriever.add_documents([Document(page_content="hello foo")])
```
<CodeOutputBlock lang="python">
```
['40011466-5bbe-4101-bfd1-e22e7f505de2']
```
</CodeOutputBlock>
```python
# "Hello Foo" is returned first because "hello world" is mostly forgotten
retriever.get_relevant_documents("hello world")
```
<CodeOutputBlock lang="python">
```
[Document(page_content='hello foo', metadata={'last_accessed_at': datetime.datetime(2023, 4, 16, 22, 9, 2, 494798), 'created_at': datetime.datetime(2023, 4, 16, 22, 9, 2, 178722), 'buffer_idx': 1})]
```
</CodeOutputBlock>
## Virtual Time
Using some utils in LangChain, you can mock out the time component
```python
from langchain.utils import mock_now
import datetime
```
```python
# Notice the last access time is that date time
with mock_now(datetime.datetime(2011, 2, 3, 10, 11)):
print(retriever.get_relevant_documents("hello world"))
```
<CodeOutputBlock lang="python">
```
[Document(page_content='hello world', metadata={'last_accessed_at': MockDateTime(2011, 2, 3, 10, 11), 'created_at': datetime.datetime(2023, 5, 13, 21, 0, 27, 279596), 'buffer_idx': 0})]
```
</CodeOutputBlock>

View file

@ -0,0 +1,88 @@
```python
from langchain.document_loaders import TextLoader
loader = TextLoader('../../../state_of_the_union.txt')
```
```python
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(texts, embeddings)
```
<CodeOutputBlock lang="python">
```
Exiting: Cleaning up .chroma directory
```
</CodeOutputBlock>
```python
retriever = db.as_retriever()
```
```python
docs = retriever.get_relevant_documents("what did he say about ketanji brown jackson")
```
## Maximum Marginal Relevance Retrieval
By default, the vectorstore retriever uses similarity search. If the underlying vectorstore support maximum marginal relevance search, you can specify that as the search type.
```python
retriever = db.as_retriever(search_type="mmr")
```
```python
docs = retriever.get_relevant_documents("what did he say about ketanji brown jackson")
```
## Similarity Score Threshold Retrieval
You can also a retrieval method that sets a similarity score threshold and only returns documents with a score above that threshold
```python
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5})
```
```python
docs = retriever.get_relevant_documents("what did he say about ketanji brown jackson")
```
## Specifying top k
You can also specify search kwargs like `k` to use when doing retrieval.
```python
retriever = db.as_retriever(search_kwargs={"k": 1})
```
```python
docs = retriever.get_relevant_documents("what did he say about ketanji brown jackson")
```
```python
len(docs)
```
<CodeOutputBlock lang="python">
```
1
```
</CodeOutputBlock>