add everyting for docs

This commit is contained in:
ishaan-jaff 2023-07-29 07:00:13 -07:00
parent de45a738ee
commit 0fe8799f94
1015 changed files with 185353 additions and 0 deletions

View file

@ -0,0 +1 @@
label: 'Post retrieval'

View file

@ -0,0 +1,177 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "fc0db1bc",
"metadata": {},
"source": [
"# Lost in the middle: The problem with long contexts\n",
"\n",
"No matter the architecture of your model, there is a substantial performance degradation when you include 10+ retrieved documents.\n",
"In brief: When models must access relevant information in the middle of long contexts, then tend to ignore the provided documents.\n",
"See: https://arxiv.org/abs/2307.03172\n",
"\n",
"To avoid this issue you can re-order documents after retrieval to avoid performance degradation."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "49cbcd8e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='This is a document about the Boston Celtics', metadata={}),\n",
" Document(page_content='The Celtics are my favourite team.', metadata={}),\n",
" Document(page_content='L. Kornet is one of the best Celtics players.', metadata={}),\n",
" Document(page_content='The Boston Celtics won the game by 20 points', metadata={}),\n",
" Document(page_content='Larry Bird was an iconic NBA player.', metadata={}),\n",
" Document(page_content='Elden Ring is one of the best games in the last 15 years.', metadata={}),\n",
" Document(page_content='Basquetball is a great sport.', metadata={}),\n",
" Document(page_content='I simply love going to the movies', metadata={}),\n",
" Document(page_content='Fly me to the moon is one of my favourite songs.', metadata={}),\n",
" Document(page_content='This is just a random text.', metadata={})]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import chromadb\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.document_transformers import (\n",
" LongContextReorder,\n",
")\n",
"from langchain.chains import StuffDocumentsChain, LLMChain\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.llms import OpenAI\n",
"\n",
"# Get embeddings.\n",
"embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
"\n",
"texts = [\n",
" \"Basquetball is a great sport.\",\n",
" \"Fly me to the moon is one of my favourite songs.\",\n",
" \"The Celtics are my favourite team.\",\n",
" \"This is a document about the Boston Celtics\",\n",
" \"I simply love going to the movies\",\n",
" \"The Boston Celtics won the game by 20 points\",\n",
" \"This is just a random text.\",\n",
" \"Elden Ring is one of the best games in the last 15 years.\",\n",
" \"L. Kornet is one of the best Celtics players.\",\n",
" \"Larry Bird was an iconic NBA player.\",\n",
"]\n",
"\n",
"# Create a retriever\n",
"retriever = Chroma.from_texts(texts, embedding=embeddings).as_retriever(\n",
" search_kwargs={\"k\": 10}\n",
")\n",
"query = \"What can you tell me about the Celtics?\"\n",
"\n",
"# Get relevant documents ordered by relevance score\n",
"docs = retriever.get_relevant_documents(query)\n",
"docs"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "34fb9d6e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='The Celtics are my favourite team.', metadata={}),\n",
" Document(page_content='The Boston Celtics won the game by 20 points', metadata={}),\n",
" Document(page_content='Elden Ring is one of the best games in the last 15 years.', metadata={}),\n",
" Document(page_content='I simply love going to the movies', metadata={}),\n",
" Document(page_content='This is just a random text.', metadata={}),\n",
" Document(page_content='Fly me to the moon is one of my favourite songs.', metadata={}),\n",
" Document(page_content='Basquetball is a great sport.', metadata={}),\n",
" Document(page_content='Larry Bird was an iconic NBA player.', metadata={}),\n",
" Document(page_content='L. Kornet is one of the best Celtics players.', metadata={}),\n",
" Document(page_content='This is a document about the Boston Celtics', metadata={})]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reorder the documents:\n",
"# Less relevant document will be at the middle of the list and more\n",
"# relevant elements at begining / end.\n",
"reordering = LongContextReorder()\n",
"reordered_docs = reordering.transform_documents(docs)\n",
"\n",
"# Confirm that the 4 relevant documents are at begining and end.\n",
"reordered_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ceccab87",
"metadata": {},
"outputs": [],
"source": [
"# We prepare and run a custom Stuff chain with reordered docs as context.\n",
"\n",
"# Override prompts\n",
"document_prompt = PromptTemplate(\n",
" input_variables=[\"page_content\"], template=\"{page_content}\"\n",
")\n",
"document_variable_name = \"context\"\n",
"llm = OpenAI()\n",
"stuff_prompt_override = \"\"\"Given this text extracts:\n",
"-----\n",
"{context}\n",
"-----\n",
"Please answer the following question:\n",
"{query}\"\"\"\n",
"prompt = PromptTemplate(\n",
" template=stuff_prompt_override, input_variables=[\"context\", \"query\"]\n",
")\n",
"\n",
"# Instantiate the chain\n",
"llm_chain = LLMChain(llm=llm, prompt=prompt)\n",
"chain = StuffDocumentsChain(\n",
" llm_chain=llm_chain,\n",
" document_prompt=document_prompt,\n",
" document_variable_name=document_variable_name,\n",
")\n",
"chain.run(input_documents=reordered_docs, query=query)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "70e9b619",
"metadata": {},
"source": [
"# MarkdownHeaderTextSplitter\n",
"\n",
"### Motivation\n",
"\n",
"Many chat or Q+A applications involve chunking input documents prior to embedding and vector storage.\n",
"\n",
"[These notes](https://www.pinecone.io/learn/chunking-strategies/) from Pinecone provide some useful tips:\n",
"\n",
"```\n",
"When a full paragraph or document is embedded, the embedding process considers both the overall context and the relationships between the sentences and phrases within the text. This can result in a more comprehensive vector representation that captures the broader meaning and themes of the text.\n",
"```\n",
" \n",
"As mentioned, chunking often aims to keep text with common context together.\n",
"\n",
"With this in mind, we might want to specifically honor the structure of the document itself.\n",
"\n",
"For example, a markdown file is organized by headers.\n",
"\n",
"Creating chunks within specific header groups is an intuitive idea.\n",
"\n",
"To address this challenge, we can use `MarkdownHeaderTextSplitter`.\n",
"\n",
"This will split a markdown file by a specified set of headers. \n",
"\n",
"For example, if we want to split this markdown:\n",
"```\n",
"md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
"```\n",
" \n",
"We can specify the headers to split on:\n",
"```\n",
"[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n",
"```\n",
"\n",
"And content is grouped or split by common headers:\n",
"```\n",
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n",
"```\n",
"\n",
"Let's have a look at some examples below."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ceb3c1fb",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import MarkdownHeaderTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2ae3649b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
" Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
" Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"markdown_document = \"# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n ## Baz\\n\\n Hi this is Molly\"\n",
"\n",
"headers_to_split_on = [\n",
" (\"#\", \"Header 1\"),\n",
" (\"##\", \"Header 2\"),\n",
" (\"###\", \"Header 3\"),\n",
"]\n",
"\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"md_header_splits"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aac1738c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"langchain.schema.Document"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(md_header_splits[0])"
]
},
{
"cell_type": "markdown",
"id": "9bd8977a",
"metadata": {},
"source": [
"Within each markdown group we can then apply any text splitter we want. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "480e0e3a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
"\n",
"headers_to_split_on = [\n",
" (\"#\", \"Header 1\"),\n",
" (\"##\", \"Header 2\"),\n",
"]\n",
"\n",
"# MD splits\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"\n",
"# Char-level splits\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"chunk_size = 250\n",
"chunk_overlap = 30\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
")\n",
"\n",
"# Split\n",
"splits = text_splitter.split_documents(md_header_splits)\n",
"splits"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,532 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a05c860c",
"metadata": {},
"source": [
"# Split by tokens \n",
"\n",
"Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model. "
]
},
{
"cell_type": "markdown",
"id": "7683b36a",
"metadata": {},
"source": [
"## tiktoken\n",
"\n",
">[tiktoken](https://github.com/openai/tiktoken) is a fast `BPE` tokenizer created by `OpenAI`.\n",
"\n",
"\n",
"We can use it to estimate tokens used. It will probably be more accurate for the OpenAI models.\n",
"\n",
"1. How the text is split: by character passed in\n",
"2. How the chunk size is measured: by `tiktoken` tokenizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c4ef83e-f43a-4658-ad1a-3952e0a5bbe7",
"metadata": {},
"outputs": [],
"source": [
"#!pip install tiktoken"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1ad2d0f2",
"metadata": {},
"outputs": [],
"source": [
"# This is a long document we can split up.\n",
"with open(\"../../../state_of_the_union.txt\") as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.text_splitter import CharacterTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "825f7c0a",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
" chunk_size=100, chunk_overlap=0\n",
")\n",
"texts = text_splitter.split_text(state_of_the_union)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ae35d165",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n",
"\n",
"Last year COVID-19 kept us apart. This year we are finally together again. \n",
"\n",
"Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n",
"\n",
"With a duty to one another to the American people to the Constitution.\n"
]
}
],
"source": [
"print(texts[0])"
]
},
{
"cell_type": "markdown",
"id": "de5b6a6e",
"metadata": {},
"source": [
"We can also load a tiktoken splitter directly"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4454c70e",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import TokenTextSplitter\n",
"\n",
"text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)\n",
"\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"print(texts[0])"
]
},
{
"cell_type": "markdown",
"id": "55f95f06",
"metadata": {},
"source": [
"## spaCy\n",
"\n",
">[spaCy](https://spacy.io/) is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython.\n",
"\n",
"Another alternative to `NLTK` is to use [spaCy tokenizer](https://spacy.io/api/tokenizer).\n",
"\n",
"1. How the text is split: by `spaCy` tokenizer\n",
"2. How the chunk size is measured: by number of characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0b9242f-690c-4819-b35a-bb68187281ed",
"metadata": {},
"outputs": [],
"source": [
"#!pip install spacy"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f1de7767",
"metadata": {},
"outputs": [],
"source": [
"# This is a long document we can split up.\n",
"with open(\"../../../state_of_the_union.txt\") as f:\n",
" state_of_the_union = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f4ec9b90",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import SpacyTextSplitter\n",
"\n",
"text_splitter = SpacyTextSplitter(chunk_size=1000)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cef2b29e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\n",
"\n",
"Members of Congress and the Cabinet.\n",
"\n",
"Justices of the Supreme Court.\n",
"\n",
"My fellow Americans. \n",
"\n",
"\n",
"\n",
"Last year COVID-19 kept us apart.\n",
"\n",
"This year we are finally together again. \n",
"\n",
"\n",
"\n",
"Tonight, we meet as Democrats Republicans and Independents.\n",
"\n",
"But most importantly as Americans. \n",
"\n",
"\n",
"\n",
"With a duty to one another to the American people to the Constitution. \n",
"\n",
"\n",
"\n",
"And with an unwavering resolve that freedom will always triumph over tyranny. \n",
"\n",
"\n",
"\n",
"Six days ago, Russias Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways.\n",
"\n",
"But he badly miscalculated. \n",
"\n",
"\n",
"\n",
"He thought he could roll into Ukraine and the world would roll over.\n",
"\n",
"Instead he met a wall of strength he never imagined. \n",
"\n",
"\n",
"\n",
"He met the Ukrainian people. \n",
"\n",
"\n",
"\n",
"From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\n"
]
}
],
"source": [
"texts = text_splitter.split_text(state_of_the_union)\n",
"print(texts[0])"
]
},
{
"cell_type": "markdown",
"id": "73dbcdb9",
"metadata": {},
"source": [
"## SentenceTransformers\n",
"\n",
"The `SentenceTransformersTokenTextSplitter` is a specialized text splitter for use with the sentence-transformer models. The default behaviour is to split the text into chunks that fit the token window of the sentence transformer model that you would like to use."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9dd5419e",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import SentenceTransformersTokenTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b43e5d54",
"metadata": {},
"outputs": [],
"source": [
"splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)\n",
"text = \"Lorem \""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1df84cb4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2\n"
]
}
],
"source": [
"count_start_and_stop_tokens = 2\n",
"text_token_count = splitter.count_tokens(text=text) - count_start_and_stop_tokens\n",
"print(text_token_count)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d7ad2213",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tokens in text to split: 514\n"
]
}
],
"source": [
"token_multiplier = splitter.maximum_tokens_per_chunk // text_token_count + 1\n",
"\n",
"# `text_to_split` does not fit in a single chunk\n",
"text_to_split = text * token_multiplier\n",
"\n",
"print(f\"tokens in text to split: {splitter.count_tokens(text=text_to_split)}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "818aea04",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"lorem\n"
]
}
],
"source": [
"text_chunks = splitter.split_text(text=text_to_split)\n",
"\n",
"print(text_chunks[1])"
]
},
{
"cell_type": "markdown",
"id": "ea2973ac",
"metadata": {},
"source": [
"## NLTK\n",
"\n",
">[The Natural Language Toolkit](https://en.wikipedia.org/wiki/Natural_Language_Toolkit), or more commonly [NLTK](https://www.nltk.org/), is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language.\n",
"\n",
"Rather than just splitting on \"\\n\\n\", we can use `NLTK` to split based on [NLTK tokenizers](https://www.nltk.org/api/nltk.tokenize.html).\n",
"\n",
"1. How the text is split: by `NLTK` tokenizer.\n",
"2. How the chunk size is measured:by number of characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6af9886-7d53-4aab-84f6-303c4cce7882",
"metadata": {},
"outputs": [],
"source": [
"# pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aed17ddf",
"metadata": {},
"outputs": [],
"source": [
"# This is a long document we can split up.\n",
"with open(\"../../../state_of_the_union.txt\") as f:\n",
" state_of_the_union = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "20fa9c23",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import NLTKTextSplitter\n",
"\n",
"text_splitter = NLTKTextSplitter(chunk_size=1000)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5ea10835",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\n",
"\n",
"Members of Congress and the Cabinet.\n",
"\n",
"Justices of the Supreme Court.\n",
"\n",
"My fellow Americans.\n",
"\n",
"Last year COVID-19 kept us apart.\n",
"\n",
"This year we are finally together again.\n",
"\n",
"Tonight, we meet as Democrats Republicans and Independents.\n",
"\n",
"But most importantly as Americans.\n",
"\n",
"With a duty to one another to the American people to the Constitution.\n",
"\n",
"And with an unwavering resolve that freedom will always triumph over tyranny.\n",
"\n",
"Six days ago, Russias Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways.\n",
"\n",
"But he badly miscalculated.\n",
"\n",
"He thought he could roll into Ukraine and the world would roll over.\n",
"\n",
"Instead he met a wall of strength he never imagined.\n",
"\n",
"He met the Ukrainian people.\n",
"\n",
"From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\n",
"\n",
"Groups of citizens blocking tanks with their bodies.\n"
]
}
],
"source": [
"texts = text_splitter.split_text(state_of_the_union)\n",
"print(texts[0])"
]
},
{
"cell_type": "markdown",
"id": "13dc0983",
"metadata": {},
"source": [
"## Hugging Face tokenizer\n",
"\n",
">[Hugging Face](https://huggingface.co/docs/tokenizers/index) has many tokenizers.\n",
"\n",
"We use Hugging Face tokenizer, the [GPT2TokenizerFast](https://huggingface.co/Ransaka/gpt2-tokenizer-fast) to count the text length in tokens.\n",
"\n",
"1. How the text is split: by character passed in\n",
"2. How the chunk size is measured: by number of tokens calculated by the `Hugging Face` tokenizer\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a8ce51d5",
"metadata": {},
"outputs": [],
"source": [
"from transformers import GPT2TokenizerFast\n",
"\n",
"tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "388369ed",
"metadata": {},
"outputs": [],
"source": [
"# This is a long document we can split up.\n",
"with open(\"../../../state_of_the_union.txt\") as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.text_splitter import CharacterTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ca5e72c0",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(\n",
" tokenizer, chunk_size=100, chunk_overlap=0\n",
")\n",
"texts = text_splitter.split_text(state_of_the_union)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "37cdfbeb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n",
"\n",
"Last year COVID-19 kept us apart. This year we are finally together again. \n",
"\n",
"Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n",
"\n",
"With a duty to one another to the American people to the Constitution.\n"
]
}
],
"source": [
"print(texts[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a43b0fa6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"vscode": {
"interpreter": {
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}