llama-index==0.10.19
Index classes in LlamaIndex has two updating functions:
- update_ref_docs(document: Document): delete and insert a single Document
- refresh_ref_docs(documents: Sequence[Document]): refresh index based on given set of Document-s
Unfortunately, refresh_ref_docs does not remove nodes corresponding to deleted documents.
So we need additional procedures to actually "refresh" the index.
from llama_index.core import (
ListIndex,
SimpleDirectoryReader,
set_global_handler,
Settings,
)
from llama_index.core.callbacks import (
CallbackManager,
LlamaDebugHandler,
CBEventType,
)
# function to construct Document set
def read_docs(path_docs):
documents = SimpleDirectoryReader(
input_dir=path_docs,
filename_as_id=True,
recursive=True,
).load_data()
for doc in documents:
print(f'loaded: {doc.get_doc_id()}')
return documents
Note that we need to change id
for Document objects so that we can update them later based on filesystem path.
# function to update index
def update_index(index, path_docs):
documents = read_docs(path_docs)
# note that refresh_ref_docs() does not remove nodes for deleted files
updated = index.refresh_ref_docs(documents)
print("updated with status ", updated)
# so, remove nodes for deleted files here
doc_ids = [doc.get_doc_id() for doc in documents]
for path in index.docstore.get_all_ref_doc_info():
if path not in doc_ids:
index.delete_ref_doc(path, delete_from_docstore=True)
print(f'{path}: removed from index')
Let's try this.
! mkdir -p docs
! echo "This is a document" > docs/test.txt
# create initial index
documents = read_docs('./docs')
index=ListIndex.from_documents(documents)
# show contents of index docstore
index.docstore.get_all_ref_doc_info()
{'/home/nishio/test/docs/test.txt': RefDocInfo(node_ids=['ba23faf6-90bb-4482-9c42-63a0146134ba'], metadata={'file_path': '/home/nishio/test/docs/test.txt', 'file_name': 'test.txt', 'file_type': 'text/plain', 'file_size': 19, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'})}
# add document
! echo "Test is another document" > docs/test2.txt
update_index(index, './docs')
index.docstore.get_all_ref_doc_info()
loaded: /home/nishio/test/docs/test.txt
loaded: /home/nishio/test/docs/test2.txt
Parsing nodes: 0%| | 0/1 [00:00<?, ?it/s]
updated with status [False, True]
doc /home/nishio/test/docs/test.txt exists? df4c3e3976435630167a36b4f87d0db8549dfab7698df7419bf80e1c23751949
doc /home/nishio/test/docs/test2.txt exists? 321e59a16cc55a60f437ac08aed39eebb3c1654e5a9636d1bf6be6a858192f57
{'/home/nishio/test/docs/test.txt': RefDocInfo(node_ids=['ba23faf6-90bb-4482-9c42-63a0146134ba'], metadata={'file_path': '/home/nishio/test/docs/test.txt', 'file_name': 'test.txt', 'file_type': 'text/plain', 'file_size': 19, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'}),
'/home/nishio/test/docs/test2.txt': RefDocInfo(node_ids=['b05d7a85-ddc3-4d79-ba31-08f08b21a783'], metadata={'file_path': '/home/nishio/test/docs/test2.txt', 'file_name': 'test2.txt', 'file_type': 'text/plain', 'file_size': 25, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'})}
# remove document
! rm docs/test2.txt
! ls docs/
test.txt
# run refresh_ref_docs
documents = read_docs('./docs')
index.refresh_ref_docs(documents)
index.docstore.get_all_ref_doc_info()
{'/home/nishio/test/docs/test.txt': RefDocInfo(node_ids=['ba23faf6-90bb-4482-9c42-63a0146134ba'], metadata={'file_path': '/home/nishio/test/docs/test.txt', 'file_name': 'test.txt', 'file_type': 'text/plain', 'file_size': 19, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'}),
'/home/nishio/test/docs/test2.txt': RefDocInfo(node_ids=['b05d7a85-ddc3-4d79-ba31-08f08b21a783'], metadata={'file_path': '/home/nishio/test/docs/test2.txt', 'file_name': 'test2.txt', 'file_type': 'text/plain', 'file_size': 25, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'})}
As we can see, refresh_ref_docs()
does not remove node(s) corresponding to the deleted file 'test2.txt'. Now, let's try update_index()
instead.
update_index(index, './docs')
loaded: /home/nishio/test/docs/test.txt
updated with status [False]
doc /home/nishio/test/docs/test.txt exists? df4c3e3976435630167a36b4f87d0db8549dfab7698df7419bf80e1c23751949
/home/nishio/test/docs/test2.txt: removed from index
index.docstore.get_all_ref_doc_info()
{'/home/nishio/test/docs/test.txt': RefDocInfo(node_ids=['ba23faf6-90bb-4482-9c42-63a0146134ba'], metadata={'file_path': '/home/nishio/test/docs/test.txt', 'file_name': 'test.txt', 'file_type': 'text/plain', 'file_size': 19, 'creation_date': '2024-03-20', 'last_modified_date': '2024-03-20'})}
Everything working as expected.
Fine.