何が起きた?
ネットでchatgptのapiの使い方を調べていたところ、下記のような書き方で一時的にpdfの資料データを持たせて回答させることができるという情報があって早速やってみました。(元情報は一番最後に載せています)
test.jpynb
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader
import openai
import os
os.environ["OPENAI_API_KEY"] ="xxx"
loader = PyPDFLoader('xxx.pdf')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(texts, embeddings)
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever())
# ==後略(続きは参考元の動画からご確認ください)==
その結果、最初に動画の通り環境構築した時はうまくいったのに、改めて別PCで環境構築し直したところ下記のようなエラーを吐き動かなくなってしまいました。
エラー文
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[4], line 4
2 texts = text_splitter.split_documents(documents)
3 embeddings = OpenAIEmbeddings()
----> 4 vectordb = Chroma.from_documents(texts, embeddings)
6 qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever())
File ~/anaconda3/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:338, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, **kwargs)
336 texts = [doc.page_content for doc in documents]
337 metadatas = [doc.metadata for doc in documents]
--> 338 return cls.from_texts(
339 texts=texts,
340 embedding=embedding,
341 metadatas=metadatas,
342 ids=ids,
343 collection_name=collection_name,
344 persist_directory=persist_directory,
345 client_settings=client_settings,
346 )
File ~/anaconda3/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:307, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, **kwargs)
284 """Create a Chroma vectorstore from a raw documents.
285
286 If a persist_directory is specified, the collection will be persisted there.
(...)
299 Chroma: Chroma vectorstore.
300 """
301 chroma_collection = cls(
302 collection_name=collection_name,
303 embedding_function=embedding,
304 persist_directory=persist_directory,
305 client_settings=client_settings,
306 )
--> 307 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
308 return chroma_collection
File ~/anaconda3/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:115, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
113 embeddings = None
114 if self._embedding_function is not None:
--> 115 embeddings = self._embedding_function.embed_documents(list(texts))
116 self._collection.add(
117 metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
118 )
119 return ids
File ~/anaconda3/lib/python3.10/site-packages/langchain/embeddings/openai.py:275, in OpenAIEmbeddings.embed_documents(self, texts, chunk_size)
273 # handle batches of large input text
274 if self.embedding_ctx_length > 0:
--> 275 return self._get_len_safe_embeddings(texts, engine=self.document_model_name)
276 else:
277 results = []
File ~/anaconda3/lib/python3.10/site-packages/langchain/embeddings/openai.py:206, in OpenAIEmbeddings._get_len_safe_embeddings(self, texts, engine, chunk_size)
204 tokens = []
205 indices = []
--> 206 encoding = tiktoken.model.encoding_for_model(self.document_model_name)
207 for i, text in enumerate(texts):
208 # replace newlines, which can negatively affect performance.
209 text = text.replace("\n", " ")
File ~/anaconda3/lib/python3.10/site-packages/tiktoken/model.py:75, in encoding_for_model(model_name)
69 if encoding_name is None:
70 raise KeyError(
71 f"Could not automatically map {model_name} to a tokeniser. "
72 "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
73 ) from None
---> 75 return get_encoding(encoding_name)
File ~/anaconda3/lib/python3.10/site-packages/tiktoken/registry.py:63, in get_encoding(encoding_name)
60 raise ValueError(f"Unknown encoding {encoding_name}")
62 constructor = ENCODING_CONSTRUCTORS[encoding_name]
---> 63 enc = Encoding(**constructor())
64 ENCODINGS[encoding_name] = enc
65 return enc
File ~/anaconda3/lib/python3.10/site-packages/tiktoken_ext/openai_public.py:64, in cl100k_base()
63 def cl100k_base():
---> 64 mergeable_ranks = load_tiktoken_bpe(
65 "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
66 )
67 special_tokens = {
68 ENDOFTEXT: 100257,
69 FIM_PREFIX: 100258,
(...)
72 ENDOFPROMPT: 100276,
73 }
74 return {
75 "name": "cl100k_base",
76 "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
77 "mergeable_ranks": mergeable_ranks,
78 "special_tokens": special_tokens,
79 }
File ~/anaconda3/lib/python3.10/site-packages/tiktoken/load.py:115, in load_tiktoken_bpe(tiktoken_bpe_file)
112 def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
113 # NB: do not add caching to this function
114 contents = read_file_cached(tiktoken_bpe_file)
--> 115 return {
116 base64.b64decode(token): int(rank)
117 for token, rank in (line.split() for line in contents.splitlines() if line)
118 }
File ~/anaconda3/lib/python3.10/site-packages/tiktoken/load.py:117, in <dictcomp>(.0)
112 def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
113 # NB: do not add caching to this function
114 contents = read_file_cached(tiktoken_bpe_file)
115 return {
116 base64.b64decode(token): int(rank)
--> 117 for token, rank in (line.split() for line in contents.splitlines() if line)
118 }
ValueError: not enough values to unpack (expected 2, got 1)
原因
使っているライブラリのバージョンを最新にしすぎたことで、(おそらく関数の中身が変わったことで)それぞれの互換性がなくなってしまっていました。
解決方法
手元にあるライブラリを全てpip uninstall
して、バージョン指定をして再度pip install
することで動くようになりました。
参考にさせていただいた資料(動画)
chatgptのapi利用のとっかかりに非常に良い動画だったのでよければご覧になってみてください。
(この方とは面識は全くなく、回しものでもなんでもありませんが、内容がとてもよかったので共有させていただいております。)