# %%
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
loader = UnstructuredHTMLLoader(file_path='/home/onoyu1012/workspace/elasticsearch/疾風伝説 特攻の拓 - Wikipedia.html')
data = loader.load()
print(data)
# %%
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
documents = text_splitter.split_documents(documents=data)
print(len(documents))
for document in documents:
print(document)
# %%
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '<INPUT YOUR HUGGINGFACEHUB_API_TOKEN>'
# %%
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large')
# %%
# %%
username = '<INPUT ES/WxD_USERNAME>'
password = '<INPUT ES/WxD_PASSWORD>'
host = 'https://%s:%s@<INPUT YOUR ES/WxD_HOST>:<INPUT YOUR ES/WxD_PORT>' % (username, password)
print(host)
# %%
from elasticsearch import Elasticsearch
es_connection = Elasticsearch(hosts=host, ca_certs='<INPUT YOUR ES/ExD_CA_CART_PATH>')
# %%
from langchain_elasticsearch.vectorstores import ElasticsearchStore
vectorstore = ElasticsearchStore.from_documents(documents=documents, index_name='test', es_connection=es_connection, embedding=embedding)
# %%
vectorstore.similarity_search(query='SR400とは?')
# %%
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme