# %%
from langchain_community.document_loaders.web_base import WebBaseLoader
web_paths = [
'https://www.chiba-u.ac.jp/general/JoureiV5HTMLContents/act/content/content110000048.htm',
'https://www.chiba-u.ac.jp/general/JoureiV5HTMLContents/act/content/content110000049.htm',
'https://www.chiba-u.ac.jp/general/JoureiV5HTMLContents/act/content/content110001356.htm',
'https://www.chiba-u.ac.jp/general/JoureiV5HTMLContents/act/content/content110000050.htm',
'https://www.chiba-u.ac.jp/general/JoureiV5HTMLContents/act/content/content110000089.htm',
]
loader = WebBaseLoader(web_paths=web_paths)
documents = loader.load()
print(len(documents))
# %%
for document in documents:
print(document)
# %%
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
splitted_documents = text_splitter.split_documents(documents=documents)
print(len(splitted_documents))
# %%
for document in splitted_documents:
print('################################################################################')
print(document)
# %%
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme