from langchain.document_loaders import TextLoader
from langchain.embeddings import ModelScopeEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import chardet
# 读取原始文档
raw_documents_sanguo = TextLoader('D:\\temp\\demo.txt', encoding='utf-8').load()
# 分割文档
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents_sanguo)
print("documents nums:", documents.__len__())
print(documents)
# 生成向量(embedding)
model_id = "damo/nlp_corom_sentence-embedding_chinese-base"
embeddings = ModelScopeEmbeddings(model_id=model_id)
db = Chroma.from_documents(documents, embedding=embeddings)
# 检索
query = "老师说什么"
docs = db.similarity_search(query, k=2)
# 打印结果
for doc in docs:
print("===")
print("metadata:", doc.metadata)
print("page_content:", doc.page_content)