from langchain.document_loaders import TextLoader from langchain.embeddings import ModelScopeEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma import chardet # 读取原始文档 raw_documents_sanguo = TextLoader('D:\\temp\\demo.txt', encoding='utf-8').load() # 分割文档 text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents_sanguo) print("documents nums:", documents.__len__()) print(documents) # 生成向量(embedding) model_id = "damo/nlp_corom_sentence-embedding_chinese-base" embeddings = ModelScopeEmbeddings(model_id=model_id) db = Chroma.from_documents(documents, embedding=embeddings) # 检索 query = "老师说什么" docs = db.similarity_search(query, k=2) # 打印结果 for doc in docs: print("===") print("metadata:", doc.metadata) print("page_content:", doc.page_content)