1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | from langchain.document_loaders import TextLoader from langchain.embeddings import ModelScopeEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma import chardet # 读取原始文档 raw_documents_sanguo = TextLoader( 'D:\\temp\\demo.txt' , encoding = 'utf-8' ).load() # 分割文档 text_splitter = CharacterTextSplitter(chunk_size = 100 , chunk_overlap = 0 ) documents = text_splitter.split_documents(raw_documents_sanguo) print ( "documents nums:" , documents.__len__()) print (documents) # 生成向量(embedding) model_id = "damo/nlp_corom_sentence-embedding_chinese-base" embeddings = ModelScopeEmbeddings(model_id = model_id) db = Chroma.from_documents(documents, embedding = embeddings) # 检索 query = "老师说什么" docs = db.similarity_search(query, k = 2 ) # 打印结果 for doc in docs: print ( "===" ) print ( "metadata:" , doc.metadata) print ( "page_content:" , doc.page_content) |