From 41a060173536d343d71dd7f6710bece7fc8d9ec8 Mon Sep 17 00:00:00 2001 From: devMls <12745675+devMls@users.noreply.github.com> Date: Tue, 10 Dec 2024 02:06:52 +0100 Subject: [PATCH] organize chunks by document in the prompt (#3925) ### What problem does this PR solve? This PR organize chunks in the prompt by document and indicate what is the name of the document in this way ``` Document: {doc_name} \nContains the following relevant fragments: chunk1 chunk2 chunk3 Document: {doc_name} \nContains the following relevant fragments: chunk4 chunk5 ``` Maybe can be a baseline to add metadata to the documents. This allow in my case to improve llm context about the orgin of the information. ### Type of change - [X] New Feature (non-breaking change which adds functionality) Co-authored-by: Miguel --- api/db/services/dialog_service.py | 55 ++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index 6fd490187..36daaddda 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -195,7 +195,32 @@ def chat(dialog, messages, stream=True, **kwargs): dialog.vector_similarity_weight, doc_ids=attachments, top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) - knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] + + # Group chunks by document ID + doc_chunks = {} + for ck in kbinfos["chunks"]: + doc_id = ck["doc_id"] + if doc_id not in doc_chunks: + doc_chunks[doc_id] = [] + doc_chunks[doc_id].append(ck["content_with_weight"]) + + # Create knowledges list with grouped chunks + knowledges = [] + for doc_id, chunks in doc_chunks.items(): + # Find the corresponding document name + doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id) + + # Create a header for the document + doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n" + + # Add numbered fragments + for i, chunk in enumerate(chunks, 1): + doc_knowledge += f"{i}. {chunk}\n" + + knowledges.append(doc_knowledge) + + + logging.debug( "{}->{}".format(" ".join(questions), "\n->".join(knowledges))) retrieval_tm = timer() @@ -592,12 +617,40 @@ def ask(question, kb_ids, tenant_id): knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] used_token_count = 0 + chunks_num = 0 for i, c in enumerate(knowledges): used_token_count += num_tokens_from_string(c) if max_tokens * 0.97 < used_token_count: knowledges = knowledges[:i] + chunks_num = chunks_num + 1 break + # Group chunks by document ID + doc_chunks = {} + counter_chunks = 0 + for ck in kbinfos["chunks"]: + if counter_chunks < chunks_num: + counter_chunks = counter_chunks + 1 + doc_id = ck["doc_id"] + if doc_id not in doc_chunks: + doc_chunks[doc_id] = [] + doc_chunks[doc_id].append(ck["content_with_weight"]) + + # Create knowledges list with grouped chunks + knowledges = [] + for doc_id, chunks in doc_chunks.items(): + # Find the corresponding document name + doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id) + + # Create a header for the document + doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n" + + # Add numbered fragments + for i, chunk in enumerate(chunks, 1): + doc_knowledge += f"{i}. {chunk}\n" + + knowledges.append(doc_knowledge) + prompt = """ Role: You're a smart assistant. Your name is Miss R. Task: Summarize the information from knowledge bases and answer user's question.