Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-08 12:32:30 +08:00 · 2025-11-28 19:25:32 +08:00
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions
--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
@ -15,13 +15,10 @@
 #
 import json
 import logging
-import re
-import sys
 from functools import partial
-import trio
 from quart import request, Response, make_response
 from agent.component import LLM
-from api.db import CanvasCategory, FileType
+from api.db import CanvasCategory
 from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService, API4ConversationService
 from api.db.services.document_service import DocumentService
 from api.db.services.file_service import FileService
@ -38,7 +35,6 @@ from peewee import MySQLDatabase, PostgresqlDatabase
 from api.db.db_models import APIToken, Task
 import time

-from api.utils.file_utils import filename_type, read_potential_broken_pdf
 from rag.flow.pipeline import Pipeline
 from rag.nlp import search
 from rag.utils.redis_conn import REDIS_CONN
@ -250,71 +246,10 @@ async def upload(canvas_id):
        return get_data_error_result(message="canvas not found.")

    user_id = cvs["user_id"]
-    def structured(filename, filetype, blob, content_type):
-        nonlocal user_id
-        if filetype == FileType.PDF.value:
-            blob = read_potential_broken_pdf(blob)
-
-        location = get_uuid()
-        FileService.put_blob(user_id, location, blob)
-
-        return {
-            "id": location,
-            "name": filename,
-            "size": sys.getsizeof(blob),
-            "extension": filename.split(".")[-1].lower(),
-            "mime_type": content_type,
-            "created_by": user_id,
-            "created_at": time.time(),
-            "preview_url": None
-        }
-
-    if request.args.get("url"):
-        from crawl4ai import (
-            AsyncWebCrawler,
-            BrowserConfig,
-            CrawlerRunConfig,
-            DefaultMarkdownGenerator,
-            PruningContentFilter,
-            CrawlResult
-        )
-        try:
-            url = request.args.get("url")
-            filename = re.sub(r"\?.*", "", url.split("/")[-1])
-            async def adownload():
-                browser_config = BrowserConfig(
-                    headless=True,
-                    verbose=False,
-                )
-                async with AsyncWebCrawler(config=browser_config) as crawler:
-                    crawler_config = CrawlerRunConfig(
-                        markdown_generator=DefaultMarkdownGenerator(
-                            content_filter=PruningContentFilter()
-                        ),
-                        pdf=True,
-                        screenshot=False
-                    )
-                    result: CrawlResult = await crawler.arun(
-                        url=url,
-                        config=crawler_config
-                    )
-                    return result
-            page = trio.run(adownload())
-            if page.pdf:
-                if filename.split(".")[-1].lower() != "pdf":
-                    filename += ".pdf"
-                return get_json_result(data=structured(filename, "pdf", page.pdf, page.response_headers["content-type"]))
-
-            return get_json_result(data=structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id))
-
-        except Exception as e:
-            return  server_error_response(e)
-
    files = await request.files
-    file = files['file']
+    file = files['file'] if files and files.get("file") else None
    try:
-        DocumentService.check_doc_health(user_id, file.filename)
-        return get_json_result(data=structured(file.filename, filename_type(file.filename), file.read(), file.content_type))
+        return get_json_result(data=FileService.upload_info(user_id, file, request.args.get("url")))
    except Exception as e:
        return  server_error_response(e)

--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -607,7 +607,7 @@ async def get_image(image_id):
@login_required
@validate_request("conversation_id")
 async def upload_and_parse():
-    files = await request.file
+    files = await request.files
    if "file" not in files:
        return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)

@ -705,3 +705,12 @@ async def set_meta():
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)
+
+@manager.route("/upload_info", methods=["POST"])  # noqa: F821
+async def upload_info():
+    files = await request.files
+    file = files['file'] if files and files.get("file") else None
+    try:
+        return get_json_result(data=FileService.upload_info(current_user.id, file, request.args.get("url")))
+    except Exception as e:
+        return  server_error_response(e)
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -25,6 +25,7 @@ import trio
 from langfuse import Langfuse
 from peewee import fn
 from agentic_reasoning import DeepResearcher
+from api.db.services.file_service import FileService
 from common.constants import LLMType, ParserType, StatusEnum
 from api.db.db_models import DB, Dialog
 from api.db.services.common_service import CommonService
@ -380,8 +381,11 @@ def chat(dialog, messages, stream=True, **kwargs):
    retriever = settings.retriever
    questions = [m["content"] for m in messages if m["role"] == "user"][-3:]
    attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else []
+    attachments_= ""
    if "doc_ids" in messages[-1]:
        attachments = messages[-1]["doc_ids"]
+    if "files" in messages[-1]:
+        attachments_ = "\n\n".join(FileService.get_files(messages[-1]["files"]))

    prompt_config = dialog.prompt_config
    field_map = KnowledgebaseService.get_field_map(dialog.kb_ids)
@ -451,7 +455,7 @@ def chat(dialog, messages, stream=True, **kwargs):
                ),
            )

-            for think in reasoner.thinking(kbinfos, " ".join(questions)):
+            for think in reasoner.thinking(kbinfos, attachments_ + " ".join(questions)):
                if isinstance(think, str):
                    thought = think
                    knowledges = [t for t in think.split("\n") if t]
@ -503,7 +507,7 @@ def chat(dialog, messages, stream=True, **kwargs):
    kwargs["knowledge"] = "\n------\n" + "\n\n------\n\n".join(knowledges)
    gen_conf = dialog.llm_setting

-    msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}]
+    msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)+attachments_}]
    prompt4citation = ""
    if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
        prompt4citation = citation_prompt()
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -13,10 +13,15 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import asyncio
+import base64
 import logging
 import re
+import sys
+import time
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
+from typing import Union

 from peewee import fn

@ -520,7 +525,7 @@ class FileService(CommonService):
        if img_base64 and file_type == FileType.VISUAL.value:
            return GptV4.image2base64(blob)
        cks = FACTORY.get(FileService.get_parser(filename_type(filename), filename, ""), naive).chunk(filename, blob, **kwargs)
-        return "\n".join([ck["content_with_weight"] for ck in cks])
+        return f"\n -----------------\nFile: {filename}\nContent as following: \n" + "\n".join([ck["content_with_weight"] for ck in cks])

    @staticmethod
    def get_parser(doc_type, filename, default):
@ -588,3 +593,80 @@ class FileService(CommonService):
                errors += str(e)

        return errors
+
+    @staticmethod
+    def upload_info(user_id, file, url: str|None=None):
+        def structured(filename, filetype, blob, content_type):
+            nonlocal user_id
+            if filetype == FileType.PDF.value:
+                blob = read_potential_broken_pdf(blob)
+
+            location = get_uuid()
+            FileService.put_blob(user_id, location, blob)
+
+            return {
+                "id": location,
+                "name": filename,
+                "size": sys.getsizeof(blob),
+                "extension": filename.split(".")[-1].lower(),
+                "mime_type": content_type,
+                "created_by": user_id,
+                "created_at": time.time(),
+                "preview_url": None
+            }
+
+        if url:
+            from crawl4ai import (
+                AsyncWebCrawler,
+                BrowserConfig,
+                CrawlerRunConfig,
+                DefaultMarkdownGenerator,
+                PruningContentFilter,
+                CrawlResult
+            )
+            filename = re.sub(r"\?.*", "", url.split("/")[-1])
+            async def adownload():
+                browser_config = BrowserConfig(
+                    headless=True,
+                    verbose=False,
+                )
+                async with AsyncWebCrawler(config=browser_config) as crawler:
+                    crawler_config = CrawlerRunConfig(
+                        markdown_generator=DefaultMarkdownGenerator(
+                            content_filter=PruningContentFilter()
+                        ),
+                        pdf=True,
+                        screenshot=False
+                    )
+                    result: CrawlResult = await crawler.arun(
+                        url=url,
+                        config=crawler_config
+                    )
+                    return result
+            page = asyncio.run(adownload())
+            if page.pdf:
+                if filename.split(".")[-1].lower() != "pdf":
+                    filename += ".pdf"
+                return structured(filename, "pdf", page.pdf, page.response_headers["content-type"])
+
+            return structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id)
+
+        DocumentService.check_doc_health(user_id, file.filename)
+        return structured(file.filename, filename_type(file.filename), file.read(), file.content_type)
+
+    @staticmethod
+    def get_files(self, files: Union[None, list[dict]]) -> list[str]:
+        if not files:
+            return  []
+        def image_to_base64(file):
+            return "data:{};base64,{}".format(file["mime_type"],
+                                        base64.b64encode(FileService.get_blob(file["created_by"], file["id"])).decode("utf-8"))
+        exe = ThreadPoolExecutor(max_workers=5)
+        threads = []
+        for file in files:
+            if file["mime_type"].find("image") >=0:
+                threads.append(exe.submit(image_to_base64, file))
+                continue
+            threads.append(exe.submit(FileService.parse, file["name"], FileService.get_blob(file["created_by"], file["id"]), True, file["created_by"]))
+        return [th.result() for th in threads]
+