From f04c9e29370e0ff5d8170bff193b2383edbec96a Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 9 Oct 2025 19:03:12 +0800 Subject: [PATCH] Fix: correctly update parser method & correct vllm pdf parser (#10441) ### What problem does this PR solve? Fix: correctly update parser method ### Type of change - [X] Bug Fix (non-breaking change which fixes an issue) --- api/apps/document_app.py | 4 ++-- deepdoc/parser/pdf_parser.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 0b3bfd6ba..b80d59b09 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -557,8 +557,8 @@ def get(doc_id): @login_required @validate_request("doc_id") def change_parser(): - req = request.json + req = request.json if not DocumentService.accessible(req["doc_id"], current_user.id): return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR) @@ -582,7 +582,7 @@ def change_parser(): settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) try: - if "pipeline_id" in req: + if "pipeline_id" in req and req["pipeline_id"] != "": if doc.pipeline_id == req["pipeline_id"]: return get_json_result(data=True) DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]}) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index ea3a87b14..c73b610ad 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1274,12 +1274,16 @@ class VisionParser(RAGFlowPdfParser): prompt=vision_llm_describe_prompt(page=pdf_page_num + 1), callback=callback, ) + if kwargs.get("callback"): kwargs["callback"](idx * 1.0 / len(self.page_images), f"Processed: {idx + 1}/{len(self.page_images)}") if text: width, height = self.page_images[idx].size - all_docs.append((text, f"{pdf_page_num + 1} 0 {width / zoomin} 0 {height / zoomin}")) + all_docs.append(( + text, + f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##" + )) return all_docs, []