From 5256980ffbf3b790c7f4583a76c7d18ce23b7580 Mon Sep 17 00:00:00 2001 From: liuzhenghua <1090179900@qq.com> Date: Wed, 25 Jun 2025 10:25:45 +0800 Subject: [PATCH] Fix: Solve the OOM issue when passing large PDF files while using QA chunking method. (#8464) ### What problem does this PR solve? Using the QA chunking method with a large PDF (e.g., 300+ pages) may lead to OOM in the ragflow-worker module. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/qa.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rag/app/qa.py b/rag/app/qa.py index 7ce0afabc..803baa102 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -310,7 +310,7 @@ def mdQuestionLevel(s): return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) -def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ Excel and csv(txt) format files are supported. If the file is in excel format, there should be 2 column question and answer without header. @@ -410,7 +410,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): callback(0.1, "Start to parse.") pdf_parser = Pdf() qai_list, tbls = pdf_parser(filename if not binary else binary, - from_page=0, to_page=10000, callback=callback) + from_page=from_page, to_page=to_page, callback=callback) for q, a, image, poss in qai_list: res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss)) return res @@ -468,4 +468,4 @@ if __name__ == "__main__": def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) \ No newline at end of file + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)