From a087d13ccbad0e576f2b9ad3ef92f870dbd528a9 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 18 Mar 2025 16:55:11 +0800 Subject: [PATCH] Feat: text file support position retaining. (#6231) ### What problem does this PR solve? #5832 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/nlp/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index bf6b25c04..171cb0911 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -258,7 +258,7 @@ def tokenize(d, t, eng): def tokenize_chunks(chunks, doc, eng, pdf_parser=None): res = [] # wrap up as es documents - for ck in chunks: + for ii, ck in enumerate(chunks): if len(ck.strip()) == 0: continue logging.debug("-- {}".format(ck)) @@ -270,6 +270,8 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): ck = pdf_parser.remove_tag(ck) except NotImplementedError: pass + else: + add_positions(d, [[ii]*5]) tokenize(d, ck, eng) res.append(d) return res