mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: add child parent chunking method in backend. (#11598)
### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -13,10 +13,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
|
||||
from common.misc_utils import get_uuid
|
||||
from rag.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
@ -32,6 +32,7 @@ class SplitterParam(ProcessParamBase):
|
||||
self.chunk_token_size = 512
|
||||
self.delimiters = ["\n"]
|
||||
self.overlapped_percent = 0
|
||||
self.children_delimiters = []
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.delimiters, "Delimiters.")
|
||||
@ -58,6 +59,14 @@ class Splitter(ProcessBase):
|
||||
deli += f"`{d}`"
|
||||
else:
|
||||
deli += d
|
||||
child_deli = ""
|
||||
for d in self._param.children_delimiters:
|
||||
if len(d) > 1:
|
||||
child_deli += f"`{d}`"
|
||||
else:
|
||||
child_deli += d
|
||||
child_deli = [m.group(1) for m in re.finditer(r"`([^`]+)`", child_deli)]
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(child_deli), key=len, reverse=True))
|
||||
|
||||
self.set_output("output_format", "chunks")
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
|
||||
@ -78,7 +87,23 @@ class Splitter(ProcessBase):
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
|
||||
if custom_pattern:
|
||||
docs = []
|
||||
for c in cks:
|
||||
if not c.strip():
|
||||
continue
|
||||
split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL)
|
||||
if split_sec:
|
||||
for txt in split_sec:
|
||||
docs.append({
|
||||
"text": txt,
|
||||
"mom": c
|
||||
})
|
||||
else:
|
||||
docs.append({"text": c})
|
||||
self.set_output("chunks", docs)
|
||||
else:
|
||||
self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
|
||||
|
||||
self.callback(1, "Done.")
|
||||
return
|
||||
@ -100,12 +125,27 @@ class Splitter(ProcessBase):
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
|
||||
}
|
||||
for c, img in zip(chunks, images) if c.strip()
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
|
||||
self.set_output("chunks", cks)
|
||||
|
||||
if custom_pattern:
|
||||
docs = []
|
||||
for c in cks:
|
||||
split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL)
|
||||
if split_sec:
|
||||
c["mom"] = c["text"]
|
||||
for txt in split_sec:
|
||||
cc = deepcopy(c)
|
||||
cc["text"] = txt
|
||||
docs.append(cc)
|
||||
else:
|
||||
docs.append(c)
|
||||
self.set_output("chunks", docs)
|
||||
else:
|
||||
self.set_output("chunks", cks)
|
||||
self.callback(1, "Done.")
|
||||
|
||||
Reference in New Issue
Block a user