Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-28 19:25:32 +08:00
committed by GitHub
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions

View File

@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import base64
import json import json
import logging import logging
import re import re
@ -25,6 +24,7 @@ from typing import Any, Union, Tuple
from agent.component import component_class from agent.component import component_class
from agent.component.base import ComponentBase from agent.component.base import ComponentBase
from api.db.services.file_service import FileService
from api.db.services.task_service import has_canceled from api.db.services.task_service import has_canceled
from common.misc_utils import get_uuid, hash_str2int from common.misc_utils import get_uuid, hash_str2int
from common.exceptions import TaskCanceledException from common.exceptions import TaskCanceledException
@ -372,7 +372,7 @@ class Canvas(Graph):
for k in kwargs.keys(): for k in kwargs.keys():
if k in ["query", "user_id", "files"] and kwargs[k]: if k in ["query", "user_id", "files"] and kwargs[k]:
if k == "files": if k == "files":
self.globals[f"sys.{k}"] = self.get_files(kwargs[k]) self.globals[f"sys.{k}"] = FileService.get_files(kwargs[k])
else: else:
self.globals[f"sys.{k}"] = kwargs[k] self.globals[f"sys.{k}"] = kwargs[k]
if not self.globals["sys.conversation_turns"] : if not self.globals["sys.conversation_turns"] :
@ -621,22 +621,6 @@ class Canvas(Graph):
def get_component_input_elements(self, cpnnm): def get_component_input_elements(self, cpnnm):
return self.components[cpnnm]["obj"].get_input_elements() return self.components[cpnnm]["obj"].get_input_elements()
def get_files(self, files: Union[None, list[dict]]) -> list[str]:
from api.db.services.file_service import FileService
if not files:
return []
def image_to_base64(file):
return "data:{};base64,{}".format(file["mime_type"],
base64.b64encode(FileService.get_blob(file["created_by"], file["id"])).decode("utf-8"))
exe = ThreadPoolExecutor(max_workers=5)
threads = []
for file in files:
if file["mime_type"].find("image") >=0:
threads.append(exe.submit(image_to_base64, file))
continue
threads.append(exe.submit(FileService.parse, file["name"], FileService.get_blob(file["created_by"], file["id"]), True, file["created_by"]))
return [th.result() for th in threads]
def tool_use_callback(self, agent_id: str, func_name: str, params: dict, result: Any, elapsed_time=None): def tool_use_callback(self, agent_id: str, func_name: str, params: dict, result: Any, elapsed_time=None):
agent_ids = agent_id.split("-->") agent_ids = agent_id.split("-->")
agent_name = self.get_component_name(agent_ids[0]) agent_name = self.get_component_name(agent_ids[0])

View File

@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
# #
from agent.component.fillup import UserFillUpParam, UserFillUp from agent.component.fillup import UserFillUpParam, UserFillUp
from api.db.services.file_service import FileService
class BeginParam(UserFillUpParam): class BeginParam(UserFillUpParam):
@ -48,7 +49,7 @@ class Begin(UserFillUp):
if v.get("optional") and v.get("value", None) is None: if v.get("optional") and v.get("value", None) is None:
v = None v = None
else: else:
v = self._canvas.get_files([v["value"]]) v = FileService.get_files([v["value"]])
else: else:
v = v.get("value") v = v.get("value")
self.set_output(k, v) self.set_output(k, v)

View File

@ -15,13 +15,10 @@
# #
import json import json
import logging import logging
import re
import sys
from functools import partial from functools import partial
import trio
from quart import request, Response, make_response from quart import request, Response, make_response
from agent.component import LLM from agent.component import LLM
from api.db import CanvasCategory, FileType from api.db import CanvasCategory
from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService, API4ConversationService from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService, API4ConversationService
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
@ -38,7 +35,6 @@ from peewee import MySQLDatabase, PostgresqlDatabase
from api.db.db_models import APIToken, Task from api.db.db_models import APIToken, Task
import time import time
from api.utils.file_utils import filename_type, read_potential_broken_pdf
from rag.flow.pipeline import Pipeline from rag.flow.pipeline import Pipeline
from rag.nlp import search from rag.nlp import search
from rag.utils.redis_conn import REDIS_CONN from rag.utils.redis_conn import REDIS_CONN
@ -250,71 +246,10 @@ async def upload(canvas_id):
return get_data_error_result(message="canvas not found.") return get_data_error_result(message="canvas not found.")
user_id = cvs["user_id"] user_id = cvs["user_id"]
def structured(filename, filetype, blob, content_type):
nonlocal user_id
if filetype == FileType.PDF.value:
blob = read_potential_broken_pdf(blob)
location = get_uuid()
FileService.put_blob(user_id, location, blob)
return {
"id": location,
"name": filename,
"size": sys.getsizeof(blob),
"extension": filename.split(".")[-1].lower(),
"mime_type": content_type,
"created_by": user_id,
"created_at": time.time(),
"preview_url": None
}
if request.args.get("url"):
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
DefaultMarkdownGenerator,
PruningContentFilter,
CrawlResult
)
try:
url = request.args.get("url")
filename = re.sub(r"\?.*", "", url.split("/")[-1])
async def adownload():
browser_config = BrowserConfig(
headless=True,
verbose=False,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
),
pdf=True,
screenshot=False
)
result: CrawlResult = await crawler.arun(
url=url,
config=crawler_config
)
return result
page = trio.run(adownload())
if page.pdf:
if filename.split(".")[-1].lower() != "pdf":
filename += ".pdf"
return get_json_result(data=structured(filename, "pdf", page.pdf, page.response_headers["content-type"]))
return get_json_result(data=structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id))
except Exception as e:
return server_error_response(e)
files = await request.files files = await request.files
file = files['file'] file = files['file'] if files and files.get("file") else None
try: try:
DocumentService.check_doc_health(user_id, file.filename) return get_json_result(data=FileService.upload_info(user_id, file, request.args.get("url")))
return get_json_result(data=structured(file.filename, filename_type(file.filename), file.read(), file.content_type))
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)

View File

@ -607,7 +607,7 @@ async def get_image(image_id):
@login_required @login_required
@validate_request("conversation_id") @validate_request("conversation_id")
async def upload_and_parse(): async def upload_and_parse():
files = await request.file files = await request.files
if "file" not in files: if "file" not in files:
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR) return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
@ -705,3 +705,12 @@ async def set_meta():
return get_json_result(data=True) return get_json_result(data=True)
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@manager.route("/upload_info", methods=["POST"]) # noqa: F821
async def upload_info():
files = await request.files
file = files['file'] if files and files.get("file") else None
try:
return get_json_result(data=FileService.upload_info(current_user.id, file, request.args.get("url")))
except Exception as e:
return server_error_response(e)

View File

@ -25,6 +25,7 @@ import trio
from langfuse import Langfuse from langfuse import Langfuse
from peewee import fn from peewee import fn
from agentic_reasoning import DeepResearcher from agentic_reasoning import DeepResearcher
from api.db.services.file_service import FileService
from common.constants import LLMType, ParserType, StatusEnum from common.constants import LLMType, ParserType, StatusEnum
from api.db.db_models import DB, Dialog from api.db.db_models import DB, Dialog
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
@ -380,8 +381,11 @@ def chat(dialog, messages, stream=True, **kwargs):
retriever = settings.retriever retriever = settings.retriever
questions = [m["content"] for m in messages if m["role"] == "user"][-3:] questions = [m["content"] for m in messages if m["role"] == "user"][-3:]
attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else [] attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else []
attachments_= ""
if "doc_ids" in messages[-1]: if "doc_ids" in messages[-1]:
attachments = messages[-1]["doc_ids"] attachments = messages[-1]["doc_ids"]
if "files" in messages[-1]:
attachments_ = "\n\n".join(FileService.get_files(messages[-1]["files"]))
prompt_config = dialog.prompt_config prompt_config = dialog.prompt_config
field_map = KnowledgebaseService.get_field_map(dialog.kb_ids) field_map = KnowledgebaseService.get_field_map(dialog.kb_ids)
@ -451,7 +455,7 @@ def chat(dialog, messages, stream=True, **kwargs):
), ),
) )
for think in reasoner.thinking(kbinfos, " ".join(questions)): for think in reasoner.thinking(kbinfos, attachments_ + " ".join(questions)):
if isinstance(think, str): if isinstance(think, str):
thought = think thought = think
knowledges = [t for t in think.split("\n") if t] knowledges = [t for t in think.split("\n") if t]
@ -503,7 +507,7 @@ def chat(dialog, messages, stream=True, **kwargs):
kwargs["knowledge"] = "\n------\n" + "\n\n------\n\n".join(knowledges) kwargs["knowledge"] = "\n------\n" + "\n\n------\n\n".join(knowledges)
gen_conf = dialog.llm_setting gen_conf = dialog.llm_setting
msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)+attachments_}]
prompt4citation = "" prompt4citation = ""
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
prompt4citation = citation_prompt() prompt4citation = citation_prompt()

View File

@ -13,10 +13,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import asyncio
import base64
import logging import logging
import re import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from typing import Union
from peewee import fn from peewee import fn
@ -520,7 +525,7 @@ class FileService(CommonService):
if img_base64 and file_type == FileType.VISUAL.value: if img_base64 and file_type == FileType.VISUAL.value:
return GptV4.image2base64(blob) return GptV4.image2base64(blob)
cks = FACTORY.get(FileService.get_parser(filename_type(filename), filename, ""), naive).chunk(filename, blob, **kwargs) cks = FACTORY.get(FileService.get_parser(filename_type(filename), filename, ""), naive).chunk(filename, blob, **kwargs)
return "\n".join([ck["content_with_weight"] for ck in cks]) return f"\n -----------------\nFile: {filename}\nContent as following: \n" + "\n".join([ck["content_with_weight"] for ck in cks])
@staticmethod @staticmethod
def get_parser(doc_type, filename, default): def get_parser(doc_type, filename, default):
@ -588,3 +593,80 @@ class FileService(CommonService):
errors += str(e) errors += str(e)
return errors return errors
@staticmethod
def upload_info(user_id, file, url: str|None=None):
def structured(filename, filetype, blob, content_type):
nonlocal user_id
if filetype == FileType.PDF.value:
blob = read_potential_broken_pdf(blob)
location = get_uuid()
FileService.put_blob(user_id, location, blob)
return {
"id": location,
"name": filename,
"size": sys.getsizeof(blob),
"extension": filename.split(".")[-1].lower(),
"mime_type": content_type,
"created_by": user_id,
"created_at": time.time(),
"preview_url": None
}
if url:
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
DefaultMarkdownGenerator,
PruningContentFilter,
CrawlResult
)
filename = re.sub(r"\?.*", "", url.split("/")[-1])
async def adownload():
browser_config = BrowserConfig(
headless=True,
verbose=False,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
),
pdf=True,
screenshot=False
)
result: CrawlResult = await crawler.arun(
url=url,
config=crawler_config
)
return result
page = asyncio.run(adownload())
if page.pdf:
if filename.split(".")[-1].lower() != "pdf":
filename += ".pdf"
return structured(filename, "pdf", page.pdf, page.response_headers["content-type"])
return structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id)
DocumentService.check_doc_health(user_id, file.filename)
return structured(file.filename, filename_type(file.filename), file.read(), file.content_type)
@staticmethod
def get_files(self, files: Union[None, list[dict]]) -> list[str]:
if not files:
return []
def image_to_base64(file):
return "data:{};base64,{}".format(file["mime_type"],
base64.b64encode(FileService.get_blob(file["created_by"], file["id"])).decode("utf-8"))
exe = ThreadPoolExecutor(max_workers=5)
threads = []
for file in files:
if file["mime_type"].find("image") >=0:
threads.append(exe.submit(image_to_base64, file))
continue
threads.append(exe.submit(FileService.parse, file["name"], FileService.get_blob(file["created_by"], file["id"]), True, file["created_by"]))
return [th.result() for th in threads]

View File

@ -39,6 +39,7 @@ from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback callback = callback
binary = binary binary = binary
@ -600,8 +601,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels return srels
def chunk(filename, binary=None, from_page=0, to_page=100000, def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
lang="Chinese", callback=None, **kwargs):
""" """
Supported file formats are docx, pdf, excel, txt. Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files. This method apply the naive ways to chunk files.
@ -611,14 +611,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
urls = set() urls = set()
url_res = [] url_res = []
is_english = lang.lower() == "english" # is_english(cks) is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get( parser_config = kwargs.get(
"parser_config", { "parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
child_deli = re.findall(r"`([^`]+)`", parser_config.get("children_delimiter", ""))
child_deli = sorted(set(child_deli), key=lambda x: -len(x))
child_deli = "|".join(re.escape(t) for t in child_deli if t)
is_markdown = False
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0)) table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -679,12 +683,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"chunk_token_num", 128)), parser_config.get( "chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False): res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
chunks.extend(embed_res)
chunks.extend(url_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res) res.extend(embed_res)
res.extend(url_res) res.extend(url_res)
@ -780,7 +779,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return_section_images=True, return_section_images=True,
) )
final_sections = True is_markdown = True
try: try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@ -857,7 +856,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)") "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer() st = timer()
if final_sections: if is_markdown:
merged_chunks = [] merged_chunks = []
merged_images = [] merged_images = []
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128))) chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
@ -900,13 +899,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks = merged_chunks chunks = merged_chunks
has_images = merged_images and any(img is not None for img in merged_images) has_images = merged_images and any(img is not None for img in merged_images)
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
if has_images: if has_images:
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images)) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
else: else:
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
else: else:
if section_images: if section_images:
if all(image is None for image in section_images): if all(image is None for image in section_images):
@ -917,21 +914,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
int(parser_config.get( int(parser_config.get(
"chunk_token_num", 128)), parser_config.get( "chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False): res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else: else:
chunks = naive_merge( chunks = naive_merge(
sections, int(parser_config.get( sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get( "chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
if urls and parser_config.get("analyze_hyperlink", False) and is_root: if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls): for index, url in enumerate(urls):

View File

@ -13,10 +13,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import random import random
import re
from copy import deepcopy
from functools import partial from functools import partial
import trio import trio
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from rag.utils.base64_image import id2image, image2id from rag.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@ -32,6 +32,7 @@ class SplitterParam(ProcessParamBase):
self.chunk_token_size = 512 self.chunk_token_size = 512
self.delimiters = ["\n"] self.delimiters = ["\n"]
self.overlapped_percent = 0 self.overlapped_percent = 0
self.children_delimiters = []
def check(self): def check(self):
self.check_empty(self.delimiters, "Delimiters.") self.check_empty(self.delimiters, "Delimiters.")
@ -58,6 +59,14 @@ class Splitter(ProcessBase):
deli += f"`{d}`" deli += f"`{d}`"
else: else:
deli += d deli += d
child_deli = ""
for d in self._param.children_delimiters:
if len(d) > 1:
child_deli += f"`{d}`"
else:
child_deli += d
child_deli = [m.group(1) for m in re.finditer(r"`([^`]+)`", child_deli)]
custom_pattern = "|".join(re.escape(t) for t in sorted(set(child_deli), key=len, reverse=True))
self.set_output("output_format", "chunks") self.set_output("output_format", "chunks")
self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.") self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
@ -78,6 +87,22 @@ class Splitter(ProcessBase):
deli, deli,
self._param.overlapped_percent, self._param.overlapped_percent,
) )
if custom_pattern:
docs = []
for c in cks:
if not c.strip():
continue
split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL)
if split_sec:
for txt in split_sec:
docs.append({
"text": txt,
"mom": c
})
else:
docs.append({"text": c})
self.set_output("chunks", docs)
else:
self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()]) self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
self.callback(1, "Done.") self.callback(1, "Done.")
@ -100,12 +125,27 @@ class Splitter(ProcessBase):
{ {
"text": RAGFlowPdfParser.remove_tag(c), "text": RAGFlowPdfParser.remove_tag(c),
"image": img, "image": img,
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)], "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
} }
for c, img in zip(chunks, images) if c.strip() for c, img in zip(chunks, images) if c.strip()
] ]
async with trio.open_nursery() as nursery: async with trio.open_nursery() as nursery:
for d in cks: for d in cks:
nursery.start_soon(image2id, d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid()) nursery.start_soon(image2id, d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
if custom_pattern:
docs = []
for c in cks:
split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL)
if split_sec:
c["mom"] = c["text"]
for txt in split_sec:
cc = deepcopy(c)
cc["text"] = txt
docs.append(cc)
else:
docs.append(c)
self.set_output("chunks", docs)
else:
self.set_output("chunks", cks) self.set_output("chunks", cks)
self.callback(1, "Done.") self.callback(1, "Done.")

View File

@ -264,14 +264,14 @@ def is_chinese(text):
return False return False
def tokenize(d, t, eng): def tokenize(d, txt, eng):
d["content_with_weight"] = t d["content_with_weight"] = txt
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
d["content_ltks"] = rag_tokenizer.tokenize(t) d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser=None): def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ii, ck in enumerate(chunks): for ii, ck in enumerate(chunks):
@ -288,12 +288,21 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
pass pass
else: else:
add_positions(d, [[ii]*5]) add_positions(d, [[ii]*5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
continue
tokenize(d, ck, eng) tokenize(d, ck, eng)
res.append(d) res.append(d)
return res return res
def tokenize_chunks_with_images(chunks, doc, eng, images): def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ii, (ck, image) in enumerate(zip(chunks, images)): for ii, (ck, image) in enumerate(zip(chunks, images)):
@ -303,6 +312,13 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
d["image"] = image d["image"] = image
add_positions(d, [[ii]*5]) add_positions(d, [[ii]*5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
continue
tokenize(d, ck, eng) tokenize(d, ck, eng)
res.append(d) res.append(d)
return res return res

View File

@ -128,9 +128,6 @@ def signal_handler(sig, frame):
sys.exit(0) sys.exit(0)
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."): def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
try: try:
if prog is not None and prog < 0: if prog is not None and prog < 0:
@ -720,6 +717,34 @@ async def delete_image(kb_id, chunk_id):
async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback): async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
mothers = []
mother_ids = set([])
for ck in chunks:
mom = ck.get("mom") or ck.get("mom_with_weight") or ""
if not mom:
continue
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
if id in mother_ids:
continue
mother_ids.add(id)
ck["mom_id"] = id
mom_ck = copy.deepcopy(ck)
mom_ck["id"] = id
mom_ck["content_with_weight"] = mom
mom_ck["available_int"] = 0
flds = list(mom_ck.keys())
for fld in flds:
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int"]:
del mom_ck[fld]
mothers.append(mom_ck)
for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(mothers[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
task_canceled = has_canceled(task_id)
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
return False
for b in range(0, len(chunks), settings.DOC_BULK_SIZE): for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id)) doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
task_canceled = has_canceled(task_id) task_canceled = has_canceled(task_id)