diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 028381b44..c8202dd4f 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -529,6 +529,7 @@ def cancel_all_task_of(doc_id): def has_canceled(task_id): try: if REDIS_CONN.get(f"{task_id}-cancel"): + logging.info(f"Task: {task_id} has been canceled") return True except Exception as e: logging.exception(e) diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 86b05690c..caf4f5b7b 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -25,7 +25,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context from rag.nlp import append_context2table_image4pdf - +# need to delete before pr def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): if not figures_data_without_positions: return [] @@ -38,7 +38,6 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): if isinstance(figure_data[1], Image.Image) ] - def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs): if not sections: return tbls @@ -124,8 +123,56 @@ def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs): return tbls -shared_executor = ThreadPoolExecutor(max_workers=10) +def vision_figure_parser_docx_wrapper_naive(chunks, idx_lst, callback=None, **kwargs): + print("\n\n hello here i am \n\n") + + if not chunks: + return [] + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") + print(" \n\n Yes vision model \n\n") + except Exception: + vision_model = None + print(" \n\n No vision model \n\n") + if vision_model: + @timeout(30, 3) + def worker(idx, ck): + context_above = ck.get("context_above", "") + context_below = ck.get("context_below", "") + if context_above or context_below: + prompt = vision_llm_figure_describe_prompt_with_context( + # context_above + caption if any + context_above=ck.get("context_above") + ck.get("text", ""), + context_below=ck.get("context_below"), + ) + logging.info(f"[VisionFigureParser] figure={idx} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context") + logging.info(f"[VisionFigureParser] figure={idx} context_above_snippet={context_above[:512]}") + logging.info(f"[VisionFigureParser] figure={idx} context_below_snippet={context_below[:512]}") + else: + prompt = vision_llm_figure_describe_prompt() + logging.info(f"[VisionFigureParser] figure={idx} context_len=0 prompt=default") + + description_text = picture_vision_llm_chunk( + binary=ck.get("image"), + vision_model=vision_model, + prompt=prompt, + callback=callback, + ) + return idx, description_text + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit(worker, idx, chunks[idx]) + for idx in idx_lst + ] + + for future in as_completed(futures): + idx, description = future.result() + chunks[idx]['text'] += description + +shared_executor = ThreadPoolExecutor(max_workers=10) class VisionFigureParser: def __init__(self, vision_model, figures_data, *args, **kwargs): diff --git a/rag/app/book.py b/rag/app/book.py index 5f093c55b..86763adf2 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -87,10 +87,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") doc_parser = naive.Docx() # TODO: table of contents need to be removed - sections, tbls = doc_parser( + main_sections = doc_parser( filename, binary=binary, from_page=from_page, to_page=to_page) + + sections = [] + tbls = [] + for text, image, html in main_sections: + sections.append((text, image)) + tbls.append(((None, html), "")) + remove_contents_table(sections, eng=is_english( random_choices([t for t, _ in sections], k=200))) + tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs) # tbls = [((None, lns), None) for lns in tbls] sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if diff --git a/rag/app/naive.py b/rag/app/naive.py index c2e028b34..05d673e4b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -23,6 +23,8 @@ from timeit import default_timer as timer from docx import Document from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship +from docx.table import Table as DocxTable +from docx.text.paragraph import Paragraph from docx.opc.oxml import parse_xml from markdown import markdown from PIL import Image @@ -33,15 +35,15 @@ from api.db.services.llm_service import LLMBundle from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \ PdfParser, TxtParser -from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper, \ +from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \ vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \ - tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context, append_context2table_image4pdf - + tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \ + attach_media_context # noqa: F401 def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): @@ -343,67 +345,116 @@ class Docx(DocxParser): pn = 0 lines = [] last_image = None - for p in self.doc.paragraphs: + table_idx = 0 + + def flush_last_image(): + nonlocal last_image, lines + if last_image is not None: + lines.append({"text": "", "image": last_image, "table": None, "style": "Image"}) + last_image = None + + for block in self.doc._element.body: if pn > to_page: break - if from_page <= pn < to_page: - if p.text.strip(): - if p.style and p.style.name == 'Caption': - former_image = None - if lines and lines[-1][1] and lines[-1][2] != 'Caption': - former_image = lines[-1][1].pop() - elif last_image: - former_image = last_image - last_image = None - lines.append((self.__clean(p.text), [former_image], p.style.name)) + + if block.tag.endswith('p'): + p = Paragraph(block, self.doc) + + if from_page <= pn < to_page: + text = p.text.strip() + style_name = p.style.name if p.style else "" + + if text: + if style_name == "Caption": + former_image = None + + if lines and lines[-1].get("image") and lines[-1].get("style") != "Caption": + former_image = lines[-1].get("image") + lines.pop() + + elif last_image is not None: + former_image = last_image + last_image = None + + lines.append( + { + "text": self.__clean(text), + "image": former_image if former_image else None, + "table": None, + } + ) + + else: + flush_last_image() + lines.append( + { + "text": self.__clean(text), + "image": None, + "table": None, + } + ) + + current_image = self.get_picture(self.doc, p) + if current_image is not None: + lines.append( + { + "text": "", + "image": current_image, + "table": None, + } + ) + else: current_image = self.get_picture(self.doc, p) - image_list = [current_image] - if last_image: - image_list.insert(0, last_image) - last_image = None - lines.append((self.__clean(p.text), image_list, p.style.name if p.style else "")) - else: - if current_image := self.get_picture(self.doc, p): - if lines: - lines[-1][1].append(current_image) - else: + if current_image is not None: last_image = current_image - for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: - pn += 1 - continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: - pn += 1 - new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] - tbls = [] - for i, tb in enumerate(self.doc.tables): - title = self.__get_nearest_title(i, filename) - html = "" - if title: - html += f"" - for r in tb.rows: - html += "" - i = 0 - try: - while i < len(r.cells): - span = 1 - c = r.cells[i] - for j in range(i + 1, len(r.cells)): - if c.text == r.cells[j].text: - span += 1 - i = j - else: - break - i += 1 - html += f"" if span == 1 else f"" - except Exception as e: - logging.warning(f"Error parsing table, ignore: {e}") - html += "" - html += "
Table Location: {title}
{c.text}{c.text}
" - tbls.append(((None, html), "")) - return new_line, tbls + for run in p.runs: + xml = run._element.xml + if "lastRenderedPageBreak" in xml: + pn += 1 + continue + if "w:br" in xml and 'type="page"' in xml: + pn += 1 + + elif block.tag.endswith('tbl'): + if pn < from_page or pn > to_page: + table_idx += 1 + continue + + flush_last_image() + tb = DocxTable(block, self.doc) + title = self.__get_nearest_title(table_idx, filename) + html = "" + if title: + html += f"" + for r in tb.rows: + html += "" + col_idx = 0 + try: + while col_idx < len(r.cells): + span = 1 + c = r.cells[col_idx] + for j in range(col_idx + 1, len(r.cells)): + if c.text == r.cells[j].text: + span += 1 + col_idx = j + else: + break + col_idx += 1 + html += f"" if span == 1 else f"" + except Exception as e: + logging.warning(f"Error parsing table, ignore: {e}") + html += "" + html += "
Table Location: {title}
{c.text}{c.text}
" + lines.append({"text": "", "image": None, "table": html}) + table_idx += 1 + + flush_last_image() + new_line = [(line.get("text"), line.get("image"), line.get("table")) for line in lines] + + return new_line + def to_markdown(self, filename=None, binary=None, inline_images: bool = True): """ @@ -727,26 +778,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 - sections, tables = Docx()(filename, binary) - tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs) - - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") - - st = timer() + # sections = (text, image, tables) + sections = Docx()(filename, binary) + # chunks list[dict] + # images list - index of image chunk in chunks chunks, images = naive_merge_docx( sections, int(parser_config.get( "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) + "delimiter", "\n!?。;!?"), table_context_size, image_context_size) + + vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs) - res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli)) + callback(0.8, "Finish parsing.") + st = timer() + + res.extend(doc_tokenize_chunks_with_images(chunks, doc, is_english, child_delimiters_pattern=child_deli)) logging.info("naive_merge({}): {}".format(filename, timer() - st)) res.extend(embed_res) res.extend(url_res) - if table_context_size or image_context_size: - attach_media_context(res, table_context_size, image_context_size) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): @@ -1012,7 +1063,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.extend(embed_res) if url_res: res.extend(url_res) - #if table_context_size or image_context_size: + # if table_context_size or image_context_size: # attach_media_context(res, table_context_size, image_context_size) return res diff --git a/rag/app/one.py b/rag/app/one.py index bb9f09f1a..a53d00ea9 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -22,7 +22,7 @@ from deepdoc.parser.utils import get_text from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, HtmlParser -from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper +from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper_naive from rag.app.naive import by_plaintext, PARSERS from common.parser_config_utils import normalize_layout_recognizer @@ -76,11 +76,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections, tbls = naive.Docx()(filename, binary) - tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs) - sections = [s for s, _ in sections if s] - for (_, html), _ in tbls: - sections.append(html) + sections = naive.Docx()(filename, binary) + cks = [] + image_idxs = [] + + for text, image, table in sections: + if table is not None: + text = (text or "") + str(table) + ck_type = "table" + else: + ck_type = "image" if image is not None else "text" + + if ck_type == "image": + image_idxs.append(len(cks)) + + cks.append({"text": text, "image": image, "ck_type": ck_type}) + + vision_figure_parser_docx_wrapper_naive(cks, image_idxs, callback, **kwargs) + for ck in cks: + print(ck) + sections = [ck["text"] for ck in cks if ck.get("text")] callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index b41bf7ead..1b0029d2c 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -316,6 +316,32 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern= return res +def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=None, batch_size=10): + res = [] + for ii, ck in enumerate(chunks): + text = ck.get('context_above', "") + ck.get('text') + ck.get('context_below', "") + if len(text.strip()) == 0: + continue + logging.debug("-- {}".format(ck)) + d = copy.deepcopy(doc) + if ck.get("image"): + d["image"] = ck.get("image") + add_positions(d, [[ii] * 5]) + + if ck.get("ck_type") == "text": + if child_delimiters_pattern: + d["mom_with_weight"] = ck + res.extend(split_with_pattern(d, child_delimiters_pattern, text, eng)) + continue + elif ck.get("ck_type") == "image": + d["doc_type_kwd"] = "image" + elif ck.get("ck_type") == "table": + d["doc_type_kwd"] = "table" + tokenize(d, text, eng) + res.append(d) + return res + + def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None): res = [] # wrap up as es documents @@ -789,6 +815,11 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si if len(contexts) < len(res) + 1: contexts.append(("", "")) res.append(((img, tb), poss)) + + print("\n\n") + for c in contexts: + print(c) + print("\n\n") return contexts if return_context else res @@ -1200,57 +1231,181 @@ def concat_img(img1, img2): new_image.paste(img2, (0, height1)) return new_image - -def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): - if not sections: - return [], [] - +def _build_cks(sections, delimiter): cks = [] + tables = [] images = [] - tk_nums = [] - - def add_chunk(t, image, pos=""): - nonlocal cks, images, tk_nums - tnum = num_tokens_from_string(t) - if tnum < 8: - pos = "" - - if not cks or tk_nums[-1] > chunk_token_num: - # new chunk - if pos and t.find(pos) < 0: - t += pos - cks.append(t) - images.append(image) - tk_nums.append(tnum) - else: - # add to last chunk - if pos and cks[-1].find(pos) < 0: - t += pos - cks[-1] += t - images[-1] = concat_img(images[-1], image) - tk_nums[-1] += tnum custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] has_custom = bool(custom_delimiters) + if has_custom: - custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)) - cks, images, tk_nums = [], [], [] + custom_pattern = "|".join( + re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True) + ) pattern = r"(%s)" % custom_pattern - for sec, image in sections: - split_sec = re.split(pattern, sec) + + for text, image, table in sections: + # normalize text + if not text: + text = "\n" + else: + text = "\n" + str(text) + + if table: + # table ck + ck_text = text + str(table) + idx = len(cks) + cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)}) + tables.append(idx) + continue + + if image: + # image ck (text can be kept as-is; depends on your downstream) + idx = len(cks) + cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)}) + images.append(idx) + continue + + # pure text ck(s) + if has_custom: + split_sec = re.split(pattern, text) for sub_sec in split_sec: if not sub_sec or re.fullmatch(custom_pattern, sub_sec): continue - text_seg = "\n" + sub_sec - cks.append(text_seg) - images.append(image) - tk_nums.append(num_tokens_from_string(text_seg)) - return cks, images + seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec + cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)}) + else: + cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)}) - for sec, image in sections: - add_chunk("\n" + sec, image, "") + return cks, tables, images - return cks, images + +def _add_context(cks, idx, context_size): + if cks[idx]["ck_type"] not in ("image", "table"): + return + + prev = idx - 1 + after = idx + 1 + remain_above = context_size + remain_below = context_size + + cks[idx]["context_above"] = "" + cks[idx]["context_below"] = "" + + split_pat = r"([。!??;!\n]|\. )" + + picked_above = [] + picked_below = [] + + def take_sentences_from_end(cnt, need_tokens): + txts = re.split(split_pat, cnt, flags=re.DOTALL) + sents = [] + for j in range(0, len(txts), 2): + sents.append(txts[j] + (txts[j + 1] if j + 1 < len(txts) else "")) + acc = "" + for s in reversed(sents): + acc = s + acc + if num_tokens_from_string(acc) >= need_tokens: + break + return acc + + def take_sentences_from_start(cnt, need_tokens): + txts = re.split(split_pat, cnt, flags=re.DOTALL) + acc = "" + for j in range(0, len(txts), 2): + acc += txts[j] + (txts[j + 1] if j + 1 < len(txts) else "") + if num_tokens_from_string(acc) >= need_tokens: + break + return acc + + # above + parts_above = [] + while prev >= 0 and remain_above > 0: + if cks[prev]["ck_type"] == "text": + tk = cks[prev]["tk_nums"] + if tk >= remain_above: + piece = take_sentences_from_end(cks[prev]["text"], remain_above) + parts_above.insert(0, piece) + picked_above.append((prev, "tail", remain_above, tk, piece[:80])) + remain_above = 0 + break + else: + parts_above.insert(0, cks[prev]["text"]) + picked_above.append((prev, "full", remain_above, tk, (cks[prev]["text"] or "")[:80])) + remain_above -= tk + prev -= 1 + + # below + parts_below = [] + while after < len(cks) and remain_below > 0: + if cks[after]["ck_type"] == "text": + tk = cks[after]["tk_nums"] + if tk >= remain_below: + piece = take_sentences_from_start(cks[after]["text"], remain_below) + parts_below.append(piece) + picked_below.append((after, "head", remain_below, tk, piece[:80])) + remain_below = 0 + break + else: + parts_below.append(cks[after]["text"]) + picked_below.append((after, "full", remain_below, tk, (cks[after]["text"] or "")[:80])) + remain_below -= tk + after += 1 + + cks[idx]["context_above"] = "".join(parts_above) if parts_above else "" + cks[idx]["context_below"] = "".join(parts_below) if parts_below else "" + + +def _merge_cks(cks, chunk_token_num): + merged = [] + image_idxs = [] + prev_text_ck = -1 + + for i in range(len(cks)): + ck_type = cks[i]["ck_type"] + + if ck_type != "text": + merged.append(cks[i]) + if ck_type == "image": + image_idxs.append(len(merged) - 1) + continue + + + if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num: + merged.append(cks[i]) + prev_text_ck = len(merged) - 1 + continue + + merged[prev_text_ck]["text"] = (merged[prev_text_ck].get("text") or "") + (cks[i].get("text") or "") + merged[prev_text_ck]["tk_nums"] = merged[prev_text_ck].get("tk_nums", 0) + cks[i].get("tk_nums", 0) + + return merged, image_idxs + + +def naive_merge_docx( + sections, + chunk_token_num = 128, + delimiter="\n。;!?", + table_context_size=0, + image_context_size=0,): + + if not sections: + return [], [] + + cks, tables, images = _build_cks(sections, delimiter) + + if table_context_size > 0: + for i in tables: + _add_context(cks, i, table_context_size) + + if image_context_size > 0: + for i in images: + _add_context(cks, i, image_context_size) + + merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num) + + return merged_cks, merged_image_idx def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]: diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 360d1c959..d39789841 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -1127,7 +1127,7 @@ async def do_handle_task(task): if has_canceled(task_id): try: exists = await asyncio.to_thread( - settings.docStoreConn.indexExist, + settings.docStoreConn.index_exist, search.index_name(task_tenant_id), task_dataset_id, )