diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index 0f39b2443..8d0e67022 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -17,7 +17,6 @@ import re -import mistune from markdown import markdown @@ -117,8 +116,6 @@ class MarkdownElementExtractor: def __init__(self, markdown_content): self.markdown_content = markdown_content self.lines = markdown_content.split("\n") - self.ast_parser = mistune.create_markdown(renderer="ast") - self.ast_nodes = self.ast_parser(markdown_content) def extract_elements(self): """Extract individual elements (headers, code blocks, lists, etc.)""" diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 74e08e212..8cc56a88c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -15,11 +15,13 @@ # import logging +import math import os import random import re import sys import threading +from collections import Counter, defaultdict from copy import deepcopy from io import BytesIO from timeit import default_timer as timer @@ -349,9 +351,78 @@ class RAGFlowPdfParser: self.boxes[i]["top"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] - def _text_merge(self): + def _assign_column(self, boxes, zoomin=3): + if not boxes: + return boxes + + if all("col_id" in b for b in boxes): + return boxes + + by_page = defaultdict(list) + for b in boxes: + by_page[b["page_number"]].append(b) + + page_info = {} # pg -> dict(page_w, left_edge, cand_cols) + counter = Counter() + + for pg, bxs in by_page.items(): + if not bxs: + page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1} + counter[1] += 1 + continue + + if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg: + page_w = self.page_images[pg - 1].size[0] / max(1, zoomin) + left_edge = 0.0 + else: + xs0 = [box["x0"] for box in bxs] + xs1 = [box["x1"] for box in bxs] + left_edge = float(min(xs0)) + page_w = max(1.0, float(max(xs1) - left_edge)) + + widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs] + median_w = float(np.median(widths)) if widths else 1.0 + + raw_cols = int(page_w / max(1.0, median_w)) + + # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1 + cand = raw_cols + + page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand} + counter[cand] += 1 + + logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}") + + global_cols = counter.most_common(1)[0][0] + logging.info(f"Global column_num decided by majority: {global_cols}") + + for pg, bxs in by_page.items(): + if not bxs: + continue + + page_w = page_info[pg]["page_w"] + left_edge = page_info[pg]["left_edge"] + + if global_cols == 1: + for box in bxs: + box["col_id"] = 0 + continue + + for box in bxs: + w = box["x1"] - box["x0"] + if w >= 0.8 * page_w: + box["col_id"] = 0 + continue + cx = 0.5 * (box["x0"] + box["x1"]) + norm_cx = (cx - left_edge) / page_w + norm_cx = max(0.0, min(norm_cx, 0.999999)) + box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols)) + + return boxes + + def _text_merge(self, zoomin=3): # merge adjusted boxes - bxs = self.boxes + bxs = self._assign_column(self.boxes, zoomin) def end_with(b, txt): txt = txt.strip() @@ -367,9 +438,15 @@ class RAGFlowPdfParser: while i < len(bxs) - 1: b = bxs[i] b_ = bxs[i + 1] + + if b["page_number"] != b_["page_number"] or b.get("col_id") != b_.get("col_id"): + i += 1 + continue + if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: i += 1 continue + if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: # merge bxs[i]["x1"] = b_["x1"] @@ -379,83 +456,108 @@ class RAGFlowPdfParser: bxs.pop(i + 1) continue i += 1 - continue - - dis_thr = 1 - dis = b["x1"] - b_["x0"] - if b.get("layout_type", "") != "text" or b_.get("layout_type", "") != "text": - if end_with(b, ",") or start_with(b_, "(,"): - dis_thr = -8 - else: - i += 1 - continue - - if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 and dis >= dis_thr and b["x1"] < b_["x1"]: - # merge - bxs[i]["x1"] = b_["x1"] - bxs[i]["top"] = (b["top"] + b_["top"]) / 2 - bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 - bxs[i]["text"] += b_["text"] - bxs.pop(i + 1) - continue - i += 1 self.boxes = bxs def _naive_vertical_merge(self, zoomin=3): - import math - bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) + bxs = self._assign_column(self.boxes, zoomin) - column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) - if not column_width or math.isnan(column_width): - column_width = self.mean_width[0] - self.column_num = int(self.page_images[0].size[0] / zoomin / column_width) - if column_width < self.page_images[0].size[0] / zoomin / self.column_num: - logging.info("Multi-column................... {} {}".format(column_width, self.page_images[0].size[0] / zoomin / self.column_num)) - self.boxes = self.sort_X_by_page(self.boxes, column_width / self.column_num) + grouped = defaultdict(list) + for b in bxs: + grouped[(b["page_number"], b.get("col_id", 0))].append(b) - i = 0 - while i + 1 < len(bxs): - b = bxs[i] - b_ = bxs[i + 1] - if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): - bxs.pop(i) + merged_boxes = [] + for (pg, col), bxs in grouped.items(): + bxs = sorted(bxs, key=lambda x: (x["top"], x["x0"])) + if not bxs: continue - if not b["text"].strip(): - bxs.pop(i) - continue - concatting_feats = [ - b["text"].strip()[-1] in ",;:'\",、‘“;:-", - len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", - b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:", - ] - # features for not concating - feats = [ - b.get("layoutno", 0) != b_.get("layoutno", 0), - b["text"].strip()[-1] in "。?!?", - self.is_english and b["text"].strip()[-1] in ".!?", - b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, - b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4, - ] - # split features - detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] - if (any(feats) and not any(concatting_feats)) or any(detach_feats): - logging.debug( - "{} {} {} {}".format( - b["text"], - b_["text"], - any(feats), - any(concatting_feats), + + mh = self.mean_height[pg - 1] if self.mean_height else np.median([b["bottom"] - b["top"] for b in bxs]) or 10 + + i = 0 + while i + 1 < len(bxs): + b = bxs[i] + b_ = bxs[i + 1] + + if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): + bxs.pop(i) + continue + + if not b["text"].strip(): + bxs.pop(i) + continue + + if not b["text"].strip() or b.get("layoutno") != b_.get("layoutno"): + i += 1 + continue + + if b_["top"] - b["bottom"] > mh * 1.5: + i += 1 + continue + + overlap = max(0, min(b["x1"], b_["x1"]) - max(b["x0"], b_["x0"])) + if overlap / max(1, min(b["x1"] - b["x0"], b_["x1"] - b_["x0"])) < 0.3: + i += 1 + continue + + concatting_feats = [ + b["text"].strip()[-1] in ",;:'\",、‘“;:-", + len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", + b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:", + ] + # features for not concating + feats = [ + b.get("layoutno", 0) != b_.get("layoutno", 0), + b["text"].strip()[-1] in "。?!?", + self.is_english and b["text"].strip()[-1] in ".!?", + b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, + b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4, + ] + # split features + detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] + if (any(feats) and not any(concatting_feats)) or any(detach_feats): + logging.debug( + "{} {} {} {}".format( + b["text"], + b_["text"], + any(feats), + any(concatting_feats), + ) ) - ) - i += 1 - continue - # merge up and down - b["bottom"] = b_["bottom"] - b["text"] += b_["text"] - b["x0"] = min(b["x0"], b_["x0"]) - b["x1"] = max(b["x1"], b_["x1"]) - bxs.pop(i + 1) - self.boxes = bxs + i += 1 + continue + + b["text"] = (b["text"].rstrip() + " " + b_["text"].lstrip()).strip() + b["bottom"] = b_["bottom"] + b["x0"] = min(b["x0"], b_["x0"]) + b["x1"] = max(b["x1"], b_["x1"]) + bxs.pop(i + 1) + + merged_boxes.extend(bxs) + + self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"])) + + def _final_reading_order_merge(self, zoomin=3): + if not self.boxes: + return + + self.boxes = self._assign_column(self.boxes, zoomin=zoomin) + + pages = defaultdict(lambda: defaultdict(list)) + for b in self.boxes: + pg = b["page_number"] + col = b.get("col_id", 0) + pages[pg][col].append(b) + + for pg in pages: + for col in pages[pg]: + pages[pg][col].sort(key=lambda x: (x["top"], x["x0"])) + + new_boxes = [] + for pg in sorted(pages.keys()): + for col in sorted(pages[pg].keys()): + new_boxes.extend(pages[pg][col]) + + self.boxes = new_boxes def _concat_downward(self, concat_between_pages=True): self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0) @@ -1074,7 +1176,6 @@ class RAGFlowPdfParser: def insert_table_figures(tbls_or_figs, layout_type): def min_rectangle_distance(rect1, rect2): - import math pn1, left1, right1, top1, bottom1 = rect1 pn2, left2, right2, top2, bottom2 = rect2 if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1: @@ -1091,27 +1192,39 @@ class RAGFlowPdfParser: dy = top1 - bottom2 else: dy = 0 - return math.sqrt(dx*dx + dy*dy)# + (pn2-pn1)*10000 + return math.sqrt(dx * dx + dy * dy) # + (pn2-pn1)*10000 for (img, txt), poss in tbls_or_figs: bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)] - dists = [(min_rectangle_distance((pn, left, right, top+self.page_cum_height[pn], bott+self.page_cum_height[pn]), rect),i) for i, rect in bboxes for pn, left, right, top, bott in poss] + dists = [ + (min_rectangle_distance((pn, left, right, top + self.page_cum_height[pn], bott + self.page_cum_height[pn]), rect), i) for i, rect in bboxes for pn, left, right, top, bott in poss + ] min_i = np.argmin(dists, axis=0)[0] min_i, rect = bboxes[dists[min_i][-1]] if isinstance(txt, list): txt = "\n".join(txt) pn, left, right, top, bott = poss[0] - if self.boxes[min_i]["bottom"] < top+self.page_cum_height[pn]: + if self.boxes[min_i]["bottom"] < top + self.page_cum_height[pn]: min_i += 1 - self.boxes.insert(min_i, { - "page_number": pn+1, "x0": left, "x1": right, "top": top+self.page_cum_height[pn], "bottom": bott+self.page_cum_height[pn], "layout_type": layout_type, "text": txt, "image": img, - "positions": [[pn+1, int(left), int(right), int(top), int(bott)]] - }) + self.boxes.insert( + min_i, + { + "page_number": pn + 1, + "x0": left, + "x1": right, + "top": top + self.page_cum_height[pn], + "bottom": bott + self.page_cum_height[pn], + "layout_type": layout_type, + "text": txt, + "image": img, + "positions": [[pn + 1, int(left), int(right), int(top), int(bott)]], + }, + ) for b in self.boxes: b["position_tag"] = self._line_tag(b, zoomin) b["image"] = self.crop(b["position_tag"], zoomin) - b["positions"] = [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])] + b["positions"] = [[pos[0][-1] + 1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])] insert_table_figures(tbls, "table") insert_table_figures(figs, "figure") diff --git a/rag/app/naive.py b/rag/app/naive.py index ca253efe1..370b503ad 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -328,7 +328,7 @@ class Pdf(PdfParser): callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) start = timer() - self._text_merge() + self._text_merge(zoomin=zoomin) callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) if separate_tables_figures: @@ -340,6 +340,7 @@ class Pdf(PdfParser): tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._concat_downward() + self._final_reading_order_merge() # self._filter_forpages() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls