mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Perf: ignore concate between rows. (#8507)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -479,6 +479,9 @@ class RAGFlowPdfParser:
|
|||||||
self.boxes = bxs
|
self.boxes = bxs
|
||||||
|
|
||||||
def _concat_downward(self, concat_between_pages=True):
|
def _concat_downward(self, concat_between_pages=True):
|
||||||
|
self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
|
||||||
|
return
|
||||||
|
|
||||||
# count boxes in the same row as a feature
|
# count boxes in the same row as a feature
|
||||||
for i in range(len(self.boxes)):
|
for i in range(len(self.boxes)):
|
||||||
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
||||||
@ -1136,7 +1139,8 @@ class RAGFlowPdfParser:
|
|||||||
need_image, zoomin, return_html, False)
|
need_image, zoomin, return_html, False)
|
||||||
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
||||||
|
|
||||||
def remove_tag(self, txt):
|
@staticmethod
|
||||||
|
def remove_tag(txt):
|
||||||
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
|
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
|
||||||
|
|
||||||
def crop(self, text, ZM=3, need_position=False):
|
def crop(self, text, ZM=3, need_position=False):
|
||||||
|
|||||||
1639
rag/res/ner.json
1639
rag/res/ner.json
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user