mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-10 12:55:06 +08:00
Refactor: improve ppt shape order logic (#13054)
### What problem does this PR solve? improve ppt shape order logic ### Type of change - [x] Refactoring
This commit is contained in:
@ -22,6 +22,16 @@ from pptx import Presentation
|
||||
class RAGFlowPptParser:
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._shape_cache = {}
|
||||
|
||||
def __sort_shapes(self, shapes):
|
||||
cache_key = id(shapes)
|
||||
if cache_key not in self._shape_cache:
|
||||
self._shape_cache[cache_key] = sorted(
|
||||
shapes,
|
||||
key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)
|
||||
)
|
||||
return self._shape_cache[cache_key]
|
||||
|
||||
def __get_bulleted_text(self, paragraph):
|
||||
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
|
||||
@ -62,7 +72,7 @@ class RAGFlowPptParser:
|
||||
# Handle group shape
|
||||
if shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
for p in self.__sort_shapes(shape.shapes):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
@ -86,8 +96,7 @@ class RAGFlowPptParser:
|
||||
if i >= to_page:
|
||||
break
|
||||
texts = []
|
||||
for shape in sorted(
|
||||
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
|
||||
for shape in self.__sort_shapes(slide.shapes):
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
|
||||
Reference in New Issue
Block a user