Refactor: improve ppt shape order logic (#13054)

### What problem does this PR solve?

improve ppt shape order logic

### Type of change

- [x] Refactoring
This commit is contained in:
Stephen Hu
2026-02-09 11:59:24 +08:00
committed by GitHub
parent 0b55d1e860
commit 2ee39f64fe

View File

@ -22,6 +22,16 @@ from pptx import Presentation
class RAGFlowPptParser: class RAGFlowPptParser:
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self._shape_cache = {}
def __sort_shapes(self, shapes):
cache_key = id(shapes)
if cache_key not in self._shape_cache:
self._shape_cache[cache_key] = sorted(
shapes,
key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)
)
return self._shape_cache[cache_key]
def __get_bulleted_text(self, paragraph): def __get_bulleted_text(self, paragraph):
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip")) is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
@ -62,7 +72,7 @@ class RAGFlowPptParser:
# Handle group shape # Handle group shape
if shape_type == 6: if shape_type == 6:
texts = [] texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): for p in self.__sort_shapes(shape.shapes):
t = self.__extract(p) t = self.__extract(p)
if t: if t:
texts.append(t) texts.append(t)
@ -86,8 +96,7 @@ class RAGFlowPptParser:
if i >= to_page: if i >= to_page:
break break
texts = [] texts = []
for shape in sorted( for shape in self.__sort_shapes(slide.shapes):
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
txt = self.__extract(shape) txt = self.__extract(shape)
if txt: if txt:
texts.append(txt) texts.append(txt)