Feat: dataflow supports Spreadsheet and Word processor document (#9996)

### What problem does this PR solve?

Dataflow supports Spreadsheet and Word processor document

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-10 13:02:53 +08:00
committed by GitHub
parent e650f0d368
commit 0d9c1f1c3c
9 changed files with 126 additions and 43 deletions

View File

@ -73,11 +73,13 @@ class Chunker(ProcessBase):
def _general(self, from_upstream: ChunkerFromUpstream):
self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `General`.")
if from_upstream.output_format in ["markdown", "text"]:
if from_upstream.output_format in ["markdown", "text", "html"]:
if from_upstream.output_format == "markdown":
payload = from_upstream.markdown_result
else: # == "text"
elif from_upstream.output_format == "text":
payload = from_upstream.text_result
else: # == "html"
payload = from_upstream.html_result
if not payload:
payload = ""
@ -90,6 +92,7 @@ class Chunker(ProcessBase):
)
return [{"text": c} for c in cks]
# json
sections, section_images = [], []
for o in from_upstream.json_result or []:
sections.append((o.get("text", ""), o.get("position_tag", "")))

View File

@ -29,7 +29,7 @@ class ChunkerFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: str | None = Field(default=None, alias="html")
html_result: list[str] | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")