Feat: dataflow supports Spreadsheet and Word processor document (#9996)

### What problem does this PR solve?

Dataflow supports Spreadsheet and Word processor document

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-10 13:02:53 +08:00
committed by GitHub
parent e650f0d368
commit 0d9c1f1c3c
9 changed files with 126 additions and 43 deletions

View File

@ -73,11 +73,13 @@ class Chunker(ProcessBase):
def _general(self, from_upstream: ChunkerFromUpstream):
self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `General`.")
if from_upstream.output_format in ["markdown", "text"]:
if from_upstream.output_format in ["markdown", "text", "html"]:
if from_upstream.output_format == "markdown":
payload = from_upstream.markdown_result
else: # == "text"
elif from_upstream.output_format == "text":
payload = from_upstream.text_result
else: # == "html"
payload = from_upstream.html_result
if not payload:
payload = ""
@ -90,6 +92,7 @@ class Chunker(ProcessBase):
)
return [{"text": c} for c in cks]
# json
sections, section_images = [], []
for o in from_upstream.json_result or []:
sections.append((o.get("text", ""), o.get("position_tag", "")))

View File

@ -29,7 +29,7 @@ class ChunkerFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: str | None = Field(default=None, alias="html")
html_result: list[str] | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
import trio
@ -29,8 +30,18 @@ class ParserParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.allowed_output_format = {
"pdf": ["json", "markdown"],
"excel": ["json", "markdown", "html"],
"pdf": [
"json",
"markdown",
],
"spreadsheet": [
"json",
"markdown",
"html",
],
"word": [
"json",
],
"ppt": [],
"image": [],
"email": [],
@ -44,12 +55,25 @@ class ParserParam(ProcessParamBase):
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
"vlm_name": "",
"lang": "Chinese",
"suffix": ["pdf"],
"suffix": [
"pdf",
],
"output_format": "json",
},
"excel": {
"spreadsheet": {
"output_format": "html",
"suffix": ["xls", "xlsx", "csv"],
"suffix": [
"xls",
"xlsx",
"csv",
],
},
"word": {
"suffix": [
"doc",
"docx",
],
"output_format": "json",
},
"ppt": {},
"image": {
@ -76,10 +100,15 @@ class ParserParam(ProcessParamBase):
pdf_output_format = pdf_config.get("output_format", "")
self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
excel_config = self.setups.get("excel", "")
if excel_config:
excel_output_format = excel_config.get("output_format", "")
self.check_valid_value(excel_output_format, "Excel output format abnormal.", self.allowed_output_format["excel"])
spreadsheet_config = self.setups.get("spreadsheet", "")
if spreadsheet_config:
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
doc_config = self.setups.get("doc", "")
if doc_config:
doc_output_format = doc_config.get("output_format", "")
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
image_config = self.setups.get("image", "")
if image_config:
@ -93,10 +122,13 @@ class ParserParam(ProcessParamBase):
class Parser(ProcessBase):
component_name = "Parser"
def _pdf(self, blob):
def _pdf(self, from_upstream: ParserFromUpstream):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
blob = from_upstream.blob
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method") == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method") == "plain_text":
@ -110,6 +142,7 @@ class Parser(ProcessBase):
for t, poss in lines:
pn, x0, x1, top, bott = poss.split(" ")
bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
if conf.get("output_format") == "markdown":
@ -123,23 +156,53 @@ class Parser(ProcessBase):
mkdn += b.get("text", "") + "\n"
self.set_output("markdown", mkdn)
def _excel(self, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Excel.")
conf = self._param.setups["excel"]
def _spreadsheet(self, from_upstream: ParserFromUpstream):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
blob = from_upstream.blob
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
excel_parser = ExcelParser()
print("spreadsheet {conf=}", flush=True)
spreadsheet_parser = ExcelParser()
if conf.get("output_format") == "html":
html = excel_parser.html(blob, 1000000000)
html = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", html)
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt} for txt in excel_parser(blob) if txt])
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", excel_parser.markdown(blob))
self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _word(self, from_upstream: ParserFromUpstream):
from tika import parser as word_parser
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"])
print("word {conf=}", flush=True)
doc_parsed = word_parser.from_buffer(blob)
sections = []
if doc_parsed.get("content"):
sections = doc_parsed["content"].split("\n")
sections = [{"text": section} for section in sections if section]
else:
logging.warning(f"tika.parser got empty content from {name}.")
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json":
self.set_output("json", sections)
async def _invoke(self, **kwargs):
function_map = {
"pdf": self._pdf,
"excel": self._excel,
"spreadsheet": self._spreadsheet,
"word": self._word,
}
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -150,5 +213,5 @@ class Parser(ProcessBase):
for p_type, conf in self._param.setups.items():
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
continue
await trio.to_thread.run_sync(function_map[p_type], from_upstream.blob)
await trio.to_thread.run_sync(function_map[p_type], from_upstream)
break

View File

@ -23,13 +23,20 @@
],
"output_format": "json"
},
"excel": {
"output_format": "html",
"spreadsheet": {
"suffix": [
"xls",
"xlsx",
"csv"
]
],
"output_format": "html"
},
"word": {
"suffix": [
"doc",
"docx"
],
"output_format": "json"
}
}
}

View File

@ -31,7 +31,7 @@ class TokenizerFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: str | None = Field(default=None, alias="html")
html_result: list[str] | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -117,11 +117,13 @@ class Tokenizer(ProcessBase):
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
if i % 100 == 99:
self.callback(i * 1.0 / len(chunks) / parts)
elif from_upstream.output_format in ["markdown", "text"]:
elif from_upstream.output_format in ["markdown", "text", "html"]:
if from_upstream.output_format == "markdown":
payload = from_upstream.markdown_result
else: # == "text"
elif from_upstream.output_format == "text":
payload = from_upstream.text_result
else: # == "html"
payload = from_upstream.html_result
if not payload:
return ""

View File

@ -751,6 +751,8 @@ class SILICONFLOWEmbed(Base):
token_count = 0
for i in range(0, len(texts), batch_size):
texts_batch = texts[i : i + batch_size]
texts_batch = [" " if not text.strip() else text for text in texts_batch]
payload = {
"model": self.model_name,
"input": texts_batch,
@ -935,7 +937,7 @@ class GiteeEmbed(SILICONFLOWEmbed):
if not base_url:
base_url = "https://ai.gitee.com/v1/embeddings"
super().__init__(key, model_name, base_url)
class DeepInfraEmbed(OpenAIEmbed):
_FACTORY_NAME = "DeepInfra"
@ -951,4 +953,4 @@ class Ai302Embed(Base):
def __init__(self, key, model_name, base_url="https://api.302.ai/v1/embeddings"):
if not base_url:
base_url = "https://api.302.ai/v1/embeddings"
super().__init__(key, model_name, base_url)
super().__init__(key, model_name, base_url)

View File

@ -518,7 +518,7 @@ def hierarchical_merge(bull, sections, depth):
return res
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
if not sections:
return []
@ -534,7 +534,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。", overl
pos = ""
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
@ -638,10 +638,10 @@ def concat_img(img1, img2):
return img2
if not img1 and not img2:
return None
if img1 is img2:
return img1
if isinstance(img1, Image.Image) and isinstance(img2, Image.Image):
pixel_data1 = img1.tobytes()
pixel_data2 = img2.tobytes()