mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: dataflow supports text (#10058)
### What problem does this PR solve? dataflow supports text. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -45,7 +45,10 @@ class ParserParam(ProcessParamBase):
|
|||||||
"ppt": [],
|
"ppt": [],
|
||||||
"image": [],
|
"image": [],
|
||||||
"email": [],
|
"email": [],
|
||||||
"text": [],
|
"text": [
|
||||||
|
"text",
|
||||||
|
"json"
|
||||||
|
],
|
||||||
"audio": [],
|
"audio": [],
|
||||||
"video": [],
|
"video": [],
|
||||||
}
|
}
|
||||||
@ -84,7 +87,12 @@ class ParserParam(ProcessParamBase):
|
|||||||
"parse_method": "ocr",
|
"parse_method": "ocr",
|
||||||
},
|
},
|
||||||
"email": {},
|
"email": {},
|
||||||
"text": {},
|
"text": {
|
||||||
|
"suffix": [
|
||||||
|
"txt"
|
||||||
|
],
|
||||||
|
"output_format": "json",
|
||||||
|
},
|
||||||
"audio": {},
|
"audio": {},
|
||||||
"video": {},
|
"video": {},
|
||||||
}
|
}
|
||||||
@ -119,6 +127,11 @@ class ParserParam(ProcessParamBase):
|
|||||||
image_parse_method = image_config.get("parse_method", "")
|
image_parse_method = image_config.get("parse_method", "")
|
||||||
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
|
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
|
||||||
|
|
||||||
|
text_config = self.setups.get("text", "")
|
||||||
|
if text_config:
|
||||||
|
text_output_format = text_config.get("output_format", "")
|
||||||
|
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
|
||||||
|
|
||||||
def get_input_form(self) -> dict[str, dict]:
|
def get_input_form(self) -> dict[str, dict]:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@ -208,15 +221,13 @@ class Parser(ProcessBase):
|
|||||||
from rag.app.naive import Markdown as naive_markdown_parser
|
from rag.app.naive import Markdown as naive_markdown_parser
|
||||||
from rag.nlp import concat_img
|
from rag.nlp import concat_img
|
||||||
|
|
||||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
|
||||||
|
|
||||||
blob = from_upstream.blob
|
blob = from_upstream.blob
|
||||||
name = from_upstream.name
|
name = from_upstream.name
|
||||||
conf = self._param.setups["markdown"]
|
conf = self._param.setups["markdown"]
|
||||||
self.set_output("output_format", conf["output_format"])
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
print("markdown {conf=}", flush=True)
|
|
||||||
|
|
||||||
markdown_parser = naive_markdown_parser()
|
markdown_parser = naive_markdown_parser()
|
||||||
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
||||||
|
|
||||||
@ -240,13 +251,33 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
self.set_output("json", json_results)
|
self.set_output("json", json_results)
|
||||||
|
|
||||||
|
def _text(self, from_upstream: ParserFromUpstream):
|
||||||
|
from deepdoc.parser.utils import get_text
|
||||||
|
|
||||||
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
|
||||||
|
|
||||||
|
blob = from_upstream.blob
|
||||||
|
name = from_upstream.name
|
||||||
|
conf = self._param.setups["text"]
|
||||||
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
|
# parse binary to text
|
||||||
|
text_content = get_text(name, binary=blob)
|
||||||
|
|
||||||
|
if conf.get("output_format") == "json":
|
||||||
|
result = [{"text": text_content}]
|
||||||
|
self.set_output("json", result)
|
||||||
|
else:
|
||||||
|
result = text_content
|
||||||
|
self.set_output("text", result)
|
||||||
|
|
||||||
async def _invoke(self, **kwargs):
|
async def _invoke(self, **kwargs):
|
||||||
function_map = {
|
function_map = {
|
||||||
"pdf": self._pdf,
|
"pdf": self._pdf,
|
||||||
"markdown": self._markdown,
|
"markdown": self._markdown,
|
||||||
"spreadsheet": self._spreadsheet,
|
"spreadsheet": self._spreadsheet,
|
||||||
"word": self._word
|
"word": self._word,
|
||||||
|
"text": self._text,
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||||
|
|||||||
@ -44,9 +44,12 @@
|
|||||||
"markdown"
|
"markdown"
|
||||||
],
|
],
|
||||||
"output_format": "json"
|
"output_format": "json"
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"suffix": ["txt"],
|
||||||
|
"output_format": "json"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"downstream": ["Chunker:0"],
|
"downstream": ["Chunker:0"],
|
||||||
|
|||||||
Reference in New Issue
Block a user