Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-05 18:50:46 +08:00
committed by GitHub
parent 9aa8cfb73a
commit 45f52e85d7
21 changed files with 959 additions and 256 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,12 +18,14 @@ import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
import trio
from api import settings
from rag.flow.pipeline import Pipeline
def print_logs(pipeline):
def print_logs(pipeline: Pipeline):
last_logs = "[]"
while True:
time.sleep(5)
@ -34,16 +36,16 @@ def print_logs(pipeline):
last_logs = logs_str
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
dsl_default_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"dsl_examples",
"general_pdf_all.json",
)
parser.add_argument('-s', '--dsl', default=dsl_default_path, help="input dsl", action='store', required=True)
parser.add_argument('-d', '--doc_id', default=False, help="Document ID", action='store', required=True)
parser.add_argument('-t', '--tenant_id', default=False, help="Tenant ID", action='store', required=True)
parser.add_argument("-s", "--dsl", default=dsl_default_path, help="input dsl", action="store", required=False)
parser.add_argument("-d", "--doc_id", default=False, help="Document ID", action="store", required=True)
parser.add_argument("-t", "--tenant_id", default=False, help="Tenant ID", action="store", required=True)
args = parser.parse_args()
settings.init_settings()
@ -53,5 +55,7 @@ if __name__ == '__main__':
exe = ThreadPoolExecutor(max_workers=5)
thr = exe.submit(print_logs, pipeline)
# queue_dataflow(dsl=open(args.dsl, "r").read(), tenant_id=args.tenant_id, doc_id=args.doc_id, task_id="xxxx", flow_id="xxx", priority=0)
trio.run(pipeline.run)
thr.result()
thr.result()

View File

@ -1,15 +1,15 @@
{
"components": {
"begin": {
"File": {
"obj":{
"component_name": "File",
"params": {
}
},
"downstream": ["parser:0"],
"downstream": ["Parser:0"],
"upstream": []
},
"parser:0": {
"Parser:0": {
"obj": {
"component_name": "Parser",
"params": {
@ -22,14 +22,22 @@
"pdf"
],
"output_format": "json"
},
"excel": {
"output_format": "html",
"suffix": [
"xls",
"xlsx",
"csv"
]
}
}
}
},
"downstream": ["chunker:0"],
"upstream": ["begin"]
"downstream": ["Chunker:0"],
"upstream": ["Begin"]
},
"chunker:0": {
"Chunker:0": {
"obj": {
"component_name": "Chunker",
"params": {
@ -37,18 +45,19 @@
"auto_keywords": 5
}
},
"downstream": ["tokenizer:0"],
"upstream": ["chunker:0"]
"downstream": ["Tokenizer:0"],
"upstream": ["Parser:0"]
},
"tokenizer:0": {
"Tokenizer:0": {
"obj": {
"component_name": "Tokenizer",
"params": {
}
},
"downstream": [],
"upstream": ["chunker:0"]
"upstream": ["Chunker:0"]
}
},
"path": []
}
}