mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-05 18:15:06 +08:00
Compare commits
4 Commits
5b5f19cbc1
...
8de6b97806
| Author | SHA1 | Date | |
|---|---|---|---|
| 8de6b97806 | |||
| e4e0a88053 | |||
| 7719fd6350 | |||
| 15ef6dd72f |
@ -534,10 +534,12 @@ class Canvas(Graph):
|
|||||||
yield decorate("message", {"content": cpn_obj.output("content")})
|
yield decorate("message", {"content": cpn_obj.output("content")})
|
||||||
cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content"))
|
cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content"))
|
||||||
|
|
||||||
if isinstance(cpn_obj.output("attachment"), tuple):
|
message_end = {}
|
||||||
yield decorate("message", {"attachment": cpn_obj.output("attachment")})
|
if isinstance(cpn_obj.output("attachment"), dict):
|
||||||
|
message_end["attachment"] = cpn_obj.output("attachment")
|
||||||
yield decorate("message_end", {"reference": self.get_reference() if cite else None})
|
if cite:
|
||||||
|
message_end["reference"] = self.get_reference()
|
||||||
|
yield decorate("message_end", message_end)
|
||||||
|
|
||||||
while partials:
|
while partials:
|
||||||
_cpn_obj = self.get_component_obj(partials[0])
|
_cpn_obj = self.get_component_obj(partials[0])
|
||||||
|
|||||||
@ -18,6 +18,7 @@ import re
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from agent.component.base import ComponentParamBase, ComponentBase
|
from agent.component.base import ComponentParamBase, ComponentBase
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
|
||||||
|
|
||||||
class UserFillUpParam(ComponentParamBase):
|
class UserFillUpParam(ComponentParamBase):
|
||||||
@ -63,6 +64,13 @@ class UserFillUp(ComponentBase):
|
|||||||
for k, v in kwargs.get("inputs", {}).items():
|
for k, v in kwargs.get("inputs", {}).items():
|
||||||
if self.check_if_canceled("UserFillUp processing"):
|
if self.check_if_canceled("UserFillUp processing"):
|
||||||
return
|
return
|
||||||
|
if isinstance(v, dict) and v.get("type", "").lower().find("file") >=0:
|
||||||
|
if v.get("optional") and v.get("value", None) is None:
|
||||||
|
v = None
|
||||||
|
else:
|
||||||
|
v = FileService.get_files([v["value"]])
|
||||||
|
else:
|
||||||
|
v = v.get("value")
|
||||||
self.set_output(k, v)
|
self.set_output(k, v)
|
||||||
|
|
||||||
def thoughts(self) -> str:
|
def thoughts(self) -> str:
|
||||||
|
|||||||
@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
from quart import request, make_response
|
from quart import request, make_response
|
||||||
@ -29,6 +29,7 @@ from api.db import FileType
|
|||||||
from api.db.services import duplicate_name
|
from api.db.services import duplicate_name
|
||||||
from api.db.services.file_service import FileService
|
from api.db.services.file_service import FileService
|
||||||
from api.utils.file_utils import filename_type
|
from api.utils.file_utils import filename_type
|
||||||
|
from api.utils.web_utils import CONTENT_TYPE_MAP
|
||||||
from common import settings
|
from common import settings
|
||||||
from common.constants import RetCode
|
from common.constants import RetCode
|
||||||
|
|
||||||
@ -629,6 +630,19 @@ async def get(tenant_id, file_id):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
@manager.route("/file/download/<attachment_id>", methods=["GET"]) # noqa: F821
|
||||||
|
@token_required
|
||||||
|
async def download_attachment(tenant_id,attachment_id):
|
||||||
|
try:
|
||||||
|
ext = request.args.get("ext", "markdown")
|
||||||
|
data = await asyncio.to_thread(settings.STORAGE_IMPL.get, tenant_id, attachment_id)
|
||||||
|
response = await make_response(data)
|
||||||
|
response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
@manager.route('/file/mv', methods=['POST']) # noqa: F821
|
@manager.route('/file/mv', methods=['POST']) # noqa: F821
|
||||||
@token_required
|
@token_required
|
||||||
|
|||||||
@ -57,7 +57,6 @@ JSON_RESPONSE = True
|
|||||||
|
|
||||||
class RAGFlowConnector:
|
class RAGFlowConnector:
|
||||||
_MAX_DATASET_CACHE = 32
|
_MAX_DATASET_CACHE = 32
|
||||||
_MAX_DOCUMENT_CACHE = 128
|
|
||||||
_CACHE_TTL = 300
|
_CACHE_TTL = 300
|
||||||
|
|
||||||
_dataset_metadata_cache: OrderedDict[str, tuple[dict, float | int]] = OrderedDict() # "dataset_id" -> (metadata, expiry_ts)
|
_dataset_metadata_cache: OrderedDict[str, tuple[dict, float | int]] = OrderedDict() # "dataset_id" -> (metadata, expiry_ts)
|
||||||
@ -116,8 +115,6 @@ class RAGFlowConnector:
|
|||||||
def _set_cached_document_metadata_by_dataset(self, dataset_id, doc_id_meta_list):
|
def _set_cached_document_metadata_by_dataset(self, dataset_id, doc_id_meta_list):
|
||||||
self._document_metadata_cache[dataset_id] = (doc_id_meta_list, self._get_expiry_timestamp())
|
self._document_metadata_cache[dataset_id] = (doc_id_meta_list, self._get_expiry_timestamp())
|
||||||
self._document_metadata_cache.move_to_end(dataset_id)
|
self._document_metadata_cache.move_to_end(dataset_id)
|
||||||
if len(self._document_metadata_cache) > self._MAX_DOCUMENT_CACHE:
|
|
||||||
self._document_metadata_cache.popitem(last=False)
|
|
||||||
|
|
||||||
def list_datasets(self, page: int = 1, page_size: int = 1000, orderby: str = "create_time", desc: bool = True, id: str | None = None, name: str | None = None):
|
def list_datasets(self, page: int = 1, page_size: int = 1000, orderby: str = "create_time", desc: bool = True, id: str | None = None, name: str | None = None):
|
||||||
res = self._get("/datasets", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
|
res = self._get("/datasets", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
|
||||||
@ -240,46 +237,46 @@ class RAGFlowConnector:
|
|||||||
|
|
||||||
docs = None if force_refresh else self._get_cached_document_metadata_by_dataset(dataset_id)
|
docs = None if force_refresh else self._get_cached_document_metadata_by_dataset(dataset_id)
|
||||||
if docs is None:
|
if docs is None:
|
||||||
docs_res = self._get(f"/datasets/{dataset_id}/documents")
|
page = 1
|
||||||
docs_data = docs_res.json()
|
page_size = 30
|
||||||
if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
|
doc_id_meta_list = []
|
||||||
doc_id_meta_list = []
|
docs = {}
|
||||||
docs = {}
|
while page:
|
||||||
for doc in docs_data["data"]["docs"]:
|
docs_res = self._get(f"/datasets/{dataset_id}/documents?page={page}")
|
||||||
doc_id = doc.get("id")
|
docs_data = docs_res.json()
|
||||||
if not doc_id:
|
if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
|
||||||
continue
|
for doc in docs_data["data"]["docs"]:
|
||||||
doc_meta = {
|
doc_id = doc.get("id")
|
||||||
"document_id": doc_id,
|
if not doc_id:
|
||||||
"name": doc.get("name", ""),
|
continue
|
||||||
"location": doc.get("location", ""),
|
doc_meta = {
|
||||||
"type": doc.get("type", ""),
|
"document_id": doc_id,
|
||||||
"size": doc.get("size"),
|
"name": doc.get("name", ""),
|
||||||
"chunk_count": doc.get("chunk_count"),
|
"location": doc.get("location", ""),
|
||||||
# "chunk_method": doc.get("chunk_method", ""),
|
"type": doc.get("type", ""),
|
||||||
"create_date": doc.get("create_date", ""),
|
"size": doc.get("size"),
|
||||||
"update_date": doc.get("update_date", ""),
|
"chunk_count": doc.get("chunk_count"),
|
||||||
# "process_begin_at": doc.get("process_begin_at", ""),
|
"create_date": doc.get("create_date", ""),
|
||||||
# "process_duration": doc.get("process_duration"),
|
"update_date": doc.get("update_date", ""),
|
||||||
# "progress": doc.get("progress"),
|
"token_count": doc.get("token_count"),
|
||||||
# "progress_msg": doc.get("progress_msg", ""),
|
"thumbnail": doc.get("thumbnail", ""),
|
||||||
# "status": doc.get("status", ""),
|
"dataset_id": doc.get("dataset_id", dataset_id),
|
||||||
# "run": doc.get("run", ""),
|
"meta_fields": doc.get("meta_fields", {}),
|
||||||
"token_count": doc.get("token_count"),
|
}
|
||||||
# "source_type": doc.get("source_type", ""),
|
doc_id_meta_list.append((doc_id, doc_meta))
|
||||||
"thumbnail": doc.get("thumbnail", ""),
|
docs[doc_id] = doc_meta
|
||||||
"dataset_id": doc.get("dataset_id", dataset_id),
|
|
||||||
"meta_fields": doc.get("meta_fields", {}),
|
page += 1
|
||||||
# "parser_config": doc.get("parser_config", {})
|
if docs_data.get("data", {}).get("total", 0) - page * page_size <= 0:
|
||||||
}
|
page = None
|
||||||
doc_id_meta_list.append((doc_id, doc_meta))
|
|
||||||
docs[doc_id] = doc_meta
|
|
||||||
self._set_cached_document_metadata_by_dataset(dataset_id, doc_id_meta_list)
|
self._set_cached_document_metadata_by_dataset(dataset_id, doc_id_meta_list)
|
||||||
if docs:
|
if docs:
|
||||||
document_cache.update(docs)
|
document_cache.update(docs)
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
# Gracefully handle metadata cache failures
|
# Gracefully handle metadata cache failures
|
||||||
|
logging.error(f"Problem building the document metadata cache: {str(e)}")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return document_cache, dataset_cache
|
return document_cache, dataset_cache
|
||||||
|
|||||||
@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _normalize_section(section):
|
def _normalize_section(section):
|
||||||
# pad section to length 3: (txt, sec_id, poss)
|
# Pad/normalize to (txt, layout, positions)
|
||||||
if len(section) == 1:
|
if not isinstance(section, (list, tuple)):
|
||||||
|
section = (section, "", [])
|
||||||
|
elif len(section) == 1:
|
||||||
section = (section[0], "", [])
|
section = (section[0], "", [])
|
||||||
elif len(section) == 2:
|
elif len(section) == 2:
|
||||||
section = (section[0], "", section[1])
|
section = (section[0], "", section[1])
|
||||||
elif len(section) != 3:
|
else:
|
||||||
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
section = (section[0], section[1], section[2])
|
||||||
|
|
||||||
txt, layoutno, poss = section
|
txt, layoutno, poss = section
|
||||||
if isinstance(poss, str):
|
if isinstance(poss, str):
|
||||||
poss = pdf_parser.extract_positions(poss)
|
poss = pdf_parser.extract_positions(poss)
|
||||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
if poss:
|
||||||
pn = first[0]
|
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||||
|
pn = first[0]
|
||||||
if isinstance(pn, list):
|
if isinstance(pn, list) and pn:
|
||||||
pn = pn[0] # [pn] -> pn
|
pn = pn[0] # [pn] -> pn
|
||||||
poss[0] = (pn, *first[1:])
|
poss[0] = (pn, *first[1:])
|
||||||
|
if not poss:
|
||||||
|
poss = []
|
||||||
|
|
||||||
return (txt, layoutno, poss)
|
return (txt, layoutno, poss)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user