mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-04 03:25:30 +08:00
Compare commits
20 Commits
v0.22.1
...
065917bf1c
| Author | SHA1 | Date | |
|---|---|---|---|
| 065917bf1c | |||
| 820934fc77 | |||
| d3d2ccc76c | |||
| c8ab9079b3 | |||
| 0d5589bfda | |||
| b846a0f547 | |||
| 69578ebfce | |||
| 06cef71ba6 | |||
| d2b1da0e26 | |||
| 7c6d30f4c8 | |||
| ea0352ee4a | |||
| fa5cf10f56 | |||
| 3fe71ab7dd | |||
| 9f715d6bc2 | |||
| 48de3b26ba | |||
| 273c4bc4d3 | |||
| 420c97199a | |||
| ecf0322165 | |||
| 38234aca53 | |||
| 1c06ec39ca |
@ -132,12 +132,12 @@ class Retrieval(ToolBase, ABC):
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
if self._param.meta_data_filter.get("method") == "auto":
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
|
||||
filters = gen_meta_filter(chat_mdl, metas, query)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, query)
|
||||
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
elif self._param.meta_data_filter.get("method") == "manual":
|
||||
filters=self._param.meta_data_filter["manual"]
|
||||
filters = self._param.meta_data_filter["manual"]
|
||||
for flt in filters:
|
||||
pat = re.compile(self.variable_ref_patt)
|
||||
s = flt["value"]
|
||||
@ -165,9 +165,9 @@ class Retrieval(ToolBase, ABC):
|
||||
|
||||
out_parts.append(s[last:])
|
||||
flt["value"] = "".join(out_parts)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
|
||||
if filters and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
|
||||
if self._param.cross_languages:
|
||||
query = cross_languages(kbs[0].tenant_id, None, query, self._param.cross_languages)
|
||||
|
||||
@ -305,14 +305,14 @@ async def retrieval_test():
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
if meta_data_filter.get("method") == "auto":
|
||||
chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
|
||||
filters = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
elif meta_data_filter.get("method") == "manual":
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
|
||||
if meta_data_filter["manual"] and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
|
||||
try:
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
|
||||
@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str):
|
||||
data=False, message=str(e),
|
||||
code=RetCode.EXCEPTION_ERROR)
|
||||
|
||||
def sse():
|
||||
async def sse():
|
||||
nonlocal canvas
|
||||
try:
|
||||
for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
|
||||
async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
|
||||
yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"
|
||||
|
||||
cvs.dsl = json.loads(str(canvas))
|
||||
|
||||
@ -120,7 +120,7 @@ async def retrieval(tenant_id):
|
||||
retrieval_setting = req.get("retrieval_setting", {})
|
||||
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
|
||||
top = int(retrieval_setting.get("top_k", 1024))
|
||||
metadata_condition = req.get("metadata_condition", {})
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
metas = DocumentService.get_meta_by_kbs([kb_id])
|
||||
|
||||
doc_ids = []
|
||||
@ -132,7 +132,7 @@ async def retrieval(tenant_id):
|
||||
|
||||
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
|
||||
if metadata_condition:
|
||||
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
|
||||
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
if not doc_ids and metadata_condition:
|
||||
doc_ids = ["-999"]
|
||||
ranks = settings.retriever.retrieval(
|
||||
|
||||
@ -1442,9 +1442,11 @@ async def retrieval_test(tenant_id):
|
||||
if doc_id not in doc_ids_list:
|
||||
return get_error_data_result(f"The datasets don't own the document {doc_id}")
|
||||
if not doc_ids:
|
||||
metadata_condition = req.get("metadata_condition", {})
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
|
||||
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
if metadata_condition and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
similarity_threshold = float(req.get("similarity_threshold", 0.2))
|
||||
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
||||
top = int(req.get("top_k", 1024))
|
||||
|
||||
@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id):
|
||||
return resp
|
||||
else:
|
||||
# For non-streaming, just return the response directly
|
||||
response = next(
|
||||
completion_openai(
|
||||
async for response in completion_openai(
|
||||
tenant_id,
|
||||
agent_id,
|
||||
question,
|
||||
session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""),
|
||||
stream=False,
|
||||
**req,
|
||||
)
|
||||
)
|
||||
return jsonify(response)
|
||||
):
|
||||
return jsonify(response)
|
||||
|
||||
|
||||
@manager.route("/agents/<agent_id>/completions", methods=["POST"]) # noqa: F821
|
||||
@ -448,8 +446,8 @@ async def agent_completions(tenant_id, agent_id):
|
||||
|
||||
if req.get("stream", True):
|
||||
|
||||
def generate():
|
||||
for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
|
||||
async def generate():
|
||||
async for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
|
||||
if isinstance(answer, str):
|
||||
try:
|
||||
ans = json.loads(answer[5:]) # remove "data:"
|
||||
@ -473,7 +471,7 @@ async def agent_completions(tenant_id, agent_id):
|
||||
full_content = ""
|
||||
reference = {}
|
||||
final_ans = ""
|
||||
for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
|
||||
async for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
|
||||
try:
|
||||
ans = json.loads(answer[5:])
|
||||
|
||||
@ -875,7 +873,7 @@ async def agent_bot_completions(agent_id):
|
||||
resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8")
|
||||
return resp
|
||||
|
||||
for answer in agent_completion(objs[0].tenant_id, agent_id, **req):
|
||||
async for answer in agent_completion(objs[0].tenant_id, agent_id, **req):
|
||||
return get_result(data=answer)
|
||||
|
||||
|
||||
@ -977,14 +975,14 @@ async def retrieval_test_embedded():
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
if meta_data_filter.get("method") == "auto":
|
||||
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
|
||||
filters = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
elif meta_data_filter.get("method") == "manual":
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
|
||||
if meta_data_filter["manual"] and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
|
||||
try:
|
||||
tenants = UserTenantService.query(user_id=tenant_id)
|
||||
|
||||
@ -177,7 +177,7 @@ class UserCanvasService(CommonService):
|
||||
return True
|
||||
|
||||
|
||||
def completion(tenant_id, agent_id, session_id=None, **kwargs):
|
||||
async def completion(tenant_id, agent_id, session_id=None, **kwargs):
|
||||
query = kwargs.get("query", "") or kwargs.get("question", "")
|
||||
files = kwargs.get("files", [])
|
||||
inputs = kwargs.get("inputs", {})
|
||||
@ -219,10 +219,14 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
|
||||
"id": message_id
|
||||
})
|
||||
txt = ""
|
||||
for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
|
||||
async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
|
||||
ans["session_id"] = session_id
|
||||
if ans["event"] == "message":
|
||||
txt += ans["data"]["content"]
|
||||
if ans["data"].get("start_to_think", False):
|
||||
txt += "<think>"
|
||||
elif ans["data"].get("end_to_think", False):
|
||||
txt += "</think>"
|
||||
yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"
|
||||
|
||||
conv.message.append({"role": "assistant", "content": txt, "created_at": time.time(), "id": message_id})
|
||||
@ -233,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
|
||||
API4ConversationService.append_message(conv["id"], conv)
|
||||
|
||||
|
||||
def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
|
||||
async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
|
||||
tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
|
||||
prompt_tokens = len(tiktoken_encoder.encode(str(question)))
|
||||
user_id = kwargs.get("user_id", "")
|
||||
@ -241,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
|
||||
if stream:
|
||||
completion_tokens = 0
|
||||
try:
|
||||
for ans in completion(
|
||||
async for ans in completion(
|
||||
tenant_id=tenant_id,
|
||||
agent_id=agent_id,
|
||||
session_id=session_id,
|
||||
@ -300,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
|
||||
try:
|
||||
all_content = ""
|
||||
reference = {}
|
||||
for ans in completion(
|
||||
async for ans in completion(
|
||||
tenant_id=tenant_id,
|
||||
agent_id=agent_id,
|
||||
session_id=session_id,
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
#
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import os
|
||||
from typing import Tuple, List
|
||||
|
||||
from anthropic import BaseModel
|
||||
@ -103,7 +104,8 @@ class SyncLogsService(CommonService):
|
||||
Knowledgebase.avatar.alias("kb_avatar"),
|
||||
Connector2Kb.auto_parse,
|
||||
cls.model.from_beginning.alias("reindex"),
|
||||
cls.model.status
|
||||
cls.model.status,
|
||||
cls.model.update_time
|
||||
]
|
||||
if not connector_id:
|
||||
fields.append(Connector.config)
|
||||
@ -116,7 +118,11 @@ class SyncLogsService(CommonService):
|
||||
if connector_id:
|
||||
query = query.where(cls.model.connector_id == connector_id)
|
||||
else:
|
||||
interval_expr = SQL("INTERVAL `t2`.`refresh_freq` MINUTE")
|
||||
database_type = os.getenv("DB_TYPE", "mysql")
|
||||
if "postgres" in database_type.lower():
|
||||
interval_expr = SQL("make_interval(mins => t2.refresh_freq)")
|
||||
else:
|
||||
interval_expr = SQL("INTERVAL `t2`.`refresh_freq` MINUTE")
|
||||
query = query.where(
|
||||
Connector.input_type == InputType.POLL,
|
||||
Connector.status == TaskStatus.SCHEDULE,
|
||||
|
||||
@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
|
||||
]
|
||||
|
||||
|
||||
def meta_filter(metas: dict, filters: list[dict]):
|
||||
def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
|
||||
doc_ids = set([])
|
||||
|
||||
def filter_out(v2docs, operator, value):
|
||||
@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
|
||||
if not doc_ids:
|
||||
doc_ids = set(ids)
|
||||
else:
|
||||
doc_ids = doc_ids & set(ids)
|
||||
if logic == "and":
|
||||
doc_ids = doc_ids & set(ids)
|
||||
else:
|
||||
doc_ids = doc_ids | set(ids)
|
||||
if not doc_ids:
|
||||
return []
|
||||
return list(doc_ids)
|
||||
@ -407,14 +410,15 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
if dialog.meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
|
||||
if dialog.meta_data_filter.get("method") == "auto":
|
||||
filters = gen_meta_filter(chat_mdl, metas, questions[-1])
|
||||
attachments.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
|
||||
attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not attachments:
|
||||
attachments = None
|
||||
elif dialog.meta_data_filter.get("method") == "manual":
|
||||
attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
|
||||
if not attachments:
|
||||
attachments = None
|
||||
conds = dialog.meta_data_filter["manual"]
|
||||
attachments.extend(meta_filter(metas, conds, dialog.meta_data_filter.get("logic", "and")))
|
||||
if conds and not attachments:
|
||||
attachments = ["-999"]
|
||||
|
||||
if prompt_config.get("keyword", False):
|
||||
questions[-1] += keyword_extraction(chat_mdl, questions[-1])
|
||||
@ -778,14 +782,14 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
if meta_data_filter.get("method") == "auto":
|
||||
filters = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
elif meta_data_filter.get("method") == "manual":
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
|
||||
if meta_data_filter["manual"] and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
|
||||
kbinfos = retriever.retrieval(
|
||||
question=question,
|
||||
@ -853,14 +857,14 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
if meta_data_filter.get("method") == "auto":
|
||||
filters = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters))
|
||||
filters: dict = gen_meta_filter(chat_mdl, metas, question)
|
||||
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
elif meta_data_filter.get("method") == "manual":
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
|
||||
if not doc_ids:
|
||||
doc_ids = None
|
||||
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
|
||||
if meta_data_filter["manual"] and not doc_ids:
|
||||
doc_ids = ["-999"]
|
||||
|
||||
ranks = settings.retriever.retrieval(
|
||||
question=question,
|
||||
|
||||
@ -1,38 +1,45 @@
|
||||
import html
|
||||
import logging
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from retry import retry
|
||||
|
||||
from common.data_source.config import (
|
||||
INDEX_BATCH_SIZE,
|
||||
DocumentSource, NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP
|
||||
NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP,
|
||||
DocumentSource,
|
||||
)
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
ConnectorValidationError,
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError,
|
||||
UnexpectedValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import (
|
||||
LoadConnector,
|
||||
PollConnector,
|
||||
SecondsSinceUnixEpoch
|
||||
SecondsSinceUnixEpoch,
|
||||
)
|
||||
from common.data_source.models import (
|
||||
Document,
|
||||
TextSection, GenerateDocumentsOutput
|
||||
)
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorValidationError,
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError,
|
||||
UnexpectedValidationError, ConnectorMissingCredentialError
|
||||
)
|
||||
from common.data_source.models import (
|
||||
NotionPage,
|
||||
GenerateDocumentsOutput,
|
||||
NotionBlock,
|
||||
NotionSearchResponse
|
||||
NotionPage,
|
||||
NotionSearchResponse,
|
||||
TextSection,
|
||||
)
|
||||
from common.data_source.utils import (
|
||||
rl_requests,
|
||||
batch_generator,
|
||||
datetime_from_string,
|
||||
fetch_notion_data,
|
||||
filter_pages_by_time,
|
||||
properties_to_str,
|
||||
filter_pages_by_time, datetime_from_string
|
||||
rl_requests,
|
||||
)
|
||||
|
||||
|
||||
@ -61,11 +68,9 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
self.recursive_index_enabled = recursive_index_enabled or bool(root_page_id)
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _fetch_child_blocks(
|
||||
self, block_id: str, cursor: Optional[str] = None
|
||||
) -> dict[str, Any] | None:
|
||||
def _fetch_child_blocks(self, block_id: str, cursor: Optional[str] = None) -> dict[str, Any] | None:
|
||||
"""Fetch all child blocks via the Notion API."""
|
||||
logging.debug(f"Fetching children of block with ID '{block_id}'")
|
||||
logging.debug(f"[Notion]: Fetching children of block with ID {block_id}")
|
||||
block_url = f"https://api.notion.com/v1/blocks/{block_id}/children"
|
||||
query_params = {"start_cursor": cursor} if cursor else None
|
||||
|
||||
@ -79,49 +84,42 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
if hasattr(e, 'response') and e.response.status_code == 404:
|
||||
logging.error(
|
||||
f"Unable to access block with ID '{block_id}'. "
|
||||
f"This is likely due to the block not being shared with the integration."
|
||||
)
|
||||
if hasattr(e, "response") and e.response.status_code == 404:
|
||||
logging.error(f"[Notion]: Unable to access block with ID {block_id}. This is likely due to the block not being shared with the integration.")
|
||||
return None
|
||||
else:
|
||||
logging.exception(f"Error fetching blocks: {e}")
|
||||
logging.exception(f"[Notion]: Error fetching blocks: {e}")
|
||||
raise
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _fetch_page(self, page_id: str) -> NotionPage:
|
||||
"""Fetch a page from its ID via the Notion API."""
|
||||
logging.debug(f"Fetching page for ID '{page_id}'")
|
||||
logging.debug(f"[Notion]: Fetching page for ID {page_id}")
|
||||
page_url = f"https://api.notion.com/v1/pages/{page_id}"
|
||||
|
||||
try:
|
||||
data = fetch_notion_data(page_url, self.headers, "GET")
|
||||
return NotionPage(**data)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch page, trying database for ID '{page_id}': {e}")
|
||||
logging.warning(f"[Notion]: Failed to fetch page, trying database for ID {page_id}: {e}")
|
||||
return self._fetch_database_as_page(page_id)
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _fetch_database_as_page(self, database_id: str) -> NotionPage:
|
||||
"""Attempt to fetch a database as a page."""
|
||||
logging.debug(f"Fetching database for ID '{database_id}' as a page")
|
||||
logging.debug(f"[Notion]: Fetching database for ID {database_id} as a page")
|
||||
database_url = f"https://api.notion.com/v1/databases/{database_id}"
|
||||
|
||||
data = fetch_notion_data(database_url, self.headers, "GET")
|
||||
database_name = data.get("title")
|
||||
database_name = (
|
||||
database_name[0].get("text", {}).get("content") if database_name else None
|
||||
)
|
||||
database_name = database_name[0].get("text", {}).get("content") if database_name else None
|
||||
|
||||
return NotionPage(**data, database_name=database_name)
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _fetch_database(
|
||||
self, database_id: str, cursor: Optional[str] = None
|
||||
) -> dict[str, Any]:
|
||||
def _fetch_database(self, database_id: str, cursor: Optional[str] = None) -> dict[str, Any]:
|
||||
"""Fetch a database from its ID via the Notion API."""
|
||||
logging.debug(f"Fetching database for ID '{database_id}'")
|
||||
logging.debug(f"[Notion]: Fetching database for ID {database_id}")
|
||||
block_url = f"https://api.notion.com/v1/databases/{database_id}/query"
|
||||
body = {"start_cursor": cursor} if cursor else None
|
||||
|
||||
@ -129,17 +127,12 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
data = fetch_notion_data(block_url, self.headers, "POST", body)
|
||||
return data
|
||||
except Exception as e:
|
||||
if hasattr(e, 'response') and e.response.status_code in [404, 400]:
|
||||
logging.error(
|
||||
f"Unable to access database with ID '{database_id}'. "
|
||||
f"This is likely due to the database not being shared with the integration."
|
||||
)
|
||||
if hasattr(e, "response") and e.response.status_code in [404, 400]:
|
||||
logging.error(f"[Notion]: Unable to access database with ID {database_id}. This is likely due to the database not being shared with the integration.")
|
||||
return {"results": [], "next_cursor": None}
|
||||
raise
|
||||
|
||||
def _read_pages_from_database(
|
||||
self, database_id: str
|
||||
) -> tuple[list[NotionBlock], list[str]]:
|
||||
def _read_pages_from_database(self, database_id: str) -> tuple[list[NotionBlock], list[str]]:
|
||||
"""Returns a list of top level blocks and all page IDs in the database."""
|
||||
result_blocks: list[NotionBlock] = []
|
||||
result_pages: list[str] = []
|
||||
@ -158,10 +151,10 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
|
||||
if self.recursive_index_enabled:
|
||||
if obj_type == "page":
|
||||
logging.debug(f"Found page with ID '{obj_id}' in database '{database_id}'")
|
||||
logging.debug(f"[Notion]: Found page with ID {obj_id} in database {database_id}")
|
||||
result_pages.append(result["id"])
|
||||
elif obj_type == "database":
|
||||
logging.debug(f"Found database with ID '{obj_id}' in database '{database_id}'")
|
||||
logging.debug(f"[Notion]: Found database with ID {obj_id} in database {database_id}")
|
||||
_, child_pages = self._read_pages_from_database(obj_id)
|
||||
result_pages.extend(child_pages)
|
||||
|
||||
@ -172,44 +165,229 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
|
||||
return result_blocks, result_pages
|
||||
|
||||
def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]:
|
||||
"""Reads all child blocks for the specified block, returns blocks and child page ids."""
|
||||
def _extract_rich_text(self, rich_text_array: list[dict[str, Any]]) -> str:
|
||||
collected_text: list[str] = []
|
||||
for rich_text in rich_text_array:
|
||||
content = ""
|
||||
r_type = rich_text.get("type")
|
||||
|
||||
if r_type == "equation":
|
||||
expr = rich_text.get("equation", {}).get("expression")
|
||||
if expr:
|
||||
content = expr
|
||||
elif r_type == "mention":
|
||||
mention = rich_text.get("mention", {}) or {}
|
||||
mention_type = mention.get("type")
|
||||
mention_value = mention.get(mention_type, {}) if mention_type else {}
|
||||
if mention_type == "date":
|
||||
start = mention_value.get("start")
|
||||
end = mention_value.get("end")
|
||||
if start and end:
|
||||
content = f"{start} - {end}"
|
||||
elif start:
|
||||
content = start
|
||||
elif mention_type in {"page", "database"}:
|
||||
content = mention_value.get("id", rich_text.get("plain_text", ""))
|
||||
elif mention_type == "link_preview":
|
||||
content = mention_value.get("url", rich_text.get("plain_text", ""))
|
||||
else:
|
||||
content = rich_text.get("plain_text", "") or str(mention_value)
|
||||
else:
|
||||
if rich_text.get("plain_text"):
|
||||
content = rich_text["plain_text"]
|
||||
elif "text" in rich_text and rich_text["text"].get("content"):
|
||||
content = rich_text["text"]["content"]
|
||||
|
||||
href = rich_text.get("href")
|
||||
if content and href:
|
||||
content = f"{content} ({href})"
|
||||
|
||||
if content:
|
||||
collected_text.append(content)
|
||||
|
||||
return "".join(collected_text).strip()
|
||||
|
||||
def _build_table_html(self, table_block_id: str) -> str | None:
|
||||
rows: list[str] = []
|
||||
cursor = None
|
||||
while True:
|
||||
data = self._fetch_child_blocks(table_block_id, cursor)
|
||||
if data is None:
|
||||
break
|
||||
|
||||
for result in data["results"]:
|
||||
if result.get("type") != "table_row":
|
||||
continue
|
||||
cells_html: list[str] = []
|
||||
for cell in result["table_row"].get("cells", []):
|
||||
cell_text = self._extract_rich_text(cell)
|
||||
cell_html = html.escape(cell_text) if cell_text else ""
|
||||
cells_html.append(f"<td>{cell_html}</td>")
|
||||
rows.append(f"<tr>{''.join(cells_html)}</tr>")
|
||||
|
||||
if data.get("next_cursor") is None:
|
||||
break
|
||||
cursor = data["next_cursor"]
|
||||
|
||||
if not rows:
|
||||
return None
|
||||
return "<table>\n" + "\n".join(rows) + "\n</table>"
|
||||
|
||||
def _download_file(self, url: str) -> bytes | None:
|
||||
try:
|
||||
response = rl_requests.get(url, timeout=60)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
except Exception as exc:
|
||||
logging.warning(f"[Notion]: Failed to download Notion file from {url}: {exc}")
|
||||
return None
|
||||
|
||||
def _extract_file_metadata(self, result_obj: dict[str, Any], block_id: str) -> tuple[str | None, str, str | None]:
|
||||
file_source_type = result_obj.get("type")
|
||||
file_source = result_obj.get(file_source_type, {}) if file_source_type else {}
|
||||
url = file_source.get("url")
|
||||
|
||||
name = result_obj.get("name") or file_source.get("name")
|
||||
if url and not name:
|
||||
parsed_name = Path(urlparse(url).path).name
|
||||
name = parsed_name or f"notion_file_{block_id}"
|
||||
elif not name:
|
||||
name = f"notion_file_{block_id}"
|
||||
|
||||
caption = self._extract_rich_text(result_obj.get("caption", [])) if "caption" in result_obj else None
|
||||
|
||||
return url, name, caption
|
||||
|
||||
def _build_attachment_document(
|
||||
self,
|
||||
block_id: str,
|
||||
url: str,
|
||||
name: str,
|
||||
caption: Optional[str],
|
||||
page_last_edited_time: Optional[str],
|
||||
) -> Document | None:
|
||||
file_bytes = self._download_file(url)
|
||||
if file_bytes is None:
|
||||
return None
|
||||
|
||||
extension = Path(name).suffix or Path(urlparse(url).path).suffix or ".bin"
|
||||
if extension and not extension.startswith("."):
|
||||
extension = f".{extension}"
|
||||
if not extension:
|
||||
extension = ".bin"
|
||||
|
||||
updated_at = datetime_from_string(page_last_edited_time) if page_last_edited_time else datetime.now(timezone.utc)
|
||||
semantic_identifier = caption or name or f"Notion file {block_id}"
|
||||
|
||||
return Document(
|
||||
id=block_id,
|
||||
blob=file_bytes,
|
||||
source=DocumentSource.NOTION,
|
||||
semantic_identifier=semantic_identifier,
|
||||
extension=extension,
|
||||
size_bytes=len(file_bytes),
|
||||
doc_updated_at=updated_at,
|
||||
)
|
||||
|
||||
def _read_blocks(self, base_block_id: str, page_last_edited_time: Optional[str] = None) -> tuple[list[NotionBlock], list[str], list[Document]]:
|
||||
result_blocks: list[NotionBlock] = []
|
||||
child_pages: list[str] = []
|
||||
attachments: list[Document] = []
|
||||
cursor = None
|
||||
|
||||
while True:
|
||||
data = self._fetch_child_blocks(base_block_id, cursor)
|
||||
|
||||
if data is None:
|
||||
return result_blocks, child_pages
|
||||
return result_blocks, child_pages, attachments
|
||||
|
||||
for result in data["results"]:
|
||||
logging.debug(f"Found child block for block with ID '{base_block_id}': {result}")
|
||||
logging.debug(f"[Notion]: Found child block for block with ID {base_block_id}: {result}")
|
||||
result_block_id = result["id"]
|
||||
result_type = result["type"]
|
||||
result_obj = result[result_type]
|
||||
|
||||
if result_type in ["ai_block", "unsupported", "external_object_instance_page"]:
|
||||
logging.warning(f"Skipping unsupported block type '{result_type}'")
|
||||
logging.warning(f"[Notion]: Skipping unsupported block type {result_type}")
|
||||
continue
|
||||
|
||||
if result_type == "table":
|
||||
table_html = self._build_table_html(result_block_id)
|
||||
if table_html:
|
||||
result_blocks.append(
|
||||
NotionBlock(
|
||||
id=result_block_id,
|
||||
text=table_html,
|
||||
prefix="\n\n",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
if result_type == "equation":
|
||||
expr = result_obj.get("expression")
|
||||
if expr:
|
||||
result_blocks.append(
|
||||
NotionBlock(
|
||||
id=result_block_id,
|
||||
text=expr,
|
||||
prefix="\n",
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
cur_result_text_arr = []
|
||||
if "rich_text" in result_obj:
|
||||
for rich_text in result_obj["rich_text"]:
|
||||
if "text" in rich_text:
|
||||
text = rich_text["text"]["content"]
|
||||
cur_result_text_arr.append(text)
|
||||
text = self._extract_rich_text(result_obj["rich_text"])
|
||||
if text:
|
||||
cur_result_text_arr.append(text)
|
||||
|
||||
if result_type == "bulleted_list_item":
|
||||
if cur_result_text_arr:
|
||||
cur_result_text_arr[0] = f"- {cur_result_text_arr[0]}"
|
||||
else:
|
||||
cur_result_text_arr = ["- "]
|
||||
|
||||
if result_type == "numbered_list_item":
|
||||
if cur_result_text_arr:
|
||||
cur_result_text_arr[0] = f"1. {cur_result_text_arr[0]}"
|
||||
else:
|
||||
cur_result_text_arr = ["1. "]
|
||||
|
||||
if result_type == "to_do":
|
||||
checked = result_obj.get("checked")
|
||||
checkbox_prefix = "[x]" if checked else "[ ]"
|
||||
if cur_result_text_arr:
|
||||
cur_result_text_arr = [f"{checkbox_prefix} {cur_result_text_arr[0]}"] + cur_result_text_arr[1:]
|
||||
else:
|
||||
cur_result_text_arr = [checkbox_prefix]
|
||||
|
||||
if result_type in {"file", "image", "pdf", "video", "audio"}:
|
||||
file_url, file_name, caption = self._extract_file_metadata(result_obj, result_block_id)
|
||||
if file_url:
|
||||
attachment_doc = self._build_attachment_document(
|
||||
block_id=result_block_id,
|
||||
url=file_url,
|
||||
name=file_name,
|
||||
caption=caption,
|
||||
page_last_edited_time=page_last_edited_time,
|
||||
)
|
||||
if attachment_doc:
|
||||
attachments.append(attachment_doc)
|
||||
|
||||
attachment_label = caption or file_name
|
||||
if attachment_label:
|
||||
cur_result_text_arr.append(f"{result_type.capitalize()}: {attachment_label}")
|
||||
|
||||
if result["has_children"]:
|
||||
if result_type == "child_page":
|
||||
child_pages.append(result_block_id)
|
||||
else:
|
||||
logging.debug(f"Entering sub-block: {result_block_id}")
|
||||
subblocks, subblock_child_pages = self._read_blocks(result_block_id)
|
||||
logging.debug(f"Finished sub-block: {result_block_id}")
|
||||
logging.debug(f"[Notion]: Entering sub-block: {result_block_id}")
|
||||
subblocks, subblock_child_pages, subblock_attachments = self._read_blocks(result_block_id, page_last_edited_time)
|
||||
logging.debug(f"[Notion]: Finished sub-block: {result_block_id}")
|
||||
result_blocks.extend(subblocks)
|
||||
child_pages.extend(subblock_child_pages)
|
||||
attachments.extend(subblock_attachments)
|
||||
|
||||
if result_type == "child_database":
|
||||
inner_blocks, inner_child_pages = self._read_pages_from_database(result_block_id)
|
||||
@ -231,7 +409,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
|
||||
cursor = data["next_cursor"]
|
||||
|
||||
return result_blocks, child_pages
|
||||
return result_blocks, child_pages, attachments
|
||||
|
||||
def _read_page_title(self, page: NotionPage) -> Optional[str]:
|
||||
"""Extracts the title from a Notion page."""
|
||||
@ -245,9 +423,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
|
||||
return None
|
||||
|
||||
def _read_pages(
|
||||
self, pages: list[NotionPage]
|
||||
) -> Generator[Document, None, None]:
|
||||
def _read_pages(self, pages: list[NotionPage], start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None) -> Generator[Document, None, None]:
|
||||
"""Reads pages for rich text content and generates Documents."""
|
||||
all_child_page_ids: list[str] = []
|
||||
|
||||
@ -255,11 +431,17 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
if isinstance(page, dict):
|
||||
page = NotionPage(**page)
|
||||
if page.id in self.indexed_pages:
|
||||
logging.debug(f"Already indexed page with ID '{page.id}'. Skipping.")
|
||||
logging.debug(f"[Notion]: Already indexed page with ID {page.id}. Skipping.")
|
||||
continue
|
||||
|
||||
logging.info(f"Reading page with ID '{page.id}', with url {page.url}")
|
||||
page_blocks, child_page_ids = self._read_blocks(page.id)
|
||||
if start is not None and end is not None:
|
||||
page_ts = datetime_from_string(page.last_edited_time).timestamp()
|
||||
if not (page_ts > start and page_ts <= end):
|
||||
logging.debug(f"[Notion]: Skipping page {page.id} outside polling window.")
|
||||
continue
|
||||
|
||||
logging.info(f"[Notion]: Reading page with ID {page.id}, with url {page.url}")
|
||||
page_blocks, child_page_ids, attachment_docs = self._read_blocks(page.id, page.last_edited_time)
|
||||
all_child_page_ids.extend(child_page_ids)
|
||||
self.indexed_pages.add(page.id)
|
||||
|
||||
@ -268,14 +450,12 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
|
||||
if not page_blocks:
|
||||
if not raw_page_title:
|
||||
logging.warning(f"No blocks OR title found for page with ID '{page.id}'. Skipping.")
|
||||
logging.warning(f"[Notion]: No blocks OR title found for page with ID {page.id}. Skipping.")
|
||||
continue
|
||||
|
||||
text = page_title
|
||||
if page.properties:
|
||||
text += "\n\n" + "\n".join(
|
||||
[f"{key}: {value}" for key, value in page.properties.items()]
|
||||
)
|
||||
text += "\n\n" + "\n".join([f"{key}: {value}" for key, value in page.properties.items()])
|
||||
sections = [TextSection(link=page.url, text=text)]
|
||||
else:
|
||||
sections = [
|
||||
@ -286,45 +466,39 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
for block in page_blocks
|
||||
]
|
||||
|
||||
blob = ("\n".join([sec.text for sec in sections])).encode("utf-8")
|
||||
joined_text = "\n".join(sec.text for sec in sections)
|
||||
blob = joined_text.encode("utf-8")
|
||||
yield Document(
|
||||
id=page.id,
|
||||
blob=blob,
|
||||
source=DocumentSource.NOTION,
|
||||
semantic_identifier=page_title,
|
||||
extension=".txt",
|
||||
size_bytes=len(blob),
|
||||
doc_updated_at=datetime_from_string(page.last_edited_time)
|
||||
id=page.id, blob=blob, source=DocumentSource.NOTION, semantic_identifier=page_title, extension=".txt", size_bytes=len(blob), doc_updated_at=datetime_from_string(page.last_edited_time)
|
||||
)
|
||||
|
||||
for attachment_doc in attachment_docs:
|
||||
yield attachment_doc
|
||||
|
||||
if self.recursive_index_enabled and all_child_page_ids:
|
||||
for child_page_batch_ids in batch_generator(all_child_page_ids, INDEX_BATCH_SIZE):
|
||||
child_page_batch = [
|
||||
self._fetch_page(page_id)
|
||||
for page_id in child_page_batch_ids
|
||||
if page_id not in self.indexed_pages
|
||||
]
|
||||
yield from self._read_pages(child_page_batch)
|
||||
child_page_batch = [self._fetch_page(page_id) for page_id in child_page_batch_ids if page_id not in self.indexed_pages]
|
||||
yield from self._read_pages(child_page_batch, start, end)
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _search_notion(self, query_dict: dict[str, Any]) -> NotionSearchResponse:
|
||||
"""Search for pages from a Notion database."""
|
||||
logging.debug(f"Searching for pages in Notion with query_dict: {query_dict}")
|
||||
logging.debug(f"[Notion]: Searching for pages in Notion with query_dict: {query_dict}")
|
||||
data = fetch_notion_data("https://api.notion.com/v1/search", self.headers, "POST", query_dict)
|
||||
return NotionSearchResponse(**data)
|
||||
|
||||
def _recursive_load(self) -> Generator[list[Document], None, None]:
|
||||
def _recursive_load(self, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None) -> Generator[list[Document], None, None]:
|
||||
"""Recursively load pages starting from root page ID."""
|
||||
if self.root_page_id is None or not self.recursive_index_enabled:
|
||||
raise RuntimeError("Recursive page lookup is not enabled")
|
||||
|
||||
logging.info(f"Recursively loading pages from Notion based on root page with ID: {self.root_page_id}")
|
||||
logging.info(f"[Notion]: Recursively loading pages from Notion based on root page with ID: {self.root_page_id}")
|
||||
pages = [self._fetch_page(page_id=self.root_page_id)]
|
||||
yield from batch_generator(self._read_pages(pages), self.batch_size)
|
||||
yield from batch_generator(self._read_pages(pages, start, end), self.batch_size)
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""Applies integration token to headers."""
|
||||
self.headers["Authorization"] = f'Bearer {credentials["notion_integration_token"]}'
|
||||
self.headers["Authorization"] = f"Bearer {credentials['notion_integration_token']}"
|
||||
return None
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
@ -348,12 +522,10 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
else:
|
||||
break
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
|
||||
"""Poll Notion for updated pages within a time period."""
|
||||
if self.recursive_index_enabled and self.root_page_id:
|
||||
yield from self._recursive_load()
|
||||
yield from self._recursive_load(start, end)
|
||||
return
|
||||
|
||||
query_dict = {
|
||||
@ -367,7 +539,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
pages = filter_pages_by_time(db_res.results, start, end, "last_edited_time")
|
||||
|
||||
if pages:
|
||||
yield from batch_generator(self._read_pages(pages), self.batch_size)
|
||||
yield from batch_generator(self._read_pages(pages, start, end), self.batch_size)
|
||||
if db_res.has_more:
|
||||
query_dict["start_cursor"] = db_res.next_cursor
|
||||
else:
|
||||
|
||||
@ -27,6 +27,7 @@ from common.constants import SVR_QUEUE_NAME, Storage
|
||||
import rag.utils
|
||||
import rag.utils.es_conn
|
||||
import rag.utils.infinity_conn
|
||||
import rag.utils.ob_conn
|
||||
import rag.utils.opensearch_conn
|
||||
from rag.utils.azure_sas_conn import RAGFlowAzureSasBlob
|
||||
from rag.utils.azure_spn_conn import RAGFlowAzureSpnBlob
|
||||
@ -103,6 +104,7 @@ INFINITY = {}
|
||||
AZURE = {}
|
||||
S3 = {}
|
||||
MINIO = {}
|
||||
OB = {}
|
||||
OSS = {}
|
||||
OS = {}
|
||||
|
||||
@ -227,7 +229,7 @@ def init_settings():
|
||||
FEISHU_OAUTH = get_base_config("oauth", {}).get("feishu")
|
||||
OAUTH_CONFIG = get_base_config("oauth", {})
|
||||
|
||||
global DOC_ENGINE, docStoreConn, ES, OS, INFINITY
|
||||
global DOC_ENGINE, docStoreConn, ES, OB, OS, INFINITY
|
||||
DOC_ENGINE = os.environ.get("DOC_ENGINE", "elasticsearch")
|
||||
# DOC_ENGINE = os.environ.get('DOC_ENGINE', "opensearch")
|
||||
lower_case_doc_engine = DOC_ENGINE.lower()
|
||||
@ -240,6 +242,9 @@ def init_settings():
|
||||
elif lower_case_doc_engine == "opensearch":
|
||||
OS = get_base_config("os", {})
|
||||
docStoreConn = rag.utils.opensearch_conn.OSConnection()
|
||||
elif lower_case_doc_engine == "oceanbase":
|
||||
OB = get_base_config("oceanbase", {})
|
||||
docStoreConn = rag.utils.ob_conn.OBConnection()
|
||||
else:
|
||||
raise Exception(f"Not supported doc engine: {DOC_ENGINE}")
|
||||
|
||||
|
||||
@ -35,6 +35,12 @@ def num_tokens_from_string(string: str) -> int:
|
||||
return 0
|
||||
|
||||
def total_token_count_from_response(resp):
|
||||
"""
|
||||
Extract token count from LLM response in various formats.
|
||||
|
||||
Handles None responses and different response structures from various LLM providers.
|
||||
Returns 0 if token count cannot be determined.
|
||||
"""
|
||||
if resp is None:
|
||||
return 0
|
||||
|
||||
@ -50,19 +56,19 @@ def total_token_count_from_response(resp):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if 'usage' in resp and 'total_tokens' in resp['usage']:
|
||||
if isinstance(resp, dict) and 'usage' in resp and 'total_tokens' in resp['usage']:
|
||||
try:
|
||||
return resp["usage"]["total_tokens"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if 'usage' in resp and 'input_tokens' in resp['usage'] and 'output_tokens' in resp['usage']:
|
||||
if isinstance(resp, dict) and 'usage' in resp and 'input_tokens' in resp['usage'] and 'output_tokens' in resp['usage']:
|
||||
try:
|
||||
return resp["usage"]["input_tokens"] + resp["usage"]["output_tokens"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if 'meta' in resp and 'tokens' in resp['meta'] and 'input_tokens' in resp['meta']['tokens'] and 'output_tokens' in resp['meta']['tokens']:
|
||||
if isinstance(resp, dict) and 'meta' in resp and 'tokens' in resp['meta'] and 'input_tokens' in resp['meta']['tokens'] and 'output_tokens' in resp['meta']['tokens']:
|
||||
try:
|
||||
return resp["meta"]["tokens"]["input_tokens"] + resp["meta"]["tokens"]["output_tokens"]
|
||||
except Exception:
|
||||
|
||||
@ -28,6 +28,14 @@ os:
|
||||
infinity:
|
||||
uri: 'localhost:23817'
|
||||
db_name: 'default_db'
|
||||
oceanbase:
|
||||
scheme: 'oceanbase' # set 'mysql' to create connection using mysql config
|
||||
config:
|
||||
db_name: 'test'
|
||||
user: 'root@ragflow'
|
||||
password: 'infini_rag_flow'
|
||||
host: 'localhost'
|
||||
port: 2881
|
||||
redis:
|
||||
db: 1
|
||||
password: 'infini_rag_flow'
|
||||
@ -139,5 +147,3 @@ user_default_llm:
|
||||
# secret_id: 'tencent_secret_id'
|
||||
# secret_key: 'tencent_secret_key'
|
||||
# region: 'tencent_region'
|
||||
# table_result_type: '1'
|
||||
# markdown_image_response_type: '1'
|
||||
|
||||
@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
|
||||
yield (DoclingContentType.EQUATION.value, text, bbox)
|
||||
|
||||
def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
|
||||
def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
|
||||
sections: list[tuple[str, str]] = []
|
||||
for typ, payload, bbox in self._iter_doc_items(doc):
|
||||
if typ == DoclingContentType.TEXT.value:
|
||||
@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
continue
|
||||
|
||||
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
|
||||
sections.append((section, tag))
|
||||
if parse_method == "manual":
|
||||
sections.append((section, typ, tag))
|
||||
elif parse_method == "paper":
|
||||
sections.append((section + tag, typ))
|
||||
else:
|
||||
sections.append((section, tag))
|
||||
return sections
|
||||
|
||||
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
|
||||
@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
output_dir: Optional[str] = None,
|
||||
lang: Optional[str] = None,
|
||||
method: str = "auto",
|
||||
delete_output: bool = True,
|
||||
delete_output: bool = True,
|
||||
parse_method: str = "raw"
|
||||
):
|
||||
|
||||
if not self.check_installation():
|
||||
@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
if callback:
|
||||
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
|
||||
|
||||
sections = self._transfer_to_sections(doc)
|
||||
sections = self._transfer_to_sections(doc, parse_method=parse_method)
|
||||
tables = self._transfer_to_tables(doc)
|
||||
|
||||
if callback:
|
||||
|
||||
@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
item[key] = str((subdir / item[key]).resolve())
|
||||
return data
|
||||
|
||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
|
||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
||||
sections = []
|
||||
for output in outputs:
|
||||
match output["type"]:
|
||||
@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
case MinerUContentType.DISCARDED:
|
||||
pass
|
||||
|
||||
if section:
|
||||
if section and parse_method == "manual":
|
||||
sections.append((section, output["type"], self._line_tag(output)))
|
||||
elif section and parse_method == "paper":
|
||||
sections.append((section + self._line_tag(output), output["type"]))
|
||||
else:
|
||||
sections.append((section, self._line_tag(output)))
|
||||
return sections
|
||||
|
||||
@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
method: str = "auto",
|
||||
server_url: Optional[str] = None,
|
||||
delete_output: bool = True,
|
||||
parse_method: str = "raw"
|
||||
) -> tuple:
|
||||
import shutil
|
||||
|
||||
@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
if callback:
|
||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
|
||||
|
||||
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
|
||||
finally:
|
||||
if temp_pdf and temp_pdf.exists():
|
||||
try:
|
||||
|
||||
@ -33,6 +33,8 @@ import xgboost as xgb
|
||||
from huggingface_hub import snapshot_download
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader as pdf2_read
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
from common.file_utils import get_project_base_directory
|
||||
from common.misc_utils import pip_install_torch
|
||||
@ -353,7 +355,6 @@ class RAGFlowPdfParser:
|
||||
def _assign_column(self, boxes, zoomin=3):
|
||||
if not boxes:
|
||||
return boxes
|
||||
|
||||
if all("col_id" in b for b in boxes):
|
||||
return boxes
|
||||
|
||||
@ -361,61 +362,80 @@ class RAGFlowPdfParser:
|
||||
for b in boxes:
|
||||
by_page[b["page_number"]].append(b)
|
||||
|
||||
page_info = {} # pg -> dict(page_w, left_edge, cand_cols)
|
||||
counter = Counter()
|
||||
page_cols = {}
|
||||
|
||||
for pg, bxs in by_page.items():
|
||||
if not bxs:
|
||||
page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1}
|
||||
counter[1] += 1
|
||||
page_cols[pg] = 1
|
||||
continue
|
||||
|
||||
if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg:
|
||||
page_w = self.page_images[pg - 1].size[0] / max(1, zoomin)
|
||||
left_edge = 0.0
|
||||
else:
|
||||
xs0 = [box["x0"] for box in bxs]
|
||||
xs1 = [box["x1"] for box in bxs]
|
||||
left_edge = float(min(xs0))
|
||||
page_w = max(1.0, float(max(xs1) - left_edge))
|
||||
x0s_raw = np.array([b["x0"] for b in bxs], dtype=float)
|
||||
|
||||
widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs]
|
||||
median_w = float(np.median(widths)) if widths else 1.0
|
||||
min_x0 = np.min(x0s_raw)
|
||||
max_x1 = np.max([b["x1"] for b in bxs])
|
||||
width = max_x1 - min_x0
|
||||
|
||||
raw_cols = int(page_w / max(1.0, median_w))
|
||||
INDENT_TOL = width * 0.12
|
||||
x0s = []
|
||||
for x in x0s_raw:
|
||||
if abs(x - min_x0) < INDENT_TOL:
|
||||
x0s.append([min_x0])
|
||||
else:
|
||||
x0s.append([x])
|
||||
x0s = np.array(x0s, dtype=float)
|
||||
|
||||
max_try = min(4, len(bxs))
|
||||
if max_try < 2:
|
||||
max_try = 1
|
||||
best_k = 1
|
||||
best_score = -1
|
||||
|
||||
# cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1
|
||||
cand = raw_cols
|
||||
for k in range(1, max_try + 1):
|
||||
km = KMeans(n_clusters=k, n_init="auto")
|
||||
labels = km.fit_predict(x0s)
|
||||
|
||||
page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand}
|
||||
counter[cand] += 1
|
||||
centers = np.sort(km.cluster_centers_.flatten())
|
||||
if len(centers) > 1:
|
||||
try:
|
||||
score = silhouette_score(x0s, labels)
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
score = 0
|
||||
print(f"{k=},{score=}",flush=True)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_k = k
|
||||
|
||||
logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}")
|
||||
page_cols[pg] = best_k
|
||||
logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
|
||||
|
||||
global_cols = counter.most_common(1)[0][0]
|
||||
|
||||
global_cols = Counter(page_cols.values()).most_common(1)[0][0]
|
||||
logging.info(f"Global column_num decided by majority: {global_cols}")
|
||||
|
||||
|
||||
for pg, bxs in by_page.items():
|
||||
if not bxs:
|
||||
continue
|
||||
k = page_cols[pg]
|
||||
if len(bxs) < k:
|
||||
k = 1
|
||||
x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
|
||||
km = KMeans(n_clusters=k, n_init="auto")
|
||||
labels = km.fit_predict(x0s)
|
||||
|
||||
page_w = page_info[pg]["page_w"]
|
||||
left_edge = page_info[pg]["left_edge"]
|
||||
centers = km.cluster_centers_.flatten()
|
||||
order = np.argsort(centers)
|
||||
|
||||
if global_cols == 1:
|
||||
for box in bxs:
|
||||
box["col_id"] = 0
|
||||
continue
|
||||
remap = {orig: new for new, orig in enumerate(order)}
|
||||
|
||||
for box in bxs:
|
||||
w = box["x1"] - box["x0"]
|
||||
if w >= 0.8 * page_w:
|
||||
box["col_id"] = 0
|
||||
continue
|
||||
cx = 0.5 * (box["x0"] + box["x1"])
|
||||
norm_cx = (cx - left_edge) / page_w
|
||||
norm_cx = max(0.0, min(norm_cx, 0.999999))
|
||||
box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols))
|
||||
for b, lb in zip(bxs, labels):
|
||||
b["col_id"] = remap[lb]
|
||||
|
||||
grouped = defaultdict(list)
|
||||
for b in bxs:
|
||||
grouped[b["col_id"]].append(b)
|
||||
|
||||
return boxes
|
||||
|
||||
@ -1303,7 +1323,10 @@ class RAGFlowPdfParser:
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
if 0 < ii < len(poss) - 1:
|
||||
right = max(left + 10, right)
|
||||
else:
|
||||
right = left + max_width
|
||||
bottom *= ZM
|
||||
for pn in pns[1:]:
|
||||
if 0 <= pn - 1 < page_count:
|
||||
|
||||
@ -192,12 +192,16 @@ class TencentCloudAPIClient:
|
||||
|
||||
|
||||
class TCADPParser(RAGFlowPdfParser):
|
||||
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
|
||||
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
|
||||
table_result_type: str = None, markdown_image_response_type: str = None):
|
||||
super().__init__()
|
||||
|
||||
# First initialize logger
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
# Log received parameters
|
||||
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
|
||||
|
||||
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
||||
try:
|
||||
tcadp_parser = get_base_config("tcadp_config", {})
|
||||
@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
|
||||
self.secret_id = secret_id or tcadp_parser.get("secret_id")
|
||||
self.secret_key = secret_key or tcadp_parser.get("secret_key")
|
||||
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
|
||||
self.table_result_type = tcadp_parser.get("table_result_type", "1")
|
||||
self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
|
||||
self.logger.info("[TCADP] Configuration read from service_conf.yaml")
|
||||
# Set table_result_type and markdown_image_response_type from config or parameters
|
||||
self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
|
||||
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
|
||||
|
||||
else:
|
||||
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
||||
# If config file is empty, use provided parameters or defaults
|
||||
self.secret_id = secret_id
|
||||
self.secret_key = secret_key
|
||||
self.region = region or "ap-guangzhou"
|
||||
self.table_result_type = table_result_type if table_result_type is not None else "1"
|
||||
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
|
||||
|
||||
except ImportError:
|
||||
self.logger.info("[TCADP] Configuration module import failed")
|
||||
# If config file is not available, use provided parameters or defaults
|
||||
self.secret_id = secret_id
|
||||
self.secret_key = secret_key
|
||||
self.region = region or "ap-guangzhou"
|
||||
self.table_result_type = table_result_type if table_result_type is not None else "1"
|
||||
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
|
||||
|
||||
# Log final values
|
||||
self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
|
||||
|
||||
if not self.secret_id or not self.secret_key:
|
||||
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
||||
@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser):
|
||||
"TableResultType": self.table_result_type,
|
||||
"MarkdownImageResponseType": self.markdown_image_response_type
|
||||
}
|
||||
|
||||
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
|
||||
|
||||
result = client.reconstruct_document_sse(
|
||||
file_type=file_type,
|
||||
|
||||
31
docker/.env
31
docker/.env
@ -7,6 +7,7 @@
|
||||
# Available options:
|
||||
# - `elasticsearch` (default)
|
||||
# - `infinity` (https://github.com/infiniflow/infinity)
|
||||
# - `oceanbase` (https://github.com/oceanbase/oceanbase)
|
||||
# - `opensearch` (https://github.com/opensearch-project/OpenSearch)
|
||||
DOC_ENGINE=${DOC_ENGINE:-elasticsearch}
|
||||
|
||||
@ -62,6 +63,27 @@ INFINITY_THRIFT_PORT=23817
|
||||
INFINITY_HTTP_PORT=23820
|
||||
INFINITY_PSQL_PORT=5432
|
||||
|
||||
# The hostname where the OceanBase service is exposed
|
||||
OCEANBASE_HOST=oceanbase
|
||||
# The port used to expose the OceanBase service
|
||||
OCEANBASE_PORT=2881
|
||||
# The username for OceanBase
|
||||
OCEANBASE_USER=root@ragflow
|
||||
# The password for OceanBase
|
||||
OCEANBASE_PASSWORD=infini_rag_flow
|
||||
# The doc database of the OceanBase service to use
|
||||
OCEANBASE_DOC_DBNAME=ragflow_doc
|
||||
|
||||
# OceanBase container configuration
|
||||
OB_CLUSTER_NAME=${OB_CLUSTER_NAME:-ragflow}
|
||||
OB_TENANT_NAME=${OB_TENANT_NAME:-ragflow}
|
||||
OB_SYS_PASSWORD=${OCEANBASE_PASSWORD:-infini_rag_flow}
|
||||
OB_TENANT_PASSWORD=${OCEANBASE_PASSWORD:-infini_rag_flow}
|
||||
OB_MEMORY_LIMIT=${OB_MEMORY_LIMIT:-10G}
|
||||
OB_SYSTEM_MEMORY=${OB_SYSTEM_MEMORY:-2G}
|
||||
OB_DATAFILE_SIZE=${OB_DATAFILE_SIZE:-20G}
|
||||
OB_LOG_DISK_SIZE=${OB_LOG_DISK_SIZE:-20G}
|
||||
|
||||
# The password for MySQL.
|
||||
MYSQL_PASSWORD=infini_rag_flow
|
||||
# The hostname where the MySQL service is exposed
|
||||
@ -208,9 +230,16 @@ REGISTER_ENABLED=1
|
||||
# SANDBOX_MAX_MEMORY=256m # b, k, m, g
|
||||
# SANDBOX_TIMEOUT=10s # s, m, 1m30s
|
||||
|
||||
# Enable DocLing and Mineru
|
||||
# Enable DocLing
|
||||
USE_DOCLING=false
|
||||
|
||||
# Enable Mineru
|
||||
USE_MINERU=false
|
||||
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
||||
MINERU_DELETE_OUTPUT=0 # keep output directory
|
||||
MINERU_BACKEND=pipeline # or another backend you prefer
|
||||
|
||||
|
||||
|
||||
# pptx support
|
||||
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1
|
||||
@ -138,6 +138,15 @@ The [.env](./.env) file contains important environment variables for Docker.
|
||||
- `password`: The password for MinIO.
|
||||
- `host`: The MinIO serving IP *and* port inside the Docker container. Defaults to `minio:9000`.
|
||||
|
||||
- `oceanbase`
|
||||
- `scheme`: The connection scheme. Set to `mysql` to use mysql config, or other values to use config below.
|
||||
- `config`:
|
||||
- `db_name`: The OceanBase database name.
|
||||
- `user`: The username for OceanBase.
|
||||
- `password`: The password for OceanBase.
|
||||
- `host`: The hostname of the OceanBase service.
|
||||
- `port`: The port of OceanBase.
|
||||
|
||||
- `oss`
|
||||
- `access_key`: The access key ID used to authenticate requests to the OSS service.
|
||||
- `secret_key`: The secret access key used to authenticate requests to the OSS service.
|
||||
|
||||
@ -72,7 +72,7 @@ services:
|
||||
infinity:
|
||||
profiles:
|
||||
- infinity
|
||||
image: infiniflow/infinity:v0.6.5
|
||||
image: infiniflow/infinity:v0.6.6
|
||||
volumes:
|
||||
- infinity_data:/var/infinity
|
||||
- ./infinity_conf.toml:/infinity_conf.toml
|
||||
@ -96,6 +96,31 @@ services:
|
||||
retries: 120
|
||||
restart: on-failure
|
||||
|
||||
oceanbase:
|
||||
profiles:
|
||||
- oceanbase
|
||||
image: oceanbase/oceanbase-ce:4.4.1.0-100000032025101610
|
||||
volumes:
|
||||
- ./oceanbase/data:/root/ob
|
||||
- ./oceanbase/conf:/root/.obd/cluster
|
||||
- ./oceanbase/init.d:/root/boot/init.d
|
||||
ports:
|
||||
- ${OCEANBASE_PORT:-2881}:2881
|
||||
env_file: .env
|
||||
environment:
|
||||
- MODE=normal
|
||||
- OB_SERVER_IP=127.0.0.1
|
||||
mem_limit: ${MEM_LIMIT}
|
||||
healthcheck:
|
||||
test: [ 'CMD-SHELL', 'obclient -h127.0.0.1 -P2881 -uroot@${OB_TENANT_NAME:-ragflow} -p${OB_TENANT_PASSWORD:-infini_rag_flow} -e "CREATE DATABASE IF NOT EXISTS ${OCEANBASE_DOC_DBNAME:-ragflow_doc};"' ]
|
||||
interval: 10s
|
||||
retries: 30
|
||||
start_period: 30s
|
||||
timeout: 10s
|
||||
networks:
|
||||
- ragflow
|
||||
restart: on-failure
|
||||
|
||||
sandbox-executor-manager:
|
||||
profiles:
|
||||
- sandbox
|
||||
@ -154,7 +179,7 @@ services:
|
||||
|
||||
minio:
|
||||
image: quay.io/minio/minio:RELEASE.2025-06-13T11-33-47Z
|
||||
command: server --console-address ":9001" /data
|
||||
command: ["server", "--console-address", ":9001", "/data"]
|
||||
ports:
|
||||
- ${MINIO_PORT}:9000
|
||||
- ${MINIO_CONSOLE_PORT}:9001
|
||||
@ -176,7 +201,7 @@ services:
|
||||
redis:
|
||||
# swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/valkey/valkey:8
|
||||
image: valkey/valkey:8
|
||||
command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru
|
||||
command: ["redis-server", "--requirepass", "${REDIS_PASSWORD}", "--maxmemory", "128mb", "--maxmemory-policy", "allkeys-lru"]
|
||||
env_file: .env
|
||||
ports:
|
||||
- ${REDIS_PORT}:6379
|
||||
@ -256,6 +281,8 @@ volumes:
|
||||
driver: local
|
||||
infinity_data:
|
||||
driver: local
|
||||
ob_data:
|
||||
driver: local
|
||||
mysql_data:
|
||||
driver: local
|
||||
minio_data:
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
[general]
|
||||
version = "0.6.5"
|
||||
version = "0.6.6"
|
||||
time_zone = "utc-8"
|
||||
|
||||
[network]
|
||||
|
||||
1
docker/oceanbase/init.d/vec_memory.sql
Normal file
1
docker/oceanbase/init.d/vec_memory.sql
Normal file
@ -0,0 +1 @@
|
||||
ALTER SYSTEM SET ob_vector_memory_limit_percentage = 30;
|
||||
@ -28,6 +28,14 @@ os:
|
||||
infinity:
|
||||
uri: '${INFINITY_HOST:-infinity}:23817'
|
||||
db_name: 'default_db'
|
||||
oceanbase:
|
||||
scheme: 'oceanbase' # set 'mysql' to create connection using mysql config
|
||||
config:
|
||||
db_name: '${OCEANBASE_DOC_DBNAME:-test}'
|
||||
user: '${OCEANBASE_USER:-root@ragflow}'
|
||||
password: '${OCEANBASE_PASSWORD:-infini_rag_flow}'
|
||||
host: '${OCEANBASE_HOST:-oceanbase}'
|
||||
port: ${OCEANBASE_PORT:-2881}
|
||||
redis:
|
||||
db: 1
|
||||
password: '${REDIS_PASSWORD:-infini_rag_flow}'
|
||||
@ -142,5 +150,3 @@ user_default_llm:
|
||||
# secret_id: '${TENCENT_SECRET_ID}'
|
||||
# secret_key: '${TENCENT_SECRET_KEY}'
|
||||
# region: '${TENCENT_REGION}'
|
||||
# table_result_type: '1'
|
||||
# markdown_image_response_type: '1'
|
||||
|
||||
@ -2085,6 +2085,7 @@ curl --request POST \
|
||||
"dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
|
||||
"document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
|
||||
"metadata_condition": {
|
||||
"logic": "and",
|
||||
"conditions": [
|
||||
{
|
||||
"name": "author",
|
||||
|
||||
@ -96,7 +96,7 @@ ragflow:
|
||||
infinity:
|
||||
image:
|
||||
repository: infiniflow/infinity
|
||||
tag: v0.6.5
|
||||
tag: v0.6.6
|
||||
pullPolicy: IfNotPresent
|
||||
pullSecrets: []
|
||||
storage:
|
||||
|
||||
@ -49,7 +49,7 @@ dependencies = [
|
||||
"html-text==0.6.2",
|
||||
"httpx[socks]>=0.28.1,<0.29.0",
|
||||
"huggingface-hub>=0.25.0,<0.26.0",
|
||||
"infinity-sdk==0.6.5",
|
||||
"infinity-sdk==0.6.6",
|
||||
"infinity-emb>=0.0.66,<0.0.67",
|
||||
"itsdangerous==2.1.2",
|
||||
"json-repair==0.35.0",
|
||||
@ -149,6 +149,7 @@ dependencies = [
|
||||
"captcha>=0.7.1",
|
||||
"pip>=25.2",
|
||||
"pypandoc>=1.16",
|
||||
"pyobvector==0.2.18",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
||||
@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang = lang,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
parse_method = "manual",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
elif len(section) != 3:
|
||||
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
||||
|
||||
txt, sec_id, poss = section
|
||||
txt, layoutno, poss = section
|
||||
if isinstance(poss, str):
|
||||
poss = pdf_parser.extract_positions(poss)
|
||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||
@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pn = pn[0] # [pn] -> pn
|
||||
poss[0] = (pn, *first[1:])
|
||||
|
||||
return (txt, sec_id, poss)
|
||||
return (txt, layoutno, poss)
|
||||
|
||||
|
||||
sections = [_normalize_section(sec) for sec in sections]
|
||||
|
||||
@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
callback(-1, "MinerU not found.")
|
||||
@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||
server_url=os.environ.get("MINERU_SERVER_URL", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
callback(-1, "Docling not found.")
|
||||
@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@ -116,7 +120,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
||||
else:
|
||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
@ -504,7 +508,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
return images if images else None
|
||||
|
||||
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
|
||||
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
@ -602,7 +606,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
||||
sections, tables = Docx()(filename, binary)
|
||||
|
||||
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
|
||||
tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -653,18 +657,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = ExcelParser()
|
||||
if parser_config.get("html4excel"):
|
||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||
|
||||
# Check if tcadp_parser is selected for spreadsheet files
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
if layout_recognizer == "TCADP Parser":
|
||||
table_result_type = parser_config.get("table_result_type", "1")
|
||||
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return res
|
||||
|
||||
# Determine file type based on extension
|
||||
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type=file_type
|
||||
)
|
||||
parser_config["chunk_token_num"] = 0
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||
parser_config["chunk_token_num"] = 12800
|
||||
# Default DeepDOC parser
|
||||
excel_parser = ExcelParser()
|
||||
if parser_config.get("html4excel"):
|
||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||
else:
|
||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||
parser_config["chunk_token_num"] = 12800
|
||||
|
||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -676,7 +709,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
|
||||
try:
|
||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||
|
||||
@ -21,8 +21,10 @@ import re
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
|
||||
from common.constants import ParserType
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
from deepdoc.parser import PdfParser
|
||||
import numpy as np
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
pdf_parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
if name == "deepdoc":
|
||||
pdf_parser = Pdf()
|
||||
paper = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
else:
|
||||
sections, tables, pdf_parser = pdf_parser(
|
||||
filename=filename,
|
||||
binary=binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
lang=lang,
|
||||
callback=callback,
|
||||
pdf_cls=Pdf,
|
||||
parse_method="paper",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
paper = {
|
||||
"title": filename,
|
||||
"authors": " ",
|
||||
"abstract": "",
|
||||
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
|
||||
"tables": []
|
||||
"sections": sections,
|
||||
"tables": tables
|
||||
}
|
||||
else:
|
||||
pdf_parser = Pdf()
|
||||
paper = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
tbls=paper["tables"]
|
||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
||||
paper["tables"] = tbls
|
||||
|
||||
@ -16,6 +16,7 @@ import io
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
|
||||
"output_format": "json",
|
||||
},
|
||||
"spreadsheet": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
"output_format": "html",
|
||||
"suffix": [
|
||||
"xls",
|
||||
@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
|
||||
"output_format": "json",
|
||||
},
|
||||
"slides": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
"suffix": [
|
||||
"pptx",
|
||||
"ppt"
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
@ -245,7 +249,12 @@ class Parser(ProcessBase):
|
||||
bboxes.append(box)
|
||||
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||
# ADP is a document parsing tool using Tencent Cloud API
|
||||
tcadp_parser = TCADPParser()
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
sections, _ = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
@ -301,14 +310,86 @@ class Parser(ProcessBase):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
||||
conf = self._param.setups["spreadsheet"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", htmls[0])
|
||||
elif conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
parse_method = conf.get("parse_method", "deepdoc")
|
||||
|
||||
# Handle TCADP parser
|
||||
if parse_method.lower() == "tcadp parser":
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
|
||||
# Determine file type based on extension
|
||||
if re.search(r"\.xlsx?$", name, re.IGNORECASE):
|
||||
file_type = "XLSX"
|
||||
else:
|
||||
file_type = "CSV"
|
||||
|
||||
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
)
|
||||
|
||||
# Process TCADP parser output based on configured output_format
|
||||
output_format = conf.get("output_format", "html")
|
||||
|
||||
if output_format == "html":
|
||||
# For HTML output, combine sections and tables into HTML
|
||||
html_content = ""
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
html_content += section + "\n"
|
||||
for table in tables:
|
||||
if table:
|
||||
html_content += table + "\n"
|
||||
|
||||
self.set_output("html", html_content)
|
||||
|
||||
elif output_format == "json":
|
||||
# For JSON output, create a list of text items
|
||||
result = []
|
||||
# Add sections as text
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
result.append({"text": section})
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
|
||||
self.set_output("json", result)
|
||||
|
||||
elif output_format == "markdown":
|
||||
# For markdown output, combine into markdown
|
||||
md_content = ""
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
md_content += section + "\n\n"
|
||||
for table in tables:
|
||||
if table:
|
||||
md_content += table + "\n\n"
|
||||
|
||||
self.set_output("markdown", md_content)
|
||||
else:
|
||||
# Default DeepDOC parser
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", htmls[0])
|
||||
elif conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
def _word(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
@ -326,22 +407,69 @@ class Parser(ProcessBase):
|
||||
self.set_output("markdown", markdown_text)
|
||||
|
||||
def _slides(self, name, blob):
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
||||
|
||||
conf = self._param.setups["slides"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
ppt_parser = ppt_parser()
|
||||
txts = ppt_parser(blob, 0, 100000, None)
|
||||
parse_method = conf.get("parse_method", "deepdoc")
|
||||
|
||||
sections = [{"text": section} for section in txts if section.strip()]
|
||||
# Handle TCADP parser
|
||||
if parse_method.lower() == "tcadp parser":
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
# Determine file type based on extension
|
||||
if re.search(r"\.pptx?$", name, re.IGNORECASE):
|
||||
file_type = "PPTX"
|
||||
else:
|
||||
file_type = "PPT"
|
||||
|
||||
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
)
|
||||
|
||||
# Process TCADP parser output - PPT only supports json format
|
||||
output_format = conf.get("output_format", "json")
|
||||
if output_format == "json":
|
||||
# For JSON output, create a list of text items
|
||||
result = []
|
||||
# Add sections as text
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
result.append({"text": section})
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
|
||||
self.set_output("json", result)
|
||||
else:
|
||||
# Default DeepDOC parser (supports .pptx format)
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
ppt_parser = ppt_parser()
|
||||
txts = ppt_parser(blob, 0, 100000, None)
|
||||
|
||||
sections = [{"text": section} for section in txts if section.strip()]
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, name, blob):
|
||||
from functools import reduce
|
||||
@ -579,6 +707,7 @@ class Parser(ProcessBase):
|
||||
"video": self._video,
|
||||
"email": self._email,
|
||||
}
|
||||
|
||||
try:
|
||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||
except Exception as e:
|
||||
|
||||
@ -234,7 +234,11 @@ class CoHereRerank(Base):
|
||||
def __init__(self, key, model_name, base_url=None):
|
||||
from cohere import Client
|
||||
|
||||
self.client = Client(api_key=key, base_url=base_url)
|
||||
# Only pass base_url if it's a non-empty string, otherwise use default Cohere API endpoint
|
||||
client_kwargs = {"api_key": key}
|
||||
if base_url and base_url.strip():
|
||||
client_kwargs["base_url"] = base_url
|
||||
self.client = Client(**client_kwargs)
|
||||
self.model_name = model_name.split("___")[0]
|
||||
|
||||
def similarity(self, query: str, texts: list):
|
||||
|
||||
@ -83,6 +83,7 @@ class FulltextQueryer:
|
||||
return txt
|
||||
|
||||
def question(self, txt, tbl="qa", min_match: float = 0.6):
|
||||
original_query = txt
|
||||
txt = FulltextQueryer.add_space_between_eng_zh(txt)
|
||||
txt = re.sub(
|
||||
r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+",
|
||||
@ -127,7 +128,7 @@ class FulltextQueryer:
|
||||
q.append(txt)
|
||||
query = " ".join(q)
|
||||
return MatchTextExpr(
|
||||
self.query_fields, query, 100
|
||||
self.query_fields, query, 100, {"original_query": original_query}
|
||||
), keywords
|
||||
|
||||
def need_fine_grained_tokenize(tk):
|
||||
@ -212,7 +213,7 @@ class FulltextQueryer:
|
||||
if not query:
|
||||
query = otxt
|
||||
return MatchTextExpr(
|
||||
self.query_fields, query, 100, {"minimum_should_match": min_match}
|
||||
self.query_fields, query, 100, {"minimum_should_match": min_match, "original_query": original_query}
|
||||
), keywords
|
||||
return None, keywords
|
||||
|
||||
@ -259,6 +260,7 @@ class FulltextQueryer:
|
||||
content_tks = [c.strip() for c in content_tks.strip() if c.strip()]
|
||||
tks_w = self.tw.weights(content_tks, preprocess=False)
|
||||
|
||||
origin_keywords = keywords.copy()
|
||||
keywords = [f'"{k.strip()}"' for k in keywords]
|
||||
for tk, w in sorted(tks_w, key=lambda x: x[1] * -1)[:keywords_topn]:
|
||||
tk_syns = self.syn.lookup(tk)
|
||||
@ -274,4 +276,4 @@ class FulltextQueryer:
|
||||
keywords.append(f"{tk}^{w}")
|
||||
|
||||
return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
|
||||
{"minimum_should_match": min(3, len(keywords) // 10)})
|
||||
{"minimum_should_match": min(3, len(keywords) / 10), "original_query": " ".join(origin_keywords)})
|
||||
|
||||
@ -355,75 +355,102 @@ class Dealer:
|
||||
rag_tokenizer.tokenize(ans).split(),
|
||||
rag_tokenizer.tokenize(inst).split())
|
||||
|
||||
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
|
||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
|
||||
rerank_mdl=None, highlight=False,
|
||||
rank_feature: dict | None = {PAGERANK_FLD: 10}):
|
||||
def retrieval(
|
||||
self,
|
||||
question,
|
||||
embd_mdl,
|
||||
tenant_ids,
|
||||
kb_ids,
|
||||
page,
|
||||
page_size,
|
||||
similarity_threshold=0.2,
|
||||
vector_similarity_weight=0.3,
|
||||
top=1024,
|
||||
doc_ids=None,
|
||||
aggs=True,
|
||||
rerank_mdl=None,
|
||||
highlight=False,
|
||||
rank_feature: dict | None = {PAGERANK_FLD: 10},
|
||||
):
|
||||
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
||||
if not question:
|
||||
return ranks
|
||||
|
||||
# Ensure RERANK_LIMIT is multiple of page_size
|
||||
RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
|
||||
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
|
||||
"question": question, "vector": True, "topk": top,
|
||||
"similarity": similarity_threshold,
|
||||
"available_int": 1}
|
||||
|
||||
RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
|
||||
req = {
|
||||
"kb_ids": kb_ids,
|
||||
"doc_ids": doc_ids,
|
||||
"page": math.ceil(page_size * page / RERANK_LIMIT),
|
||||
"size": RERANK_LIMIT,
|
||||
"question": question,
|
||||
"vector": True,
|
||||
"topk": top,
|
||||
"similarity": similarity_threshold,
|
||||
"available_int": 1,
|
||||
}
|
||||
|
||||
if isinstance(tenant_ids, str):
|
||||
tenant_ids = tenant_ids.split(",")
|
||||
|
||||
sres = self.search(req, [index_name(tid) for tid in tenant_ids],
|
||||
kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
||||
sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
||||
|
||||
if rerank_mdl and sres.total > 0:
|
||||
sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
|
||||
sres, question, 1 - vector_similarity_weight,
|
||||
vector_similarity_weight,
|
||||
rank_feature=rank_feature)
|
||||
sim, tsim, vsim = self.rerank_by_model(
|
||||
rerank_mdl,
|
||||
sres,
|
||||
question,
|
||||
1 - vector_similarity_weight,
|
||||
vector_similarity_weight,
|
||||
rank_feature=rank_feature,
|
||||
)
|
||||
else:
|
||||
lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if lower_case_doc_engine in ["elasticsearch","opensearch"]:
|
||||
lower_case_doc_engine = os.getenv("DOC_ENGINE", "elasticsearch")
|
||||
if lower_case_doc_engine in ["elasticsearch", "opensearch"]:
|
||||
# ElasticSearch doesn't normalize each way score before fusion.
|
||||
sim, tsim, vsim = self.rerank(
|
||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
||||
rank_feature=rank_feature)
|
||||
sres,
|
||||
question,
|
||||
1 - vector_similarity_weight,
|
||||
vector_similarity_weight,
|
||||
rank_feature=rank_feature,
|
||||
)
|
||||
else:
|
||||
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
||||
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
||||
sim = [s if s is not None else 0. for s in sim]
|
||||
sim = [s if s is not None else 0.0 for s in sim]
|
||||
tsim = sim
|
||||
vsim = sim
|
||||
# Already paginated in search function
|
||||
max_pages = RERANK_LIMIT // page_size
|
||||
page_index = (page % max_pages) - 1
|
||||
begin = max(page_index * page_size, 0)
|
||||
sim = sim[begin : begin + page_size]
|
||||
|
||||
sim_np = np.array(sim, dtype=np.float64)
|
||||
idx = np.argsort(sim_np * -1)
|
||||
if sim_np.size == 0:
|
||||
return ranks
|
||||
|
||||
sorted_idx = np.argsort(sim_np * -1)
|
||||
|
||||
valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= similarity_threshold]
|
||||
filtered_count = len(valid_idx)
|
||||
ranks["total"] = int(filtered_count)
|
||||
|
||||
if filtered_count == 0:
|
||||
return ranks
|
||||
|
||||
max_pages = max(RERANK_LIMIT // max(page_size, 1), 1)
|
||||
page_index = (page - 1) % max_pages
|
||||
begin = page_index * page_size
|
||||
end = begin + page_size
|
||||
page_idx = valid_idx[begin:end]
|
||||
|
||||
dim = len(sres.query_vector)
|
||||
vector_column = f"q_{dim}_vec"
|
||||
zero_vector = [0.0] * dim
|
||||
filtered_count = (sim_np >= similarity_threshold).sum()
|
||||
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
||||
for i in idx:
|
||||
if np.float64(sim[i]) < similarity_threshold:
|
||||
break
|
||||
|
||||
for i in page_idx:
|
||||
id = sres.ids[i]
|
||||
chunk = sres.field[id]
|
||||
dnm = chunk.get("docnm_kwd", "")
|
||||
did = chunk.get("doc_id", "")
|
||||
|
||||
if len(ranks["chunks"]) >= page_size:
|
||||
if aggs:
|
||||
if dnm not in ranks["doc_aggs"]:
|
||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||
ranks["doc_aggs"][dnm]["count"] += 1
|
||||
continue
|
||||
break
|
||||
|
||||
position_int = chunk.get("position_int", [])
|
||||
d = {
|
||||
"chunk_id": id,
|
||||
@ -434,12 +461,12 @@ class Dealer:
|
||||
"kb_id": chunk["kb_id"],
|
||||
"important_kwd": chunk.get("important_kwd", []),
|
||||
"image_id": chunk.get("img_id", ""),
|
||||
"similarity": sim[i],
|
||||
"vector_similarity": vsim[i],
|
||||
"term_similarity": tsim[i],
|
||||
"similarity": float(sim_np[i]),
|
||||
"vector_similarity": float(vsim[i]),
|
||||
"term_similarity": float(tsim[i]),
|
||||
"vector": chunk.get(vector_column, zero_vector),
|
||||
"positions": position_int,
|
||||
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
||||
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
||||
}
|
||||
if highlight and sres.highlight:
|
||||
if id in sres.highlight:
|
||||
@ -447,15 +474,30 @@ class Dealer:
|
||||
else:
|
||||
d["highlight"] = d["content_with_weight"]
|
||||
ranks["chunks"].append(d)
|
||||
if dnm not in ranks["doc_aggs"]:
|
||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||
ranks["doc_aggs"][dnm]["count"] += 1
|
||||
ranks["doc_aggs"] = [{"doc_name": k,
|
||||
"doc_id": v["doc_id"],
|
||||
"count": v["count"]} for k,
|
||||
v in sorted(ranks["doc_aggs"].items(),
|
||||
key=lambda x: x[1]["count"] * -1)]
|
||||
ranks["chunks"] = ranks["chunks"][:page_size]
|
||||
|
||||
if aggs:
|
||||
for i in valid_idx:
|
||||
id = sres.ids[i]
|
||||
chunk = sres.field[id]
|
||||
dnm = chunk.get("docnm_kwd", "")
|
||||
did = chunk.get("doc_id", "")
|
||||
if dnm not in ranks["doc_aggs"]:
|
||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||
ranks["doc_aggs"][dnm]["count"] += 1
|
||||
|
||||
ranks["doc_aggs"] = [
|
||||
{
|
||||
"doc_name": k,
|
||||
"doc_id": v["doc_id"],
|
||||
"count": v["count"],
|
||||
}
|
||||
for k, v in sorted(
|
||||
ranks["doc_aggs"].items(),
|
||||
key=lambda x: x[1]["count"] * -1,
|
||||
)
|
||||
]
|
||||
else:
|
||||
ranks["doc_aggs"] = []
|
||||
|
||||
return ranks
|
||||
|
||||
@ -564,7 +606,7 @@ class Dealer:
|
||||
ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2)
|
||||
if not ids:
|
||||
return chunks
|
||||
|
||||
|
||||
vector_size = 1024
|
||||
id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
|
||||
for cid, sim in ids:
|
||||
|
||||
@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
|
||||
return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
|
||||
|
||||
|
||||
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
|
||||
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
|
||||
sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
|
||||
current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
|
||||
metadata_keys=json.dumps(meta_data),
|
||||
@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
|
||||
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
|
||||
try:
|
||||
ans = json_repair.loads(ans)
|
||||
assert isinstance(ans, list), ans
|
||||
assert isinstance(ans, dict), ans
|
||||
assert "conditions" in ans and isinstance(ans["conditions"], list), ans
|
||||
return ans
|
||||
except Exception:
|
||||
logging.exception(f"Loading json failure: {ans}")
|
||||
return []
|
||||
|
||||
return {"conditions": []}
|
||||
|
||||
|
||||
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
|
||||
|
||||
@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
|
||||
}
|
||||
|
||||
2. **Output Requirements**:
|
||||
- Always output a JSON array of filter objects
|
||||
- Each object must have:
|
||||
- Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
|
||||
- Each filter object in conditions must have:
|
||||
"key": (metadata attribute name),
|
||||
"value": (string value to compare),
|
||||
"op": (operator from allowed list)
|
||||
- Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
|
||||
|
||||
|
||||
3. **Operator Guide**:
|
||||
- Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
|
||||
@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
|
||||
- Attribute doesn't exist in metadata
|
||||
- Value has no match in metadata
|
||||
|
||||
5. **Example**:
|
||||
5. **Example A**:
|
||||
- User query: "上市日期七月份的有哪些商品,不要蓝色的"
|
||||
- Metadata: { "color": {...}, "listing_date": {...} }
|
||||
- Output:
|
||||
[
|
||||
{
|
||||
"logic": "and",
|
||||
"conditions": [
|
||||
{"key": "listing_date", "value": "2025-07-01", "op": "≥"},
|
||||
{"key": "listing_date", "value": "2025-08-01", "op": "<"},
|
||||
{"key": "color", "value": "blue", "op": "≠"}
|
||||
]
|
||||
}
|
||||
|
||||
6. **Final Output**:
|
||||
- ONLY output valid JSON array
|
||||
6. **Example B**:
|
||||
- User query: "Both blue and red are acceptable."
|
||||
- Metadata: { "color": {...}, "listing_date": {...} }
|
||||
- Output:
|
||||
{
|
||||
"logic": "or",
|
||||
"conditions": [
|
||||
{"key": "color", "value": "blue", "op": "="},
|
||||
{"key": "color", "value": "red", "op": "="}
|
||||
]
|
||||
}
|
||||
|
||||
7. **Final Output**:
|
||||
- ONLY output valid JSON dictionary
|
||||
- NO additional text/explanations
|
||||
- Json schema is as following:
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"logic": {
|
||||
"type": "string",
|
||||
"description": "Logic relationship between all the conditions, the default is 'and'.",
|
||||
"enum": [
|
||||
"and",
|
||||
"or"
|
||||
]
|
||||
},
|
||||
"conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Metadata attribute name."
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Value to compare."
|
||||
},
|
||||
"op": {
|
||||
"type": "string",
|
||||
"description": "Operator from allowed list.",
|
||||
"enum": [
|
||||
"contains",
|
||||
"not contains",
|
||||
"start with",
|
||||
"end with",
|
||||
"empty",
|
||||
"not empty",
|
||||
"=",
|
||||
"≠",
|
||||
">",
|
||||
"<",
|
||||
"≥",
|
||||
"≤"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"key",
|
||||
"value",
|
||||
"op"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"conditions"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
```
|
||||
|
||||
**Current Task**:
|
||||
- Today's date: {{current_date}}
|
||||
- Available metadata keys: {{metadata_keys}}
|
||||
- User query: "{{user_question}}"
|
||||
- Today's date: {{ current_date }}
|
||||
- Available metadata keys: {{ metadata_keys }}
|
||||
- User query: "{{ user_question }}"
|
||||
|
||||
|
||||
1562
rag/utils/ob_conn.py
Normal file
1562
rag/utils/ob_conn.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -69,7 +69,7 @@ class Document(Base):
|
||||
response = res.json()
|
||||
actual_keys = set(response.keys())
|
||||
if actual_keys == error_keys:
|
||||
raise Exception(res.get("message"))
|
||||
raise Exception(response.get("message"))
|
||||
else:
|
||||
return res.content
|
||||
except json.JSONDecodeError:
|
||||
|
||||
@ -80,6 +80,7 @@ class Session(Base):
|
||||
|
||||
|
||||
def _structure_answer(self, json_data):
|
||||
answer = ""
|
||||
if self.__session_type == "agent":
|
||||
answer = json_data["data"]["content"]
|
||||
elif self.__session_type == "chat":
|
||||
|
||||
96
web/package-lock.json
generated
96
web/package-lock.json
generated
@ -66,6 +66,7 @@
|
||||
"input-otp": "^1.4.1",
|
||||
"js-base64": "^3.7.5",
|
||||
"jsencrypt": "^3.3.2",
|
||||
"jsoneditor": "^10.4.2",
|
||||
"lexical": "^0.23.1",
|
||||
"lodash": "^4.17.21",
|
||||
"lucide-react": "^0.546.0",
|
||||
@ -8998,6 +8999,12 @@
|
||||
"@sinonjs/commons": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@sphinxxxx/color-conversion": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmmirror.com/@sphinxxxx/color-conversion/-/color-conversion-2.2.2.tgz",
|
||||
"integrity": "sha512-XExJS3cLqgrmNBIP3bBw6+1oQ1ksGjFh0+oClDKFYpCCqx/hlqwWO5KO/S63fzUo67SxI9dMrF0y5T/Ey7h8Zw==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/@storybook/addon-docs": {
|
||||
"version": "9.1.4",
|
||||
"resolved": "https://registry.npmmirror.com/@storybook/addon-docs/-/addon-docs-9.1.4.tgz",
|
||||
@ -12962,6 +12969,12 @@
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/ace-builds": {
|
||||
"version": "1.43.4",
|
||||
"resolved": "https://registry.npmmirror.com/ace-builds/-/ace-builds-1.43.4.tgz",
|
||||
"integrity": "sha512-8hAxVfo2ImICd69BWlZwZlxe9rxDGDjuUhh+WeWgGDvfBCE+r3lkynkQvIovDz4jcMi8O7bsEaFygaDT+h9sBA==",
|
||||
"license": "BSD-3-Clause"
|
||||
},
|
||||
"node_modules/acorn": {
|
||||
"version": "8.15.0",
|
||||
"resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.15.0.tgz",
|
||||
@ -21894,6 +21907,12 @@
|
||||
"@pkgjs/parseargs": "^0.11.0"
|
||||
}
|
||||
},
|
||||
"node_modules/javascript-natural-sort": {
|
||||
"version": "0.7.1",
|
||||
"resolved": "https://registry.npmmirror.com/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz",
|
||||
"integrity": "sha512-nO6jcEfZWQXDhOiBtG2KvKyEptz7RVbpGP4vTD2hLBdmNQSsCiicO2Ioinv6UI4y9ukqnBpy+XZ9H6uLNgJTlw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/javascript-stringify": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmmirror.com/javascript-stringify/-/javascript-stringify-2.1.0.tgz",
|
||||
@ -24253,6 +24272,15 @@
|
||||
"jiti": "bin/jiti.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jmespath": {
|
||||
"version": "0.16.0",
|
||||
"resolved": "https://registry.npmmirror.com/jmespath/-/jmespath-0.16.0.tgz",
|
||||
"integrity": "sha512-9FzQjJ7MATs1tSpnco1K6ayiYE3figslrXA72G2HQ/n76RzvYlofyi5QM+iX4YRs/pu3yzxlVQSST23+dMDknw==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">= 0.6.0"
|
||||
}
|
||||
},
|
||||
"node_modules/js-base64": {
|
||||
"version": "3.7.5",
|
||||
"resolved": "https://registry.npmmirror.com/js-base64/-/js-base64-3.7.5.tgz",
|
||||
@ -24357,6 +24385,12 @@
|
||||
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/json-source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmmirror.com/json-source-map/-/json-source-map-0.6.1.tgz",
|
||||
"integrity": "sha512-1QoztHPsMQqhDq0hlXY5ZqcEdUzxQEIxgFkKl4WUp2pgShObl+9ovi4kRh2TfvAfxAoHOJ9vIMEqk3k4iex7tg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/json-stable-stringify-without-jsonify": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmmirror.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
|
||||
@ -24393,6 +24427,44 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsoneditor": {
|
||||
"version": "10.4.2",
|
||||
"resolved": "https://registry.npmmirror.com/jsoneditor/-/jsoneditor-10.4.2.tgz",
|
||||
"integrity": "sha512-SQPCXlanU4PqdVsYuj2X7yfbLiiJYjklbksGfMKPsuwLhAIPxDlG43jYfXieGXvxpuq1fkw08YoRbkKXKabcLA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"ace-builds": "^1.36.2",
|
||||
"ajv": "^6.12.6",
|
||||
"javascript-natural-sort": "^0.7.1",
|
||||
"jmespath": "^0.16.0",
|
||||
"json-source-map": "^0.6.1",
|
||||
"jsonrepair": "^3.8.1",
|
||||
"picomodal": "^3.0.0",
|
||||
"vanilla-picker": "^2.12.3"
|
||||
}
|
||||
},
|
||||
"node_modules/jsoneditor/node_modules/ajv": {
|
||||
"version": "6.12.6",
|
||||
"resolved": "https://registry.npmmirror.com/ajv/-/ajv-6.12.6.tgz",
|
||||
"integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"fast-deep-equal": "^3.1.1",
|
||||
"fast-json-stable-stringify": "^2.0.0",
|
||||
"json-schema-traverse": "^0.4.1",
|
||||
"uri-js": "^4.2.2"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/epoberezkin"
|
||||
}
|
||||
},
|
||||
"node_modules/jsoneditor/node_modules/json-schema-traverse": {
|
||||
"version": "0.4.1",
|
||||
"resolved": "https://registry.npmmirror.com/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
|
||||
"integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/jsonfile": {
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmmirror.com/jsonfile/-/jsonfile-6.1.0.tgz",
|
||||
@ -24404,6 +24476,15 @@
|
||||
"graceful-fs": "^4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonrepair": {
|
||||
"version": "3.13.1",
|
||||
"resolved": "https://registry.npmmirror.com/jsonrepair/-/jsonrepair-3.13.1.tgz",
|
||||
"integrity": "sha512-WJeiE0jGfxYmtLwBTEk8+y/mYcaleyLXWaqp5bJu0/ZTSeG0KQq/wWQ8pmnkKenEdN6pdnn6QtcoSUkbqDHWNw==",
|
||||
"license": "ISC",
|
||||
"bin": {
|
||||
"jsonrepair": "bin/cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jsx-ast-utils": {
|
||||
"version": "3.3.5",
|
||||
"resolved": "https://registry.npmmirror.com/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz",
|
||||
@ -27499,6 +27580,12 @@
|
||||
"node": ">=8.6"
|
||||
}
|
||||
},
|
||||
"node_modules/picomodal": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmmirror.com/picomodal/-/picomodal-3.0.0.tgz",
|
||||
"integrity": "sha512-FoR3TDfuLlqUvcEeK5ifpKSVVns6B4BQvc8SDF6THVMuadya6LLtji0QgUDSStw0ZR2J7I6UGi5V2V23rnPWTw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/pidtree": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmmirror.com/pidtree/-/pidtree-0.6.0.tgz",
|
||||
@ -36235,6 +36322,15 @@
|
||||
"dev": true,
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/vanilla-picker": {
|
||||
"version": "2.12.3",
|
||||
"resolved": "https://registry.npmmirror.com/vanilla-picker/-/vanilla-picker-2.12.3.tgz",
|
||||
"integrity": "sha512-qVkT1E7yMbUsB2mmJNFmaXMWE2hF8ffqzMMwe9zdAikd8u2VfnsVY2HQcOUi2F38bgbxzlJBEdS1UUhOXdF9GQ==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@sphinxxxx/color-conversion": "^2.2.2"
|
||||
}
|
||||
},
|
||||
"node_modules/vary": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmmirror.com/vary/-/vary-1.1.2.tgz",
|
||||
|
||||
@ -79,6 +79,7 @@
|
||||
"input-otp": "^1.4.1",
|
||||
"js-base64": "^3.7.5",
|
||||
"jsencrypt": "^3.3.2",
|
||||
"jsoneditor": "^10.4.2",
|
||||
"lexical": "^0.23.1",
|
||||
"lodash": "^4.17.21",
|
||||
"lucide-react": "^0.546.0",
|
||||
|
||||
132
web/src/components/json-edit/css/cloud9_night.less
Normal file
132
web/src/components/json-edit/css/cloud9_night.less
Normal file
@ -0,0 +1,132 @@
|
||||
.ace-tomorrow-night .ace_gutter {
|
||||
background: var(--bg-card);
|
||||
color: rgb(var(--text-primary));
|
||||
}
|
||||
.ace-tomorrow-night .ace_print-margin {
|
||||
width: 1px;
|
||||
background: #25282c;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night {
|
||||
background: var(--bg-card);
|
||||
color: rgb(var(--text-primary));
|
||||
.ace_editor {
|
||||
background: var(--bg-card);
|
||||
}
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_cursor {
|
||||
color: #aeafad;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_marker-layer .ace_selection {
|
||||
background: #373b41;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night.ace_multiselect .ace_selection.ace_start {
|
||||
box-shadow: 0 0 3px 0px #1d1f21;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_marker-layer .ace_step {
|
||||
background: rgb(102, 82, 0);
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_marker-layer .ace_bracket {
|
||||
margin: -1px 0 0 -1px;
|
||||
border: 1px solid #4b4e55;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_marker-layer .ace_active-line {
|
||||
background: var(--bg-card);
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_gutter-active-line {
|
||||
background-color: var(--bg-card);
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_marker-layer .ace_selected-word {
|
||||
border: 1px solid #373b41;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_invisible {
|
||||
color: #4b4e55;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_keyword,
|
||||
.ace-tomorrow-night .ace_meta,
|
||||
.ace-tomorrow-night .ace_storage,
|
||||
.ace-tomorrow-night .ace_storage.ace_type,
|
||||
.ace-tomorrow-night .ace_support.ace_type {
|
||||
color: #b294bb;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_keyword.ace_operator {
|
||||
color: #8abeb7;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_constant.ace_character,
|
||||
.ace-tomorrow-night .ace_constant.ace_language,
|
||||
.ace-tomorrow-night .ace_constant.ace_numeric,
|
||||
.ace-tomorrow-night .ace_keyword.ace_other.ace_unit,
|
||||
.ace-tomorrow-night .ace_support.ace_constant,
|
||||
.ace-tomorrow-night .ace_variable.ace_parameter {
|
||||
color: #de935f;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_constant.ace_other {
|
||||
color: #ced1cf;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_invalid {
|
||||
color: #ced2cf;
|
||||
background-color: #df5f5f;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_invalid.ace_deprecated {
|
||||
color: #ced2cf;
|
||||
background-color: #b798bf;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_fold {
|
||||
background-color: #81a2be;
|
||||
border-color: #c5c8c6;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_entity.ace_name.ace_function,
|
||||
.ace-tomorrow-night .ace_support.ace_function,
|
||||
.ace-tomorrow-night .ace_variable {
|
||||
color: #81a2be;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_support.ace_class,
|
||||
.ace-tomorrow-night .ace_support.ace_type {
|
||||
color: #f0c674;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_heading,
|
||||
.ace-tomorrow-night .ace_markup.ace_heading,
|
||||
.ace-tomorrow-night .ace_string {
|
||||
color: #b5bd68;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_entity.ace_name.ace_tag,
|
||||
.ace-tomorrow-night .ace_entity.ace_other.ace_attribute-name,
|
||||
.ace-tomorrow-night .ace_meta.ace_tag,
|
||||
.ace-tomorrow-night .ace_string.ace_regexp,
|
||||
.ace-tomorrow-night .ace_variable {
|
||||
color: #cc6666;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_comment {
|
||||
color: #969896;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_indent-guide {
|
||||
background: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAACCAYAAACZgbYnAAAAEklEQVQImWNgYGBgYHB3d/8PAAOIAdULw8qMAAAAAElFTkSuQmCC)
|
||||
right repeat-y;
|
||||
}
|
||||
|
||||
.ace-tomorrow-night .ace_indent-guide-active {
|
||||
background: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAACCAYAAACZgbYnAAAAEklEQVQIW2PQ1dX9zzBz5sz/ABCcBFFentLlAAAAAElFTkSuQmCC)
|
||||
right repeat-y;
|
||||
}
|
||||
83
web/src/components/json-edit/css/index.less
Normal file
83
web/src/components/json-edit/css/index.less
Normal file
@ -0,0 +1,83 @@
|
||||
.jsoneditor {
|
||||
border: none;
|
||||
color: rgb(var(--text-primary));
|
||||
overflow: auto;
|
||||
scrollbar-width: none;
|
||||
background-color: var(--bg-base);
|
||||
.jsoneditor-menu {
|
||||
background-color: var(--bg-base);
|
||||
// border-color: var(--border-button);
|
||||
border-bottom: thin solid var(--border-button);
|
||||
}
|
||||
.jsoneditor-navigation-bar {
|
||||
border-bottom: 1px solid var(--border-button);
|
||||
background-color: var(--bg-input);
|
||||
}
|
||||
.jsoneditor-tree {
|
||||
background: var(--bg-base);
|
||||
}
|
||||
.jsoneditor-highlight {
|
||||
background-color: var(--bg-card);
|
||||
}
|
||||
}
|
||||
.jsoneditor-popover,
|
||||
.jsoneditor-schema-error,
|
||||
div.jsoneditor td,
|
||||
div.jsoneditor textarea,
|
||||
div.jsoneditor th,
|
||||
div.jsoneditor-field,
|
||||
div.jsoneditor-value,
|
||||
pre.jsoneditor-preview {
|
||||
font-family: consolas, menlo, monaco, 'Ubuntu Mono', source-code-pro,
|
||||
monospace;
|
||||
font-size: 14px;
|
||||
color: rgb(var(--text-primary));
|
||||
}
|
||||
|
||||
div.jsoneditor-field.jsoneditor-highlight,
|
||||
div.jsoneditor-field[contenteditable='true']:focus,
|
||||
div.jsoneditor-field[contenteditable='true']:hover,
|
||||
div.jsoneditor-value.jsoneditor-highlight,
|
||||
div.jsoneditor-value[contenteditable='true']:focus,
|
||||
div.jsoneditor-value[contenteditable='true']:hover {
|
||||
background-color: var(--bg-input);
|
||||
border: 1px solid var(--border-button);
|
||||
border-radius: 2px;
|
||||
}
|
||||
|
||||
.jsoneditor-selected,
|
||||
.jsoneditor-contextmenu .jsoneditor-menu li ul {
|
||||
background: var(--bg-base);
|
||||
}
|
||||
|
||||
.jsoneditor-contextmenu .jsoneditor-menu button {
|
||||
color: rgb(var(--text-secondary));
|
||||
}
|
||||
.jsoneditor-menu a.jsoneditor-poweredBy {
|
||||
display: none;
|
||||
}
|
||||
.ace-jsoneditor .ace_scroller {
|
||||
background-color: var(--bg-base);
|
||||
}
|
||||
.jsoneditor-statusbar {
|
||||
border-top: 1px solid var(--border-button);
|
||||
background-color: var(--bg-base);
|
||||
color: rgb(var(--text-primary));
|
||||
}
|
||||
.jsoneditor-menu > .jsoneditor-modes > button,
|
||||
.jsoneditor-menu > button {
|
||||
// color: rgb(var(--text-secondary));
|
||||
background-color: var(--text-disabled);
|
||||
}
|
||||
|
||||
.jsoneditor-menu > .jsoneditor-modes > button:active,
|
||||
.jsoneditor-menu > .jsoneditor-modes > button:focus,
|
||||
.jsoneditor-menu > button:active,
|
||||
.jsoneditor-menu > button:focus {
|
||||
background-color: rgb(var(--text-secondary));
|
||||
}
|
||||
.jsoneditor-menu > .jsoneditor-modes > button:hover,
|
||||
.jsoneditor-menu > button:hover {
|
||||
background-color: rgb(var(--text-secondary));
|
||||
border: 1px solid var(--border-button);
|
||||
}
|
||||
142
web/src/components/json-edit/index.tsx
Normal file
142
web/src/components/json-edit/index.tsx
Normal file
@ -0,0 +1,142 @@
|
||||
import React, { useEffect, useRef } from 'react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import './css/cloud9_night.less';
|
||||
import './css/index.less';
|
||||
import { JsonEditorOptions, JsonEditorProps } from './interface';
|
||||
const defaultConfig: JsonEditorOptions = {
|
||||
mode: 'code',
|
||||
modes: ['tree', 'code'],
|
||||
history: false,
|
||||
search: false,
|
||||
mainMenuBar: false,
|
||||
navigationBar: false,
|
||||
enableSort: false,
|
||||
enableTransform: false,
|
||||
indentation: 2,
|
||||
};
|
||||
const JsonEditor: React.FC<JsonEditorProps> = ({
|
||||
value,
|
||||
onChange,
|
||||
height = '400px',
|
||||
className = '',
|
||||
options = {},
|
||||
}) => {
|
||||
const containerRef = useRef<HTMLDivElement>(null);
|
||||
const editorRef = useRef<any>(null);
|
||||
const { i18n } = useTranslation();
|
||||
const currentLanguageRef = useRef<string>(i18n.language);
|
||||
|
||||
useEffect(() => {
|
||||
if (typeof window !== 'undefined') {
|
||||
const JSONEditor = require('jsoneditor');
|
||||
import('jsoneditor/dist/jsoneditor.min.css');
|
||||
|
||||
if (containerRef.current) {
|
||||
// Default configuration options
|
||||
const defaultOptions: JsonEditorOptions = {
|
||||
...defaultConfig,
|
||||
language: i18n.language === 'zh' ? 'zh-CN' : 'en',
|
||||
onChange: () => {
|
||||
if (editorRef.current && onChange) {
|
||||
try {
|
||||
const updatedJson = editorRef.current.get();
|
||||
onChange(updatedJson);
|
||||
} catch (err) {
|
||||
// Do not trigger onChange when parsing error occurs
|
||||
console.error(err);
|
||||
}
|
||||
}
|
||||
},
|
||||
...options, // Merge user provided options with defaults
|
||||
};
|
||||
|
||||
editorRef.current = new JSONEditor(
|
||||
containerRef.current,
|
||||
defaultOptions,
|
||||
);
|
||||
|
||||
if (value) {
|
||||
editorRef.current.set(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return () => {
|
||||
if (editorRef.current) {
|
||||
if (typeof editorRef.current.destroy === 'function') {
|
||||
editorRef.current.destroy();
|
||||
}
|
||||
editorRef.current = null;
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Update language when i18n language changes
|
||||
// Since JSONEditor doesn't have a setOptions method, we need to recreate the editor
|
||||
if (editorRef.current && currentLanguageRef.current !== i18n.language) {
|
||||
currentLanguageRef.current = i18n.language;
|
||||
|
||||
// Save current data
|
||||
let currentData;
|
||||
try {
|
||||
currentData = editorRef.current.get();
|
||||
} catch (e) {
|
||||
// If there's an error getting data, use the passed value or empty object
|
||||
currentData = value || {};
|
||||
}
|
||||
|
||||
// Destroy the current editor
|
||||
if (typeof editorRef.current.destroy === 'function') {
|
||||
editorRef.current.destroy();
|
||||
}
|
||||
|
||||
// Recreate the editor with new language
|
||||
const JSONEditor = require('jsoneditor');
|
||||
|
||||
const newOptions: JsonEditorOptions = {
|
||||
...defaultConfig,
|
||||
language: i18n.language === 'zh' ? 'zh-CN' : 'en',
|
||||
onChange: () => {
|
||||
if (editorRef.current && onChange) {
|
||||
try {
|
||||
const updatedJson = editorRef.current.get();
|
||||
onChange(updatedJson);
|
||||
} catch (err) {
|
||||
// Do not trigger onChange when parsing error occurs
|
||||
}
|
||||
}
|
||||
},
|
||||
...options, // Merge user provided options with defaults
|
||||
};
|
||||
|
||||
editorRef.current = new JSONEditor(containerRef.current, newOptions);
|
||||
editorRef.current.set(currentData);
|
||||
}
|
||||
}, [i18n.language, value, onChange, options]);
|
||||
|
||||
useEffect(() => {
|
||||
if (editorRef.current && value !== undefined) {
|
||||
try {
|
||||
// Only update the editor when the value actually changes
|
||||
const currentJson = editorRef.current.get();
|
||||
if (JSON.stringify(currentJson) !== JSON.stringify(value)) {
|
||||
editorRef.current.set(value);
|
||||
}
|
||||
} catch (err) {
|
||||
// Skip update if there is a syntax error in the current editor
|
||||
editorRef.current.set(value);
|
||||
}
|
||||
}
|
||||
}, [value]);
|
||||
|
||||
return (
|
||||
<div
|
||||
ref={containerRef}
|
||||
style={{ height }}
|
||||
className={`ace-tomorrow-night w-full border border-border-button rounded-lg overflow-hidden bg-bg-input ${className} `}
|
||||
/>
|
||||
);
|
||||
};
|
||||
|
||||
export default JsonEditor;
|
||||
339
web/src/components/json-edit/interface.ts
Normal file
339
web/src/components/json-edit/interface.ts
Normal file
@ -0,0 +1,339 @@
|
||||
// JSONEditor configuration options interface see: https://github.com/josdejong/jsoneditor/blob/master/docs/api.md
|
||||
export interface JsonEditorOptions {
|
||||
/**
|
||||
* Editor mode. Available values: 'tree' (default), 'view', 'form', 'text', and 'code'.
|
||||
*/
|
||||
mode?: 'tree' | 'view' | 'form' | 'text' | 'code';
|
||||
|
||||
/**
|
||||
* Array of available modes
|
||||
*/
|
||||
modes?: Array<'tree' | 'view' | 'form' | 'text' | 'code'>;
|
||||
|
||||
/**
|
||||
* Field name for the root node. Only applicable for modes 'tree', 'view', and 'form'
|
||||
*/
|
||||
name?: string;
|
||||
|
||||
/**
|
||||
* Theme for the editor
|
||||
*/
|
||||
theme?: string;
|
||||
|
||||
/**
|
||||
* Enable history (undo/redo). True by default. Only applicable for modes 'tree', 'view', and 'form'
|
||||
*/
|
||||
history?: boolean;
|
||||
|
||||
/**
|
||||
* Enable search box. True by default. Only applicable for modes 'tree', 'view', and 'form'
|
||||
*/
|
||||
search?: boolean;
|
||||
|
||||
/**
|
||||
* Main menu bar visibility
|
||||
*/
|
||||
mainMenuBar?: boolean;
|
||||
|
||||
/**
|
||||
* Navigation bar visibility
|
||||
*/
|
||||
navigationBar?: boolean;
|
||||
|
||||
/**
|
||||
* Status bar visibility
|
||||
*/
|
||||
statusBar?: boolean;
|
||||
|
||||
/**
|
||||
* If true, object keys are sorted before display. false by default.
|
||||
*/
|
||||
sortObjectKeys?: boolean;
|
||||
|
||||
/**
|
||||
* Enable transform functionality
|
||||
*/
|
||||
enableTransform?: boolean;
|
||||
|
||||
/**
|
||||
* Enable sort functionality
|
||||
*/
|
||||
enableSort?: boolean;
|
||||
|
||||
/**
|
||||
* Limit dragging functionality
|
||||
*/
|
||||
limitDragging?: boolean;
|
||||
|
||||
/**
|
||||
* A JSON schema object
|
||||
*/
|
||||
schema?: any;
|
||||
|
||||
/**
|
||||
* Schemas that are referenced using the `$ref` property from the JSON schema
|
||||
*/
|
||||
schemaRefs?: Record<string, any>;
|
||||
|
||||
/**
|
||||
* Array of template objects
|
||||
*/
|
||||
templates?: Array<{
|
||||
text: string;
|
||||
title?: string;
|
||||
className?: string;
|
||||
field?: string;
|
||||
value: any;
|
||||
}>;
|
||||
|
||||
/**
|
||||
* Ace editor instance
|
||||
*/
|
||||
ace?: any;
|
||||
|
||||
/**
|
||||
* An instance of Ajv JSON schema validator
|
||||
*/
|
||||
ajv?: any;
|
||||
|
||||
/**
|
||||
* Switch to enable/disable autocomplete
|
||||
*/
|
||||
autocomplete?: {
|
||||
confirmKey?: string | string[];
|
||||
caseSensitive?: boolean;
|
||||
getOptions?: (
|
||||
text: string,
|
||||
path: Array<string | number>,
|
||||
input: string,
|
||||
editor: any,
|
||||
) => string[] | Promise<string[]> | null;
|
||||
};
|
||||
|
||||
/**
|
||||
* Number of indentation spaces. 4 by default. Only applicable for modes 'text' and 'code'
|
||||
*/
|
||||
indentation?: number;
|
||||
|
||||
/**
|
||||
* Available languages
|
||||
*/
|
||||
languages?: string[];
|
||||
|
||||
/**
|
||||
* Language of the editor
|
||||
*/
|
||||
language?: string;
|
||||
|
||||
/**
|
||||
* Callback method, triggered on change of contents. Does not pass the contents itself.
|
||||
* See also onChangeJSON and onChangeText.
|
||||
*/
|
||||
onChange?: () => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered in modes on change of contents, passing the changed contents as JSON.
|
||||
* Only applicable for modes 'tree', 'view', and 'form'.
|
||||
*/
|
||||
onChangeJSON?: (json: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered in modes on change of contents, passing the changed contents as stringified JSON.
|
||||
*/
|
||||
onChangeText?: (text: string) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when an error occurs
|
||||
*/
|
||||
onError?: (error: Error) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when node is expanded
|
||||
*/
|
||||
onExpand?: (node: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when node is collapsed
|
||||
*/
|
||||
onCollapse?: (node: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, determines if a node is editable
|
||||
*/
|
||||
onEditable?: (node: any) => boolean | { field: boolean; value: boolean };
|
||||
|
||||
/**
|
||||
* Callback method, triggered when an event occurs in a JSON field or value.
|
||||
* Only applicable for modes 'form', 'tree' and 'view'
|
||||
*/
|
||||
onEvent?: (node: any, event: Event) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when the editor comes into focus, passing an object {type, target}.
|
||||
* Applicable for all modes
|
||||
*/
|
||||
onFocus?: (node: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when the editor goes out of focus, passing an object {type, target}.
|
||||
* Applicable for all modes
|
||||
*/
|
||||
onBlur?: (node: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when creating menu items
|
||||
*/
|
||||
onCreateMenu?: (menuItems: any[], node: any) => any[];
|
||||
|
||||
/**
|
||||
* Callback method, triggered on node selection change. Only applicable for modes 'tree', 'view', and 'form'
|
||||
*/
|
||||
onSelectionChange?: (selection: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered on text selection change. Only applicable for modes 'text' and 'code'
|
||||
*/
|
||||
onTextSelectionChange?: (selection: any) => void;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when a Node DOM is rendered. Function returns a css class name to be set on a node.
|
||||
* Only applicable for modes 'form', 'tree' and 'view'
|
||||
*/
|
||||
onClassName?: (node: any) => string | undefined;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when validating nodes
|
||||
*/
|
||||
onValidate?: (
|
||||
json: any,
|
||||
) =>
|
||||
| Array<{ path: Array<string | number>; message: string }>
|
||||
| Promise<Array<{ path: Array<string | number>; message: string }>>;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when node name is determined
|
||||
*/
|
||||
onNodeName?: (parentNode: any, childNode: any, name: string) => string;
|
||||
|
||||
/**
|
||||
* Callback method, triggered when mode changes
|
||||
*/
|
||||
onModeChange?: (newMode: string, oldMode: string) => void;
|
||||
|
||||
/**
|
||||
* Color picker options
|
||||
*/
|
||||
colorPicker?: boolean;
|
||||
|
||||
/**
|
||||
* Callback method for color picker
|
||||
*/
|
||||
onColorPicker?: (
|
||||
callback: (color: string) => void,
|
||||
parent: HTMLElement,
|
||||
) => void;
|
||||
|
||||
/**
|
||||
* If true, shows timestamp tag
|
||||
*/
|
||||
timestampTag?: boolean;
|
||||
|
||||
/**
|
||||
* Format for timestamps
|
||||
*/
|
||||
timestampFormat?: string;
|
||||
|
||||
/**
|
||||
* If true, unicode characters are escaped. false by default.
|
||||
*/
|
||||
escapeUnicode?: boolean;
|
||||
|
||||
/**
|
||||
* Number of children allowed for a node in 'tree', 'view', or 'form' mode before
|
||||
* the "show more/show all" buttons appear. 100 by default.
|
||||
*/
|
||||
maxVisibleChilds?: number;
|
||||
|
||||
/**
|
||||
* Callback method for validation errors
|
||||
*/
|
||||
onValidationError?: (
|
||||
errors: Array<{ path: Array<string | number>; message: string }>,
|
||||
) => void;
|
||||
|
||||
/**
|
||||
* Callback method for validation warnings
|
||||
*/
|
||||
onValidationWarning?: (
|
||||
warnings: Array<{ path: Array<string | number>; message: string }>,
|
||||
) => void;
|
||||
|
||||
/**
|
||||
* The anchor element to apply an overlay and display the modals in a centered location. Defaults to document.body
|
||||
*/
|
||||
modalAnchor?: HTMLElement | null;
|
||||
|
||||
/**
|
||||
* Anchor element for popups
|
||||
*/
|
||||
popupAnchor?: HTMLElement | null;
|
||||
|
||||
/**
|
||||
* Function to create queries
|
||||
*/
|
||||
createQuery?: () => void;
|
||||
|
||||
/**
|
||||
* Function to execute queries
|
||||
*/
|
||||
executeQuery?: () => void;
|
||||
|
||||
/**
|
||||
* Query description
|
||||
*/
|
||||
queryDescription?: string;
|
||||
|
||||
/**
|
||||
* Allow schema suggestions
|
||||
*/
|
||||
allowSchemaSuggestions?: boolean;
|
||||
|
||||
/**
|
||||
* Show error table
|
||||
*/
|
||||
showErrorTable?: boolean;
|
||||
|
||||
/**
|
||||
* Validate current JSON object against the configured JSON schema
|
||||
* Must be implemented by tree mode and text mode
|
||||
*/
|
||||
validate?: () => Promise<any[]>;
|
||||
|
||||
/**
|
||||
* Refresh the rendered contents
|
||||
* Can be implemented by tree mode and text mode
|
||||
*/
|
||||
refresh?: () => void;
|
||||
|
||||
/**
|
||||
* Callback method triggered when schema changes
|
||||
*/
|
||||
_onSchemaChange?: (schema: any, schemaRefs: any) => void;
|
||||
}
|
||||
|
||||
export interface JsonEditorProps {
|
||||
// JSON data to be displayed in the editor
|
||||
value?: any;
|
||||
|
||||
// Callback function triggered when the JSON data changes
|
||||
onChange?: (value: any) => void;
|
||||
|
||||
// Height of the editor
|
||||
height?: string;
|
||||
|
||||
// Additional CSS class names
|
||||
className?: string;
|
||||
|
||||
// Configuration options for the JSONEditor
|
||||
options?: JsonEditorOptions;
|
||||
}
|
||||
@ -25,6 +25,7 @@ export default {
|
||||
portugueseBr: 'Portuguese (Brazil)',
|
||||
chinese: 'Simplified Chinese',
|
||||
traditionalChinese: 'Traditional Chinese',
|
||||
russian: 'Russian',
|
||||
language: 'Language',
|
||||
languageMessage: 'Please input your language!',
|
||||
languagePlaceholder: 'select your language',
|
||||
@ -1752,6 +1753,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
|
||||
The Indexer will store the content in the corresponding data structures for the selected methods.`,
|
||||
// file: 'File',
|
||||
parserMethod: 'PDF parser',
|
||||
tableResultType: 'Table Result Type',
|
||||
markdownImageResponseType: 'Markdown Image Response Type',
|
||||
// systemPrompt: 'System Prompt',
|
||||
systemPromptPlaceholder:
|
||||
'Enter system prompt for image analysis, if empty the system default value will be used',
|
||||
@ -1934,6 +1937,7 @@ Important structured information may include: names, dates, locations, events, k
|
||||
japanese: 'Japanese',
|
||||
korean: 'Korean',
|
||||
vietnamese: 'Vietnamese',
|
||||
russian: 'Russian',
|
||||
},
|
||||
pagination: {
|
||||
total: 'Total {{total}}',
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1629,6 +1629,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
|
||||
filenameEmbdWeight: '文件名嵌入权重',
|
||||
parserMethod: '解析方法',
|
||||
tableResultType: '表格返回形式',
|
||||
markdownImageResponseType: '图片返回形式',
|
||||
systemPromptPlaceholder:
|
||||
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
|
||||
exportJson: '导出 JSON',
|
||||
|
||||
@ -169,6 +169,7 @@ export const initialParserValues = {
|
||||
{
|
||||
fileFormat: FileType.Spreadsheet,
|
||||
output_format: SpreadsheetOutputFormat.Html,
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Image,
|
||||
@ -192,6 +193,7 @@ export const initialParserValues = {
|
||||
{
|
||||
fileFormat: FileType.PowerPoint,
|
||||
output_format: PptOutputFormat.Json,
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
},
|
||||
],
|
||||
};
|
||||
@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
|
||||
[FileType.Email]: ['eml', 'msg'],
|
||||
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
|
||||
[FileType.Docx]: ['doc', 'docx'],
|
||||
[FileType.PowerPoint]: ['pptx'],
|
||||
[FileType.PowerPoint]: ['pptx', 'ppt'],
|
||||
[FileType.Video]: ['mp4', 'avi', 'mkv'],
|
||||
[FileType.Audio]: [
|
||||
'da',
|
||||
|
||||
@ -22,7 +22,8 @@ import { Switch } from '@/components/ui/switch';
|
||||
import { LlmModelType } from '@/constants/knowledge';
|
||||
import { useFindLlmByUuid } from '@/hooks/use-llm-request';
|
||||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { memo, useCallback, useEffect, useMemo } from 'react';
|
||||
import { get } from 'lodash';
|
||||
import { memo, useEffect, useMemo } from 'react';
|
||||
import { useForm, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { z } from 'zod';
|
||||
@ -45,7 +46,10 @@ import { AgentTools, Agents } from './agent-tools';
|
||||
import { StructuredOutputDialog } from './structured-output-dialog';
|
||||
import { StructuredOutputPanel } from './structured-output-panel';
|
||||
import { useBuildPromptExtraPromptOptions } from './use-build-prompt-options';
|
||||
import { useShowStructuredOutputDialog } from './use-show-structured-output-dialog';
|
||||
import {
|
||||
useHandleShowStructuredOutput,
|
||||
useShowStructuredOutputDialog,
|
||||
} from './use-show-structured-output-dialog';
|
||||
import { useValues } from './use-values';
|
||||
import { useWatchFormChange } from './use-watch-change';
|
||||
|
||||
@ -121,22 +125,19 @@ function AgentForm({ node }: INextOperatorForm) {
|
||||
});
|
||||
|
||||
const {
|
||||
initialStructuredOutput,
|
||||
showStructuredOutputDialog,
|
||||
structuredOutputDialogVisible,
|
||||
hideStructuredOutputDialog,
|
||||
handleStructuredOutputDialogOk,
|
||||
} = useShowStructuredOutputDialog(node?.id);
|
||||
|
||||
const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
|
||||
const structuredOutput = get(
|
||||
node,
|
||||
`data.form.outputs.${AgentStructuredOutputField}`,
|
||||
);
|
||||
|
||||
const handleShowStructuredOutput = useCallback(
|
||||
(val: boolean) => {
|
||||
if (node?.id && val) {
|
||||
updateNodeForm(node?.id, {}, ['outputs', AgentStructuredOutputField]);
|
||||
}
|
||||
},
|
||||
[node?.id, updateNodeForm],
|
||||
const { handleShowStructuredOutput } = useHandleShowStructuredOutput(
|
||||
node?.id,
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
@ -327,7 +328,7 @@ function AgentForm({ node }: INextOperatorForm) {
|
||||
</div>
|
||||
|
||||
<StructuredOutputPanel
|
||||
value={initialStructuredOutput}
|
||||
value={structuredOutput}
|
||||
></StructuredOutputPanel>
|
||||
</section>
|
||||
)}
|
||||
@ -337,7 +338,7 @@ function AgentForm({ node }: INextOperatorForm) {
|
||||
<StructuredOutputDialog
|
||||
hideModal={hideStructuredOutputDialog}
|
||||
onOk={handleStructuredOutputDialogOk}
|
||||
initialValues={initialStructuredOutput}
|
||||
initialValues={structuredOutput}
|
||||
></StructuredOutputDialog>
|
||||
)}
|
||||
</>
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import { JSONSchema } from '@/components/jsonjoy-builder';
|
||||
import { AgentStructuredOutputField } from '@/constants/agent';
|
||||
import { useSetModalState } from '@/hooks/common-hooks';
|
||||
import { useCallback } from 'react';
|
||||
import { initialAgentValues } from '../../constant';
|
||||
import useGraphStore from '../../store';
|
||||
|
||||
export function useShowStructuredOutputDialog(nodeId?: string) {
|
||||
@ -9,15 +11,13 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
|
||||
showModal: showStructuredOutputDialog,
|
||||
hideModal: hideStructuredOutputDialog,
|
||||
} = useSetModalState();
|
||||
const { updateNodeForm, getNode } = useGraphStore((state) => state);
|
||||
|
||||
const initialStructuredOutput = getNode(nodeId)?.data.form.outputs.structured;
|
||||
const { updateNodeForm } = useGraphStore((state) => state);
|
||||
|
||||
const handleStructuredOutputDialogOk = useCallback(
|
||||
(values: JSONSchema) => {
|
||||
// Sync data to canvas
|
||||
if (nodeId) {
|
||||
updateNodeForm(nodeId, values, ['outputs', 'structured']);
|
||||
updateNodeForm(nodeId, values, ['outputs', AgentStructuredOutputField]);
|
||||
}
|
||||
hideStructuredOutputDialog();
|
||||
},
|
||||
@ -25,10 +25,30 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
|
||||
);
|
||||
|
||||
return {
|
||||
initialStructuredOutput,
|
||||
structuredOutputDialogVisible,
|
||||
showStructuredOutputDialog,
|
||||
hideStructuredOutputDialog,
|
||||
handleStructuredOutputDialogOk,
|
||||
};
|
||||
}
|
||||
|
||||
export function useHandleShowStructuredOutput(nodeId?: string) {
|
||||
const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
|
||||
|
||||
const handleShowStructuredOutput = useCallback(
|
||||
(val: boolean) => {
|
||||
if (nodeId) {
|
||||
if (val) {
|
||||
updateNodeForm(nodeId, {}, ['outputs', AgentStructuredOutputField]);
|
||||
} else {
|
||||
updateNodeForm(nodeId, initialAgentValues.outputs, ['outputs']);
|
||||
}
|
||||
}
|
||||
},
|
||||
[nodeId, updateNodeForm],
|
||||
);
|
||||
|
||||
return {
|
||||
handleShowStructuredOutput,
|
||||
};
|
||||
}
|
||||
|
||||
@ -6,8 +6,10 @@ import { initialAgentValues } from '../../constant';
|
||||
|
||||
// You need to exclude the mcp and tools fields that are not in the form,
|
||||
// otherwise the form data update will reset the tools or mcp data to an array
|
||||
// Exclude data that is not in the form to avoid writing this data to the canvas when using useWatch.
|
||||
// Outputs, tools, and MCP data are directly synchronized to the canvas without going through the form.
|
||||
function omitToolsAndMcp(values: Record<string, any>) {
|
||||
return omit(values, ['mcp', 'tools']);
|
||||
return omit(values, ['mcp', 'tools', 'outputs']);
|
||||
}
|
||||
|
||||
export function useValues(node?: RAGFlowNodeType) {
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import { omit } from 'lodash';
|
||||
import { useEffect } from 'react';
|
||||
import { UseFormReturn, useWatch } from 'react-hook-form';
|
||||
import { AgentStructuredOutputField, PromptRole } from '../../constant';
|
||||
import { PromptRole } from '../../constant';
|
||||
import useGraphStore from '../../store';
|
||||
|
||||
export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
|
||||
@ -17,14 +16,6 @@ export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
|
||||
prompts: [{ role: PromptRole.User, content: values.prompts }],
|
||||
};
|
||||
|
||||
if (!values.showStructuredOutput) {
|
||||
nextValues = {
|
||||
...nextValues,
|
||||
outputs: omit(values.outputs, [AgentStructuredOutputField]),
|
||||
};
|
||||
} else {
|
||||
nextValues = omit(nextValues, 'outputs');
|
||||
}
|
||||
updateNodeForm(id, nextValues);
|
||||
}
|
||||
}, [form?.formState.isDirty, id, updateNodeForm, values]);
|
||||
|
||||
@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
|
||||
import { EmailFormFields } from './email-form-fields';
|
||||
import { ImageFormFields } from './image-form-fields';
|
||||
import { PdfFormFields } from './pdf-form-fields';
|
||||
import { PptFormFields } from './ppt-form-fields';
|
||||
import { SpreadsheetFormFields } from './spreadsheet-form-fields';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
import { AudioFormFields, VideoFormFields } from './video-form-fields';
|
||||
|
||||
@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);
|
||||
|
||||
const FileFormatWidgetMap = {
|
||||
[FileType.PDF]: PdfFormFields,
|
||||
[FileType.Spreadsheet]: SpreadsheetFormFields,
|
||||
[FileType.PowerPoint]: PptFormFields,
|
||||
[FileType.Video]: VideoFormFields,
|
||||
[FileType.Audio]: AudioFormFields,
|
||||
[FileType.Email]: EmailFormFields,
|
||||
@ -65,6 +69,8 @@ export const FormSchema = z.object({
|
||||
fields: z.array(z.string()).optional(),
|
||||
llm_id: z.string().optional(),
|
||||
system_prompt: z.string().optional(),
|
||||
table_result_type: z.string().optional(),
|
||||
markdown_image_response_type: z.string().optional(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
|
||||
lang: '',
|
||||
fields: [],
|
||||
llm_id: '',
|
||||
table_result_type: '',
|
||||
markdown_image_response_type: '',
|
||||
});
|
||||
}, [append]);
|
||||
|
||||
|
||||
@ -1,13 +1,30 @@
|
||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||
import {
|
||||
SelectWithSearch,
|
||||
SelectWithSearchFlagOptionType,
|
||||
} from '@/components/originui/select-with-search';
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useEffect, useMemo } from 'react';
|
||||
import { useFormContext, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
import { useSetInitialLanguage } from './use-set-initial-language';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
|
||||
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'Markdown', value: '0' },
|
||||
{ label: 'HTML', value: '1' },
|
||||
];
|
||||
|
||||
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'URL', value: '0' },
|
||||
{ label: 'Text', value: '1' },
|
||||
];
|
||||
|
||||
export function PdfFormFields({ prefix }: CommonProps) {
|
||||
const { t } = useTranslation();
|
||||
const form = useFormContext();
|
||||
|
||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||
@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
||||
);
|
||||
}, [parseMethod]);
|
||||
|
||||
const tcadpOptionsShown = useMemo(() => {
|
||||
return (
|
||||
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||
);
|
||||
}, [parseMethod]);
|
||||
|
||||
useSetInitialLanguage({ prefix, languageShown });
|
||||
|
||||
useEffect(() => {
|
||||
@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
||||
}
|
||||
}, [form, parseMethodName]);
|
||||
|
||||
// Set default values for TCADP options when TCADP is selected
|
||||
useEffect(() => {
|
||||
if (tcadpOptionsShown) {
|
||||
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||
'table_result_type',
|
||||
prefix,
|
||||
);
|
||||
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
);
|
||||
|
||||
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||
form.setValue(tableResultTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||
form.setValue(markdownImageResponseTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
}, [tcadpOptionsShown, form, prefix]);
|
||||
|
||||
return (
|
||||
<>
|
||||
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
|
||||
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
|
||||
{tcadpOptionsShown && (
|
||||
<>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||
label={t('flow.tableResultType') || '表格返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={tableResultTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
)}
|
||||
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={markdownImageResponseTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
125
web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
Normal file
125
web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
Normal file
@ -0,0 +1,125 @@
|
||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||
import {
|
||||
SelectWithSearch,
|
||||
SelectWithSearchFlagOptionType,
|
||||
} from '@/components/originui/select-with-search';
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useEffect, useMemo } from 'react';
|
||||
import { useFormContext, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { ParserMethodFormField } from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
|
||||
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'Markdown', value: '0' },
|
||||
{ label: 'HTML', value: '1' },
|
||||
];
|
||||
|
||||
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'URL', value: '0' },
|
||||
{ label: 'Text', value: '1' },
|
||||
];
|
||||
|
||||
export function PptFormFields({ prefix }: CommonProps) {
|
||||
const { t } = useTranslation();
|
||||
const form = useFormContext();
|
||||
|
||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||
|
||||
const parseMethod = useWatch({
|
||||
name: parseMethodName,
|
||||
});
|
||||
|
||||
// PPT only supports DeepDOC and TCADPParser
|
||||
const optionsWithoutLLM = [
|
||||
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||
{
|
||||
label: ParseDocumentType.TCADPParser,
|
||||
value: ParseDocumentType.TCADPParser,
|
||||
},
|
||||
];
|
||||
|
||||
const tcadpOptionsShown = useMemo(() => {
|
||||
return (
|
||||
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||
);
|
||||
}, [parseMethod]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isEmpty(form.getValues(parseMethodName))) {
|
||||
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}, [form, parseMethodName]);
|
||||
|
||||
// Set default values for TCADP options when TCADP is selected
|
||||
useEffect(() => {
|
||||
if (tcadpOptionsShown) {
|
||||
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||
'table_result_type',
|
||||
prefix,
|
||||
);
|
||||
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
);
|
||||
|
||||
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||
form.setValue(tableResultTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||
form.setValue(markdownImageResponseTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
}, [tcadpOptionsShown, form, prefix]);
|
||||
|
||||
return (
|
||||
<>
|
||||
<ParserMethodFormField
|
||||
prefix={prefix}
|
||||
optionsWithoutLLM={optionsWithoutLLM}
|
||||
></ParserMethodFormField>
|
||||
{tcadpOptionsShown && (
|
||||
<>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||
label={t('flow.tableResultType') || '表格返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={tableResultTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
)}
|
||||
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={markdownImageResponseTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
125
web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
Normal file
125
web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
Normal file
@ -0,0 +1,125 @@
|
||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||
import {
|
||||
SelectWithSearch,
|
||||
SelectWithSearchFlagOptionType,
|
||||
} from '@/components/originui/select-with-search';
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useEffect, useMemo } from 'react';
|
||||
import { useFormContext, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { ParserMethodFormField } from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
|
||||
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'Markdown', value: '0' },
|
||||
{ label: 'HTML', value: '1' },
|
||||
];
|
||||
|
||||
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||
{ label: 'URL', value: '0' },
|
||||
{ label: 'Text', value: '1' },
|
||||
];
|
||||
|
||||
export function SpreadsheetFormFields({ prefix }: CommonProps) {
|
||||
const { t } = useTranslation();
|
||||
const form = useFormContext();
|
||||
|
||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||
|
||||
const parseMethod = useWatch({
|
||||
name: parseMethodName,
|
||||
});
|
||||
|
||||
// Spreadsheet only supports DeepDOC and TCADPParser
|
||||
const optionsWithoutLLM = [
|
||||
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||
{
|
||||
label: ParseDocumentType.TCADPParser,
|
||||
value: ParseDocumentType.TCADPParser,
|
||||
},
|
||||
];
|
||||
|
||||
const tcadpOptionsShown = useMemo(() => {
|
||||
return (
|
||||
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||
);
|
||||
}, [parseMethod]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isEmpty(form.getValues(parseMethodName))) {
|
||||
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}, [form, parseMethodName]);
|
||||
|
||||
// Set default values for TCADP options when TCADP is selected
|
||||
useEffect(() => {
|
||||
if (tcadpOptionsShown) {
|
||||
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||
'table_result_type',
|
||||
prefix,
|
||||
);
|
||||
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
);
|
||||
|
||||
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||
form.setValue(tableResultTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||
form.setValue(markdownImageResponseTypeName, '1', {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
}, [tcadpOptionsShown, form, prefix]);
|
||||
|
||||
return (
|
||||
<>
|
||||
<ParserMethodFormField
|
||||
prefix={prefix}
|
||||
optionsWithoutLLM={optionsWithoutLLM}
|
||||
></ParserMethodFormField>
|
||||
{tcadpOptionsShown && (
|
||||
<>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||
label={t('flow.tableResultType') || '表格返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={tableResultTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix(
|
||||
'markdown_image_response_type',
|
||||
prefix,
|
||||
)}
|
||||
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||
>
|
||||
{(field) => (
|
||||
<SelectWithSearch
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={markdownImageResponseTypeOptions}
|
||||
></SelectWithSearch>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
import JsonEditor from '@/components/json-edit';
|
||||
import { BlockButton, Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Segmented } from '@/components/ui/segmented';
|
||||
import { Editor } from '@monaco-editor/react';
|
||||
import { t } from 'i18next';
|
||||
import { Trash2, X } from 'lucide-react';
|
||||
import { useCallback } from 'react';
|
||||
@ -31,32 +31,80 @@ export const useObjectFields = () => {
|
||||
},
|
||||
[],
|
||||
);
|
||||
const validateKeys = (
|
||||
obj: any,
|
||||
path: (string | number)[] = [],
|
||||
): Array<{ path: (string | number)[]; message: string }> => {
|
||||
const errors: Array<{ path: (string | number)[]; message: string }> = [];
|
||||
|
||||
if (obj !== null && typeof obj === 'object' && !Array.isArray(obj)) {
|
||||
for (const key in obj) {
|
||||
if (obj.hasOwnProperty(key)) {
|
||||
if (!/^[a-zA-Z_]+$/.test(key)) {
|
||||
errors.push({
|
||||
path: [...path, key],
|
||||
message: `Key "${key}" is invalid. Keys can only contain letters and underscores.`,
|
||||
});
|
||||
}
|
||||
const nestedErrors = validateKeys(obj[key], [...path, key]);
|
||||
errors.push(...nestedErrors);
|
||||
}
|
||||
}
|
||||
} else if (Array.isArray(obj)) {
|
||||
obj.forEach((item, index) => {
|
||||
const nestedErrors = validateKeys(item, [...path, index]);
|
||||
errors.push(...nestedErrors);
|
||||
});
|
||||
}
|
||||
|
||||
return errors;
|
||||
};
|
||||
const objectRender = useCallback((field: FieldValues) => {
|
||||
const fieldValue =
|
||||
typeof field.value === 'object'
|
||||
? JSON.stringify(field.value, null, 2)
|
||||
: JSON.stringify({}, null, 2);
|
||||
console.log('object-render-field', field, fieldValue);
|
||||
// const fieldValue =
|
||||
// typeof field.value === 'object'
|
||||
// ? JSON.stringify(field.value, null, 2)
|
||||
// : JSON.stringify({}, null, 2);
|
||||
// console.log('object-render-field', field, fieldValue);
|
||||
return (
|
||||
<Editor
|
||||
height={200}
|
||||
defaultLanguage="json"
|
||||
theme="vs-dark"
|
||||
value={fieldValue}
|
||||
// <Editor
|
||||
// height={200}
|
||||
// defaultLanguage="json"
|
||||
// theme="vs-dark"
|
||||
// value={fieldValue}
|
||||
// onChange={field.onChange}
|
||||
// />
|
||||
<JsonEditor
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
height="400px"
|
||||
options={{
|
||||
mode: 'code',
|
||||
navigationBar: false,
|
||||
mainMenuBar: true,
|
||||
history: true,
|
||||
onValidate: (json) => {
|
||||
return validateKeys(json);
|
||||
},
|
||||
}}
|
||||
/>
|
||||
);
|
||||
}, []);
|
||||
|
||||
const objectValidate = useCallback((value: any) => {
|
||||
try {
|
||||
if (!JSON.parse(value)) {
|
||||
throw new Error(t('knowledgeDetails.formatTypeError'));
|
||||
if (validateKeys(value, [])?.length > 0) {
|
||||
throw new Error(t('flow.formatTypeError'));
|
||||
}
|
||||
if (!z.object({}).safeParse(value).success) {
|
||||
throw new Error(t('flow.formatTypeError'));
|
||||
}
|
||||
if (value && typeof value === 'string' && !JSON.parse(value)) {
|
||||
throw new Error(t('flow.formatTypeError'));
|
||||
}
|
||||
return true;
|
||||
} catch (e) {
|
||||
throw new Error(t('knowledgeDetails.formatTypeError'));
|
||||
console.log('object-render-error', e, value);
|
||||
throw new Error(t('flow.formatTypeError'));
|
||||
}
|
||||
}, []);
|
||||
|
||||
@ -219,6 +267,10 @@ export const useObjectFields = () => {
|
||||
};
|
||||
const handleCustomSchema = (value: TypesWithArray) => {
|
||||
switch (value) {
|
||||
case TypesWithArray.Object:
|
||||
return z.object({});
|
||||
case TypesWithArray.ArrayObject:
|
||||
return z.array(z.object({}));
|
||||
case TypesWithArray.ArrayString:
|
||||
return z.array(z.string());
|
||||
case TypesWithArray.ArrayNumber:
|
||||
|
||||
@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
|
||||
parse_method: cur.parse_method,
|
||||
lang: cur.lang,
|
||||
};
|
||||
// Only include TCADP parameters if TCADP Parser is selected
|
||||
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||
filteredSetup.table_result_type = cur.table_result_type;
|
||||
filteredSetup.markdown_image_response_type =
|
||||
cur.markdown_image_response_type;
|
||||
}
|
||||
break;
|
||||
case FileType.Spreadsheet:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
parse_method: cur.parse_method,
|
||||
};
|
||||
// Only include TCADP parameters if TCADP Parser is selected
|
||||
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||
filteredSetup.table_result_type = cur.table_result_type;
|
||||
filteredSetup.markdown_image_response_type =
|
||||
cur.markdown_image_response_type;
|
||||
}
|
||||
break;
|
||||
case FileType.PowerPoint:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
parse_method: cur.parse_method,
|
||||
};
|
||||
// Only include TCADP parameters if TCADP Parser is selected
|
||||
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||
filteredSetup.table_result_type = cur.table_result_type;
|
||||
filteredSetup.markdown_image_response_type =
|
||||
cur.markdown_image_response_type;
|
||||
}
|
||||
break;
|
||||
case FileType.Image:
|
||||
filteredSetup = {
|
||||
|
||||
0
web/src/pages/data-flow/constant.tsx
Normal file
0
web/src/pages/data-flow/constant.tsx
Normal file
0
web/src/pages/data-flow/form/parser-form/index.tsx
Normal file
0
web/src/pages/data-flow/form/parser-form/index.tsx
Normal file
40
web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
Normal file
40
web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
Normal file
@ -0,0 +1,40 @@
|
||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useEffect } from 'react';
|
||||
import { useFormContext } from 'react-hook-form';
|
||||
import { ParserMethodFormField } from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
|
||||
export function PptFormFields({ prefix }: CommonProps) {
|
||||
const form = useFormContext();
|
||||
|
||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||
|
||||
// PPT only supports DeepDOC and TCADPParser
|
||||
const optionsWithoutLLM = [
|
||||
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||
{
|
||||
label: ParseDocumentType.TCADPParser,
|
||||
value: ParseDocumentType.TCADPParser,
|
||||
},
|
||||
];
|
||||
|
||||
useEffect(() => {
|
||||
if (isEmpty(form.getValues(parseMethodName))) {
|
||||
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}, [form, parseMethodName]);
|
||||
|
||||
return (
|
||||
<>
|
||||
<ParserMethodFormField
|
||||
prefix={prefix}
|
||||
optionsWithoutLLM={optionsWithoutLLM}
|
||||
></ParserMethodFormField>
|
||||
</>
|
||||
);
|
||||
}
|
||||
@ -0,0 +1,40 @@
|
||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useEffect } from 'react';
|
||||
import { useFormContext } from 'react-hook-form';
|
||||
import { ParserMethodFormField } from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
|
||||
export function SpreadsheetFormFields({ prefix }: CommonProps) {
|
||||
const form = useFormContext();
|
||||
|
||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||
|
||||
// Spreadsheet only supports DeepDOC and TCADPParser
|
||||
const optionsWithoutLLM = [
|
||||
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||
{
|
||||
label: ParseDocumentType.TCADPParser,
|
||||
value: ParseDocumentType.TCADPParser,
|
||||
},
|
||||
];
|
||||
|
||||
useEffect(() => {
|
||||
if (isEmpty(form.getValues(parseMethodName))) {
|
||||
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||
shouldValidate: true,
|
||||
shouldDirty: true,
|
||||
});
|
||||
}
|
||||
}, [form, parseMethodName]);
|
||||
|
||||
return (
|
||||
<>
|
||||
<ParserMethodFormField
|
||||
prefix={prefix}
|
||||
optionsWithoutLLM={optionsWithoutLLM}
|
||||
></ParserMethodFormField>
|
||||
</>
|
||||
);
|
||||
}
|
||||
0
web/src/pages/data-flow/utils.ts
Normal file
0
web/src/pages/data-flow/utils.ts
Normal file
Reference in New Issue
Block a user