mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: debug extractor... (#10294)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -153,6 +153,16 @@ class Graph:
|
|||||||
def get_tenant_id(self):
|
def get_tenant_id(self):
|
||||||
return self._tenant_id
|
return self._tenant_id
|
||||||
|
|
||||||
|
def get_variable_value(self, exp: str) -> Any:
|
||||||
|
exp = exp.strip("{").strip("}").strip(" ").strip("{").strip("}")
|
||||||
|
if exp.find("@") < 0:
|
||||||
|
return self.globals[exp]
|
||||||
|
cpn_id, var_nm = exp.split("@")
|
||||||
|
cpn = self.get_component(cpn_id)
|
||||||
|
if not cpn:
|
||||||
|
raise Exception(f"Can't find variable: '{cpn_id}@{var_nm}'")
|
||||||
|
return cpn["obj"].output(var_nm)
|
||||||
|
|
||||||
|
|
||||||
class Canvas(Graph):
|
class Canvas(Graph):
|
||||||
|
|
||||||
@ -406,16 +416,6 @@ class Canvas(Graph):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_variable_value(self, exp: str) -> Any:
|
|
||||||
exp = exp.strip("{").strip("}").strip(" ").strip("{").strip("}")
|
|
||||||
if exp.find("@") < 0:
|
|
||||||
return self.globals[exp]
|
|
||||||
cpn_id, var_nm = exp.split("@")
|
|
||||||
cpn = self.get_component(cpn_id)
|
|
||||||
if not cpn:
|
|
||||||
raise Exception(f"Can't find variable: '{cpn_id}@{var_nm}'")
|
|
||||||
return cpn["obj"].output(var_nm)
|
|
||||||
|
|
||||||
def get_history(self, window_size):
|
def get_history(self, window_size):
|
||||||
convs = []
|
convs = []
|
||||||
if window_size <= 0:
|
if window_size <= 0:
|
||||||
|
|||||||
@ -102,6 +102,8 @@ class LLM(ComponentBase):
|
|||||||
|
|
||||||
def get_input_elements(self) -> dict[str, Any]:
|
def get_input_elements(self) -> dict[str, Any]:
|
||||||
res = self.get_input_elements_from_text(self._param.sys_prompt)
|
res = self.get_input_elements_from_text(self._param.sys_prompt)
|
||||||
|
if isinstance(self._param.prompts, str):
|
||||||
|
self._param.prompts = [{"role": "user", "content": self._param.prompts}]
|
||||||
for prompt in self._param.prompts:
|
for prompt in self._param.prompts:
|
||||||
d = self.get_input_elements_from_text(prompt["content"])
|
d = self.get_input_elements_from_text(prompt["content"])
|
||||||
res.update(d)
|
res.update(d)
|
||||||
@ -114,6 +116,8 @@ class LLM(ComponentBase):
|
|||||||
self._param.sys_prompt += txt
|
self._param.sys_prompt += txt
|
||||||
|
|
||||||
def _sys_prompt_and_msg(self, msg, args):
|
def _sys_prompt_and_msg(self, msg, args):
|
||||||
|
if isinstance(self._param.prompts, str):
|
||||||
|
self._param.prompts = [{"role": "user", "content": self._param.prompts}]
|
||||||
for p in self._param.prompts:
|
for p in self._param.prompts:
|
||||||
if msg and msg[-1]["role"] == p["role"]:
|
if msg and msg[-1]["role"] == p["role"]:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -14,9 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import random
|
import random
|
||||||
from agent.component.llm import LLMParam, LLM
|
from agent.component.llm import LLMParam, LLM
|
||||||
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
|
|
||||||
|
|
||||||
class ExtractorParam(LLMParam):
|
class ExtractorParam(ProcessParamBase, LLMParam):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.field_name = ""
|
self.field_name = ""
|
||||||
@ -26,7 +27,7 @@ class ExtractorParam(LLMParam):
|
|||||||
self.check_empty(self.field_name, "Result Destination")
|
self.check_empty(self.field_name, "Result Destination")
|
||||||
|
|
||||||
|
|
||||||
class Extractor(LLM):
|
class Extractor(ProcessBase, LLM):
|
||||||
component_name = "Extractor"
|
component_name = "Extractor"
|
||||||
|
|
||||||
async def _invoke(self, **kwargs):
|
async def _invoke(self, **kwargs):
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class ExtractorFromUpstream(BaseModel):
|
|||||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||||
text_result: str | None = Field(default=None, alias="text")
|
text_result: str | None = Field(default=None, alias="text")
|
||||||
html_result: list[str] | None = Field(default=None, alias="html")
|
html_result: str | None = Field(default=None, alias="html")
|
||||||
|
|
||||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||||
|
|
||||||
|
|||||||
@ -29,7 +29,7 @@ class HierarchicalMergerFromUpstream(BaseModel):
|
|||||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||||
text_result: str | None = Field(default=None, alias="text")
|
text_result: str | None = Field(default=None, alias="text")
|
||||||
html_result: list[str] | None = Field(default=None, alias="html")
|
html_result: str | None = Field(default=None, alias="html")
|
||||||
|
|
||||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||||
|
|
||||||
|
|||||||
@ -143,6 +143,10 @@ class Pipeline(Graph):
|
|||||||
async def invoke():
|
async def invoke():
|
||||||
nonlocal last_cpn, cpn_obj
|
nonlocal last_cpn, cpn_obj
|
||||||
await cpn_obj.invoke(**last_cpn.output())
|
await cpn_obj.invoke(**last_cpn.output())
|
||||||
|
#if inspect.iscoroutinefunction(cpn_obj.invoke):
|
||||||
|
# await cpn_obj.invoke(**last_cpn.output())
|
||||||
|
#else:
|
||||||
|
# cpn_obj.invoke(**last_cpn.output())
|
||||||
|
|
||||||
async with trio.open_nursery() as nursery:
|
async with trio.open_nursery() as nursery:
|
||||||
nursery.start_soon(invoke)
|
nursery.start_soon(invoke)
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class SplitterFromUpstream(BaseModel):
|
|||||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||||
text_result: str | None = Field(default=None, alias="text")
|
text_result: str | None = Field(default=None, alias="text")
|
||||||
html_result: list[str] | None = Field(default=None, alias="html")
|
html_result: str | None = Field(default=None, alias="html")
|
||||||
|
|
||||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||||
|
|
||||||
|
|||||||
@ -31,7 +31,7 @@ class TokenizerFromUpstream(BaseModel):
|
|||||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||||
text_result: str | None = Field(default=None, alias="text")
|
text_result: str | None = Field(default=None, alias="text")
|
||||||
html_result: list[str] | None = Field(default=None, alias="html")
|
html_result: str | None = Field(default=None, alias="html")
|
||||||
|
|
||||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||||
|
|
||||||
|
|||||||
@ -119,7 +119,7 @@ class Tokenizer(ProcessBase):
|
|||||||
if ck.get("questions"):
|
if ck.get("questions"):
|
||||||
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
|
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
|
||||||
if ck.get("keywords"):
|
if ck.get("keywords"):
|
||||||
ck["important_tks"] = rag_tokenizer.tokenize("\n".join(ck["keywords"]))
|
ck["important_tks"] = rag_tokenizer.tokenize(",".join(ck["keywords"]))
|
||||||
if ck.get("summary"):
|
if ck.get("summary"):
|
||||||
ck["content_ltks"] = rag_tokenizer.tokenize(ck["summary"])
|
ck["content_ltks"] = rag_tokenizer.tokenize(ck["summary"])
|
||||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||||
|
|||||||
@ -222,10 +222,9 @@ async def collect():
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
canceled = False
|
canceled = False
|
||||||
if msg.get("doc_id", "") == GRAPH_RAPTOR_FAKE_DOC_ID:
|
if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
|
||||||
task = msg
|
task = msg
|
||||||
if task["task_type"] in ["graphrag", "raptor"] and msg.get("doc_ids", []):
|
if task["task_type"] in ["graphrag", "raptor"] and msg.get("doc_ids", []):
|
||||||
print(f"hack {msg['doc_ids']=}=",flush=True)
|
|
||||||
task = TaskService.get_task(msg["id"], msg["doc_ids"])
|
task = TaskService.get_task(msg["id"], msg["doc_ids"])
|
||||||
task["doc_ids"] = msg["doc_ids"]
|
task["doc_ids"] = msg["doc_ids"]
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user