Feat: debug extractor... (#10294)

### What problem does this PR solve?

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-09-26 10:51:05 +08:00
committed by GitHub
parent ff49454501
commit c7efaab30e
10 changed files with 27 additions and 19 deletions

View File

@ -14,9 +14,10 @@
# limitations under the License.
import random
from agent.component.llm import LLMParam, LLM
from rag.flow.base import ProcessBase, ProcessParamBase
class ExtractorParam(LLMParam):
class ExtractorParam(ProcessParamBase, LLMParam):
def __init__(self):
super().__init__()
self.field_name = ""
@ -26,7 +27,7 @@ class ExtractorParam(LLMParam):
self.check_empty(self.field_name, "Result Destination")
class Extractor(LLM):
class Extractor(ProcessBase, LLM):
component_name = "Extractor"
async def _invoke(self, **kwargs):

View File

@ -30,7 +30,7 @@ class ExtractorFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: list[str] | None = Field(default=None, alias="html")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -29,7 +29,7 @@ class HierarchicalMergerFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: list[str] | None = Field(default=None, alias="html")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -143,6 +143,10 @@ class Pipeline(Graph):
async def invoke():
nonlocal last_cpn, cpn_obj
await cpn_obj.invoke(**last_cpn.output())
#if inspect.iscoroutinefunction(cpn_obj.invoke):
# await cpn_obj.invoke(**last_cpn.output())
#else:
# cpn_obj.invoke(**last_cpn.output())
async with trio.open_nursery() as nursery:
nursery.start_soon(invoke)

View File

@ -30,7 +30,7 @@ class SplitterFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: list[str] | None = Field(default=None, alias="html")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -31,7 +31,7 @@ class TokenizerFromUpstream(BaseModel):
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: list[str] | None = Field(default=None, alias="html")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -119,7 +119,7 @@ class Tokenizer(ProcessBase):
if ck.get("questions"):
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
if ck.get("keywords"):
ck["important_tks"] = rag_tokenizer.tokenize("\n".join(ck["keywords"]))
ck["important_tks"] = rag_tokenizer.tokenize(",".join(ck["keywords"]))
if ck.get("summary"):
ck["content_ltks"] = rag_tokenizer.tokenize(ck["summary"])
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])