mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add graphrag (#1793)
### What problem does this PR solve? #1594 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
25
agent/component/__init__.py
Normal file
25
agent/component/__init__.py
Normal file
@ -0,0 +1,25 @@
|
||||
import importlib
|
||||
from .begin import Begin, BeginParam
|
||||
from .generate import Generate, GenerateParam
|
||||
from .retrieval import Retrieval, RetrievalParam
|
||||
from .answer import Answer, AnswerParam
|
||||
from .categorize import Categorize, CategorizeParam
|
||||
from .switch import Switch, SwitchParam
|
||||
from .relevant import Relevant, RelevantParam
|
||||
from .message import Message, MessageParam
|
||||
from .rewrite import RewriteQuestion, RewriteQuestionParam
|
||||
from .keyword import KeywordExtract, KeywordExtractParam
|
||||
from .baidu import Baidu, BaiduParam
|
||||
from .duckduckgo import DuckDuckGo, DuckDuckGoParam
|
||||
from .wikipedia import Wikipedia, WikipediaParam
|
||||
from .pubmed import PubMed, PubMedParam
|
||||
from .arxiv import ArXiv, ArXivParam
|
||||
from .google import Google, GoogleParam
|
||||
from .bing import Bing, BingParam
|
||||
from .googlescholar import GoogleScholar, GoogleScholarParam
|
||||
|
||||
|
||||
def component_class(class_name):
|
||||
m = importlib.import_module("graph.component")
|
||||
c = getattr(m, class_name)
|
||||
return c
|
||||
79
agent/component/answer.py
Normal file
79
agent/component/answer.py
Normal file
@ -0,0 +1,79 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class AnswerParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Answer component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.post_answers = []
|
||||
|
||||
def check(self):
|
||||
return True
|
||||
|
||||
|
||||
class Answer(ComponentBase, ABC):
|
||||
component_name = "Answer"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
if kwargs.get("stream"):
|
||||
return partial(self.stream_output)
|
||||
|
||||
ans = self.get_input()
|
||||
if self._param.post_answers:
|
||||
ans = pd.concat([ans, pd.DataFrame([{"content": random.choice(self._param.post_answers)}])], ignore_index=False)
|
||||
return ans
|
||||
|
||||
def stream_output(self):
|
||||
res = None
|
||||
if hasattr(self, "exception") and self.exception:
|
||||
res = {"content": str(self.exception)}
|
||||
self.exception = None
|
||||
yield res
|
||||
self.set_output(res)
|
||||
return
|
||||
|
||||
stream = self.get_stream_input()
|
||||
if isinstance(stream, pd.DataFrame):
|
||||
res = stream
|
||||
answer = ""
|
||||
for ii, row in stream.iterrows():
|
||||
answer += row.to_dict()["content"]
|
||||
yield {"content": answer}
|
||||
else:
|
||||
for st in stream():
|
||||
res = st
|
||||
yield st
|
||||
if self._param.post_answers:
|
||||
res["content"] += random.choice(self._param.post_answers)
|
||||
yield res
|
||||
|
||||
self.set_output(res)
|
||||
|
||||
def set_exception(self, e):
|
||||
self.exception = e
|
||||
|
||||
|
||||
69
agent/component/arxiv.py
Normal file
69
agent/component/arxiv.py
Normal file
@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
import arxiv
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class ArXivParam(ComponentParamBase):
|
||||
"""
|
||||
Define the ArXiv component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 6
|
||||
self.sort_by = 'submittedDate'
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_valid_value(self.sort_by, "ArXiv Search Sort_by",
|
||||
['submittedDate', 'lastUpdatedDate', 'relevance'])
|
||||
|
||||
|
||||
class ArXiv(ComponentBase, ABC):
|
||||
component_name = "ArXiv"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return ArXiv.be_output("")
|
||||
|
||||
try:
|
||||
sort_choices = {"relevance": arxiv.SortCriterion.Relevance,
|
||||
"lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
|
||||
'submittedDate': arxiv.SortCriterion.SubmittedDate}
|
||||
arxiv_client = arxiv.Client()
|
||||
search = arxiv.Search(
|
||||
query=ans,
|
||||
max_results=self._param.top_n,
|
||||
sort_by=sort_choices[self._param.sort_by]
|
||||
)
|
||||
arxiv_res = [
|
||||
{"content": 'Title: ' + i.title + '\nPdf_Url: <a href="' + i.pdf_url + '"></a> \nSummary: ' + i.summary} for
|
||||
i in list(arxiv_client.results(search))]
|
||||
except Exception as e:
|
||||
return ArXiv.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not arxiv_res:
|
||||
return ArXiv.be_output("")
|
||||
|
||||
df = pd.DataFrame(arxiv_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
69
agent/component/baidu.py
Normal file
69
agent/component/baidu.py
Normal file
@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
import pandas as pd
|
||||
import requests
|
||||
import re
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class BaiduParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Baidu component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 10
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
|
||||
|
||||
class Baidu(ComponentBase, ABC):
|
||||
component_name = "Baidu"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Baidu.be_output("")
|
||||
|
||||
try:
|
||||
url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
|
||||
response = requests.get(url=url, headers=headers)
|
||||
|
||||
url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text)
|
||||
title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text)
|
||||
body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text)
|
||||
baidu_res = [{"content": re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body)} for
|
||||
url, title, body in zip(url_res, title_res, body_res)]
|
||||
del body_res, url_res, title_res
|
||||
except Exception as e:
|
||||
return Baidu.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not baidu_res:
|
||||
return Baidu.be_output("")
|
||||
|
||||
df = pd.DataFrame(baidu_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
|
||||
494
agent/component/base.py
Normal file
494
agent/component/base.py
Normal file
@ -0,0 +1,494 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
import builtins
|
||||
import json
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from agent import settings
|
||||
from agent.settings import flow_logger, DEBUG
|
||||
|
||||
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
|
||||
_DEPRECATED_PARAMS = "_deprecated_params"
|
||||
_USER_FEEDED_PARAMS = "_user_feeded_params"
|
||||
_IS_RAW_CONF = "_is_raw_conf"
|
||||
|
||||
|
||||
class ComponentParamBase(ABC):
|
||||
def __init__(self):
|
||||
self.output_var_name = "output"
|
||||
self.message_history_window_size = 22
|
||||
|
||||
def set_name(self, name: str):
|
||||
self._name = name
|
||||
return self
|
||||
|
||||
def check(self):
|
||||
raise NotImplementedError("Parameter Object should be checked.")
|
||||
|
||||
@classmethod
|
||||
def _get_or_init_deprecated_params_set(cls):
|
||||
if not hasattr(cls, _DEPRECATED_PARAMS):
|
||||
setattr(cls, _DEPRECATED_PARAMS, set())
|
||||
return getattr(cls, _DEPRECATED_PARAMS)
|
||||
|
||||
def _get_or_init_feeded_deprecated_params_set(self, conf=None):
|
||||
if not hasattr(self, _FEEDED_DEPRECATED_PARAMS):
|
||||
if conf is None:
|
||||
setattr(self, _FEEDED_DEPRECATED_PARAMS, set())
|
||||
else:
|
||||
setattr(
|
||||
self,
|
||||
_FEEDED_DEPRECATED_PARAMS,
|
||||
set(conf[_FEEDED_DEPRECATED_PARAMS]),
|
||||
)
|
||||
return getattr(self, _FEEDED_DEPRECATED_PARAMS)
|
||||
|
||||
def _get_or_init_user_feeded_params_set(self, conf=None):
|
||||
if not hasattr(self, _USER_FEEDED_PARAMS):
|
||||
if conf is None:
|
||||
setattr(self, _USER_FEEDED_PARAMS, set())
|
||||
else:
|
||||
setattr(self, _USER_FEEDED_PARAMS, set(conf[_USER_FEEDED_PARAMS]))
|
||||
return getattr(self, _USER_FEEDED_PARAMS)
|
||||
|
||||
def get_user_feeded(self):
|
||||
return self._get_or_init_user_feeded_params_set()
|
||||
|
||||
def get_feeded_deprecated_params(self):
|
||||
return self._get_or_init_feeded_deprecated_params_set()
|
||||
|
||||
@property
|
||||
def _deprecated_params_set(self):
|
||||
return {name: True for name in self.get_feeded_deprecated_params()}
|
||||
|
||||
def __str__(self):
|
||||
|
||||
return json.dumps(self.as_dict(), ensure_ascii=False)
|
||||
|
||||
def as_dict(self):
|
||||
def _recursive_convert_obj_to_dict(obj):
|
||||
ret_dict = {}
|
||||
for attr_name in list(obj.__dict__):
|
||||
if attr_name in [_FEEDED_DEPRECATED_PARAMS, _DEPRECATED_PARAMS, _USER_FEEDED_PARAMS, _IS_RAW_CONF]:
|
||||
continue
|
||||
# get attr
|
||||
attr = getattr(obj, attr_name)
|
||||
if isinstance(attr, pd.DataFrame):
|
||||
ret_dict[attr_name] = attr.to_dict()
|
||||
continue
|
||||
if attr and type(attr).__name__ not in dir(builtins):
|
||||
ret_dict[attr_name] = _recursive_convert_obj_to_dict(attr)
|
||||
else:
|
||||
ret_dict[attr_name] = attr
|
||||
|
||||
return ret_dict
|
||||
|
||||
return _recursive_convert_obj_to_dict(self)
|
||||
|
||||
def update(self, conf, allow_redundant=False):
|
||||
update_from_raw_conf = conf.get(_IS_RAW_CONF, True)
|
||||
if update_from_raw_conf:
|
||||
deprecated_params_set = self._get_or_init_deprecated_params_set()
|
||||
feeded_deprecated_params_set = (
|
||||
self._get_or_init_feeded_deprecated_params_set()
|
||||
)
|
||||
user_feeded_params_set = self._get_or_init_user_feeded_params_set()
|
||||
setattr(self, _IS_RAW_CONF, False)
|
||||
else:
|
||||
feeded_deprecated_params_set = (
|
||||
self._get_or_init_feeded_deprecated_params_set(conf)
|
||||
)
|
||||
user_feeded_params_set = self._get_or_init_user_feeded_params_set(conf)
|
||||
|
||||
def _recursive_update_param(param, config, depth, prefix):
|
||||
if depth > settings.PARAM_MAXDEPTH:
|
||||
raise ValueError("Param define nesting too deep!!!, can not parse it")
|
||||
|
||||
inst_variables = param.__dict__
|
||||
redundant_attrs = []
|
||||
for config_key, config_value in config.items():
|
||||
# redundant attr
|
||||
if config_key not in inst_variables:
|
||||
if not update_from_raw_conf and config_key.startswith("_"):
|
||||
setattr(param, config_key, config_value)
|
||||
else:
|
||||
setattr(param, config_key, config_value)
|
||||
# redundant_attrs.append(config_key)
|
||||
continue
|
||||
|
||||
full_config_key = f"{prefix}{config_key}"
|
||||
|
||||
if update_from_raw_conf:
|
||||
# add user feeded params
|
||||
user_feeded_params_set.add(full_config_key)
|
||||
|
||||
# update user feeded deprecated param set
|
||||
if full_config_key in deprecated_params_set:
|
||||
feeded_deprecated_params_set.add(full_config_key)
|
||||
|
||||
# supported attr
|
||||
attr = getattr(param, config_key)
|
||||
if type(attr).__name__ in dir(builtins) or attr is None:
|
||||
setattr(param, config_key, config_value)
|
||||
|
||||
else:
|
||||
# recursive set obj attr
|
||||
sub_params = _recursive_update_param(
|
||||
attr, config_value, depth + 1, prefix=f"{prefix}{config_key}."
|
||||
)
|
||||
setattr(param, config_key, sub_params)
|
||||
|
||||
if not allow_redundant and redundant_attrs:
|
||||
raise ValueError(
|
||||
f"cpn `{getattr(self, '_name', type(self))}` has redundant parameters: `{[redundant_attrs]}`"
|
||||
)
|
||||
|
||||
return param
|
||||
|
||||
return _recursive_update_param(param=self, config=conf, depth=0, prefix="")
|
||||
|
||||
def extract_not_builtin(self):
|
||||
def _get_not_builtin_types(obj):
|
||||
ret_dict = {}
|
||||
for variable in obj.__dict__:
|
||||
attr = getattr(obj, variable)
|
||||
if attr and type(attr).__name__ not in dir(builtins):
|
||||
ret_dict[variable] = _get_not_builtin_types(attr)
|
||||
|
||||
return ret_dict
|
||||
|
||||
return _get_not_builtin_types(self)
|
||||
|
||||
def validate(self):
|
||||
self.builtin_types = dir(builtins)
|
||||
self.func = {
|
||||
"ge": self._greater_equal_than,
|
||||
"le": self._less_equal_than,
|
||||
"in": self._in,
|
||||
"not_in": self._not_in,
|
||||
"range": self._range,
|
||||
}
|
||||
home_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
||||
param_validation_path_prefix = home_dir + "/param_validation/"
|
||||
|
||||
param_name = type(self).__name__
|
||||
param_validation_path = "/".join(
|
||||
[param_validation_path_prefix, param_name + ".json"]
|
||||
)
|
||||
|
||||
validation_json = None
|
||||
|
||||
try:
|
||||
with open(param_validation_path, "r") as fin:
|
||||
validation_json = json.loads(fin.read())
|
||||
except BaseException:
|
||||
return
|
||||
|
||||
self._validate_param(self, validation_json)
|
||||
|
||||
def _validate_param(self, param_obj, validation_json):
|
||||
default_section = type(param_obj).__name__
|
||||
var_list = param_obj.__dict__
|
||||
|
||||
for variable in var_list:
|
||||
attr = getattr(param_obj, variable)
|
||||
|
||||
if type(attr).__name__ in self.builtin_types or attr is None:
|
||||
if variable not in validation_json:
|
||||
continue
|
||||
|
||||
validation_dict = validation_json[default_section][variable]
|
||||
value = getattr(param_obj, variable)
|
||||
value_legal = False
|
||||
|
||||
for op_type in validation_dict:
|
||||
if self.func[op_type](value, validation_dict[op_type]):
|
||||
value_legal = True
|
||||
break
|
||||
|
||||
if not value_legal:
|
||||
raise ValueError(
|
||||
"Plase check runtime conf, {} = {} does not match user-parameter restriction".format(
|
||||
variable, value
|
||||
)
|
||||
)
|
||||
|
||||
elif variable in validation_json:
|
||||
self._validate_param(attr, validation_json)
|
||||
|
||||
@staticmethod
|
||||
def check_string(param, descr):
|
||||
if type(param).__name__ not in ["str"]:
|
||||
raise ValueError(
|
||||
descr + " {} not supported, should be string type".format(param)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_empty(param, descr):
|
||||
if not param:
|
||||
raise ValueError(
|
||||
descr + " does not support empty value."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_positive_integer(param, descr):
|
||||
if type(param).__name__ not in ["int", "long"] or param <= 0:
|
||||
raise ValueError(
|
||||
descr + " {} not supported, should be positive integer".format(param)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_positive_number(param, descr):
|
||||
if type(param).__name__ not in ["float", "int", "long"] or param <= 0:
|
||||
raise ValueError(
|
||||
descr + " {} not supported, should be positive numeric".format(param)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_nonnegative_number(param, descr):
|
||||
if type(param).__name__ not in ["float", "int", "long"] or param < 0:
|
||||
raise ValueError(
|
||||
descr
|
||||
+ " {} not supported, should be non-negative numeric".format(param)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_decimal_float(param, descr):
|
||||
if type(param).__name__ not in ["float", "int"] or param < 0 or param > 1:
|
||||
raise ValueError(
|
||||
descr
|
||||
+ " {} not supported, should be a float number in range [0, 1]".format(
|
||||
param
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_boolean(param, descr):
|
||||
if type(param).__name__ != "bool":
|
||||
raise ValueError(
|
||||
descr + " {} not supported, should be bool type".format(param)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_open_unit_interval(param, descr):
|
||||
if type(param).__name__ not in ["float"] or param <= 0 or param >= 1:
|
||||
raise ValueError(
|
||||
descr + " should be a numeric number between 0 and 1 exclusively"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_valid_value(param, descr, valid_values):
|
||||
if param not in valid_values:
|
||||
raise ValueError(
|
||||
descr
|
||||
+ " {} is not supported, it should be in {}".format(param, valid_values)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_defined_type(param, descr, types):
|
||||
if type(param).__name__ not in types:
|
||||
raise ValueError(
|
||||
descr + " {} not supported, should be one of {}".format(param, types)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def check_and_change_lower(param, valid_list, descr=""):
|
||||
if type(param).__name__ != "str":
|
||||
raise ValueError(
|
||||
descr
|
||||
+ " {} not supported, should be one of {}".format(param, valid_list)
|
||||
)
|
||||
|
||||
lower_param = param.lower()
|
||||
if lower_param in valid_list:
|
||||
return lower_param
|
||||
else:
|
||||
raise ValueError(
|
||||
descr
|
||||
+ " {} not supported, should be one of {}".format(param, valid_list)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _greater_equal_than(value, limit):
|
||||
return value >= limit - settings.FLOAT_ZERO
|
||||
|
||||
@staticmethod
|
||||
def _less_equal_than(value, limit):
|
||||
return value <= limit + settings.FLOAT_ZERO
|
||||
|
||||
@staticmethod
|
||||
def _range(value, ranges):
|
||||
in_range = False
|
||||
for left_limit, right_limit in ranges:
|
||||
if (
|
||||
left_limit - settings.FLOAT_ZERO
|
||||
<= value
|
||||
<= right_limit + settings.FLOAT_ZERO
|
||||
):
|
||||
in_range = True
|
||||
break
|
||||
|
||||
return in_range
|
||||
|
||||
@staticmethod
|
||||
def _in(value, right_value_list):
|
||||
return value in right_value_list
|
||||
|
||||
@staticmethod
|
||||
def _not_in(value, wrong_value_list):
|
||||
return value not in wrong_value_list
|
||||
|
||||
def _warn_deprecated_param(self, param_name, descr):
|
||||
if self._deprecated_params_set.get(param_name):
|
||||
flow_logger.warning(
|
||||
f"{descr} {param_name} is deprecated and ignored in this version."
|
||||
)
|
||||
|
||||
def _warn_to_deprecate_param(self, param_name, descr, new_param):
|
||||
if self._deprecated_params_set.get(param_name):
|
||||
flow_logger.warning(
|
||||
f"{descr} {param_name} will be deprecated in future release; "
|
||||
f"please use {new_param} instead."
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class ComponentBase(ABC):
|
||||
component_name: str
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
{
|
||||
"component_name": "Begin",
|
||||
"params": {}
|
||||
}
|
||||
"""
|
||||
return """{{
|
||||
"component_name": "{}",
|
||||
"params": {}
|
||||
}}""".format(self.component_name,
|
||||
self._param
|
||||
)
|
||||
|
||||
def __init__(self, canvas, id, param: ComponentParamBase):
|
||||
self._canvas = canvas
|
||||
self._id = id
|
||||
self._param = param
|
||||
self._param.check()
|
||||
|
||||
def run(self, history, **kwargs):
|
||||
flow_logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
|
||||
json.dumps(kwargs, ensure_ascii=False)))
|
||||
try:
|
||||
res = self._run(history, **kwargs)
|
||||
self.set_output(res)
|
||||
except Exception as e:
|
||||
self.set_output(pd.DataFrame([{"content": str(e)}]))
|
||||
raise e
|
||||
|
||||
return res
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
raise NotImplementedError()
|
||||
|
||||
def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
|
||||
o = getattr(self._param, self._param.output_var_name)
|
||||
if not isinstance(o, partial) and not isinstance(o, pd.DataFrame):
|
||||
if not isinstance(o, list): o = [o]
|
||||
o = pd.DataFrame(o)
|
||||
|
||||
if allow_partial or not isinstance(o, partial):
|
||||
if not isinstance(o, partial) and not isinstance(o, pd.DataFrame):
|
||||
return pd.DataFrame(o if isinstance(o, list) else [o])
|
||||
return self._param.output_var_name, o
|
||||
|
||||
outs = None
|
||||
for oo in o():
|
||||
if not isinstance(oo, pd.DataFrame):
|
||||
outs = pd.DataFrame(oo if isinstance(oo, list) else [oo])
|
||||
else: outs = oo
|
||||
return self._param.output_var_name, outs
|
||||
|
||||
def reset(self):
|
||||
setattr(self._param, self._param.output_var_name, None)
|
||||
|
||||
def set_output(self, v: pd.DataFrame):
|
||||
setattr(self._param, self._param.output_var_name, v)
|
||||
|
||||
def get_input(self):
|
||||
upstream_outs = []
|
||||
reversed_cpnts = []
|
||||
if len(self._canvas.path) > 1:
|
||||
reversed_cpnts.extend(self._canvas.path[-2])
|
||||
reversed_cpnts.extend(self._canvas.path[-1])
|
||||
|
||||
if DEBUG: print(self.component_name, reversed_cpnts[::-1])
|
||||
for u in reversed_cpnts[::-1]:
|
||||
if self.get_component_name(u) in ["switch"]: continue
|
||||
if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":
|
||||
o = self._canvas.get_component(u)["obj"].output(allow_partial=False)[1]
|
||||
if o is not None:
|
||||
upstream_outs.append(o)
|
||||
continue
|
||||
if u not in self._canvas.get_component(self._id)["upstream"]: continue
|
||||
if self.component_name.lower().find("switch") < 0 \
|
||||
and self.get_component_name(u) in ["relevant", "categorize"]:
|
||||
continue
|
||||
if u.lower().find("answer") >= 0:
|
||||
for r, c in self._canvas.history[::-1]:
|
||||
if r == "user":
|
||||
upstream_outs.append(pd.DataFrame([{"content": c}]))
|
||||
break
|
||||
break
|
||||
if self.component_name.lower().find("answer") >= 0:
|
||||
if self.get_component_name(u) in ["relevant"]:
|
||||
continue
|
||||
else:
|
||||
o = self._canvas.get_component(u)["obj"].output(allow_partial=False)[1]
|
||||
if o is not None:
|
||||
upstream_outs.append(o)
|
||||
break
|
||||
|
||||
if upstream_outs:
|
||||
df = pd.concat(upstream_outs, ignore_index=True)
|
||||
if "content" in df:
|
||||
df = df.drop_duplicates(subset=['content']).reset_index(drop=True)
|
||||
return df
|
||||
return pd.DataFrame()
|
||||
|
||||
def get_stream_input(self):
|
||||
reversed_cpnts = []
|
||||
if len(self._canvas.path) > 1:
|
||||
reversed_cpnts.extend(self._canvas.path[-2])
|
||||
reversed_cpnts.extend(self._canvas.path[-1])
|
||||
|
||||
for u in reversed_cpnts[::-1]:
|
||||
if self.get_component_name(u) in ["switch", "answer"]: continue
|
||||
return self._canvas.get_component(u)["obj"].output()[1]
|
||||
|
||||
@staticmethod
|
||||
def be_output(v):
|
||||
return pd.DataFrame([{"content": v}])
|
||||
|
||||
def get_component_name(self, cpn_id):
|
||||
return self._canvas.get_component(cpn_id)["obj"].component_name.lower()
|
||||
48
agent/component/begin.py
Normal file
48
agent/component/begin.py
Normal file
@ -0,0 +1,48 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from functools import partial
|
||||
import pandas as pd
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class BeginParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Begin component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.prologue = "Hi! I'm your smart assistant. What can I do for you?"
|
||||
|
||||
def check(self):
|
||||
return True
|
||||
|
||||
|
||||
class Begin(ComponentBase):
|
||||
component_name = "Begin"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
if kwargs.get("stream"):
|
||||
return partial(self.stream_output)
|
||||
return pd.DataFrame([{"content": self._param.prologue}])
|
||||
|
||||
def stream_output(self):
|
||||
res = {"content": self._param.prologue}
|
||||
yield res
|
||||
self.set_output(res)
|
||||
|
||||
|
||||
|
||||
85
agent/component/bing.py
Normal file
85
agent/component/bing.py
Normal file
@ -0,0 +1,85 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
import requests
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class BingParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Bing component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 10
|
||||
self.channel = "Webpages"
|
||||
self.api_key = "YOUR_ACCESS_KEY"
|
||||
self.country = "CN"
|
||||
self.language = "en"
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_valid_value(self.channel, "Bing Web Search or Bing News", ["Webpages", "News"])
|
||||
self.check_empty(self.api_key, "Bing subscription key")
|
||||
self.check_valid_value(self.country, "Bing Country",
|
||||
['AR', 'AU', 'AT', 'BE', 'BR', 'CA', 'CL', 'DK', 'FI', 'FR', 'DE', 'HK', 'IN', 'ID',
|
||||
'IT', 'JP', 'KR', 'MY', 'MX', 'NL', 'NZ', 'NO', 'CN', 'PL', 'PT', 'PH', 'RU', 'SA',
|
||||
'ZA', 'ES', 'SE', 'CH', 'TW', 'TR', 'GB', 'US'])
|
||||
self.check_valid_value(self.language, "Bing Languages",
|
||||
['ar', 'eu', 'bn', 'bg', 'ca', 'ns', 'nt', 'hr', 'cs', 'da', 'nl', 'en', 'gb', 'et',
|
||||
'fi', 'fr', 'gl', 'de', 'gu', 'he', 'hi', 'hu', 'is', 'it', 'jp', 'kn', 'ko', 'lv',
|
||||
'lt', 'ms', 'ml', 'mr', 'nb', 'pl', 'br', 'pt', 'pa', 'ro', 'ru', 'sr', 'sk', 'sl',
|
||||
'es', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'vi'])
|
||||
|
||||
|
||||
class Bing(ComponentBase, ABC):
|
||||
component_name = "Bing"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Bing.be_output("")
|
||||
|
||||
try:
|
||||
headers = {"Ocp-Apim-Subscription-Key": self._param.api_key, 'Accept-Language': self._param.language}
|
||||
params = {"q": ans, "textDecorations": True, "textFormat": "HTML", "cc": self._param.country,
|
||||
"answerCount": 1, "promote": self._param.channel}
|
||||
if self._param.channel == "Webpages":
|
||||
response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params)
|
||||
response.raise_for_status()
|
||||
search_results = response.json()
|
||||
bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["snippet"]} for i in
|
||||
search_results["webPages"]["value"]]
|
||||
elif self._param.channel == "News":
|
||||
response = requests.get("https://api.bing.microsoft.com/v7.0/news/search", headers=headers,
|
||||
params=params)
|
||||
response.raise_for_status()
|
||||
search_results = response.json()
|
||||
bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["description"]} for i
|
||||
in search_results['news']['value']]
|
||||
except Exception as e:
|
||||
return Bing.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not bing_res:
|
||||
return Bing.be_output("")
|
||||
|
||||
df = pd.DataFrame(bing_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
87
agent/component/categorize.py
Normal file
87
agent/component/categorize.py
Normal file
@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from agent.settings import DEBUG
|
||||
|
||||
|
||||
class CategorizeParam(GenerateParam):
|
||||
|
||||
"""
|
||||
Define the Categorize component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.category_description = {}
|
||||
self.prompt = ""
|
||||
|
||||
def check(self):
|
||||
super().check()
|
||||
self.check_empty(self.category_description, "[Categorize] Category examples")
|
||||
for k, v in self.category_description.items():
|
||||
if not k: raise ValueError(f"[Categorize] Category name can not be empty!")
|
||||
if not v.get("to"): raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
|
||||
|
||||
def get_prompt(self):
|
||||
cate_lines = []
|
||||
for c, desc in self.category_description.items():
|
||||
for l in desc.get("examples", "").split("\n"):
|
||||
if not l: continue
|
||||
cate_lines.append("Question: {}\tCategory: {}".format(l, c))
|
||||
descriptions = []
|
||||
for c, desc in self.category_description.items():
|
||||
if desc.get("description"):
|
||||
descriptions.append(
|
||||
"--------------------\nCategory: {}\nDescription: {}\n".format(c, desc["description"]))
|
||||
|
||||
self.prompt = """
|
||||
You're a text classifier. You need to categorize the user’s questions into {} categories,
|
||||
namely: {}
|
||||
Here's description of each category:
|
||||
{}
|
||||
|
||||
You could learn from the following examples:
|
||||
{}
|
||||
You could learn from the above examples.
|
||||
Just mention the category names, no need for any additional words.
|
||||
""".format(
|
||||
len(self.category_description.keys()),
|
||||
"/".join(list(self.category_description.keys())),
|
||||
"\n".join(descriptions),
|
||||
"- ".join(cate_lines)
|
||||
)
|
||||
return self.prompt
|
||||
|
||||
|
||||
class Categorize(Generate, ABC):
|
||||
component_name = "Categorize"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
input = self.get_input()
|
||||
input = "Question: " + ("; ".join(input["content"]) if "content" in input else "") + "Category: "
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}],
|
||||
self._param.gen_conf())
|
||||
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::", input)
|
||||
for c in self._param.category_description.keys():
|
||||
if ans.lower().find(c.lower()) >= 0:
|
||||
return Categorize.be_output(self._param.category_description[c]["to"])
|
||||
|
||||
return Categorize.be_output(self._param.category_description.items()[-1][1]["to"])
|
||||
|
||||
|
||||
75
agent/component/cite.py
Normal file
75
agent/component/cite.py
Normal file
@ -0,0 +1,75 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.settings import retrievaler
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class CiteParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Retrieval component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.cite_sources = []
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.cite_source, "Please specify where you want to cite from.")
|
||||
|
||||
|
||||
class Cite(ComponentBase, ABC):
|
||||
component_name = "Cite"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
input = "\n- ".join(self.get_input()["content"])
|
||||
sources = [self._canvas.get_component(cpn_id).output()[1] for cpn_id in self._param.cite_source]
|
||||
query = []
|
||||
for role, cnt in history[::-1][:self._param.message_history_window_size]:
|
||||
if role != "user":continue
|
||||
query.append(cnt)
|
||||
query = "\n".join(query)
|
||||
|
||||
kbs = KnowledgebaseService.get_by_ids(self._param.kb_ids)
|
||||
if not kbs:
|
||||
raise ValueError("Can't find knowledgebases by {}".format(self._param.kb_ids))
|
||||
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
||||
assert len(embd_nms) == 1, "Knowledge bases use different embedding models."
|
||||
|
||||
embd_mdl = LLMBundle(kbs[0].tenant_id, LLMType.EMBEDDING, embd_nms[0])
|
||||
|
||||
rerank_mdl = None
|
||||
if self._param.rerank_id:
|
||||
rerank_mdl = LLMBundle(kbs[0].tenant_id, LLMType.RERANK, self._param.rerank_id)
|
||||
|
||||
kbinfos = retrievaler.retrieval(query, embd_mdl, kbs[0].tenant_id, self._param.kb_ids,
|
||||
1, self._param.top_n,
|
||||
self._param.similarity_threshold, 1 - self._param.keywords_similarity_weight,
|
||||
aggs=False, rerank_mdl=rerank_mdl)
|
||||
|
||||
if not kbinfos["chunks"]: return pd.DataFrame()
|
||||
df = pd.DataFrame(kbinfos["chunks"])
|
||||
df["content"] = df["content_with_weight"]
|
||||
del df["content_with_weight"]
|
||||
return df
|
||||
|
||||
|
||||
66
agent/component/duckduckgo.py
Normal file
66
agent/component/duckduckgo.py
Normal file
@ -0,0 +1,66 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from duckduckgo_search import DDGS
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class DuckDuckGoParam(ComponentParamBase):
|
||||
"""
|
||||
Define the DuckDuckGo component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 10
|
||||
self.channel = "text"
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_valid_value(self.channel, "Web Search or News", ["text", "news"])
|
||||
|
||||
|
||||
class DuckDuckGo(ComponentBase, ABC):
|
||||
component_name = "DuckDuckGo"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return DuckDuckGo.be_output("")
|
||||
|
||||
try:
|
||||
if self._param.channel == "text":
|
||||
with DDGS() as ddgs:
|
||||
# {'title': '', 'href': '', 'body': ''}
|
||||
duck_res = [{"content": '<a href="' + i["href"] + '">' + i["title"] + '</a> ' + i["body"]} for i
|
||||
in ddgs.text(ans, max_results=self._param.top_n)]
|
||||
elif self._param.channel == "news":
|
||||
with DDGS() as ddgs:
|
||||
# {'date': '', 'title': '', 'body': '', 'url': '', 'image': '', 'source': ''}
|
||||
duck_res = [{"content": '<a href="' + i["url"] + '">' + i["title"] + '</a> ' + i["body"]} for i
|
||||
in ddgs.news(ans, max_results=self._param.top_n)]
|
||||
except Exception as e:
|
||||
return DuckDuckGo.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not duck_res:
|
||||
return DuckDuckGo.be_output("")
|
||||
|
||||
df = pd.DataFrame(duck_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
150
agent/component/generate.py
Normal file
150
agent/component/generate.py
Normal file
@ -0,0 +1,150 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
from functools import partial
|
||||
import pandas as pd
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.settings import retrievaler
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class GenerateParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Generate component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.llm_id = ""
|
||||
self.prompt = ""
|
||||
self.max_tokens = 0
|
||||
self.temperature = 0
|
||||
self.top_p = 0
|
||||
self.presence_penalty = 0
|
||||
self.frequency_penalty = 0
|
||||
self.cite = True
|
||||
self.parameters = []
|
||||
|
||||
def check(self):
|
||||
self.check_decimal_float(self.temperature, "[Generate] Temperature")
|
||||
self.check_decimal_float(self.presence_penalty, "[Generate] Presence penalty")
|
||||
self.check_decimal_float(self.frequency_penalty, "[Generate] Frequency penalty")
|
||||
self.check_nonnegative_number(self.max_tokens, "[Generate] Max tokens")
|
||||
self.check_decimal_float(self.top_p, "[Generate] Top P")
|
||||
self.check_empty(self.llm_id, "[Generate] LLM")
|
||||
# self.check_defined_type(self.parameters, "Parameters", ["list"])
|
||||
|
||||
def gen_conf(self):
|
||||
conf = {}
|
||||
if self.max_tokens > 0: conf["max_tokens"] = self.max_tokens
|
||||
if self.temperature > 0: conf["temperature"] = self.temperature
|
||||
if self.top_p > 0: conf["top_p"] = self.top_p
|
||||
if self.presence_penalty > 0: conf["presence_penalty"] = self.presence_penalty
|
||||
if self.frequency_penalty > 0: conf["frequency_penalty"] = self.frequency_penalty
|
||||
return conf
|
||||
|
||||
|
||||
class Generate(ComponentBase):
|
||||
component_name = "Generate"
|
||||
|
||||
def get_dependent_components(self):
|
||||
cpnts = [para["component_id"] for para in self._param.parameters]
|
||||
return cpnts
|
||||
|
||||
def set_cite(self, retrieval_res, answer):
|
||||
answer, idx = retrievaler.insert_citations(answer, [ck["content_ltks"] for _, ck in retrieval_res.iterrows()],
|
||||
[ck["vector"] for _, ck in retrieval_res.iterrows()],
|
||||
LLMBundle(self._canvas.get_tenant_id(), LLMType.EMBEDDING,
|
||||
self._canvas.get_embedding_model()), tkweight=0.7,
|
||||
vtweight=0.3)
|
||||
doc_ids = set([])
|
||||
recall_docs = []
|
||||
for i in idx:
|
||||
did = retrieval_res.loc[int(i), "doc_id"]
|
||||
if did in doc_ids: continue
|
||||
doc_ids.add(did)
|
||||
recall_docs.append({"doc_id": did, "doc_name": retrieval_res.loc[int(i), "docnm_kwd"]})
|
||||
|
||||
del retrieval_res["vector"]
|
||||
del retrieval_res["content_ltks"]
|
||||
|
||||
reference = {
|
||||
"chunks": [ck.to_dict() for _, ck in retrieval_res.iterrows()],
|
||||
"doc_aggs": recall_docs
|
||||
}
|
||||
|
||||
if answer.lower().find("invalid key") >= 0 or answer.lower().find("invalid api") >= 0:
|
||||
answer += " Please set LLM API-Key in 'User Setting -> Model Providers -> API-Key'"
|
||||
res = {"content": answer, "reference": reference}
|
||||
|
||||
return res
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
prompt = self._param.prompt
|
||||
|
||||
retrieval_res = self.get_input()
|
||||
input = (" - " + "\n - ".join(retrieval_res["content"])) if "content" in retrieval_res else ""
|
||||
for para in self._param.parameters:
|
||||
cpn = self._canvas.get_component(para["component_id"])["obj"]
|
||||
_, out = cpn.output(allow_partial=False)
|
||||
if "content" not in out.columns:
|
||||
kwargs[para["key"]] = "Nothing"
|
||||
else:
|
||||
kwargs[para["key"]] = " - " + "\n - ".join(out["content"])
|
||||
|
||||
kwargs["input"] = input
|
||||
for n, v in kwargs.items():
|
||||
# prompt = re.sub(r"\{%s\}"%n, re.escape(str(v)), prompt)
|
||||
prompt = re.sub(r"\{%s\}" % n, str(v), prompt)
|
||||
|
||||
downstreams = self._canvas.get_component(self._id)["downstream"]
|
||||
if kwargs.get("stream") and len(downstreams) == 1 and self._canvas.get_component(downstreams[0])[
|
||||
"obj"].component_name.lower() == "answer":
|
||||
return partial(self.stream_output, chat_mdl, prompt, retrieval_res)
|
||||
|
||||
if "empty_response" in retrieval_res.columns:
|
||||
return Generate.be_output(input)
|
||||
|
||||
ans = chat_mdl.chat(prompt, self._canvas.get_history(self._param.message_history_window_size),
|
||||
self._param.gen_conf())
|
||||
if self._param.cite and "content_ltks" in retrieval_res.columns and "vector" in retrieval_res.columns:
|
||||
df = self.set_cite(retrieval_res, ans)
|
||||
return pd.DataFrame(df)
|
||||
|
||||
return Generate.be_output(ans)
|
||||
|
||||
def stream_output(self, chat_mdl, prompt, retrieval_res):
|
||||
res = None
|
||||
if "empty_response" in retrieval_res.columns and "\n- ".join(retrieval_res["content"]):
|
||||
res = {"content": "\n- ".join(retrieval_res["content"]), "reference": []}
|
||||
yield res
|
||||
self.set_output(res)
|
||||
return
|
||||
|
||||
answer = ""
|
||||
for ans in chat_mdl.chat_streamly(prompt, self._canvas.get_history(self._param.message_history_window_size),
|
||||
self._param.gen_conf()):
|
||||
res = {"content": ans, "reference": []}
|
||||
answer = ans
|
||||
yield res
|
||||
|
||||
if self._param.cite and "content_ltks" in retrieval_res.columns and "vector" in retrieval_res.columns:
|
||||
res = self.set_cite(retrieval_res, answer)
|
||||
yield res
|
||||
|
||||
self.set_output(res)
|
||||
96
agent/component/google.py
Normal file
96
agent/component/google.py
Normal file
@ -0,0 +1,96 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from serpapi import GoogleSearch
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class GoogleParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Google component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 10
|
||||
self.api_key = "xxx"
|
||||
self.country = "cn"
|
||||
self.language = "en"
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_empty(self.api_key, "SerpApi API key")
|
||||
self.check_valid_value(self.country, "Google Country",
|
||||
['af', 'al', 'dz', 'as', 'ad', 'ao', 'ai', 'aq', 'ag', 'ar', 'am', 'aw', 'au', 'at',
|
||||
'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 'bm', 'bt', 'bo', 'ba', 'bw',
|
||||
'bv', 'br', 'io', 'bn', 'bg', 'bf', 'bi', 'kh', 'cm', 'ca', 'cv', 'ky', 'cf', 'td',
|
||||
'cl', 'cn', 'cx', 'cc', 'co', 'km', 'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cy',
|
||||
'cz', 'dk', 'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'et', 'fk', 'fo',
|
||||
'fj', 'fi', 'fr', 'gf', 'pf', 'tf', 'ga', 'gm', 'ge', 'de', 'gh', 'gi', 'gr', 'gl',
|
||||
'gd', 'gp', 'gu', 'gt', 'gn', 'gw', 'gy', 'ht', 'hm', 'va', 'hn', 'hk', 'hu', 'is',
|
||||
'in', 'id', 'ir', 'iq', 'ie', 'il', 'it', 'jm', 'jp', 'jo', 'kz', 'ke', 'ki', 'kp',
|
||||
'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 'lr', 'ly', 'li', 'lt', 'lu', 'mo', 'mk',
|
||||
'mg', 'mw', 'my', 'mv', 'ml', 'mt', 'mh', 'mq', 'mr', 'mu', 'yt', 'mx', 'fm', 'md',
|
||||
'mc', 'mn', 'ms', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'an', 'nc', 'nz', 'ni',
|
||||
'ne', 'ng', 'nu', 'nf', 'mp', 'no', 'om', 'pk', 'pw', 'ps', 'pa', 'pg', 'py', 'pe',
|
||||
'ph', 'pn', 'pl', 'pt', 'pr', 'qa', 're', 'ro', 'ru', 'rw', 'sh', 'kn', 'lc', 'pm',
|
||||
'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 'sk', 'si', 'sb', 'so',
|
||||
'za', 'gs', 'es', 'lk', 'sd', 'sr', 'sj', 'sz', 'se', 'ch', 'sy', 'tw', 'tj', 'tz',
|
||||
'th', 'tl', 'tg', 'tk', 'to', 'tt', 'tn', 'tr', 'tm', 'tc', 'tv', 'ug', 'ua', 'ae',
|
||||
'uk', 'gb', 'us', 'um', 'uy', 'uz', 'vu', 've', 'vn', 'vg', 'vi', 'wf', 'eh', 'ye',
|
||||
'zm', 'zw'])
|
||||
self.check_valid_value(self.language, "Google languages",
|
||||
['af', 'ak', 'sq', 'ws', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bem', 'bn', 'bh',
|
||||
'xx-bork', 'bs', 'br', 'bg', 'bt', 'km', 'ca', 'chr', 'ny', 'zh-cn', 'zh-tw', 'co',
|
||||
'hr', 'cs', 'da', 'nl', 'xx-elmer', 'en', 'eo', 'et', 'ee', 'fo', 'tl', 'fi', 'fr',
|
||||
'fy', 'gaa', 'gl', 'ka', 'de', 'el', 'kl', 'gn', 'gu', 'xx-hacker', 'ht', 'ha', 'haw',
|
||||
'iw', 'hi', 'hu', 'is', 'ig', 'id', 'ia', 'ga', 'it', 'ja', 'jw', 'kn', 'kk', 'rw',
|
||||
'rn', 'xx-klingon', 'kg', 'ko', 'kri', 'ku', 'ckb', 'ky', 'lo', 'la', 'lv', 'ln', 'lt',
|
||||
'loz', 'lg', 'ach', 'mk', 'mg', 'ms', 'ml', 'mt', 'mv', 'mi', 'mr', 'mfe', 'mo', 'mn',
|
||||
'sr-me', 'my', 'ne', 'pcm', 'nso', 'no', 'nn', 'oc', 'or', 'om', 'ps', 'fa',
|
||||
'xx-pirate', 'pl', 'pt', 'pt-br', 'pt-pt', 'pa', 'qu', 'ro', 'rm', 'nyn', 'ru', 'gd',
|
||||
'sr', 'sh', 'st', 'tn', 'crs', 'sn', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'es-419', 'su',
|
||||
'sw', 'sv', 'tg', 'ta', 'tt', 'te', 'th', 'ti', 'to', 'lua', 'tum', 'tr', 'tk', 'tw',
|
||||
'ug', 'uk', 'ur', 'uz', 'vu', 'vi', 'cy', 'wo', 'xh', 'yi', 'yo', 'zu']
|
||||
)
|
||||
|
||||
|
||||
class Google(ComponentBase, ABC):
|
||||
component_name = "Google"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Google.be_output("")
|
||||
|
||||
try:
|
||||
client = GoogleSearch(
|
||||
{"engine": "google", "q": ans, "api_key": self._param.api_key, "gl": self._param.country,
|
||||
"hl": self._param.language, "num": self._param.top_n})
|
||||
google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in
|
||||
client.get_dict()["organic_results"]]
|
||||
except Exception as e:
|
||||
return Google.be_output("**ERROR**: Existing Unavailable Parameters!")
|
||||
|
||||
if not google_res:
|
||||
return Google.be_output("")
|
||||
|
||||
df = pd.DataFrame(google_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
70
agent/component/googlescholar.py
Normal file
70
agent/component/googlescholar.py
Normal file
@ -0,0 +1,70 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from scholarly import scholarly
|
||||
|
||||
|
||||
class GoogleScholarParam(ComponentParamBase):
|
||||
"""
|
||||
Define the GoogleScholar component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 6
|
||||
self.sort_by = 'relevance'
|
||||
self.year_low = None
|
||||
self.year_high = None
|
||||
self.patents = True
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance'])
|
||||
self.check_boolean(self.patents, "Whether or not to include patents, defaults to True")
|
||||
|
||||
|
||||
class GoogleScholar(ComponentBase, ABC):
|
||||
component_name = "GoogleScholar"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return GoogleScholar.be_output("")
|
||||
|
||||
scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low,
|
||||
year_high=self._param.year_high, sort_by=self._param.sort_by)
|
||||
scholar_res = []
|
||||
for i in range(self._param.top_n):
|
||||
try:
|
||||
pub = next(scholar_client)
|
||||
scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[
|
||||
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
|
||||
'bib'].get('abstract', 'no abstract')})
|
||||
|
||||
except StopIteration or Exception as e:
|
||||
print("**ERROR** " + str(e))
|
||||
break
|
||||
|
||||
if not scholar_res:
|
||||
return GoogleScholar.be_output("")
|
||||
|
||||
df = pd.DataFrame(scholar_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
65
agent/component/keyword.py
Normal file
65
agent/component/keyword.py
Normal file
@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from agent.settings import DEBUG
|
||||
|
||||
|
||||
class KeywordExtractParam(GenerateParam):
|
||||
"""
|
||||
Define the KeywordExtract component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 1
|
||||
|
||||
def check(self):
|
||||
super().check()
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
|
||||
def get_prompt(self):
|
||||
self.prompt = """
|
||||
- Role: You're a question analyzer.
|
||||
- Requirements:
|
||||
- Summarize user's question, and give top %s important keyword/phrase.
|
||||
- Use comma as a delimiter to separate keywords/phrases.
|
||||
- Answer format: (in language of user's question)
|
||||
- keyword:
|
||||
""" % self.top_n
|
||||
return self.prompt
|
||||
|
||||
|
||||
class KeywordExtract(Generate, ABC):
|
||||
component_name = "KeywordExtract"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
q = ""
|
||||
for r, c in self._canvas.history[::-1]:
|
||||
if r == "user":
|
||||
q += c
|
||||
break
|
||||
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": q}],
|
||||
self._param.gen_conf())
|
||||
|
||||
ans = re.sub(r".*keyword:", "", ans).strip()
|
||||
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
return KeywordExtract.be_output(ans)
|
||||
53
agent/component/message.py
Normal file
53
agent/component/message.py
Normal file
@ -0,0 +1,53 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class MessageParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Message component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.messages = []
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.messages, "[Message]")
|
||||
return True
|
||||
|
||||
|
||||
class Message(ComponentBase, ABC):
|
||||
component_name = "Message"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
if kwargs.get("stream"):
|
||||
return partial(self.stream_output)
|
||||
|
||||
return Message.be_output(random.choice(self._param.messages))
|
||||
|
||||
def stream_output(self):
|
||||
res = None
|
||||
if self._param.messages:
|
||||
res = {"content": random.choice(self._param.messages)}
|
||||
yield res
|
||||
|
||||
self.set_output(res)
|
||||
|
||||
|
||||
65
agent/component/pubmed.py
Normal file
65
agent/component/pubmed.py
Normal file
@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from Bio import Entrez
|
||||
import pandas as pd
|
||||
import xml.etree.ElementTree as ET
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class PubMedParam(ComponentParamBase):
|
||||
"""
|
||||
Define the PubMed component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 5
|
||||
self.email = "A.N.Other@example.com"
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
|
||||
|
||||
class PubMed(ComponentBase, ABC):
|
||||
component_name = "PubMed"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return PubMed.be_output("")
|
||||
|
||||
try:
|
||||
Entrez.email = self._param.email
|
||||
pubmedids = Entrez.read(Entrez.esearch(db='pubmed', retmax=self._param.top_n, term=ans))['IdList']
|
||||
pubmedcnt = ET.fromstring(
|
||||
Entrez.efetch(db='pubmed', id=",".join(pubmedids), retmode="xml").read().decode("utf-8"))
|
||||
pubmed_res = [{"content": 'Title:' + child.find("MedlineCitation").find("Article").find(
|
||||
"ArticleTitle").text + '\nUrl:<a href=" https://pubmed.ncbi.nlm.nih.gov/' + child.find(
|
||||
"MedlineCitation").find("PMID").text + '">' + '</a>\n' + 'Abstract:' + child.find(
|
||||
"MedlineCitation").find("Article").find("Abstract").find("AbstractText").text} for child in
|
||||
pubmedcnt.findall("PubmedArticle")]
|
||||
except Exception as e:
|
||||
return PubMed.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not pubmed_res:
|
||||
return PubMed.be_output("")
|
||||
|
||||
df = pd.DataFrame(pubmed_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
80
agent/component/relevant.py
Normal file
80
agent/component/relevant.py
Normal file
@ -0,0 +1,80 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from rag.utils import num_tokens_from_string, encoder
|
||||
|
||||
|
||||
class RelevantParam(GenerateParam):
|
||||
|
||||
"""
|
||||
Define the Relevant component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.prompt = ""
|
||||
self.yes = ""
|
||||
self.no = ""
|
||||
|
||||
def check(self):
|
||||
super().check()
|
||||
self.check_empty(self.yes, "[Relevant] 'Yes'")
|
||||
self.check_empty(self.no, "[Relevant] 'No'")
|
||||
|
||||
def get_prompt(self):
|
||||
self.prompt = """
|
||||
You are a grader assessing relevance of a retrieved document to a user question.
|
||||
It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
|
||||
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
|
||||
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
|
||||
No other words needed except 'yes' or 'no'.
|
||||
"""
|
||||
return self.prompt
|
||||
|
||||
|
||||
class Relevant(Generate, ABC):
|
||||
component_name = "Relevant"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
q = ""
|
||||
for r, c in self._canvas.history[::-1]:
|
||||
if r == "user":
|
||||
q = c
|
||||
break
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Relevant.be_output(self._param.no)
|
||||
ans = "Documents: \n" + ans
|
||||
ans = f"Question: {q}\n" + ans
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
|
||||
if num_tokens_from_string(ans) >= chat_mdl.max_length - 4:
|
||||
ans = encoder.decode(encoder.encode(ans)[:chat_mdl.max_length - 4])
|
||||
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}],
|
||||
self._param.gen_conf())
|
||||
|
||||
print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
if ans.lower().find("yes") >= 0:
|
||||
return Relevant.be_output(self._param.yes)
|
||||
if ans.lower().find("no") >= 0:
|
||||
return Relevant.be_output(self._param.no)
|
||||
assert False, f"Relevant component got: {ans}"
|
||||
|
||||
|
||||
88
agent/component/retrieval.py
Normal file
88
agent/component/retrieval.py
Normal file
@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.settings import retrievaler
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class RetrievalParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Retrieval component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.similarity_threshold = 0.2
|
||||
self.keywords_similarity_weight = 0.5
|
||||
self.top_n = 8
|
||||
self.top_k = 1024
|
||||
self.kb_ids = []
|
||||
self.rerank_id = ""
|
||||
self.empty_response = ""
|
||||
|
||||
def check(self):
|
||||
self.check_decimal_float(self.similarity_threshold, "[Retrieval] Similarity threshold")
|
||||
self.check_decimal_float(self.keywords_similarity_weight, "[Retrieval] Keywords similarity weight")
|
||||
self.check_positive_number(self.top_n, "[Retrieval] Top N")
|
||||
self.check_empty(self.kb_ids, "[Retrieval] Knowledge bases")
|
||||
|
||||
|
||||
class Retrieval(ComponentBase, ABC):
|
||||
component_name = "Retrieval"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
query = []
|
||||
for role, cnt in history[::-1][:self._param.message_history_window_size]:
|
||||
if role != "user":continue
|
||||
query.append(cnt)
|
||||
query = "\n".join(query)
|
||||
|
||||
kbs = KnowledgebaseService.get_by_ids(self._param.kb_ids)
|
||||
if not kbs:
|
||||
raise ValueError("Can't find knowledgebases by {}".format(self._param.kb_ids))
|
||||
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
||||
assert len(embd_nms) == 1, "Knowledge bases use different embedding models."
|
||||
|
||||
embd_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.EMBEDDING, embd_nms[0])
|
||||
self._canvas.set_embedding_model(embd_nms[0])
|
||||
|
||||
rerank_mdl = None
|
||||
if self._param.rerank_id:
|
||||
rerank_mdl = LLMBundle(kbs[0].tenant_id, LLMType.RERANK, self._param.rerank_id)
|
||||
|
||||
kbinfos = retrievaler.retrieval(query, embd_mdl, kbs[0].tenant_id, self._param.kb_ids,
|
||||
1, self._param.top_n,
|
||||
self._param.similarity_threshold, 1 - self._param.keywords_similarity_weight,
|
||||
aggs=False, rerank_mdl=rerank_mdl)
|
||||
|
||||
if not kbinfos["chunks"]:
|
||||
df = Retrieval.be_output(self._param.empty_response)
|
||||
df["empty_response"] = True
|
||||
return df
|
||||
|
||||
df = pd.DataFrame(kbinfos["chunks"])
|
||||
df["content"] = df["content_with_weight"]
|
||||
del df["content_with_weight"]
|
||||
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", query, df)
|
||||
return df
|
||||
|
||||
|
||||
72
agent/component/rewrite.py
Normal file
72
agent/component/rewrite.py
Normal file
@ -0,0 +1,72 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
|
||||
|
||||
class RewriteQuestionParam(GenerateParam):
|
||||
|
||||
"""
|
||||
Define the QuestionRewrite component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.temperature = 0.9
|
||||
self.prompt = ""
|
||||
self.loop = 1
|
||||
|
||||
def check(self):
|
||||
super().check()
|
||||
|
||||
def get_prompt(self):
|
||||
self.prompt = """
|
||||
You are an expert at query expansion to generate a paraphrasing of a question.
|
||||
I can't retrieval relevant information from the knowledge base by using user's question directly.
|
||||
You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase,
|
||||
writing the abbreviation in its entirety, adding some extra descriptions or explanations,
|
||||
changing the way of expression, translating the original question into another language (English/Chinese), etc.
|
||||
And return 5 versions of question and one is from translation.
|
||||
Just list the question. No other words are needed.
|
||||
"""
|
||||
return self.prompt
|
||||
|
||||
|
||||
class RewriteQuestion(Generate, ABC):
|
||||
component_name = "RewriteQuestion"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
if not hasattr(self, "_loop"):
|
||||
setattr(self, "_loop", 0)
|
||||
if self._loop >= self._param.loop:
|
||||
self._loop = 0
|
||||
raise Exception("Maximum loop time exceeds. Can't find relevant information.")
|
||||
self._loop += 1
|
||||
q = "Question: "
|
||||
for r, c in self._canvas.history[::-1]:
|
||||
if r == "user":
|
||||
q += c
|
||||
break
|
||||
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": q}],
|
||||
self._param.gen_conf())
|
||||
|
||||
print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
return RewriteQuestion.be_output(ans)
|
||||
|
||||
|
||||
77
agent/component/switch.py
Normal file
77
agent/component/switch.py
Normal file
@ -0,0 +1,77 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
|
||||
import pandas as pd
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class SwitchParam(ComponentParamBase):
|
||||
|
||||
"""
|
||||
Define the Switch component parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
"""
|
||||
{
|
||||
"cpn_id": "categorize:0",
|
||||
"not": False,
|
||||
"operator": "gt/gte/lt/lte/eq/in",
|
||||
"value": "",
|
||||
"to": ""
|
||||
}
|
||||
"""
|
||||
self.conditions = []
|
||||
self.default = ""
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.conditions, "[Switch] conditions")
|
||||
self.check_empty(self.default, "[Switch] Default path")
|
||||
for cond in self.conditions:
|
||||
if not cond["to"]: raise ValueError(f"[Switch] 'To' can not be empty!")
|
||||
|
||||
def operators(self, field, op, value):
|
||||
if op == "gt":
|
||||
return float(field) > float(value)
|
||||
if op == "gte":
|
||||
return float(field) >= float(value)
|
||||
if op == "lt":
|
||||
return float(field) < float(value)
|
||||
if op == "lte":
|
||||
return float(field) <= float(value)
|
||||
if op == "eq":
|
||||
return str(field) == str(value)
|
||||
if op == "in":
|
||||
return str(field).find(str(value)) >= 0
|
||||
return False
|
||||
|
||||
|
||||
class Switch(ComponentBase, ABC):
|
||||
component_name = "Switch"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
for cond in self._param.conditions:
|
||||
input = self._canvas.get_component(cond["cpn_id"])["obj"].output()[1]
|
||||
if self._param.operators(input.iloc[0, 0], cond["operator"], cond["value"]):
|
||||
if not cond["not"]:
|
||||
return pd.DataFrame([{"content": cond["to"]}])
|
||||
|
||||
return pd.DataFrame([{"content": self._param.default}])
|
||||
|
||||
|
||||
|
||||
|
||||
69
agent/component/wikipedia.py
Normal file
69
agent/component/wikipedia.py
Normal file
@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
import wikipedia
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
|
||||
class WikipediaParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Wikipedia component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.top_n = 10
|
||||
self.language = "en"
|
||||
|
||||
def check(self):
|
||||
self.check_positive_integer(self.top_n, "Top N")
|
||||
self.check_valid_value(self.language, "Wikipedia languages",
|
||||
['af', 'pl', 'ar', 'ast', 'az', 'bg', 'nan', 'bn', 'be', 'ca', 'cs', 'cy', 'da', 'de',
|
||||
'et', 'el', 'en', 'es', 'eo', 'eu', 'fa', 'fr', 'gl', 'ko', 'hy', 'hi', 'hr', 'id',
|
||||
'it', 'he', 'ka', 'lld', 'la', 'lv', 'lt', 'hu', 'mk', 'arz', 'ms', 'min', 'my', 'nl',
|
||||
'ja', 'nb', 'nn', 'ce', 'uz', 'pt', 'kk', 'ro', 'ru', 'ceb', 'sk', 'sl', 'sr', 'sh',
|
||||
'fi', 'sv', 'ta', 'tt', 'th', 'tg', 'azb', 'tr', 'uk', 'ur', 'vi', 'war', 'zh', 'yue'])
|
||||
|
||||
|
||||
class Wikipedia(ComponentBase, ABC):
|
||||
component_name = "Wikipedia"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Wikipedia.be_output("")
|
||||
|
||||
try:
|
||||
wiki_res = []
|
||||
wikipedia.set_lang(self._param.language)
|
||||
wiki_engine = wikipedia
|
||||
for wiki_key in wiki_engine.search(ans, results=self._param.top_n):
|
||||
page = wiki_engine.page(title=wiki_key, auto_suggest=False)
|
||||
wiki_res.append({"content": '<a href="' + page.url + '">' + page.title + '</a> ' + page.summary})
|
||||
except Exception as e:
|
||||
return Wikipedia.be_output("**ERROR**: " + str(e))
|
||||
|
||||
if not wiki_res:
|
||||
return Wikipedia.be_output("")
|
||||
|
||||
df = pd.DataFrame(wiki_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
return df
|
||||
Reference in New Issue
Block a user