add function: upload and parse (#1889)

### What problem does this PR solve?

#1880
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2024-08-09 16:20:02 +08:00
committed by GitHub
parent 6529c764c9
commit e3cf14a3c9
8 changed files with 255 additions and 89 deletions

View File

@ -30,24 +30,6 @@ from rag.nlp import rag_tokenizer
from rag.utils import num_tokens_from_string
def be_children(obj: dict, keyset:set):
if isinstance(obj, str):
obj = [obj]
if isinstance(obj, list):
for i in obj: keyset.add(i)
return [{"id": re.sub(r"\*+", "", i), "children":[]} for i in obj]
arr = []
for k,v in obj.items():
k = re.sub(r"\*+", "", k)
if not k or k in keyset:continue
keyset.add(k)
arr.append({
"id": k,
"children": be_children(v, keyset)
})
return arr
def graph_merge(g1, g2):
g = g2.copy()
for n, attr in g1.nodes(data=True):
@ -153,16 +135,10 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
mg = mindmap(_chunks).output
if not len(mg.keys()): return chunks
if len(mg.keys()) > 1:
keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
else:
k = re.sub(r"\*+", "", list(mg.keys())[0])
md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))}
print(json.dumps(md_map, ensure_ascii=False, indent=2))
print(json.dumps(mg, ensure_ascii=False, indent=2))
chunks.append(
{
"content_with_weight": json.dumps(md_map, ensure_ascii=False, indent=2),
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),
"knowledge_graph_kwd": "mind_map"
})

View File

@ -57,6 +57,26 @@ class MindMapExtractor:
self._mind_map_prompt = prompt or MIND_MAP_EXTRACTION_PROMPT
self._on_error = on_error or (lambda _e, _s, _d: None)
def _key(self, k):
return re.sub(r"\*+", "", k)
def _be_children(self, obj: dict, keyset: set):
if isinstance(obj, str):
obj = [obj]
if isinstance(obj, list):
for i in obj: keyset.add(i)
return [{"id": re.sub(r"\*+", "", i), "children": []} for i in obj]
arr = []
for k, v in obj.items():
k = self._key(k)
if not k or k in keyset: continue
keyset.add(k)
arr.append({
"id": k,
"children": self._be_children(v, keyset)
})
return arr
def __call__(
self, sections: list[str], prompt_variables: dict[str, Any] | None = None
) -> MindMapResult:
@ -86,13 +106,23 @@ class MindMapExtractor:
res.append(_.result())
merge_json = reduce(self._merge, res)
merge_json = self._list_to_kv(merge_json)
if len(merge_json.keys()) > 1:
keyset = set(
[re.sub(r"\*+", "", k) for k, v in merge_json.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
merge_json = {"id": "root",
"children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
merge_json.items() if isinstance(v, dict) and self._key(k)]}
else:
k = self._key(list(self._be_children.keys())[0])
merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
except Exception as e:
logging.exception("error mind graph")
self._on_error(
e,
traceback.format_exc(), None
)
merge_json = {"error": str(e)}
return MindMapResult(output=merge_json)

View File

@ -23,6 +23,7 @@ MIND_MAP_EXTRACTION_PROMPT = """
4. Add a shot content summary of the bottom level section.
- Output requirement:
- Generate at least 4 levels.
- Always try to maximize the number of sub-sections.
- In language of 'Text'
- MUST IN FORMAT OF MARKDOWN