Add graphrag (#1793)

### What problem does this PR solve?

#1594

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2024-08-02 18:51:14 +08:00
committed by GitHub
parent 80032b1fc0
commit 152072f900
74 changed files with 2522 additions and 105 deletions

View File

@ -0,0 +1,30 @@
import re
from graphrag.index import build_knowlege_graph_chunks
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize_chunks
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": False})
eng = lang.lower() == "english"
parser_config["layout_recognize"] = False
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config)
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
)
for c in chunks: c["docnm_kwd"] = filename
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
"knowledge_graph_kwd": "text"
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
chunks.extend(tokenize_chunks(sections, doc, eng))
return chunks

View File

@ -273,6 +273,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
if kwargs.get("section_only", False):
return [t for t, _ in sections]
st = timer()
chunks = naive_merge(
sections, int(parser_config.get(

View File

@ -228,7 +228,7 @@ def tokenize(d, t, eng):
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser):
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
res = []
# wrap up as es documents
for ck in chunks:

View File

@ -64,24 +64,25 @@ class Dealer:
"query_vector": [float(v) for v in qv]
}
def _add_filters(self, bqry, req):
if req.get("kb_ids"):
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
if req.get("doc_ids"):
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
if req.get("knowledge_graph_kwd"):
bqry.filter.append(Q("terms", knowledge_graph_kwd=req["knowledge_graph_kwd"]))
if "available_int" in req:
if req["available_int"] == 0:
bqry.filter.append(Q("range", available_int={"lt": 1}))
else:
bqry.filter.append(
Q("bool", must_not=Q("range", available_int={"lt": 1})))
return bqry
def search(self, req, idxnm, emb_mdl=None):
qst = req.get("question", "")
bqry, keywords = self.qryr.question(qst)
def add_filters(bqry):
nonlocal req
if req.get("kb_ids"):
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
if req.get("doc_ids"):
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
if "available_int" in req:
if req["available_int"] == 0:
bqry.filter.append(Q("range", available_int={"lt": 1}))
else:
bqry.filter.append(
Q("bool", must_not=Q("range", available_int={"lt": 1})))
return bqry
bqry = add_filters(bqry)
bqry = self._add_filters(bqry, req)
bqry.boost = 0.05
s = Search()
@ -89,7 +90,7 @@ class Dealer:
topk = int(req.get("topk", 1024))
ps = int(req.get("size", topk))
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int", "knowledge_graph_kwd",
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
s = s.query(bqry)[pg * ps:(pg + 1) * ps]
@ -137,7 +138,7 @@ class Dealer:
es_logger.info("TOTAL: {}".format(self.es.getTotal(res)))
if self.es.getTotal(res) == 0 and "knn" in s:
bqry, _ = self.qryr.question(qst, min_match="10%")
bqry = add_filters(bqry)
bqry = self._add_filters(bqry)
s["query"] = bqry.to_dict()
s["knn"]["filter"] = bqry.to_dict()
s["knn"]["similarity"] = 0.17

View File

@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
from io import BytesIO
import pandas as pd
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
@ -68,7 +68,8 @@ FACTORY = {
ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
ParserType.AUDIO.value: audio
ParserType.AUDIO.value: audio,
ParserType.KG.value: knowledge_graph
}