mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add graphrag (#1793)
### What problem does this PR solve? #1594 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
30
rag/app/knowledge_graph.py
Normal file
30
rag/app/knowledge_graph.py
Normal file
@ -0,0 +1,30 @@
|
||||
import re
|
||||
|
||||
from graphrag.index import build_knowlege_graph_chunks
|
||||
from rag.app import naive
|
||||
from rag.nlp import rag_tokenizer, tokenize_chunks
|
||||
|
||||
|
||||
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": False})
|
||||
eng = lang.lower() == "english"
|
||||
|
||||
parser_config["layout_recognize"] = False
|
||||
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config)
|
||||
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
|
||||
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
||||
)
|
||||
for c in chunks: c["docnm_kwd"] = filename
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
||||
"knowledge_graph_kwd": "text"
|
||||
}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
chunks.extend(tokenize_chunks(sections, doc, eng))
|
||||
|
||||
return chunks
|
||||
@ -273,6 +273,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
if kwargs.get("section_only", False):
|
||||
return [t for t, _ in sections]
|
||||
|
||||
st = timer()
|
||||
chunks = naive_merge(
|
||||
sections, int(parser_config.get(
|
||||
|
||||
@ -228,7 +228,7 @@ def tokenize(d, t, eng):
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
|
||||
|
||||
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
||||
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ck in chunks:
|
||||
|
||||
@ -64,24 +64,25 @@ class Dealer:
|
||||
"query_vector": [float(v) for v in qv]
|
||||
}
|
||||
|
||||
def _add_filters(self, bqry, req):
|
||||
if req.get("kb_ids"):
|
||||
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
||||
if req.get("doc_ids"):
|
||||
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
|
||||
if req.get("knowledge_graph_kwd"):
|
||||
bqry.filter.append(Q("terms", knowledge_graph_kwd=req["knowledge_graph_kwd"]))
|
||||
if "available_int" in req:
|
||||
if req["available_int"] == 0:
|
||||
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
||||
else:
|
||||
bqry.filter.append(
|
||||
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
||||
return bqry
|
||||
|
||||
def search(self, req, idxnm, emb_mdl=None):
|
||||
qst = req.get("question", "")
|
||||
bqry, keywords = self.qryr.question(qst)
|
||||
def add_filters(bqry):
|
||||
nonlocal req
|
||||
if req.get("kb_ids"):
|
||||
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
||||
if req.get("doc_ids"):
|
||||
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
|
||||
if "available_int" in req:
|
||||
if req["available_int"] == 0:
|
||||
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
||||
else:
|
||||
bqry.filter.append(
|
||||
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
||||
return bqry
|
||||
|
||||
bqry = add_filters(bqry)
|
||||
bqry = self._add_filters(bqry, req)
|
||||
bqry.boost = 0.05
|
||||
|
||||
s = Search()
|
||||
@ -89,7 +90,7 @@ class Dealer:
|
||||
topk = int(req.get("topk", 1024))
|
||||
ps = int(req.get("size", topk))
|
||||
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
|
||||
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
||||
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int", "knowledge_graph_kwd",
|
||||
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
||||
|
||||
s = s.query(bqry)[pg * ps:(pg + 1) * ps]
|
||||
@ -137,7 +138,7 @@ class Dealer:
|
||||
es_logger.info("TOTAL: {}".format(self.es.getTotal(res)))
|
||||
if self.es.getTotal(res) == 0 and "knn" in s:
|
||||
bqry, _ = self.qryr.question(qst, min_match="10%")
|
||||
bqry = add_filters(bqry)
|
||||
bqry = self._add_filters(bqry)
|
||||
s["query"] = bqry.to_dict()
|
||||
s["knn"]["filter"] = bqry.to_dict()
|
||||
s["knn"]["similarity"] = 0.17
|
||||
|
||||
@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
|
||||
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.document_service import DocumentService
|
||||
@ -68,7 +68,8 @@ FACTORY = {
|
||||
ParserType.RESUME.value: resume,
|
||||
ParserType.PICTURE.value: picture,
|
||||
ParserType.ONE.value: one,
|
||||
ParserType.AUDIO.value: audio
|
||||
ParserType.AUDIO.value: audio,
|
||||
ParserType.KG.value: knowledge_graph
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user