fix bug about fetching knowledge graph (#3394)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu
2024-11-14 12:29:15 +08:00
committed by GitHub
parent 400fc3f5e9
commit 4caf932808
6 changed files with 120 additions and 62 deletions

View File

@ -10,6 +10,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from deepdoc.parser.utils import get_text
from rag.nlp import num_tokens_from_string
@ -29,8 +31,6 @@ class RAGFlowTxtParser:
def add_chunk(t):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8:
pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t)
tk_nums.append(tnum)
@ -38,15 +38,19 @@ class RAGFlowTxtParser:
cks[-1] += t
tk_nums[-1] += tnum
s, e = 0, 1
while e < len(txt):
if txt[e] in delimiter:
add_chunk(txt[s: e + 1])
s = e + 1
e = s + 1
else:
e += 1
if s < e:
add_chunk(txt[s: e + 1])
dels = []
s = 0
for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
f, t = m.span()
dels.append(m.group(1))
dels.extend(list(delimiter[s: f]))
s = t
if s < len(delimiter):
dels.extend(list(delimiter[s:]))
dels = [re.escape(d) for d in delimiter if d]
dels = [d for d in dels if d]
dels = "|".join(dels)
secs = re.split(r"(%s)" % dels, txt)
for sec in secs: add_chunk(sec)
return [[c,""] for c in cks]
return [[c, ""] for c in cks]