mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Tagging (#4426)
### What problem does this PR solve? #4367 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -26,6 +26,7 @@ from docx import Document
|
||||
from PIL import Image
|
||||
from markdown import markdown
|
||||
|
||||
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, callback=None):
|
||||
if not binary:
|
||||
@ -58,11 +59,11 @@ class Excel(ExcelParser):
|
||||
if len(res) % 999 == 0:
|
||||
callback(len(res) *
|
||||
0.6 /
|
||||
total, ("Extract Q&A: {}".format(len(res)) +
|
||||
total, ("Extract pairs: {}".format(len(res)) +
|
||||
(f"{len(fails)} failure, line: %s..." %
|
||||
(",".join(fails[:3])) if fails else "")))
|
||||
|
||||
callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
|
||||
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
self.is_english = is_english(
|
||||
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
||||
@ -269,7 +270,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
|
||||
return d
|
||||
|
||||
|
||||
def beAdocDocx(d, q, a, eng, image):
|
||||
def beAdocDocx(d, q, a, eng, image, row_num=-1):
|
||||
qprefix = "Question: " if eng else "问题:"
|
||||
aprefix = "Answer: " if eng else "回答:"
|
||||
d["content_with_weight"] = "\t".join(
|
||||
@ -277,16 +278,20 @@ def beAdocDocx(d, q, a, eng, image):
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["image"] = image
|
||||
if row_num >= 0:
|
||||
d["top_int"] = [row_num]
|
||||
return d
|
||||
|
||||
|
||||
def beAdoc(d, q, a, eng):
|
||||
def beAdoc(d, q, a, eng, row_num=-1):
|
||||
qprefix = "Question: " if eng else "问题:"
|
||||
aprefix = "Answer: " if eng else "回答:"
|
||||
d["content_with_weight"] = "\t".join(
|
||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
if row_num >= 0:
|
||||
d["top_int"] = [row_num]
|
||||
return d
|
||||
|
||||
|
||||
@ -316,8 +321,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
for q, a in excel_parser(filename, binary, callback):
|
||||
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
||||
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
|
||||
res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
|
||||
@ -344,7 +349,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
fails.append(str(i+1))
|
||||
elif len(arr) == 2:
|
||||
if question and answer:
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
|
||||
question, answer = arr
|
||||
i += 1
|
||||
if len(res) % 999 == 0:
|
||||
@ -352,7 +357,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
if question:
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
|
||||
|
||||
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
@ -378,14 +383,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
fails.append(str(i + 1))
|
||||
elif len(row) == 2:
|
||||
if question and answer:
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
|
||||
question, answer = row
|
||||
if len(res) % 999 == 0:
|
||||
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
if question:
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(reader)))
|
||||
|
||||
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
@ -420,7 +425,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
if last_answer.strip():
|
||||
sum_question = '\n'.join(question_stack)
|
||||
if sum_question:
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
|
||||
last_answer = ''
|
||||
|
||||
i = question_level
|
||||
@ -432,7 +437,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
if last_answer.strip():
|
||||
sum_question = '\n'.join(question_stack)
|
||||
if sum_question:
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -440,8 +445,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
qai_list, tbls = docx_parser(filename, binary,
|
||||
from_page=0, to_page=10000, callback=callback)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
for q, a, image in qai_list:
|
||||
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image))
|
||||
for i, (q, a, image) in enumerate(qai_list):
|
||||
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
|
||||
return res
|
||||
|
||||
raise NotImplementedError(
|
||||
|
||||
125
rag/app/tag.py
Normal file
125
rag/app/tag.py
Normal file
@ -0,0 +1,125 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
import csv
|
||||
from copy import deepcopy
|
||||
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.app.qa import Excel
|
||||
from rag.nlp import rag_tokenizer
|
||||
|
||||
|
||||
def beAdoc(d, q, a, eng, row_num=-1):
|
||||
d["content_with_weight"] = q
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["tag_kwd"] = [t.strip() for t in a.split(",") if t.strip()]
|
||||
if row_num >= 0:
|
||||
d["top_int"] = [row_num]
|
||||
return d
|
||||
|
||||
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
If the file is in excel format, there should be 2 column content and tags without header.
|
||||
And content column is ahead of tags column.
|
||||
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
|
||||
|
||||
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate content and tags.
|
||||
|
||||
All the deformed lines will be ignored.
|
||||
Every pair will be treated as a chunk.
|
||||
"""
|
||||
eng = lang.lower() == "english"
|
||||
res = []
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
|
||||
res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = get_text(filename, binary)
|
||||
lines = txt.split("\n")
|
||||
comma, tab = 0, 0
|
||||
for line in lines:
|
||||
if len(line.split(",")) == 2:
|
||||
comma += 1
|
||||
if len(line.split("\t")) == 2:
|
||||
tab += 1
|
||||
delimiter = "\t" if tab >= comma else ","
|
||||
|
||||
fails = []
|
||||
content = ""
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
arr = lines[i].split(delimiter)
|
||||
if len(arr) != 2:
|
||||
content += "\n" + lines[i]
|
||||
elif len(arr) == 2:
|
||||
content += "\n" + arr[0]
|
||||
res.append(beAdoc(deepcopy(doc), content, arr[1], eng, i))
|
||||
content = ""
|
||||
i += 1
|
||||
if len(res) % 999 == 0:
|
||||
callback(len(res) * 0.6 / len(lines), ("Extract TAG: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
callback(0.6, ("Extract TAG: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
return res
|
||||
|
||||
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = get_text(filename, binary)
|
||||
lines = txt.split("\n")
|
||||
delimiter = "\t" if any("\t" in line for line in lines) else ","
|
||||
|
||||
fails = []
|
||||
content = ""
|
||||
res = []
|
||||
reader = csv.reader(lines, delimiter=delimiter)
|
||||
|
||||
for i, row in enumerate(reader):
|
||||
if len(row) != 2:
|
||||
content += "\n" + lines[i]
|
||||
elif len(row) == 2:
|
||||
content += "\n" + row[0]
|
||||
res.append(beAdoc(deepcopy(doc), content, row[1], eng, i))
|
||||
content = ""
|
||||
if len(res) % 999 == 0:
|
||||
callback(len(res) * 0.6 / len(lines), ("Extract Tags: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
callback(0.6, ("Extract TAG : {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
return res
|
||||
|
||||
raise NotImplementedError(
|
||||
"Excel, csv(txt) format files are supported.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
Reference in New Issue
Block a user