fix docker compose issue (#238)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

Issue link:#[[Link the issue
here](https://github.com/infiniflow/ragflow/issues/226)]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
KevinHuSh
2024-04-07 09:04:32 +08:00
committed by GitHub
parent b4abbe5d93
commit 23b448cf96
14 changed files with 195 additions and 234 deletions

View File

@ -10,14 +10,59 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from io import BytesIO
from docx import Document
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
from deepdoc.parser import PdfParser, ExcelParser
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger
class Docx(DocxParser):
def __init__(self):
pass
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
tbls = []
for tb in self.doc.tables:
html= "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i+1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return [(l, "") for l in lines if l], tbls
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in laws.Docx()(filename, binary):
sections.append((txt, ""))
sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):

View File

@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
continue
if not str(row[clmns[j]]):
continue
if pd.isna(row[clmns[j]]):
continue
#if pd.isna(row[clmns[j]]):
# continue
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
row[clmns[j]])

View File

@ -170,3 +170,4 @@ class LocalLLM(Base):
return ans, num_tokens_from_string(ans)
except Exception as e:
return "**ERROR**: " + str(e), 0

View File

@ -68,6 +68,7 @@ def bullets_category(sections):
def is_english(texts):
eng = 0
if not texts: return False
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc)
tokenize(d, rows, eng)
d["content_with_weight"] = rows
d["image"] = img
add_positions(d, poss)
if img: d["image"] = img
if poss: add_positions(d, poss)
res.append(d)
continue
de = "; " if eng else " "

View File

@ -46,7 +46,7 @@ class Dealer:
"k": topk,
"similarity": sim,
"num_candidates": topk * 2,
"query_vector": qv
"query_vector": list(qv)
}
def search(self, req, idxnm, emb_mdl=None):