fix docker compose issue (#238)

### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/226)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-23 03:26:53 +08:00 · 2024-04-07 09:04:32 +08:00
parent b4abbe5d93
commit 23b448cf96
14 changed files with 195 additions and 234 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -10,14 +10,59 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import copy
+from io import BytesIO
+from docx import Document
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
-from deepdoc.parser import PdfParser, ExcelParser
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from rag.settings import cron_logger

+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+        tbls.append(((None, html), ""))
+        return [(l, "") for l in lines if l], tbls
+

 class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    sections = []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        for txt in laws.Docx()(filename, binary):
-            sections.append((txt, ""))
+        sections, tbls = Docx()(filename, binary)
+        res = tokenize_table(tbls, doc, eng)
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
                    continue
                if not str(row[clmns[j]]):
                    continue
-                if pd.isna(row[clmns[j]]):
-                    continue
+                #if pd.isna(row[clmns[j]]):
+                #    continue
                fld = clmns_map[j][0]
                d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
                    row[clmns[j]])
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@ -170,3 +170,4 @@ class LocalLLM(Base):
            return ans, num_tokens_from_string(ans)
        except Exception as e:
            return "**ERROR**: " + str(e), 0
+
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -68,6 +68,7 @@ def bullets_category(sections):

 def is_english(texts):
    eng = 0
+    if not texts: return False
    for t in texts:
        if re.match(r"[a-zA-Z]{2,}", t.strip()):
            eng += 1
@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            d = copy.deepcopy(doc)
            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
-            d["image"] = img
-            add_positions(d, poss)
+            if img: d["image"] = img
+            if poss: add_positions(d, poss)
            res.append(d)
            continue
        de = "; " if eng else "； "
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -46,7 +46,7 @@ class Dealer:
            "k": topk,
            "similarity": sim,
            "num_candidates": topk * 2,
-            "query_vector": qv
+            "query_vector": list(qv)
        }

    def search(self, req, idxnm, emb_mdl=None):