fix docker compose issue (#238)

### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/226)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-30 23:26:36 +08:00 · 2024-04-07 09:04:32 +08:00
parent b4abbe5d93
commit 23b448cf96
14 changed files with 195 additions and 234 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -10,14 +10,59 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import copy
+from io import BytesIO
+from docx import Document
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
-from deepdoc.parser import PdfParser, ExcelParser
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from rag.settings import cron_logger

+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+        tbls.append(((None, html), ""))
+        return [(l, "") for l in lines if l], tbls
+

 class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    sections = []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        for txt in laws.Docx()(filename, binary):
-            sections.append((txt, ""))
+        sections, tbls = Docx()(filename, binary)
+        res = tokenize_table(tbls, doc, eng)
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
                    continue
                if not str(row[clmns[j]]):
                    continue
-                if pd.isna(row[clmns[j]]):
-                    continue
+                #if pd.isna(row[clmns[j]]):
+                #    continue
                fld = clmns_map[j][0]
                d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
                    row[clmns[j]])