Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve? Related source file is in Windows/DOS format, they are format to Unix format. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-02-02 16:45:08 +08:00 · 2024-08-15 09:17:36 +08:00
parent 1328d715db
commit 6b3a40be5c
108 changed files with 36399 additions and 36399 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -1,282 +1,282 @@
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from tika import parser
-from io import BytesIO
-from docx import Document
-from timeit import default_timer as timer
-import re
-from deepdoc.parser.pdf_parser import PlainParser
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
-from rag.settings import cron_logger
-from rag.utils import num_tokens_from_string
-from PIL import Image
-from functools import reduce
-from markdown import markdown
-from docx.image.exceptions import UnrecognizedImageError
-
-class Docx(DocxParser):
-    def __init__(self):
-        pass
-
-    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath('.//pic:pic')
-        if not img:
-            return None
-        img = img[0]
-        embed = img.xpath('.//a:blip/@r:embed')[0]
-        related_part = document.part.related_parts[embed]
-        try:
-            image_blob = related_part.image.blob
-        except UnrecognizedImageError:
-            print("Unrecognized image format. Skipping image.")
-            return None
-        try:
-            image = Image.open(BytesIO(image_blob)).convert('RGB')
-            return image
-        except Exception as e:
-            return None
-
-    def __clean(self, line):
-        line = re.sub(r"\u3000", " ", line).strip()
-        return line
-
-    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
-        pn = 0
-        lines = []
-        last_image = None
-        for p in self.doc.paragraphs:
-            if pn > to_page:
-                break
-            if from_page <= pn < to_page:
-                if p.text.strip():
-                    if p.style and p.style.name == 'Caption':
-                        former_image = None
-                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
-                            former_image = lines[-1][1].pop()
-                        elif last_image:
-                            former_image = last_image
-                            last_image = None
-                        lines.append((self.__clean(p.text), [former_image], p.style.name))
-                    else:
-                        current_image = self.get_picture(self.doc, p)
-                        image_list = [current_image]
-                        if last_image:
-                            image_list.insert(0, last_image)
-                            last_image = None
-                        lines.append((self.__clean(p.text), image_list, p.style.name))
-                else:
-                    if current_image := self.get_picture(self.doc, p):
-                        if lines:
-                            lines[-1][1].append(current_image)
-                        else:
-                            last_image = current_image
-            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
-                    pn += 1
-        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
-
-        tbls = []
-        for tb in self.doc.tables:
-            html= "<table>"
-            for r in tb.rows:
-                html += "<tr>"
-                i = 0
-                while i < len(r.cells):
-                    span = 1
-                    c = r.cells[i]
-                    for j in range(i+1, len(r.cells)):
-                        if c.text == r.cells[j].text:
-                            span += 1
-                            i = j
-                    i += 1
-                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
-                html += "</tr>"
-            html += "</table>"
-            tbls.append(((None, html), ""))
-        return new_line, tbls
-
-
-class Pdf(PdfParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        start = timer()
-        callback(msg="OCR is running...")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished")
-        cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.63, "Layout analysis finished.")
-        self._table_transformer_job(zoomin)
-        callback(0.65, "Table analysis finished.")
-        self._text_merge()
-        callback(0.67, "Text merging finished")
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        #self._naive_vertical_merge()
-        self._concat_downward()
-        #self._filter_forpages()
-
-        cron_logger.info("layouts: {}".format(timer() - start))
-        return [(b["text"], self._line_tag(b, zoomin))
-                for b in self.boxes], tbls
-
-
-class Markdown(MarkdownParser):
-    def __call__(self, filename, binary=None):
-        txt = ""
-        tbls = []
-        if binary:
-            encoding = find_codec(binary)
-            txt = binary.decode(encoding, errors="ignore")
-        else:
-            with open(filename, "r") as f:
-                txt = f.read()
-        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
-        sections = []
-        tbls = []
-        for sec in remainder.split("\n"):
-            if num_tokens_from_string(sec) > 10 * self.chunk_token_num:
-                sections.append((sec[:int(len(sec)/2)], ""))
-                sections.append((sec[int(len(sec)/2):], ""))
-            else:
-                sections.append((sec, ""))
-        print(tables)
-        for table in tables:
-            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
-        return sections, tbls
-
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Supported file formats are docx, pdf, excel, txt.
-        This method apply the naive ways to chunk files.
-        Successive text will be sliced into pieces using 'delimiter'.
-        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
-    """
-
-    eng = lang.lower() == "english"  # is_english(cks)
-    parser_config = kwargs.get(
-        "parser_config", {
-            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    res = []
-    pdf_parser = None
-    sections = []
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections, tbls = Docx()(filename, binary)
-        res = tokenize_table(tbls, doc, eng)    # just for table
-
-        callback(0.8, "Finish parsing.")
-        st = timer()
-
-        chunks, images = naive_merge_docx(
-            sections, int(parser_config.get(
-                "chunk_token_num", 128)), parser_config.get(
-                "delimiter", "\n!?。；！？"))
-
-        if kwargs.get("section_only", False):
-            return chunks
-
-        res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
-        cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
-        return res
-
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf(
-        ) if parser_config.get("layout_recognize", True) else PlainParser()
-        sections, tbls = pdf_parser(filename if not binary else binary,
-                                    from_page=from_page, to_page=to_page, callback=callback)
-        res = tokenize_table(tbls, doc, eng)
-
-    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        excel_parser = ExcelParser()
-        sections = [(l, "") for l in excel_parser.html(binary) if l]
-
-    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = TxtParser()(filename,binary,
-                               parser_config.get("chunk_token_num", 128),
-                               parser_config.get("delimiter", "\n!?;。；！？"))
-        callback(0.8, "Finish parsing.")
-    
-    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
-        res = tokenize_table(tbls, doc, eng)
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = HtmlParser()(filename, binary)
-        sections = [(l, "") for l in sections if l]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.json$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary)
-        sections = [(l, "") for l in sections if l]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.doc$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        binary = BytesIO(binary)
-        doc_parsed = parser.from_buffer(binary)
-        sections = doc_parsed['content'].split('\n')
-        sections = [(l, "") for l in sections if l]
-        callback(0.8, "Finish parsing.")
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
-
-    st = timer()
-    chunks = naive_merge(
-        sections, int(parser_config.get(
-            "chunk_token_num", 128)), parser_config.get(
-            "delimiter", "\n!?。；！？"))
-    if kwargs.get("section_only", False):
-        return chunks
-
-    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
-    cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
-    return res
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-
-    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from tika import parser
+from io import BytesIO
+from docx import Document
+from timeit import default_timer as timer
+import re
+from deepdoc.parser.pdf_parser import PlainParser
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
+from rag.settings import cron_logger
+from rag.utils import num_tokens_from_string
+from PIL import Image
+from functools import reduce
+from markdown import markdown
+from docx.image.exceptions import UnrecognizedImageError
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        try:
+            image_blob = related_part.image.blob
+        except UnrecognizedImageError:
+            print("Unrecognized image format. Skipping image.")
+            return None
+        try:
+            image = Image.open(BytesIO(image_blob)).convert('RGB')
+            return image
+        except Exception as e:
+            return None
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        last_image = None
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page:
+                if p.text.strip():
+                    if p.style and p.style.name == 'Caption':
+                        former_image = None
+                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+                            former_image = lines[-1][1].pop()
+                        elif last_image:
+                            former_image = last_image
+                            last_image = None
+                        lines.append((self.__clean(p.text), [former_image], p.style.name))
+                    else:
+                        current_image = self.get_picture(self.doc, p)
+                        image_list = [current_image]
+                        if last_image:
+                            image_list.insert(0, last_image)
+                            last_image = None
+                        lines.append((self.__clean(p.text), image_list, p.style.name))
+                else:
+                    if current_image := self.get_picture(self.doc, p):
+                        if lines:
+                            lines[-1][1].append(current_image)
+                        else:
+                            last_image = current_image
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
+
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return new_line, tbls
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        start = timer()
+        callback(msg="OCR is running...")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished")
+        cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis finished.")
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis finished.")
+        self._text_merge()
+        callback(0.67, "Text merging finished")
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        #self._naive_vertical_merge()
+        self._concat_downward()
+        #self._filter_forpages()
+
+        cron_logger.info("layouts: {}".format(timer() - start))
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], tbls
+
+
+class Markdown(MarkdownParser):
+    def __call__(self, filename, binary=None):
+        txt = ""
+        tbls = []
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(filename, "r") as f:
+                txt = f.read()
+        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
+        sections = []
+        tbls = []
+        for sec in remainder.split("\n"):
+            if num_tokens_from_string(sec) > 10 * self.chunk_token_num:
+                sections.append((sec[:int(len(sec)/2)], ""))
+                sections.append((sec[int(len(sec)/2):], ""))
+            else:
+                sections.append((sec, ""))
+        print(tables)
+        for table in tables:
+            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+        return sections, tbls
+
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, excel, txt.
+        This method apply the naive ways to chunk files.
+        Successive text will be sliced into pieces using 'delimiter'.
+        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
+    """
+
+    eng = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    res = []
+    pdf_parser = None
+    sections = []
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections, tbls = Docx()(filename, binary)
+        res = tokenize_table(tbls, doc, eng)    # just for table
+
+        callback(0.8, "Finish parsing.")
+        st = timer()
+
+        chunks, images = naive_merge_docx(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+
+        if kwargs.get("section_only", False):
+            return chunks
+
+        res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
+        cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+        return res
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf(
+        ) if parser_config.get("layout_recognize", True) else PlainParser()
+        sections, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)
+        res = tokenize_table(tbls, doc, eng)
+
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = [(l, "") for l in excel_parser.html(binary) if l]
+
+    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = TxtParser()(filename,binary,
+                               parser_config.get("chunk_token_num", 128),
+                               parser_config.get("delimiter", "\n!?;。；！？"))
+        callback(0.8, "Finish parsing.")
+    
+    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
+        res = tokenize_table(tbls, doc, eng)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.json$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary)
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
+
+    st = timer()
+    chunks = naive_merge(
+        sections, int(parser_config.get(
+            "chunk_token_num", 128)), parser_config.get(
+            "delimiter", "\n!?。；！？"))
+    if kwargs.get("section_only", False):
+        return chunks
+
+    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+    cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+    return res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)