Fix: excel default optimization. (#11519 )

### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
Fix: incorrect image merging for naive markdown parser (#11520 )
2026-02-02 08:35:08 +08:00 · 2025-11-25 19:54:20 +08:00 · 2025-11-25 19:54:06 +08:00 · 2025-11-25 19:13:00 +08:00
42 changed files with 454 additions and 1327 deletions
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:

        # Replace any TAGS e.g. <table ...> to <table>
        TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
-        table_with_attributes_pattern = re.compile(
-            rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
-        )
+        table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
+
        def replace_tag(m):
            tag_name = re.match(r"<(\w+)", m.group()).group(1)
            return "<{}>".format(tag_name)
@ -128,23 +127,48 @@ class MarkdownElementExtractor:
        self.markdown_content = markdown_content
        self.lines = markdown_content.split("\n")

-    def get_delimiters(self,delimiters):
+    def get_delimiters(self, delimiters):
        toks = re.findall(r"`([^`]+)`", delimiters)
        toks = sorted(set(toks), key=lambda x: -len(x))
        return "|".join(re.escape(t) for t in toks if t)
-    
-    def extract_elements(self,delimiter=None):
+
+    def extract_elements(self, delimiter=None, include_meta=False):
        """Extract individual elements (headers, code blocks, lists, etc.)"""
        sections = []

        i = 0
-        dels=""
+        dels = ""
        if delimiter:
            dels = self.get_delimiters(delimiter)
        if len(dels) > 0:
            text = "\n".join(self.lines)
-            parts = re.split(dels, text)
-            sections = [p.strip() for p in parts if p and p.strip()]
+            if include_meta:
+                pattern = re.compile(dels)
+                last_end = 0
+                for m in pattern.finditer(text):
+                    part = text[last_end : m.start()]
+                    if part and part.strip():
+                        sections.append(
+                            {
+                                "content": part.strip(),
+                                "start_line": text.count("\n", 0, last_end),
+                                "end_line": text.count("\n", 0, m.start()),
+                            }
+                        )
+                    last_end = m.end()
+
+                part = text[last_end:]
+                if part and part.strip():
+                    sections.append(
+                        {
+                            "content": part.strip(),
+                            "start_line": text.count("\n", 0, last_end),
+                            "end_line": text.count("\n", 0, len(text)),
+                        }
+                    )
+            else:
+                parts = re.split(dels, text)
+                sections = [p.strip() for p in parts if p and p.strip()]
            return sections
        while i < len(self.lines):
            line = self.lines[i]
@ -152,32 +176,35 @@ class MarkdownElementExtractor:
            if re.match(r"^#{1,6}\s+.*$", line):
                # header
                element = self._extract_header(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                i = element["end_line"] + 1
            elif line.strip().startswith("```"):
                # code block
                element = self._extract_code_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                i = element["end_line"] + 1
            elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
                # list block
                element = self._extract_list_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                i = element["end_line"] + 1
            elif line.strip().startswith(">"):
                # blockquote
                element = self._extract_blockquote(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                i = element["end_line"] + 1
            elif line.strip():
                # text block (paragraphs and inline elements until next block element)
                element = self._extract_text_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                i = element["end_line"] + 1
            else:
                i += 1

-        sections = [section for section in sections if section.strip()]
+        if include_meta:
+            sections = [section for section in sections if section["content"].strip()]
+        else:
+            sections = [section for section in sections if section.strip()]
        return sections

    def _extract_header(self, start_pos):
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 from markdown import markdown
 from PIL import Image
+from common.token_utils import num_tokens_from_string

 from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
@ -464,51 +465,88 @@ class Markdown(MarkdownParser):
        html_content = markdown(text)
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup
-    
-    def get_picture_urls(self, soup):
-        if soup:
-            return [img.get('src') for img in soup.find_all('img') if img.get('src')]
-        return []

    def get_hyperlink_urls(self, soup):
        if soup:
            return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
        return []
-    
-    def get_pictures(self, text):
-        """Download and open all images from markdown text."""
+
+    def extract_image_urls_with_lines(self, text):
+        md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
+        html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
+        urls = []
+        seen = set()
+        lines = text.splitlines()
+        for idx, line in enumerate(lines):
+            for url in md_img_re.findall(line):
+                if (url, idx) not in seen:
+                    urls.append({"url": url, "line": idx})
+                    seen.add((url, idx))
+            for url in html_img_re.findall(line):
+                if (url, idx) not in seen:
+                    urls.append({"url": url, "line": idx})
+                    seen.add((url, idx))
+
+        # cross-line
+        try:
+            from bs4 import BeautifulSoup
+
+            soup = BeautifulSoup(text, 'html.parser')
+            newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
+            for img_tag in soup.find_all('img'):
+                src = img_tag.get('src')
+                if not src:
+                    continue
+
+                tag_str = str(img_tag)
+                pos = text.find(tag_str)
+                if pos == -1:
+                    # fallback
+                    pos = max(text.find(src), 0)
+                line_no = 0
+                for i, off in enumerate(newline_offsets):
+                    if pos <= off:
+                        line_no = i
+                        break
+                if (src, line_no) not in seen:
+                    urls.append({"url": src, "line": line_no})
+                    seen.add((src, line_no))
+        except Exception:
+            pass
+
+        return urls
+
+    def load_images_from_urls(self, urls, cache=None):
        import requests
-        soup = self.md_to_html(text)
-        image_urls = self.get_picture_urls(soup)
+        from pathlib import Path
+
+        cache = cache or {}
        images = []
-        # Find all image URLs in text
-        for url in image_urls:
-            if not url:
+        for url in urls:
+            if url in cache:
+                if cache[url]:
+                    images.append(cache[url])
                continue
+            img_obj = None
            try:
-                # check if the url is a local file or a remote URL
                if url.startswith(('http://', 'https://')):
-                    # For remote URLs, download the image
                    response = requests.get(url, stream=True, timeout=30)
-                    if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
-                        img = Image.open(BytesIO(response.content)).convert('RGB')
-                        images.append(img)
+                    if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
+                        img_obj = Image.open(BytesIO(response.content)).convert('RGB')
                else:
-                    # For local file paths, open the image directly
-                    from pathlib import Path
                    local_path = Path(url)
-                    if not local_path.exists():
+                    if local_path.exists():
+                        img_obj = Image.open(url).convert('RGB')
+                    else:
                        logging.warning(f"Local image file not found: {url}")
-                        continue
-                    img = Image.open(url).convert('RGB')
-                    images.append(img)
            except Exception as e:
                logging.error(f"Failed to download/open image from {url}: {e}")
-                continue
+            cache[url] = img_obj
+            if img_obj:
+                images.append(img_obj)
+        return images, cache

-        return images if images else None
-
-    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
+    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
        # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
        # extractor = MarkdownElementExtractor(remainder)
        extractor = MarkdownElementExtractor(txt)
-        element_sections = extractor.extract_elements(delimiter)
-        sections = [(element, "") for element in element_sections]
+        image_refs = self.extract_image_urls_with_lines(txt)
+        element_sections = extractor.extract_elements(delimiter, include_meta=True)
+
+        sections = []
+        section_images = []
+        image_cache = {}
+        for element in element_sections:
+            content = element["content"]
+            start_line = element["start_line"]
+            end_line = element["end_line"]
+            urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
+            imgs = []
+            if urls_in_section:
+                imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
+            combined_image = None
+            if imgs:
+                combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
+            sections.append((content, ""))
+            section_images.append(combined_image)
+
        tbls = []
        for table in tables:
            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+        if return_section_images:
+            return sections, tbls, section_images
        return sections, tbls

 def load_from_xml_v2(baseURI, rels_item_xml):
@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    parser_config = kwargs.get(
        "parser_config", {
            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
+    final_sections = False
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -695,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            excel_parser = ExcelParser()
            if parser_config.get("html4excel"):
                sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+                parser_config["chunk_token_num"] = 0
            else:
                sections = [(_, "") for _ in excel_parser(binary) if _]
-            parser_config["chunk_token_num"] = 12800

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
-        sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+        sections, tables, section_images = markdown_parser(
+            filename,
+            binary,
+            separate_tables=False,
+            delimiter=parser_config.get("delimiter", "\n!?;。；！？"),
+            return_section_images=True,
+        )
+
+        final_sections = True

        try:
            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if vision_model:
            # Process images for each section
-            section_images = []
            for idx, (section_text, _) in enumerate(sections):
-                images = markdown_parser.get_pictures(section_text) if section_text else None
+                images = []
+                if section_images and len(section_images) > idx and section_images[idx] is not None:
+                    images.append(section_images[idx])

-                if images:
+                if images and len(images) > 0:
                    # If multiple images found, combine them using concat_img
                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
-                    section_images.append(combined_image)
+                    if section_images:
+                        section_images[idx] = combined_image
+                    else:
+                        section_images = [None] * len(sections)
+                        section_images[idx] = combined_image
                    markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
                    boosted_figures = markdown_vision_parser(callback=callback)
                    sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
-                else:
-                    section_images.append(None)

        else:
            logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")

    st = timer()
-    if section_images:
-        # if all images are None, set section_images to None
-        if all(image is None for image in section_images):
-            section_images = None
+    if final_sections:
+        merged_chunks = []
+        merged_images = []
+        chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
+        overlapped_percent = int(parser_config.get("overlapped_percent", 0))
+        overlapped_percent = max(0, min(overlapped_percent, 90))

-    if section_images:
-        chunks, images = naive_merge_with_images(sections, section_images,
-                                        int(parser_config.get(
-                                            "chunk_token_num", 128)), parser_config.get(
-                                            "delimiter", "\n!?。；！？"))
+        current_text = ""
+        current_tokens = 0
+        current_image = None
+
+        for idx, sec in enumerate(sections):
+            text = sec[0] if isinstance(sec, tuple) else sec
+            sec_tokens = num_tokens_from_string(text)
+            sec_image = section_images[idx] if section_images and idx < len(section_images) else None
+
+            if current_text and current_tokens + sec_tokens > chunk_limit:
+                merged_chunks.append(current_text)
+                merged_images.append(current_image)
+                overlap_part = ""
+                if overlapped_percent > 0:
+                    overlap_len = int(len(current_text) * overlapped_percent / 100)
+                    if overlap_len > 0:
+                        overlap_part = current_text[-overlap_len:]
+                current_text = overlap_part
+                current_tokens = num_tokens_from_string(current_text)
+                current_image = current_image if overlap_part else None
+
+            if current_text:
+                current_text += "\n" + text
+            else:
+                current_text = text
+            current_tokens += sec_tokens
+
+            if sec_image:
+                current_image = concat_img(current_image, sec_image) if current_image else sec_image
+
+        if current_text:
+            merged_chunks.append(current_text)
+            merged_images.append(current_image)
+
+        chunks = merged_chunks
+        has_images = merged_images and any(img is not None for img in merged_images)
        if kwargs.get("section_only", False):
            chunks.extend(embed_res)
            return chunks
-
-        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        if has_images:
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
+        else:
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
    else:
-        chunks = naive_merge(
-            sections, int(parser_config.get(
-                "chunk_token_num", 128)), parser_config.get(
-                "delimiter", "\n!?。；！？"))
-        if kwargs.get("section_only", False):
-            chunks.extend(embed_res)
-            return chunks
+        if section_images:
+            if all(image is None for image in section_images):
+                section_images = None

-        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+        if section_images:
+            chunks, images = naive_merge_with_images(sections, section_images,
+                                            int(parser_config.get(
+                                                "chunk_token_num", 128)), parser_config.get(
+                                                "delimiter", "\n!?。；！？"))
+            if kwargs.get("section_only", False):
+                chunks.extend(embed_res)
+                return chunks
+
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        else:
+            chunks = naive_merge(
+                sections, int(parser_config.get(
+                    "chunk_token_num", 128)), parser_config.get(
+                    "delimiter", "\n!?。；！？"))
+            if kwargs.get("section_only", False):
+                chunks.extend(embed_res)
+                return chunks
+
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))

    if urls and parser_config.get("analyze_hyperlink", False) and is_root:
        for index, url in enumerate(urls):
@ -820,9 +940,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                logging.info(f"Failed to chunk url in registered file type {url}: {e}")
                sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
            url_res.extend(sub_url_res)
-        
+
    logging.info("naive_merge({}): {}".format(filename, timer() - st))
-    
+
    if embed_res:
        res.extend(embed_res)
    if url_res:
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -482,17 +482,25 @@ class Parser(ProcessBase):
        self.set_output("output_format", conf["output_format"])

        markdown_parser = naive_markdown_parser()
-        sections, tables = markdown_parser(name, blob, separate_tables=False)
+        sections, tables, section_images = markdown_parser(
+            name,
+            blob,
+            separate_tables=False,
+            delimiter=conf.get("delimiter"),
+            return_section_images=True,
+        )

        if conf.get("output_format") == "json":
            json_results = []

-            for section_text, _ in sections:
+            for idx, (section_text, _) in enumerate(sections):
                json_result = {
                    "text": section_text,
                }

-                images = markdown_parser.get_pictures(section_text) if section_text else None
+                images = []
+                if section_images and len(section_images) > idx and section_images[idx] is not None:
+                    images.append(section_images[idx])
                if images:
                    # If multiple images found, combine them using concat_img
                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
                tk_nums.append(num_tokens_from_string(text))
        return cks

-    dels = get_delimiters(delimiter)
    for sec, pos in sections:
-        if num_tokens_from_string(sec) < chunk_token_num:
-            add_chunk("\n"+sec, pos)
-            continue
-        split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
-        for sub_sec in split_sec:
-            if re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n"+sub_sec, pos)
+        add_chunk("\n"+sec, pos)

    return cks

@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
                tk_nums.append(num_tokens_from_string(text_seg))
        return cks, result_images

-    dels = get_delimiters(delimiter)
    for text, image in zip(texts, images):
        # if text is tuple, unpack it
        if isinstance(text, tuple):
            text_str = text[0]
            text_pos = text[1] if len(text) > 1 else ""
-            split_sec = re.split(r"(%s)" % dels, text_str)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image, text_pos)
+            add_chunk("\n"+text_str, image, text_pos)
        else:
-            split_sec = re.split(r"(%s)" % dels, text)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image)
+            add_chunk("\n"+text, image)

    return cks, result_images

+
 def docx_question_level(p, bull=-1):
    txt = re.sub(r"\u3000", " ", p.text).strip()
    if p.style.name.startswith('Heading'):
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
                tk_nums.append(num_tokens_from_string(text_seg))
        return cks, images

-    dels = get_delimiters(delimiter)
-    pattern = r"(%s)" % dels
-
    for sec, image in sections:
-        split_sec = re.split(pattern, sec)
-        for sub_sec in split_sec:
-            if not sub_sec or re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n" + sub_sec, image, "")
+        add_chunk("\n" + sec, image, "")

    return cks, images

@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):

    return dels_pattern

+
 class Node:
    def __init__(self, level, depth=-1, texts=None):
        self.level = level
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/csv-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/csv-preview.tsx
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/doc-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/doc-preview.tsx
@ -1,5 +1,7 @@
 import message from '@/components/ui/message';
 import { Spin } from '@/components/ui/spin';
+import { Authorization } from '@/constants/authorization';
+import { getAuthorization } from '@/utils/authorization-util';
 import request from '@/utils/request';
 import classNames from 'classnames';
 import mammoth from 'mammoth';
@ -22,6 +24,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
    const res = await request(url, {
      method: 'GET',
      responseType: 'blob',
+      headers: { [Authorization]: getAuthorization() },
      onError: () => {
        message.error('Document parsing failed');
        console.error('Error loading document:', url);
--- a/web/src/pages/dataflow-result/components/document-preview/document-header.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/document-header.tsx
--- a/web/src/pages/dataflow-result/components/document-preview/excel-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/excel-preview.tsx
@ -1,5 +1,6 @@
-import { useFetchExcel } from '@/pages/document-viewer/hooks';
+// import { useFetchExcel } from '@/pages/document-viewer/hooks';
 import classNames from 'classnames';
+import { useFetchExcel } from './hooks';

 interface ExcelCsvPreviewerProps {
  className?: string;
--- a/web/src/components/document-preview/hooks.ts
+++ b/web/src/components/document-preview/hooks.ts
@ -1,9 +1,67 @@
 import { Authorization } from '@/constants/authorization';
+import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
+import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
+import api, { api_host } from '@/utils/api';
 import { getAuthorization } from '@/utils/authorization-util';
 import jsPreviewExcel from '@js-preview/excel';
+import { useSize } from 'ahooks';
 import axios from 'axios';
 import mammoth from 'mammoth';
-import { useCallback, useEffect, useRef, useState } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+
+export const useDocumentResizeObserver = () => {
+  const [containerWidth, setContainerWidth] = useState<number>();
+  const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
+  const size = useSize(containerRef);
+
+  const onResize = useCallback((width?: number) => {
+    if (width) {
+      setContainerWidth(width);
+    }
+  }, []);
+
+  useEffect(() => {
+    onResize(size?.width);
+  }, [size?.width, onResize]);
+
+  return { containerWidth, setContainerRef };
+};
+
+function highlightPattern(text: string, pattern: string, pageNumber: number) {
+  if (pageNumber === 2) {
+    return `<mark>${text}</mark>`;
+  }
+  if (text.trim() !== '' && pattern.match(text)) {
+    // return pattern.replace(text, (value) => `<mark>${value}</mark>`);
+    return `<mark>${text}</mark>`;
+  }
+  return text.replace(pattern, (value) => `<mark>${value}</mark>`);
+}
+
+export const useHighlightText = (searchText: string = '') => {
+  const textRenderer = useCallback(
+    (textItem: any) => {
+      return highlightPattern(textItem.str, searchText, textItem.pageNumber);
+    },
+    [searchText],
+  );
+
+  return textRenderer;
+};
+
+export const useGetDocumentUrl = (isAgent: boolean) => {
+  const { documentId } = useGetKnowledgeSearchParams();
+  const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
+
+  const url = useMemo(() => {
+    if (isAgent) {
+      return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
+    }
+    return `${api_host}/document/get/${documentId}`;
+  }, [createdBy, documentId, id, isAgent]);
+
+  return url;
+};

 export const useCatchError = (api: string) => {
  const [error, setError] = useState('');
--- a/web/src/pages/dataflow-result/components/document-preview/image-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/image-preview.tsx
@ -1,5 +1,7 @@
 import message from '@/components/ui/message';
 import { Spin } from '@/components/ui/spin';
+import { Authorization } from '@/constants/authorization';
+import { getAuthorization } from '@/utils/authorization-util';
 import request from '@/utils/request';
 import classNames from 'classnames';
 import { useEffect, useState } from 'react';
@ -22,6 +24,7 @@ export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
    const res = await request(url, {
      method: 'GET',
      responseType: 'blob',
+      headers: { [Authorization]: getAuthorization() },
      onError: () => {
        message.error('Failed to load image');
        setIsLoading(false);
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/index.less
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/index.less
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/index.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/index.tsx
@ -4,7 +4,7 @@ import CSVFileViewer from './csv-preview';
 import { DocPreviewer } from './doc-preview';
 import { ExcelCsvPreviewer } from './excel-preview';
 import { ImagePreviewer } from './image-preview';
-import styles from './index.less';
+import { Md } from './md';
 import PdfPreviewer, { IProps } from './pdf-preview';
 import { PptPreviewer } from './ppt-preview';
 import { TxtPreviewer } from './txt-preview';
@ -25,7 +25,7 @@ const Preview = ({
  return (
    <>
      {fileType === 'pdf' && highlights && setWidthAndHeight && (
-        <section className={styles.documentPreview}>
+        <section>
          <PdfPreviewer
            highlights={highlights}
            setWidthAndHeight={setWidthAndHeight}
@ -38,7 +38,7 @@ const Preview = ({
          <DocPreviewer className={className} url={url} />
        </section>
      )}
-      {['txt', 'md'].indexOf(fileType) > -1 && (
+      {['txt'].indexOf(fileType) > -1 && (
        <section>
          <TxtPreviewer className={className} url={url} />
        </section>
@ -82,6 +82,11 @@ const Preview = ({
          <CSVFileViewer className={className} url={url} />
        </section>
      )}
+      {['md'].indexOf(fileType) > -1 && (
+        <section>
+          <Md className={className} url={url} />
+        </section>
+      )}
    </>
  );
 };
--- a/web/src/components/document-preview/md/index.tsx
+++ b/web/src/components/document-preview/md/index.tsx
@ -1,31 +1,39 @@
+import { Authorization } from '@/constants/authorization';
+import { cn } from '@/lib/utils';
+import FileError from '@/pages/document-viewer/file-error';
+import { getAuthorization } from '@/utils/authorization-util';
 import React, { useEffect, useState } from 'react';
 import ReactMarkdown from 'react-markdown';
 import remarkGfm from 'remark-gfm';
-import FileError from '../file-error';

 interface MdProps {
-  filePath: string;
+  // filePath: string;
+  className?: string;
+  url: string;
 }

-const Md: React.FC<MdProps> = ({ filePath }) => {
+export const Md: React.FC<MdProps> = ({ url, className }) => {
  const [content, setContent] = useState<string>('');
  const [error, setError] = useState<string | null>(null);

  useEffect(() => {
    setError(null);
-    fetch(filePath)
+    fetch(url, { headers: { [Authorization]: getAuthorization() } })
      .then((res) => {
        if (!res.ok) throw new Error('Failed to fetch markdown file');
        return res.text();
      })
      .then((text) => setContent(text))
      .catch((err) => setError(err.message));
-  }, [filePath]);
+  }, [url]);

  if (error) return <FileError>{error}</FileError>;

  return (
-    <div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
+    <div
+      style={{ padding: 4, overflow: 'scroll' }}
+      className={cn(className, 'markdown-body h-[calc(100vh - 200px)]')}
+    >
      <ReactMarkdown remarkPlugins={[remarkGfm]}>{content}</ReactMarkdown>
    </div>
  );
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/pdf-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/pdf-preview.tsx
@ -10,13 +10,21 @@ import {

 import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
 import { Spin } from '@/components/ui/spin';
+// import FileError from '@/pages/document-viewer/file-error';
+import { Authorization } from '@/constants/authorization';
 import FileError from '@/pages/document-viewer/file-error';
+import { getAuthorization } from '@/utils/authorization-util';
 import styles from './index.less';
+type PdfLoaderProps = React.ComponentProps<typeof PdfLoader> & {
+  httpHeaders?: Record<string, string>;
+};

+const Loader = PdfLoader as React.ComponentType<PdfLoaderProps>;
 export interface IProps {
-  highlights: IHighlight[];
-  setWidthAndHeight: (width: number, height: number) => void;
+  highlights?: IHighlight[];
+  setWidthAndHeight?: (width: number, height: number) => void;
  url: string;
+  className?: string;
 }
 const HighlightPopup = ({
  comment,
@ -30,7 +38,12 @@ const HighlightPopup = ({
  ) : null;

 // TODO: merge with DocumentPreviewer
-const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
+const PdfPreview = ({
+  highlights: state,
+  setWidthAndHeight,
+  url,
+  className,
+}: IProps) => {
  // const url = useGetDocumentUrl();

  const ref = useRef<(highlight: IHighlight) => void>(() => {});
@ -39,17 +52,22 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
  const resetHash = () => {};

  useEffect(() => {
-    if (state.length > 0) {
+    if (state?.length && state?.length > 0) {
      ref?.current(state[0]);
    }
  }, [state]);

+  const httpHeaders = {
+    [Authorization]: getAuthorization(),
+  };
+
  return (
    <div
-      className={`${styles.documentContainer} rounded-[10px] overflow-hidden	`}
+      className={`${styles.documentContainer} rounded-[10px] overflow-hidden	${className}`}
    >
-      <PdfLoader
+      <Loader
        url={url}
+        httpHeaders={httpHeaders}
        beforeLoad={
          <div className="absolute inset-0 flex items-center justify-center">
            <Spin />
@ -63,7 +81,7 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
            const viewport = page.getViewport({ scale: 1 });
            const width = viewport.width;
            const height = viewport.height;
-            setWidthAndHeight(width, height);
+            setWidthAndHeight?.(width, height);
          });

          return (
@ -115,11 +133,11 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
                  </Popup>
                );
              }}
-              highlights={state}
+              highlights={state || []}
            />
          );
        }}
-      </PdfLoader>
+      </Loader>
    </div>
  );
 };
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/ppt-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/ppt-preview.tsx
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/txt-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/txt-preview.tsx
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/video-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/video-preview.tsx
--- a/web/src/constants/common.ts
+++ b/web/src/constants/common.ts
@ -148,7 +148,7 @@ export const Images = [
 ];

 // Without FileViewer
-export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', ...Images];
+export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', 'md', ...Images];

 export const SupportedPreviewDocumentTypes = [...ExceptiveType];
 //#endregion
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-result-bar/index.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-result-bar/index.tsx
@ -1,14 +1,13 @@
-import { Input } from '@/components/originui/input';
 import { Button } from '@/components/ui/button';
+import { SearchInput } from '@/components/ui/input';
 import {
  Popover,
  PopoverContent,
  PopoverTrigger,
 } from '@/components/ui/popover';
 import { Radio } from '@/components/ui/radio';
+import { Segmented } from '@/components/ui/segmented';
 import { useTranslate } from '@/hooks/common-hooks';
-import { cn } from '@/lib/utils';
-import { SearchOutlined } from '@ant-design/icons';
 import { ListFilter, Plus } from 'lucide-react';
 import { useState } from 'react';
 import { ChunkTextMode } from '../../constant';
@ -61,46 +60,43 @@ export default ({
  };
  return (
    <div className="flex pr-[25px]">
-      <div className="flex items-center gap-4 bg-bg-card text-muted-foreground w-fit h-[35px] rounded-md px-4 py-2">
-        {textSelectOptions.map((option) => (
-          <div
-            key={option.value}
-            className={cn('flex items-center cursor-pointer', {
-              'text-primary': option.value === textSelectValue,
-            })}
-            onClick={() => changeTextSelectValue(option.value)}
-          >
-            {option.label}
-          </div>
-        ))}
-      </div>
-      <div className="ml-auto"></div>
-      <Input
-        className="bg-bg-card text-muted-foreground"
-        style={{ width: 200 }}
-        placeholder={t('search')}
-        icon={<SearchOutlined />}
-        onChange={handleInputChange}
-        value={searchString}
+      <Segmented
+        options={textSelectOptions}
+        value={textSelectValue}
+        onChange={changeTextSelectValue}
      />
-      <div className="w-[20px]"></div>
-      <Popover>
-        <PopoverTrigger asChild>
-          <Button className="bg-bg-card text-muted-foreground hover:bg-card">
-            <ListFilter />
-          </Button>
-        </PopoverTrigger>
-        <PopoverContent className="p-0 w-[200px]">
-          {filterContent}
-        </PopoverContent>
-      </Popover>
-      <div className="w-[20px]"></div>
-      <Button
-        onClick={() => createChunk()}
-        className="bg-bg-card text-primary hover:bg-card"
-      >
-        <Plus size={44} />
-      </Button>
+      <div className="ml-auto"></div>
+      <div className="h-8 flex items-center gap-5">
+        <SearchInput
+          // style={{ width: 200 }}
+          placeholder={t('search')}
+          // icon={<SearchOutlined />}
+          onChange={handleInputChange}
+          value={searchString}
+        />
+        <Popover>
+          <PopoverTrigger asChild>
+            <Button
+              variant={'ghost'}
+              // className="bg-bg-card text-text-secondary hover:bg-card"
+            >
+              <ListFilter />
+            </Button>
+          </PopoverTrigger>
+          <PopoverContent className="p-0 w-[200px]">
+            {filterContent}
+          </PopoverContent>
+        </Popover>
+        <Button
+          variant={'ghost'}
+          onClick={() => createChunk()}
+          // className="bg-bg-card text-primary hover:bg-card"
+        >
+          <Plus size={44} />
+        </Button>
+      </div>
+      {/* <div className="w-[20px]"></div>
+      <div className="w-[20px]"></div> */}
    </div>
  );
 };
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/document-header.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/document-header.tsx
@ -1,21 +0,0 @@
-import { formatDate } from '@/utils/date';
-import { formatBytes } from '@/utils/file-util';
-
-type Props = {
-  size: number;
-  name: string;
-  create_date: string;
-};
-
-export default ({ size, name, create_date }: Props) => {
-  const sizeName = formatBytes(size);
-  const dateStr = formatDate(create_date);
-  return (
-    <div>
-      <h2 className="text-[24px]">{name}</h2>
-      <div className="text-[#979AAB] pt-[5px]">
-        Size：{sizeName} Uploaded Time：{dateStr}
-      </div>
-    </div>
-  );
-};
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/excel-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/excel-preview.tsx
@ -1,25 +0,0 @@
-import { useFetchExcel } from '@/pages/document-viewer/hooks';
-import classNames from 'classnames';
-
-interface ExcelCsvPreviewerProps {
-  className?: string;
-  url: string;
-}
-
-export const ExcelCsvPreviewer: React.FC<ExcelCsvPreviewerProps> = ({
-  className,
-  url,
-}) => {
-  // const url = useGetDocumentUrl();
-  const { containerRef } = useFetchExcel(url);
-
-  return (
-    <div
-      ref={containerRef}
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md excel-csv-previewer',
-        className,
-      )}
-    ></div>
-  );
-};
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/hooks.ts
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/hooks.ts
@ -1,55 +0,0 @@
-import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
-import { api_host } from '@/utils/api';
-import { useSize } from 'ahooks';
-import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
-import { useCallback, useEffect, useMemo, useState } from 'react';
-
-export const useDocumentResizeObserver = () => {
-  const [containerWidth, setContainerWidth] = useState<number>();
-  const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
-  const size = useSize(containerRef);
-
-  const onResize = useCallback((width?: number) => {
-    if (width) {
-      setContainerWidth(width);
-    }
-  }, []);
-
-  useEffect(() => {
-    onResize(size?.width);
-  }, [size?.width, onResize]);
-
-  return { containerWidth, setContainerRef };
-};
-
-function highlightPattern(text: string, pattern: string, pageNumber: number) {
-  if (pageNumber === 2) {
-    return `<mark>${text}</mark>`;
-  }
-  if (text.trim() !== '' && pattern.match(text)) {
-    // return pattern.replace(text, (value) => `<mark>${value}</mark>`);
-    return `<mark>${text}</mark>`;
-  }
-  return text.replace(pattern, (value) => `<mark>${value}</mark>`);
-}
-
-export const useHighlightText = (searchText: string = '') => {
-  const textRenderer: CustomTextRenderer = useCallback(
-    (textItem) => {
-      return highlightPattern(textItem.str, searchText, textItem.pageNumber);
-    },
-    [searchText],
-  );
-
-  return textRenderer;
-};
-
-export const useGetDocumentUrl = () => {
-  const { documentId } = useGetKnowledgeSearchParams();
-
-  const url = useMemo(() => {
-    return `${api_host}/document/get/${documentId}`;
-  }, [documentId]);
-
-  return url;
-};
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/image-preview.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview/image-preview.tsx
@ -1,74 +0,0 @@
-import message from '@/components/ui/message';
-import { Spin } from '@/components/ui/spin';
-import request from '@/utils/request';
-import classNames from 'classnames';
-import { useCallback, useEffect, useState } from 'react';
-
-interface ImagePreviewerProps {
-  className?: string;
-  url: string;
-}
-
-export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
-  className,
-  url,
-}) => {
-  // const url = useGetDocumentUrl();
-  const [imageSrc, setImageSrc] = useState<string | null>(null);
-  const [isLoading, setIsLoading] = useState<boolean>(true);
-
-  const fetchImage = useCallback(async () => {
-    setIsLoading(true);
-    const res = await request(url, {
-      method: 'GET',
-      responseType: 'blob',
-      onError: () => {
-        message.error('Failed to load image');
-        setIsLoading(false);
-      },
-    });
-    const objectUrl = URL.createObjectURL(res.data);
-    setImageSrc(objectUrl);
-    setIsLoading(false);
-  }, [url]);
-
-  useEffect(() => {
-    if (url) {
-      fetchImage();
-    }
-  }, [url, fetchImage]);
-
-  useEffect(() => {
-    return () => {
-      if (imageSrc) {
-        URL.revokeObjectURL(imageSrc);
-      }
-    };
-  }, [imageSrc]);
-
-  return (
-    <div
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md image-previewer',
-        className,
-      )}
-    >
-      {isLoading && (
-        <div className="absolute inset-0 flex items-center justify-center">
-          <Spin />
-        </div>
-      )}
-
-      {!isLoading && imageSrc && (
-        <div className="max-h-[80vh] overflow-auto p-2">
-          <img
-            src={imageSrc}
-            alt={'image'}
-            className="w-full h-auto max-w-full object-contain"
-            onLoad={() => URL.revokeObjectURL(imageSrc!)}
-          />
-        </div>
-      )}
-    </div>
-  );
-};
--- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/index.tsx
+++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/index.tsx
@ -7,7 +7,6 @@ import { useCallback, useEffect, useMemo, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import ChunkCard from './components/chunk-card';
 import CreatingModal from './components/chunk-creating-modal';
-import DocumentPreview from './components/document-preview';
 import {
  useChangeChunkTextMode,
  useDeleteChunkByIds,
@ -18,8 +17,11 @@ import {

 import ChunkResultBar from './components/chunk-result-bar';
 import CheckboxSets from './components/chunk-result-bar/checkbox-sets';
-import DocumentHeader from './components/document-preview/document-header';
+// import DocumentHeader from './components/document-preview/document-header';

+import DocumentPreview from '@/components/document-preview';
+import DocumentHeader from '@/components/document-preview/document-header';
+import { useGetDocumentUrl } from '@/components/document-preview/hooks';
 import { PageHeader } from '@/components/page-header';
 import {
  Breadcrumb,
@ -40,7 +42,6 @@ import {
  useNavigatePage,
 } from '@/hooks/logic-hooks/navigate-hooks';
 import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-request';
-import { useGetDocumentUrl } from './components/document-preview/hooks';
 import styles from './index.less';

 const Chunk = () => {
@ -74,7 +75,7 @@ const Chunk = () => {
  } = useUpdateChunk();
  const { navigateToDataFile, getQueryString, navigateToDatasetList } =
    useNavigatePage();
-  const fileUrl = useGetDocumentUrl();
+  const fileUrl = useGetDocumentUrl(false);
  useEffect(() => {
    setChunkList(data);
  }, [data]);
--- a/web/src/pages/dataflow-result/components/document-preview/csv-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/csv-preview.tsx
@ -1,114 +0,0 @@
-import message from '@/components/ui/message';
-import { Spin } from '@/components/ui/spin';
-import request from '@/utils/request';
-import classNames from 'classnames';
-import React, { useEffect, useRef, useState } from 'react';
-
-interface CSVData {
-  rows: string[][];
-  headers: string[];
-}
-
-interface FileViewerProps {
-  className?: string;
-  url: string;
-}
-
-const CSVFileViewer: React.FC<FileViewerProps> = ({ url }) => {
-  const [data, setData] = useState<CSVData | null>(null);
-  const [isLoading, setIsLoading] = useState<boolean>(true);
-  const containerRef = useRef<HTMLDivElement>(null);
-  // const url = useGetDocumentUrl();
-  const parseCSV = (csvText: string): CSVData => {
-    console.log('Parsing CSV data:', csvText);
-    const lines = csvText.split('\n');
-    const headers = lines[0].split(',').map((header) => header.trim());
-    const rows = lines
-      .slice(1)
-      .map((line) => line.split(',').map((cell) => cell.trim()));
-
-    return { headers, rows };
-  };
-
-  useEffect(() => {
-    const loadCSV = async () => {
-      try {
-        const res = await request(url, {
-          method: 'GET',
-          responseType: 'blob',
-          onError: () => {
-            message.error('file load failed');
-            setIsLoading(false);
-          },
-        });
-
-        // parse CSV file
-        const reader = new FileReader();
-        reader.readAsText(res.data);
-        reader.onload = () => {
-          const parsedData = parseCSV(reader.result as string);
-          console.log('file loaded successfully', reader.result);
-          setData(parsedData);
-        };
-      } catch (error) {
-        message.error('CSV file parse failed');
-        console.error('Error loading CSV file:', error);
-      } finally {
-        setIsLoading(false);
-      }
-    };
-
-    loadCSV();
-
-    return () => {
-      setData(null);
-    };
-  }, [url]);
-
-  return (
-    <div
-      ref={containerRef}
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
-        'overflow-auto max-h-[80vh] p-2',
-      )}
-    >
-      {isLoading ? (
-        <div className="absolute inset-0 flex items-center justify-center">
-          <Spin />
-        </div>
-      ) : data ? (
-        <table className="min-w-full divide-y divide-border-normal">
-          <thead className="bg-background-header-bar">
-            <tr>
-              {data.headers.map((header, index) => (
-                <th
-                  key={`header-${index}`}
-                  className="px-6 py-3 text-left text-sm font-medium text-text-primary"
-                >
-                  {header}
-                </th>
-              ))}
-            </tr>
-          </thead>
-          <tbody className="bg-background-paper divide-y divide-border-normal">
-            {data.rows.map((row, rowIndex) => (
-              <tr key={`row-${rowIndex}`}>
-                {row.map((cell, cellIndex) => (
-                  <td
-                    key={`cell-${rowIndex}-${cellIndex}`}
-                    className="px-6 py-4 whitespace-nowrap text-sm text-text-secondary"
-                  >
-                    {cell || '-'}
-                  </td>
-                ))}
-              </tr>
-            ))}
-          </tbody>
-        </table>
-      ) : null}
-    </div>
-  );
-};
-
-export default CSVFileViewer;
--- a/web/src/pages/dataflow-result/components/document-preview/doc-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/doc-preview.tsx
@ -1,70 +0,0 @@
-import message from '@/components/ui/message';
-import { Spin } from '@/components/ui/spin';
-import request from '@/utils/request';
-import classNames from 'classnames';
-import mammoth from 'mammoth';
-import { useEffect, useState } from 'react';
-
-interface DocPreviewerProps {
-  className?: string;
-  url: string;
-}
-
-export const DocPreviewer: React.FC<DocPreviewerProps> = ({
-  className,
-  url,
-}) => {
-  // const url = useGetDocumentUrl();
-  const [htmlContent, setHtmlContent] = useState<string>('');
-  const [loading, setLoading] = useState(false);
-  const fetchDocument = async () => {
-    setLoading(true);
-    const res = await request(url, {
-      method: 'GET',
-      responseType: 'blob',
-      onError: () => {
-        message.error('Document parsing failed');
-        console.error('Error loading document:', url);
-      },
-    });
-    try {
-      const arrayBuffer = await res.data.arrayBuffer();
-      const result = await mammoth.convertToHtml(
-        { arrayBuffer },
-        { includeDefaultStyleMap: true },
-      );
-
-      const styledContent = result.value
-        .replace(/<p>/g, '<p class="mb-2">')
-        .replace(/<h(\d)>/g, '<h$1 class="font-semibold mt-4 mb-2">');
-
-      setHtmlContent(styledContent);
-    } catch (err) {
-      message.error('Document parsing failed');
-      console.error('Error parsing document:', err);
-    }
-    setLoading(false);
-  };
-
-  useEffect(() => {
-    if (url) {
-      fetchDocument();
-    }
-  }, [url]);
-  return (
-    <div
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
-        className,
-      )}
-    >
-      {loading && (
-        <div className="absolute inset-0 flex items-center justify-center">
-          <Spin />
-        </div>
-      )}
-
-      {!loading && <div dangerouslySetInnerHTML={{ __html: htmlContent }} />}
-    </div>
-  );
-};
--- a/web/src/pages/dataflow-result/components/document-preview/hooks.ts
+++ b/web/src/pages/dataflow-result/components/document-preview/hooks.ts
@ -1,60 +0,0 @@
-import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
-import api, { api_host } from '@/utils/api';
-import { useSize } from 'ahooks';
-import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
-import { useCallback, useEffect, useMemo, useState } from 'react';
-import { useGetPipelineResultSearchParams } from '../../hooks';
-
-export const useDocumentResizeObserver = () => {
-  const [containerWidth, setContainerWidth] = useState<number>();
-  const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
-  const size = useSize(containerRef);
-
-  const onResize = useCallback((width?: number) => {
-    if (width) {
-      setContainerWidth(width);
-    }
-  }, []);
-
-  useEffect(() => {
-    onResize(size?.width);
-  }, [size?.width, onResize]);
-
-  return { containerWidth, setContainerRef };
-};
-
-function highlightPattern(text: string, pattern: string, pageNumber: number) {
-  if (pageNumber === 2) {
-    return `<mark>${text}</mark>`;
-  }
-  if (text.trim() !== '' && pattern.match(text)) {
-    // return pattern.replace(text, (value) => `<mark>${value}</mark>`);
-    return `<mark>${text}</mark>`;
-  }
-  return text.replace(pattern, (value) => `<mark>${value}</mark>`);
-}
-
-export const useHighlightText = (searchText: string = '') => {
-  const textRenderer: CustomTextRenderer = useCallback(
-    (textItem) => {
-      return highlightPattern(textItem.str, searchText, textItem.pageNumber);
-    },
-    [searchText],
-  );
-
-  return textRenderer;
-};
-
-export const useGetDocumentUrl = (isAgent: boolean) => {
-  const { documentId } = useGetKnowledgeSearchParams();
-  const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
-
-  const url = useMemo(() => {
-    if (isAgent) {
-      return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
-    }
-    return `${api_host}/document/get/${documentId}`;
-  }, [createdBy, documentId, id, isAgent]);
-
-  return url;
-};
--- a/web/src/pages/dataflow-result/components/document-preview/index.less
+++ b/web/src/pages/dataflow-result/components/document-preview/index.less
@ -1,13 +0,0 @@
-.documentContainer {
-  width: 100%;
-  // height: calc(100vh - 284px);
-  height: calc(100vh - 180px);
-  position: relative;
-  :global(.PdfHighlighter) {
-    overflow-x: hidden;
-  }
-  :global(.Highlight--scrolledTo .Highlight__part) {
-    overflow-x: hidden;
-    background-color: rgba(255, 226, 143, 1);
-  }
-}
--- a/web/src/pages/dataflow-result/components/document-preview/index.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/index.tsx
@ -1,67 +0,0 @@
-import { memo } from 'react';
-
-import CSVFileViewer from './csv-preview';
-import { DocPreviewer } from './doc-preview';
-import { ExcelCsvPreviewer } from './excel-preview';
-import { ImagePreviewer } from './image-preview';
-import PdfPreviewer, { IProps } from './pdf-preview';
-import { PptPreviewer } from './ppt-preview';
-import { TxtPreviewer } from './txt-preview';
-
-type PreviewProps = {
-  fileType: string;
-  className?: string;
-  url: string;
-};
-const Preview = ({
-  fileType,
-  className,
-  highlights,
-  setWidthAndHeight,
-  url,
-}: PreviewProps & Partial<IProps>) => {
-  return (
-    <>
-      {fileType === 'pdf' && highlights && setWidthAndHeight && (
-        <section>
-          <PdfPreviewer
-            highlights={highlights}
-            setWidthAndHeight={setWidthAndHeight}
-            url={url}
-          ></PdfPreviewer>
-        </section>
-      )}
-      {['doc', 'docx'].indexOf(fileType) > -1 && (
-        <section>
-          <DocPreviewer className={className} url={url} />
-        </section>
-      )}
-      {['txt', 'md'].indexOf(fileType) > -1 && (
-        <section>
-          <TxtPreviewer className={className} url={url} />
-        </section>
-      )}
-      {['visual'].indexOf(fileType) > -1 && (
-        <section>
-          <ImagePreviewer className={className} url={url} />
-        </section>
-      )}
-      {['pptx'].indexOf(fileType) > -1 && (
-        <section>
-          <PptPreviewer className={className} url={url} />
-        </section>
-      )}
-      {['xlsx'].indexOf(fileType) > -1 && (
-        <section>
-          <ExcelCsvPreviewer className={className} url={url} />
-        </section>
-      )}
-      {['csv'].indexOf(fileType) > -1 && (
-        <section>
-          <CSVFileViewer className={className} url={url} />
-        </section>
-      )}
-    </>
-  );
-};
-export default memo(Preview);
--- a/web/src/pages/dataflow-result/components/document-preview/pdf-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/pdf-preview.tsx
@ -1,127 +0,0 @@
-import { memo, useEffect, useRef } from 'react';
-import {
-  AreaHighlight,
-  Highlight,
-  IHighlight,
-  PdfHighlighter,
-  PdfLoader,
-  Popup,
-} from 'react-pdf-highlighter';
-
-import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
-import { Spin } from '@/components/ui/spin';
-import FileError from '@/pages/document-viewer/file-error';
-import styles from './index.less';
-
-export interface IProps {
-  highlights: IHighlight[];
-  setWidthAndHeight: (width: number, height: number) => void;
-  url: string;
-}
-const HighlightPopup = ({
-  comment,
-}: {
-  comment: { text: string; emoji: string };
-}) =>
-  comment.text ? (
-    <div className="Highlight__popup">
-      {comment.emoji} {comment.text}
-    </div>
-  ) : null;
-
-// TODO: merge with DocumentPreviewer
-const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
-  // const url = useGetDocumentUrl();
-
-  const ref = useRef<(highlight: IHighlight) => void>(() => {});
-  const error = useCatchDocumentError(url);
-
-  const resetHash = () => {};
-
-  useEffect(() => {
-    if (state.length > 0) {
-      ref?.current(state[0]);
-    }
-  }, [state]);
-
-  return (
-    <div
-      className={`${styles.documentContainer} rounded-[10px] overflow-hidden	`}
-    >
-      <PdfLoader
-        url={url}
-        beforeLoad={
-          <div className="absolute inset-0 flex items-center justify-center">
-            <Spin />
-          </div>
-        }
-        workerSrc="/pdfjs-dist/pdf.worker.min.js"
-        errorMessage={<FileError>{error}</FileError>}
-      >
-        {(pdfDocument) => {
-          pdfDocument.getPage(1).then((page) => {
-            const viewport = page.getViewport({ scale: 1 });
-            const width = viewport.width;
-            const height = viewport.height;
-            setWidthAndHeight(width, height);
-          });
-
-          return (
-            <PdfHighlighter
-              pdfDocument={pdfDocument}
-              enableAreaSelection={(event) => event.altKey}
-              onScrollChange={resetHash}
-              scrollRef={(scrollTo) => {
-                ref.current = scrollTo;
-              }}
-              onSelectionFinished={() => null}
-              highlightTransform={(
-                highlight,
-                index,
-                setTip,
-                hideTip,
-                viewportToScaled,
-                screenshot,
-                isScrolledTo,
-              ) => {
-                const isTextHighlight = !Boolean(
-                  highlight.content && highlight.content.image,
-                );
-
-                const component = isTextHighlight ? (
-                  <Highlight
-                    isScrolledTo={isScrolledTo}
-                    position={highlight.position}
-                    comment={highlight.comment}
-                  />
-                ) : (
-                  <AreaHighlight
-                    isScrolledTo={isScrolledTo}
-                    highlight={highlight}
-                    onChange={() => {}}
-                  />
-                );
-
-                return (
-                  <Popup
-                    popupContent={<HighlightPopup {...highlight} />}
-                    onMouseOver={(popupContent) =>
-                      setTip(highlight, () => popupContent)
-                    }
-                    onMouseOut={hideTip}
-                    key={index}
-                  >
-                    {component}
-                  </Popup>
-                );
-              }}
-              highlights={state}
-            />
-          );
-        }}
-      </PdfLoader>
-    </div>
-  );
-};
-
-export default memo(PdfPreview);
--- a/web/src/pages/dataflow-result/components/document-preview/ppt-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/ppt-preview.tsx
@ -1,70 +0,0 @@
-import message from '@/components/ui/message';
-import request from '@/utils/request';
-import classNames from 'classnames';
-import { init } from 'pptx-preview';
-import { useEffect, useRef } from 'react';
-interface PptPreviewerProps {
-  className?: string;
-  url: string;
-}
-
-export const PptPreviewer: React.FC<PptPreviewerProps> = ({
-  className,
-  url,
-}) => {
-  // const url = useGetDocumentUrl();
-  const wrapper = useRef<HTMLDivElement>(null);
-  const containerRef = useRef<HTMLDivElement>(null);
-  const fetchDocument = async () => {
-    const res = await request(url, {
-      method: 'GET',
-      responseType: 'blob',
-      onError: () => {
-        message.error('Document parsing failed');
-        console.error('Error loading document:', url);
-      },
-    });
-    console.log(res);
-    try {
-      const arrayBuffer = await res.data.arrayBuffer();
-
-      if (containerRef.current) {
-        let width = 500;
-        let height = 900;
-        if (containerRef.current) {
-          width = containerRef.current.clientWidth - 50;
-          height = containerRef.current.clientHeight - 50;
-        }
-        let pptxPrviewer = init(containerRef.current, {
-          width: width,
-          height: height,
-        });
-        pptxPrviewer.preview(arrayBuffer);
-      }
-    } catch (err) {
-      message.error('ppt parse failed');
-    }
-  };
-
-  useEffect(() => {
-    if (url) {
-      fetchDocument();
-    }
-  }, [url]);
-
-  return (
-    <div
-      ref={containerRef}
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md ppt-previewer',
-        className,
-      )}
-    >
-      <div className="overflow-auto p-2">
-        <div className="flex flex-col gap-4">
-          <div ref={wrapper} />
-        </div>
-      </div>
-    </div>
-  );
-};
--- a/web/src/pages/dataflow-result/components/document-preview/txt-preview.tsx
+++ b/web/src/pages/dataflow-result/components/document-preview/txt-preview.tsx
@ -1,56 +0,0 @@
-import message from '@/components/ui/message';
-import { Spin } from '@/components/ui/spin';
-import request from '@/utils/request';
-import classNames from 'classnames';
-import { useEffect, useState } from 'react';
-
-type TxtPreviewerProps = { className?: string; url: string };
-export const TxtPreviewer = ({ className, url }: TxtPreviewerProps) => {
-  // const url = useGetDocumentUrl();
-  const [loading, setLoading] = useState(false);
-  const [data, setData] = useState<string>('');
-  const fetchTxt = async () => {
-    setLoading(true);
-    const res = await request(url, {
-      method: 'GET',
-      responseType: 'blob',
-      onError: (err: any) => {
-        message.error('Failed to load file');
-        console.error('Error loading file:', err);
-      },
-    });
-    // blob to string
-    const reader = new FileReader();
-    reader.readAsText(res.data);
-    reader.onload = () => {
-      setData(reader.result as string);
-      setLoading(false);
-      console.log('file loaded successfully', reader.result);
-    };
-    console.log('file data:', res);
-  };
-  useEffect(() => {
-    if (url) {
-      fetchTxt();
-    } else {
-      setLoading(false);
-      setData('');
-    }
-  }, [url]);
-  return (
-    <div
-      className={classNames(
-        'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
-        className,
-      )}
-    >
-      {loading && (
-        <div className="absolute inset-0 flex items-center justify-center">
-          <Spin />
-        </div>
-      )}
-
-      {!loading && <pre className="whitespace-pre-wrap p-2 ">{data}</pre>}
-    </div>
-  );
-};
--- a/web/src/pages/dataflow-result/index.tsx
+++ b/web/src/pages/dataflow-result/index.tsx
@ -1,7 +1,7 @@
+import DocumentPreview from '@/components/document-preview';
 import { useFetchNextChunkList } from '@/hooks/use-chunk-request';
 import { useMemo, useState } from 'react';
 import { useTranslation } from 'react-i18next';
-import DocumentPreview from './components/document-preview';
 import {
  useFetchPipelineFileLogDetail,
  useFetchPipelineResult,
@ -13,8 +13,9 @@ import {
  useTimelineDataFlow,
 } from './hooks';

-import DocumentHeader from './components/document-preview/document-header';
+import DocumentHeader from '@/components/document-preview/document-header';

+import { useGetDocumentUrl } from '@/components/document-preview/hooks';
 import { TimelineNode } from '@/components/originui/timeline';
 import { PageHeader } from '@/components/page-header';
 import Spotlight from '@/components/spotlight';
@ -32,7 +33,6 @@ import { AgentCategory } from '@/constants/agent';
 import { Images } from '@/constants/common';
 import { useNavigatePage } from '@/hooks/logic-hooks/navigate-hooks';
 import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
-import { useGetDocumentUrl } from './components/document-preview/hooks';
 import TimelineDataFlow from './components/time-line';
 import { TimelineNodeType } from './constant';
 import styles from './index.less';
@ -76,13 +76,14 @@ const Chunk = () => {
  const fileType = useMemo(() => {
    if (isAgent) {
      return Images.some((x) => x === documentExtension)
-        ? 'visual'
+        ? documentInfo?.name.split('.').pop() || 'visual'
        : documentExtension;
    }
    switch (documentInfo?.type) {
      case 'doc':
        return documentInfo?.name.split('.').pop() || 'doc';
      case 'visual':
+        return documentInfo?.name.split('.').pop() || 'visual';
      case 'docx':
      case 'txt':
      case 'md':
--- a/web/src/pages/document-viewer/docx/index.less
+++ b/web/src/pages/document-viewer/docx/index.less
@ -1,282 +0,0 @@
-// Copyright (c) 2017 PlanGrid, Inc.
-
-.docxViewerWrapper {
-  overflow-y: scroll;
-  height: 100%;
-  width: 100%;
-
-  .box {
-    width: 100%;
-    height: 100%;
-  }
-
-  :global(.document-container) {
-    padding: 30px;
-    width: 700px;
-    background: rgba(255, 255, 255, 0.1);
-
-    margin: auto;
-  }
-
-  html,
-  bodyaddress,
-  blockquote,
-  body,
-  dd,
-  div,
-  dl,
-  dt,
-  fieldset,
-  form,
-  frame,
-  frameset,
-  h1,
-  h2,
-  h3,
-  h4,
-  h5,
-  h6,
-  noframes,
-  ol,
-  p,
-  ul,
-  center,
-  dir,
-  hr,
-  menu,
-  pre {
-    display: block;
-    unicode-bidi: embed;
-  }
-  li {
-    display: list-item;
-    list-style-type: disc;
-  }
-  head {
-    display: none;
-  }
-  table {
-    display: table;
-  }
-  img {
-    width: 100%;
-  }
-  tr {
-    display: table-row;
-  }
-  thead {
-    display: table-header-group;
-  }
-  tbody {
-    display: table-row-group;
-  }
-  tfoot {
-    display: table-footer-group;
-  }
-  col {
-    display: table-column;
-  }
-  colgroup {
-    display: table-column-group;
-  }
-  th {
-    display: table-cell;
-  }
-  td {
-    display: table-cell;
-    border-bottom: 1px solid #ccc;
-    border-right: 1px solid #ccc;
-    padding: 0.2em 0.5em;
-  }
-  caption {
-    display: table-caption;
-  }
-  th {
-    font-weight: bolder;
-    text-align: center;
-  }
-  caption {
-    text-align: center;
-  }
-  body {
-    margin: 8px;
-  }
-  h1 {
-    font-size: 2em;
-    margin: 0.67em 0;
-  }
-  h2 {
-    font-size: 1.5em;
-    margin: 0.75em 0;
-  }
-  h3 {
-    font-size: 1.17em;
-    margin: 0.83em 0;
-  }
-  h4,
-  p,
-  blockquote,
-  ul,
-  fieldset,
-  form,
-  ol,
-  dl,
-  dir,
-  menu {
-    margin: 1.12em 0;
-  }
-  h5 {
-    font-size: 0.83em;
-    margin: 1.5em 0;
-  }
-  h6 {
-    font-size: 0.75em;
-    margin: 1.67em 0;
-  }
-  h1,
-  h2,
-  h3,
-  h4,
-  h5,
-  h6,
-  b,
-  strong {
-    font-weight: bolder;
-  }
-  blockquote {
-    margin-left: 40px;
-    margin-right: 40px;
-  }
-  i,
-  cite,
-  em,
-  var,
-  address {
-    font-style: italic;
-  }
-  pre,
-  tt,
-  code,
-  kbd,
-  samp {
-    font-family: monospace;
-  }
-  pre {
-    white-space: pre;
-  }
-  button,
-  textarea,
-  input,
-  select {
-    display: inline-block;
-  }
-  big {
-    font-size: 1.17em;
-  }
-  small,
-  sub,
-  sup {
-    font-size: 0.83em;
-  }
-  sub {
-    vertical-align: sub;
-  }
-  sup {
-    vertical-align: super;
-  }
-  table {
-    border-spacing: 2px;
-  }
-  thead,
-  tbody,
-  tfoot {
-    vertical-align: middle;
-  }
-  td,
-  th,
-  tr {
-    vertical-align: inherit;
-  }
-  s,
-  strike,
-  del {
-    text-decoration: line-through;
-  }
-  hr {
-    border: 1px inset;
-  }
-  ol,
-  ul,
-  dir,
-  menu,
-  dd {
-    margin-left: 40px;
-  }
-  ol {
-    list-style-type: decimal;
-  }
-  ol ul,
-  ol ul,
-  ul ol,
-  ul ol,
-  ul ul,
-  ul ul,
-  ol ol,
-  ol ol {
-    margin-top: 0;
-    margin-bottom: 0;
-  }
-  u,
-  ins {
-    text-decoration: underline;
-  }
-  br:before {
-    content: '\A';
-    white-space: pre-line;
-  }
-  center {
-    text-align: center;
-  }
-  :link,
-  :visited {
-    text-decoration: underline;
-  }
-  :focus {
-    outline: thin dotted invert;
-  }
-  /* Begin bidirectionality settings (do not change) */
-  BDO[DIR='ltr'] {
-    direction: ltr;
-    unicode-bidi: bidi-override;
-  }
-  BDO[DIR='rtl'] {
-    direction: rtl;
-    unicode-bidi: bidi-override;
-  }
-  *[DIR='ltr'] {
-    direction: ltr;
-    unicode-bidi: embed;
-  }
-  *[DIR='rtl'] {
-    direction: rtl;
-    unicode-bidi: embed;
-  }
-  @media print {
-    h1 {
-      page-break-before: always;
-    }
-    h1,
-    h2,
-    h3,
-    h4,
-    h5,
-    h6 {
-      page-break-after: avoid;
-    }
-    ul,
-    ol,
-    dl {
-      page-break-before: avoid;
-    }
-  }
-}
--- a/web/src/pages/document-viewer/docx/index.tsx
+++ b/web/src/pages/document-viewer/docx/index.tsx
@ -1,25 +0,0 @@
-import { Spin } from 'antd';
-import FileError from '../file-error';
-
-import { useFetchDocx } from '../hooks';
-import styles from './index.less';
-
-const Docx = ({ filePath }: { filePath: string }) => {
-  const { succeed, containerRef, error } = useFetchDocx(filePath);
-
-  return (
-    <>
-      {succeed ? (
-        <section className={styles.docxViewerWrapper}>
-          <div id="docx" ref={containerRef} className={styles.box}>
-            <Spin />
-          </div>
-        </section>
-      ) : (
-        <FileError>{error}</FileError>
-      )}
-    </>
-  );
-};
-
-export default Docx;
--- a/web/src/pages/document-viewer/excel/index.tsx
+++ b/web/src/pages/document-viewer/excel/index.tsx
@ -1,19 +0,0 @@
-import '@js-preview/excel/lib/index.css';
-import FileError from '../file-error';
-import { useFetchExcel } from '../hooks';
-
-const Excel = ({ filePath }: { filePath: string }) => {
-  const { status, containerRef, error } = useFetchExcel(filePath);
-
-  return (
-    <div
-      id="excel"
-      ref={containerRef}
-      style={{ height: '100%', width: '100%' }}
-    >
-      {status || <FileError>{error}</FileError>}
-    </div>
-  );
-};
-
-export default Excel;
--- a/web/src/pages/document-viewer/file-error/index.less
+++ b/web/src/pages/document-viewer/file-error/index.less
@ -1,4 +0,0 @@
-.errorWrapper {
-  width: 100%;
-  height: 100%;
-}
--- a/web/src/pages/document-viewer/file-error/index.tsx
+++ b/web/src/pages/document-viewer/file-error/index.tsx
@ -1,18 +1,18 @@
-import { Alert, Flex } from 'antd';
-
 import { useTranslate } from '@/hooks/common-hooks';
 import React from 'react';
-import styles from './index.less';

 const FileError = ({ children }: React.PropsWithChildren) => {
  const { t } = useTranslate('fileManager');
  return (
-    <Flex align="center" justify="center" className={styles.errorWrapper}>
-      <Alert
-        type="error"
-        message={<h2>{children || t('fileError')}</h2>}
-      ></Alert>
-    </Flex>
+    <div className="flex items-center justify-center min-h-screen">
+      <div className="bg-state-error-5 border border-state-error rounded-lg p-4 shadow-sm">
+        <div className="flex ml-3">
+          <div className="text-white font-medium">
+            {children || t('fileError')}
+          </div>
+        </div>
+      </div>
+    </div>
  );
 };

--- a/web/src/pages/document-viewer/index.tsx
+++ b/web/src/pages/document-viewer/index.tsx
@ -1,16 +1,22 @@
 import { Images } from '@/constants/common';
 import { api_host } from '@/utils/api';
-import { Flex } from 'antd';
+// import { Flex } from 'antd';
 import { useParams, useSearchParams } from 'umi';
-import Docx from './docx';
-import Excel from './excel';
-import Image from './image';
-import Md from './md';
-import Pdf from './pdf';
-import Text from './text';
+// import Docx from './docx';
+// import Excel from './excel';
+// import Image from './image';
+// import Md from './md';
+// import Pdf from './pdf';
+// import Text from './text';

+import { DocPreviewer } from '@/components/document-preview/doc-preview';
+import { ExcelCsvPreviewer } from '@/components/document-preview/excel-preview';
+import { ImagePreviewer } from '@/components/document-preview/image-preview';
+import Md from '@/components/document-preview/md';
+import PdfPreview from '@/components/document-preview/pdf-preview';
+import { TxtPreviewer } from '@/components/document-preview/txt-preview';
 import { previewHtmlFile } from '@/utils/file-util';
-import styles from './index.less';
+// import styles from './index.less';

 // TODO: The interface returns an incorrect content-type for the SVG.

@ -20,6 +26,7 @@ const DocumentViewer = () => {
  const ext = currentQueryParameters.get('ext');
  const prefix = currentQueryParameters.get('prefix');
  const api = `${api_host}/${prefix || 'file'}/get/${documentId}`;
+  // request.head

  if (ext === 'html' && documentId) {
    previewHtmlFile(documentId);
@ -27,19 +34,24 @@ const DocumentViewer = () => {
  }

  return (
-    <section className={styles.viewerWrapper}>
+    <section className="w-full h-full">
      {Images.includes(ext!) && (
-        <Flex className={styles.image} align="center" justify="center">
-          <Image src={api} preview={false}></Image>
-        </Flex>
+        <div className="flex w-full h-full items-center justify-center">
+          {/* <Image src={api} preview={false}></Image> */}
+          <ImagePreviewer className="w-full !h-dvh p-5" url={api} />
+        </div>
      )}
-      {ext === 'md' && <Md filePath={api}></Md>}
-      {ext === 'txt' && <Text filePath={api}></Text>}
+      {ext === 'md' && <Md url={api} className="!h-dvh p-5"></Md>}
+      {ext === 'txt' && <TxtPreviewer url={api}></TxtPreviewer>}

-      {ext === 'pdf' && <Pdf url={api}></Pdf>}
-      {(ext === 'xlsx' || ext === 'xls') && <Excel filePath={api}></Excel>}
+      {ext === 'pdf' && (
+        <PdfPreview url={api} className="!h-dvh p-5"></PdfPreview>
+      )}
+      {(ext === 'xlsx' || ext === 'xls') && (
+        <ExcelCsvPreviewer url={api}></ExcelCsvPreviewer>
+      )}

-      {ext === 'docx' && <Docx filePath={api}></Docx>}
+      {ext === 'docx' && <DocPreviewer url={api}></DocPreviewer>}
    </section>
  );
 };
--- a/web/src/pages/document-viewer/text/index.tsx
+++ b/web/src/pages/document-viewer/text/index.tsx
@ -1,32 +0,0 @@
-import React, { useEffect, useState } from 'react';
-import FileError from '../file-error';
-
-interface TxtProps {
-  filePath: string;
-}
-
-const Md: React.FC<TxtProps> = ({ filePath }) => {
-  const [content, setContent] = useState<string>('');
-  const [error, setError] = useState<string | null>(null);
-
-  useEffect(() => {
-    setError(null);
-    fetch(filePath)
-      .then((res) => {
-        if (!res.ok) throw new Error('Failed to fetch text file');
-        return res.text();
-      })
-      .then((text) => setContent(text))
-      .catch((err) => setError(err.message));
-  }, [filePath]);
-
-  if (error) return <FileError>{error}</FileError>;
-
-  return (
-    <div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
-      {content}
-    </div>
-  );
-};
-
-export default Md;
--- a/web/src/pages/next-search/document-preview-modal/index.tsx
+++ b/web/src/pages/next-search/document-preview-modal/index.tsx
@ -1,3 +1,4 @@
+import DocumentPreview from '@/components/document-preview';
 import { FileIcon } from '@/components/icon-font';
 import { Modal } from '@/components/ui/modal/modal';
 import {
@ -7,7 +8,6 @@ import {
 import { IModalProps } from '@/interfaces/common';
 import { IReferenceChunk } from '@/interfaces/database/chat';
 import { IChunk } from '@/interfaces/database/knowledge';
-import DocumentPreview from '@/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview';
 import { useEffect, useState } from 'react';

 interface IProps extends IModalProps<any> {
--- a/web/src/pages/user-setting/data-source/hooks.ts
+++ b/web/src/pages/user-setting/data-source/hooks.ts
@ -45,21 +45,23 @@ export const useListDataSource = () => {

  const updatedDataSourceTemplates = useMemo(() => {
    const categorizedData = categorizeDataBySource(list || []);
-    let sourcelist: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
+    let sourceList: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
      [];
    Object.keys(categorizedData).forEach((key: string) => {
      const k = key as DataSourceKey;
-      sourcelist.push({
-        id: k,
-        name: DataSourceInfo[k].name,
-        description: DataSourceInfo[k].description,
-        icon: DataSourceInfo[k].icon,
-        list: categorizedData[k] || [],
-      });
+      if (DataSourceInfo[k]) {
+        sourceList.push({
+          id: k,
+          name: DataSourceInfo[k].name,
+          description: DataSourceInfo[k].description,
+          icon: DataSourceInfo[k].icon,
+          list: categorizedData[k] || [],
+        });
+      }
    });

-    console.log('🚀 ~ useListDataSource ~ sourcelist:', sourcelist);
-    return sourcelist;
+    console.log('🚀 ~ useListDataSource ~ sourceList:', sourceList);
+    return sourceList;
  }, [list]);

  return { list, categorizedList: updatedDataSourceTemplates, isFetching };