Compare commits

..

3 Commits

Author SHA1 Message Date
74e0b58d89 Fix: excel default optimization. (#11519)
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-25 19:54:20 +08:00
7c20c964b4 Fix: incorrect image merging for naive markdown parser (#11520)
### What problem does this PR solve?

Fix incorrect image merging for naive markdown parser. #9349 


[ragflow_readme.webm](https://github.com/user-attachments/assets/ca3f1e18-72b6-4a4c-80db-d03da9adf8dc)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-25 19:54:06 +08:00
5d0981d046 Refactoring: Integrating the file preview component (#11523)
### What problem does this PR solve?

Refactoring: Integrating the file preview component

### Type of change

- [x] Refactoring
2025-11-25 19:13:00 +08:00
42 changed files with 454 additions and 1327 deletions

View File

@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:
# Replace any TAGS e.g. <table ...> to <table>
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
table_with_attributes_pattern = re.compile(
rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
)
table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
def replace_tag(m):
tag_name = re.match(r"<(\w+)", m.group()).group(1)
return "<{}>".format(tag_name)
@ -128,23 +127,48 @@ class MarkdownElementExtractor:
self.markdown_content = markdown_content
self.lines = markdown_content.split("\n")
def get_delimiters(self,delimiters):
def get_delimiters(self, delimiters):
toks = re.findall(r"`([^`]+)`", delimiters)
toks = sorted(set(toks), key=lambda x: -len(x))
return "|".join(re.escape(t) for t in toks if t)
def extract_elements(self,delimiter=None):
def extract_elements(self, delimiter=None, include_meta=False):
"""Extract individual elements (headers, code blocks, lists, etc.)"""
sections = []
i = 0
dels=""
dels = ""
if delimiter:
dels = self.get_delimiters(delimiter)
if len(dels) > 0:
text = "\n".join(self.lines)
parts = re.split(dels, text)
sections = [p.strip() for p in parts if p and p.strip()]
if include_meta:
pattern = re.compile(dels)
last_end = 0
for m in pattern.finditer(text):
part = text[last_end : m.start()]
if part and part.strip():
sections.append(
{
"content": part.strip(),
"start_line": text.count("\n", 0, last_end),
"end_line": text.count("\n", 0, m.start()),
}
)
last_end = m.end()
part = text[last_end:]
if part and part.strip():
sections.append(
{
"content": part.strip(),
"start_line": text.count("\n", 0, last_end),
"end_line": text.count("\n", 0, len(text)),
}
)
else:
parts = re.split(dels, text)
sections = [p.strip() for p in parts if p and p.strip()]
return sections
while i < len(self.lines):
line = self.lines[i]
@ -152,32 +176,35 @@ class MarkdownElementExtractor:
if re.match(r"^#{1,6}\s+.*$", line):
# header
element = self._extract_header(i)
sections.append(element["content"])
sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip().startswith("```"):
# code block
element = self._extract_code_block(i)
sections.append(element["content"])
sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
# list block
element = self._extract_list_block(i)
sections.append(element["content"])
sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip().startswith(">"):
# blockquote
element = self._extract_blockquote(i)
sections.append(element["content"])
sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip():
# text block (paragraphs and inline elements until next block element)
element = self._extract_text_block(i)
sections.append(element["content"])
sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
else:
i += 1
sections = [section for section in sections if section.strip()]
if include_meta:
sections = [section for section in sections if section["content"].strip()]
else:
sections = [section for section in sections if section.strip()]
return sections
def _extract_header(self, start_pos):

View File

@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
from common.token_utils import num_tokens_from_string
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
@ -464,51 +465,88 @@ class Markdown(MarkdownParser):
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
return soup
def get_picture_urls(self, soup):
if soup:
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
return []
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
def get_pictures(self, text):
"""Download and open all images from markdown text."""
def extract_image_urls_with_lines(self, text):
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
urls = []
seen = set()
lines = text.splitlines()
for idx, line in enumerate(lines):
for url in md_img_re.findall(line):
if (url, idx) not in seen:
urls.append({"url": url, "line": idx})
seen.add((url, idx))
for url in html_img_re.findall(line):
if (url, idx) not in seen:
urls.append({"url": url, "line": idx})
seen.add((url, idx))
# cross-line
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
for img_tag in soup.find_all('img'):
src = img_tag.get('src')
if not src:
continue
tag_str = str(img_tag)
pos = text.find(tag_str)
if pos == -1:
# fallback
pos = max(text.find(src), 0)
line_no = 0
for i, off in enumerate(newline_offsets):
if pos <= off:
line_no = i
break
if (src, line_no) not in seen:
urls.append({"url": src, "line": line_no})
seen.add((src, line_no))
except Exception:
pass
return urls
def load_images_from_urls(self, urls, cache=None):
import requests
soup = self.md_to_html(text)
image_urls = self.get_picture_urls(soup)
from pathlib import Path
cache = cache or {}
images = []
# Find all image URLs in text
for url in image_urls:
if not url:
for url in urls:
if url in cache:
if cache[url]:
images.append(cache[url])
continue
img_obj = None
try:
# check if the url is a local file or a remote URL
if url.startswith(('http://', 'https://')):
# For remote URLs, download the image
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
img = Image.open(BytesIO(response.content)).convert('RGB')
images.append(img)
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
else:
# For local file paths, open the image directly
from pathlib import Path
local_path = Path(url)
if not local_path.exists():
if local_path.exists():
img_obj = Image.open(url).convert('RGB')
else:
logging.warning(f"Local image file not found: {url}")
continue
img = Image.open(url).convert('RGB')
images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
cache[url] = img_obj
if img_obj:
images.append(img_obj)
return images, cache
return images if images else None
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
# extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements(delimiter)
sections = [(element, "") for element in element_sections]
image_refs = self.extract_image_urls_with_lines(txt)
element_sections = extractor.extract_elements(delimiter, include_meta=True)
sections = []
section_images = []
image_cache = {}
for element in element_sections:
content = element["content"]
start_line = element["start_line"]
end_line = element["end_line"]
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
imgs = []
if urls_in_section:
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
combined_image = None
if imgs:
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
sections.append((content, ""))
section_images.append(combined_image)
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
if return_section_images:
return sections, tbls, section_images
return sections, tbls
def load_from_xml_v2(baseURI, rels_item_xml):
@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
final_sections = False
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -695,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
parser_config["chunk_token_num"] = 0
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
sections, tables, section_images = markdown_parser(
filename,
binary,
separate_tables=False,
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
return_section_images=True,
)
final_sections = True
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if vision_model:
# Process images for each section
section_images = []
for idx, (section_text, _) in enumerate(sections):
images = markdown_parser.get_pictures(section_text) if section_text else None
images = []
if section_images and len(section_images) > idx and section_images[idx] is not None:
images.append(section_images[idx])
if images:
if images and len(images) > 0:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
section_images.append(combined_image)
if section_images:
section_images[idx] = combined_image
else:
section_images = [None] * len(sections)
section_images[idx] = combined_image
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
section_images.append(None)
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
if section_images:
# if all images are None, set section_images to None
if all(image is None for image in section_images):
section_images = None
if final_sections:
merged_chunks = []
merged_images = []
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
overlapped_percent = max(0, min(overlapped_percent, 90))
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
current_text = ""
current_tokens = 0
current_image = None
for idx, sec in enumerate(sections):
text = sec[0] if isinstance(sec, tuple) else sec
sec_tokens = num_tokens_from_string(text)
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
if current_text and current_tokens + sec_tokens > chunk_limit:
merged_chunks.append(current_text)
merged_images.append(current_image)
overlap_part = ""
if overlapped_percent > 0:
overlap_len = int(len(current_text) * overlapped_percent / 100)
if overlap_len > 0:
overlap_part = current_text[-overlap_len:]
current_text = overlap_part
current_tokens = num_tokens_from_string(current_text)
current_image = current_image if overlap_part else None
if current_text:
current_text += "\n" + text
else:
current_text = text
current_tokens += sec_tokens
if sec_image:
current_image = concat_img(current_image, sec_image) if current_image else sec_image
if current_text:
merged_chunks.append(current_text)
merged_images.append(current_image)
chunks = merged_chunks
has_images = merged_images and any(img is not None for img in merged_images)
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
if has_images:
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
else:
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
if section_images:
if all(image is None for image in section_images):
section_images = None
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
@ -820,9 +940,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res:
res.extend(embed_res)
if url_res:

View File

@ -482,17 +482,25 @@ class Parser(ProcessBase):
self.set_output("output_format", conf["output_format"])
markdown_parser = naive_markdown_parser()
sections, tables = markdown_parser(name, blob, separate_tables=False)
sections, tables, section_images = markdown_parser(
name,
blob,
separate_tables=False,
delimiter=conf.get("delimiter"),
return_section_images=True,
)
if conf.get("output_format") == "json":
json_results = []
for section_text, _ in sections:
for idx, (section_text, _) in enumerate(sections):
json_result = {
"text": section_text,
}
images = markdown_parser.get_pictures(section_text) if section_text else None
images = []
if section_images and len(section_images) > idx and section_images[idx] is not None:
images.append(section_images[idx])
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]

View File

@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text))
return cks
dels = get_delimiters(delimiter)
for sec, pos in sections:
if num_tokens_from_string(sec) < chunk_token_num:
add_chunk("\n"+sec, pos)
continue
split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, pos)
add_chunk("\n"+sec, pos)
return cks
@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text_seg))
return cks, result_images
dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
# if text is tuple, unpack it
if isinstance(text, tuple):
text_str = text[0]
text_pos = text[1] if len(text) > 1 else ""
split_sec = re.split(r"(%s)" % dels, text_str)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, image, text_pos)
add_chunk("\n"+text_str, image, text_pos)
else:
split_sec = re.split(r"(%s)" % dels, text)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, image)
add_chunk("\n"+text, image)
return cks, result_images
def docx_question_level(p, bull=-1):
txt = re.sub(r"\u3000", " ", p.text).strip()
if p.style.name.startswith('Heading'):
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。"):
tk_nums.append(num_tokens_from_string(text_seg))
return cks, images
dels = get_delimiters(delimiter)
pattern = r"(%s)" % dels
for sec, image in sections:
split_sec = re.split(pattern, sec)
for sub_sec in split_sec:
if not sub_sec or re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n" + sub_sec, image, "")
add_chunk("\n" + sec, image, "")
return cks, images
@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):
return dels_pattern
class Node:
def __init__(self, level, depth=-1, texts=None):
self.level = level

View File

@ -1,5 +1,7 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import { Authorization } from '@/constants/authorization';
import { getAuthorization } from '@/utils/authorization-util';
import request from '@/utils/request';
import classNames from 'classnames';
import mammoth from 'mammoth';
@ -22,6 +24,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
const res = await request(url, {
method: 'GET',
responseType: 'blob',
headers: { [Authorization]: getAuthorization() },
onError: () => {
message.error('Document parsing failed');
console.error('Error loading document:', url);

View File

@ -1,5 +1,6 @@
import { useFetchExcel } from '@/pages/document-viewer/hooks';
// import { useFetchExcel } from '@/pages/document-viewer/hooks';
import classNames from 'classnames';
import { useFetchExcel } from './hooks';
interface ExcelCsvPreviewerProps {
className?: string;

View File

@ -1,9 +1,67 @@
import { Authorization } from '@/constants/authorization';
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
import api, { api_host } from '@/utils/api';
import { getAuthorization } from '@/utils/authorization-util';
import jsPreviewExcel from '@js-preview/excel';
import { useSize } from 'ahooks';
import axios from 'axios';
import mammoth from 'mammoth';
import { useCallback, useEffect, useRef, useState } from 'react';
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
export const useDocumentResizeObserver = () => {
const [containerWidth, setContainerWidth] = useState<number>();
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
const size = useSize(containerRef);
const onResize = useCallback((width?: number) => {
if (width) {
setContainerWidth(width);
}
}, []);
useEffect(() => {
onResize(size?.width);
}, [size?.width, onResize]);
return { containerWidth, setContainerRef };
};
function highlightPattern(text: string, pattern: string, pageNumber: number) {
if (pageNumber === 2) {
return `<mark>${text}</mark>`;
}
if (text.trim() !== '' && pattern.match(text)) {
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
return `<mark>${text}</mark>`;
}
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
}
export const useHighlightText = (searchText: string = '') => {
const textRenderer = useCallback(
(textItem: any) => {
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
},
[searchText],
);
return textRenderer;
};
export const useGetDocumentUrl = (isAgent: boolean) => {
const { documentId } = useGetKnowledgeSearchParams();
const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
const url = useMemo(() => {
if (isAgent) {
return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
}
return `${api_host}/document/get/${documentId}`;
}, [createdBy, documentId, id, isAgent]);
return url;
};
export const useCatchError = (api: string) => {
const [error, setError] = useState('');

View File

@ -1,5 +1,7 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import { Authorization } from '@/constants/authorization';
import { getAuthorization } from '@/utils/authorization-util';
import request from '@/utils/request';
import classNames from 'classnames';
import { useEffect, useState } from 'react';
@ -22,6 +24,7 @@ export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
const res = await request(url, {
method: 'GET',
responseType: 'blob',
headers: { [Authorization]: getAuthorization() },
onError: () => {
message.error('Failed to load image');
setIsLoading(false);

View File

@ -4,7 +4,7 @@ import CSVFileViewer from './csv-preview';
import { DocPreviewer } from './doc-preview';
import { ExcelCsvPreviewer } from './excel-preview';
import { ImagePreviewer } from './image-preview';
import styles from './index.less';
import { Md } from './md';
import PdfPreviewer, { IProps } from './pdf-preview';
import { PptPreviewer } from './ppt-preview';
import { TxtPreviewer } from './txt-preview';
@ -25,7 +25,7 @@ const Preview = ({
return (
<>
{fileType === 'pdf' && highlights && setWidthAndHeight && (
<section className={styles.documentPreview}>
<section>
<PdfPreviewer
highlights={highlights}
setWidthAndHeight={setWidthAndHeight}
@ -38,7 +38,7 @@ const Preview = ({
<DocPreviewer className={className} url={url} />
</section>
)}
{['txt', 'md'].indexOf(fileType) > -1 && (
{['txt'].indexOf(fileType) > -1 && (
<section>
<TxtPreviewer className={className} url={url} />
</section>
@ -82,6 +82,11 @@ const Preview = ({
<CSVFileViewer className={className} url={url} />
</section>
)}
{['md'].indexOf(fileType) > -1 && (
<section>
<Md className={className} url={url} />
</section>
)}
</>
);
};

View File

@ -1,31 +1,39 @@
import { Authorization } from '@/constants/authorization';
import { cn } from '@/lib/utils';
import FileError from '@/pages/document-viewer/file-error';
import { getAuthorization } from '@/utils/authorization-util';
import React, { useEffect, useState } from 'react';
import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import FileError from '../file-error';
interface MdProps {
filePath: string;
// filePath: string;
className?: string;
url: string;
}
const Md: React.FC<MdProps> = ({ filePath }) => {
export const Md: React.FC<MdProps> = ({ url, className }) => {
const [content, setContent] = useState<string>('');
const [error, setError] = useState<string | null>(null);
useEffect(() => {
setError(null);
fetch(filePath)
fetch(url, { headers: { [Authorization]: getAuthorization() } })
.then((res) => {
if (!res.ok) throw new Error('Failed to fetch markdown file');
return res.text();
})
.then((text) => setContent(text))
.catch((err) => setError(err.message));
}, [filePath]);
}, [url]);
if (error) return <FileError>{error}</FileError>;
return (
<div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
<div
style={{ padding: 4, overflow: 'scroll' }}
className={cn(className, 'markdown-body h-[calc(100vh - 200px)]')}
>
<ReactMarkdown remarkPlugins={[remarkGfm]}>{content}</ReactMarkdown>
</div>
);

View File

@ -10,13 +10,21 @@ import {
import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
import { Spin } from '@/components/ui/spin';
// import FileError from '@/pages/document-viewer/file-error';
import { Authorization } from '@/constants/authorization';
import FileError from '@/pages/document-viewer/file-error';
import { getAuthorization } from '@/utils/authorization-util';
import styles from './index.less';
type PdfLoaderProps = React.ComponentProps<typeof PdfLoader> & {
httpHeaders?: Record<string, string>;
};
const Loader = PdfLoader as React.ComponentType<PdfLoaderProps>;
export interface IProps {
highlights: IHighlight[];
setWidthAndHeight: (width: number, height: number) => void;
highlights?: IHighlight[];
setWidthAndHeight?: (width: number, height: number) => void;
url: string;
className?: string;
}
const HighlightPopup = ({
comment,
@ -30,7 +38,12 @@ const HighlightPopup = ({
) : null;
// TODO: merge with DocumentPreviewer
const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
const PdfPreview = ({
highlights: state,
setWidthAndHeight,
url,
className,
}: IProps) => {
// const url = useGetDocumentUrl();
const ref = useRef<(highlight: IHighlight) => void>(() => {});
@ -39,17 +52,22 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
const resetHash = () => {};
useEffect(() => {
if (state.length > 0) {
if (state?.length && state?.length > 0) {
ref?.current(state[0]);
}
}, [state]);
const httpHeaders = {
[Authorization]: getAuthorization(),
};
return (
<div
className={`${styles.documentContainer} rounded-[10px] overflow-hidden `}
className={`${styles.documentContainer} rounded-[10px] overflow-hidden ${className}`}
>
<PdfLoader
<Loader
url={url}
httpHeaders={httpHeaders}
beforeLoad={
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
@ -63,7 +81,7 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
const viewport = page.getViewport({ scale: 1 });
const width = viewport.width;
const height = viewport.height;
setWidthAndHeight(width, height);
setWidthAndHeight?.(width, height);
});
return (
@ -115,11 +133,11 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
</Popup>
);
}}
highlights={state}
highlights={state || []}
/>
);
}}
</PdfLoader>
</Loader>
</div>
);
};

View File

@ -148,7 +148,7 @@ export const Images = [
];
// Without FileViewer
export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', ...Images];
export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', 'md', ...Images];
export const SupportedPreviewDocumentTypes = [...ExceptiveType];
//#endregion

View File

@ -1,14 +1,13 @@
import { Input } from '@/components/originui/input';
import { Button } from '@/components/ui/button';
import { SearchInput } from '@/components/ui/input';
import {
Popover,
PopoverContent,
PopoverTrigger,
} from '@/components/ui/popover';
import { Radio } from '@/components/ui/radio';
import { Segmented } from '@/components/ui/segmented';
import { useTranslate } from '@/hooks/common-hooks';
import { cn } from '@/lib/utils';
import { SearchOutlined } from '@ant-design/icons';
import { ListFilter, Plus } from 'lucide-react';
import { useState } from 'react';
import { ChunkTextMode } from '../../constant';
@ -61,46 +60,43 @@ export default ({
};
return (
<div className="flex pr-[25px]">
<div className="flex items-center gap-4 bg-bg-card text-muted-foreground w-fit h-[35px] rounded-md px-4 py-2">
{textSelectOptions.map((option) => (
<div
key={option.value}
className={cn('flex items-center cursor-pointer', {
'text-primary': option.value === textSelectValue,
})}
onClick={() => changeTextSelectValue(option.value)}
>
{option.label}
</div>
))}
</div>
<div className="ml-auto"></div>
<Input
className="bg-bg-card text-muted-foreground"
style={{ width: 200 }}
placeholder={t('search')}
icon={<SearchOutlined />}
onChange={handleInputChange}
value={searchString}
<Segmented
options={textSelectOptions}
value={textSelectValue}
onChange={changeTextSelectValue}
/>
<div className="w-[20px]"></div>
<Popover>
<PopoverTrigger asChild>
<Button className="bg-bg-card text-muted-foreground hover:bg-card">
<ListFilter />
</Button>
</PopoverTrigger>
<PopoverContent className="p-0 w-[200px]">
{filterContent}
</PopoverContent>
</Popover>
<div className="w-[20px]"></div>
<Button
onClick={() => createChunk()}
className="bg-bg-card text-primary hover:bg-card"
>
<Plus size={44} />
</Button>
<div className="ml-auto"></div>
<div className="h-8 flex items-center gap-5">
<SearchInput
// style={{ width: 200 }}
placeholder={t('search')}
// icon={<SearchOutlined />}
onChange={handleInputChange}
value={searchString}
/>
<Popover>
<PopoverTrigger asChild>
<Button
variant={'ghost'}
// className="bg-bg-card text-text-secondary hover:bg-card"
>
<ListFilter />
</Button>
</PopoverTrigger>
<PopoverContent className="p-0 w-[200px]">
{filterContent}
</PopoverContent>
</Popover>
<Button
variant={'ghost'}
onClick={() => createChunk()}
// className="bg-bg-card text-primary hover:bg-card"
>
<Plus size={44} />
</Button>
</div>
{/* <div className="w-[20px]"></div>
<div className="w-[20px]"></div> */}
</div>
);
};

View File

@ -1,21 +0,0 @@
import { formatDate } from '@/utils/date';
import { formatBytes } from '@/utils/file-util';
type Props = {
size: number;
name: string;
create_date: string;
};
export default ({ size, name, create_date }: Props) => {
const sizeName = formatBytes(size);
const dateStr = formatDate(create_date);
return (
<div>
<h2 className="text-[24px]">{name}</h2>
<div className="text-[#979AAB] pt-[5px]">
Size{sizeName} Uploaded Time{dateStr}
</div>
</div>
);
};

View File

@ -1,25 +0,0 @@
import { useFetchExcel } from '@/pages/document-viewer/hooks';
import classNames from 'classnames';
interface ExcelCsvPreviewerProps {
className?: string;
url: string;
}
export const ExcelCsvPreviewer: React.FC<ExcelCsvPreviewerProps> = ({
className,
url,
}) => {
// const url = useGetDocumentUrl();
const { containerRef } = useFetchExcel(url);
return (
<div
ref={containerRef}
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md excel-csv-previewer',
className,
)}
></div>
);
};

View File

@ -1,55 +0,0 @@
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
import { api_host } from '@/utils/api';
import { useSize } from 'ahooks';
import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
import { useCallback, useEffect, useMemo, useState } from 'react';
export const useDocumentResizeObserver = () => {
const [containerWidth, setContainerWidth] = useState<number>();
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
const size = useSize(containerRef);
const onResize = useCallback((width?: number) => {
if (width) {
setContainerWidth(width);
}
}, []);
useEffect(() => {
onResize(size?.width);
}, [size?.width, onResize]);
return { containerWidth, setContainerRef };
};
function highlightPattern(text: string, pattern: string, pageNumber: number) {
if (pageNumber === 2) {
return `<mark>${text}</mark>`;
}
if (text.trim() !== '' && pattern.match(text)) {
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
return `<mark>${text}</mark>`;
}
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
}
export const useHighlightText = (searchText: string = '') => {
const textRenderer: CustomTextRenderer = useCallback(
(textItem) => {
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
},
[searchText],
);
return textRenderer;
};
export const useGetDocumentUrl = () => {
const { documentId } = useGetKnowledgeSearchParams();
const url = useMemo(() => {
return `${api_host}/document/get/${documentId}`;
}, [documentId]);
return url;
};

View File

@ -1,74 +0,0 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import request from '@/utils/request';
import classNames from 'classnames';
import { useCallback, useEffect, useState } from 'react';
interface ImagePreviewerProps {
className?: string;
url: string;
}
export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
className,
url,
}) => {
// const url = useGetDocumentUrl();
const [imageSrc, setImageSrc] = useState<string | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(true);
const fetchImage = useCallback(async () => {
setIsLoading(true);
const res = await request(url, {
method: 'GET',
responseType: 'blob',
onError: () => {
message.error('Failed to load image');
setIsLoading(false);
},
});
const objectUrl = URL.createObjectURL(res.data);
setImageSrc(objectUrl);
setIsLoading(false);
}, [url]);
useEffect(() => {
if (url) {
fetchImage();
}
}, [url, fetchImage]);
useEffect(() => {
return () => {
if (imageSrc) {
URL.revokeObjectURL(imageSrc);
}
};
}, [imageSrc]);
return (
<div
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md image-previewer',
className,
)}
>
{isLoading && (
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
</div>
)}
{!isLoading && imageSrc && (
<div className="max-h-[80vh] overflow-auto p-2">
<img
src={imageSrc}
alt={'image'}
className="w-full h-auto max-w-full object-contain"
onLoad={() => URL.revokeObjectURL(imageSrc!)}
/>
</div>
)}
</div>
);
};

View File

@ -7,7 +7,6 @@ import { useCallback, useEffect, useMemo, useState } from 'react';
import { useTranslation } from 'react-i18next';
import ChunkCard from './components/chunk-card';
import CreatingModal from './components/chunk-creating-modal';
import DocumentPreview from './components/document-preview';
import {
useChangeChunkTextMode,
useDeleteChunkByIds,
@ -18,8 +17,11 @@ import {
import ChunkResultBar from './components/chunk-result-bar';
import CheckboxSets from './components/chunk-result-bar/checkbox-sets';
import DocumentHeader from './components/document-preview/document-header';
// import DocumentHeader from './components/document-preview/document-header';
import DocumentPreview from '@/components/document-preview';
import DocumentHeader from '@/components/document-preview/document-header';
import { useGetDocumentUrl } from '@/components/document-preview/hooks';
import { PageHeader } from '@/components/page-header';
import {
Breadcrumb,
@ -40,7 +42,6 @@ import {
useNavigatePage,
} from '@/hooks/logic-hooks/navigate-hooks';
import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-request';
import { useGetDocumentUrl } from './components/document-preview/hooks';
import styles from './index.less';
const Chunk = () => {
@ -74,7 +75,7 @@ const Chunk = () => {
} = useUpdateChunk();
const { navigateToDataFile, getQueryString, navigateToDatasetList } =
useNavigatePage();
const fileUrl = useGetDocumentUrl();
const fileUrl = useGetDocumentUrl(false);
useEffect(() => {
setChunkList(data);
}, [data]);

View File

@ -1,114 +0,0 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import request from '@/utils/request';
import classNames from 'classnames';
import React, { useEffect, useRef, useState } from 'react';
interface CSVData {
rows: string[][];
headers: string[];
}
interface FileViewerProps {
className?: string;
url: string;
}
const CSVFileViewer: React.FC<FileViewerProps> = ({ url }) => {
const [data, setData] = useState<CSVData | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(true);
const containerRef = useRef<HTMLDivElement>(null);
// const url = useGetDocumentUrl();
const parseCSV = (csvText: string): CSVData => {
console.log('Parsing CSV data:', csvText);
const lines = csvText.split('\n');
const headers = lines[0].split(',').map((header) => header.trim());
const rows = lines
.slice(1)
.map((line) => line.split(',').map((cell) => cell.trim()));
return { headers, rows };
};
useEffect(() => {
const loadCSV = async () => {
try {
const res = await request(url, {
method: 'GET',
responseType: 'blob',
onError: () => {
message.error('file load failed');
setIsLoading(false);
},
});
// parse CSV file
const reader = new FileReader();
reader.readAsText(res.data);
reader.onload = () => {
const parsedData = parseCSV(reader.result as string);
console.log('file loaded successfully', reader.result);
setData(parsedData);
};
} catch (error) {
message.error('CSV file parse failed');
console.error('Error loading CSV file:', error);
} finally {
setIsLoading(false);
}
};
loadCSV();
return () => {
setData(null);
};
}, [url]);
return (
<div
ref={containerRef}
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
'overflow-auto max-h-[80vh] p-2',
)}
>
{isLoading ? (
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
</div>
) : data ? (
<table className="min-w-full divide-y divide-border-normal">
<thead className="bg-background-header-bar">
<tr>
{data.headers.map((header, index) => (
<th
key={`header-${index}`}
className="px-6 py-3 text-left text-sm font-medium text-text-primary"
>
{header}
</th>
))}
</tr>
</thead>
<tbody className="bg-background-paper divide-y divide-border-normal">
{data.rows.map((row, rowIndex) => (
<tr key={`row-${rowIndex}`}>
{row.map((cell, cellIndex) => (
<td
key={`cell-${rowIndex}-${cellIndex}`}
className="px-6 py-4 whitespace-nowrap text-sm text-text-secondary"
>
{cell || '-'}
</td>
))}
</tr>
))}
</tbody>
</table>
) : null}
</div>
);
};
export default CSVFileViewer;

View File

@ -1,70 +0,0 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import request from '@/utils/request';
import classNames from 'classnames';
import mammoth from 'mammoth';
import { useEffect, useState } from 'react';
interface DocPreviewerProps {
className?: string;
url: string;
}
export const DocPreviewer: React.FC<DocPreviewerProps> = ({
className,
url,
}) => {
// const url = useGetDocumentUrl();
const [htmlContent, setHtmlContent] = useState<string>('');
const [loading, setLoading] = useState(false);
const fetchDocument = async () => {
setLoading(true);
const res = await request(url, {
method: 'GET',
responseType: 'blob',
onError: () => {
message.error('Document parsing failed');
console.error('Error loading document:', url);
},
});
try {
const arrayBuffer = await res.data.arrayBuffer();
const result = await mammoth.convertToHtml(
{ arrayBuffer },
{ includeDefaultStyleMap: true },
);
const styledContent = result.value
.replace(/<p>/g, '<p class="mb-2">')
.replace(/<h(\d)>/g, '<h$1 class="font-semibold mt-4 mb-2">');
setHtmlContent(styledContent);
} catch (err) {
message.error('Document parsing failed');
console.error('Error parsing document:', err);
}
setLoading(false);
};
useEffect(() => {
if (url) {
fetchDocument();
}
}, [url]);
return (
<div
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
className,
)}
>
{loading && (
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
</div>
)}
{!loading && <div dangerouslySetInnerHTML={{ __html: htmlContent }} />}
</div>
);
};

View File

@ -1,60 +0,0 @@
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
import api, { api_host } from '@/utils/api';
import { useSize } from 'ahooks';
import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
import { useCallback, useEffect, useMemo, useState } from 'react';
import { useGetPipelineResultSearchParams } from '../../hooks';
export const useDocumentResizeObserver = () => {
const [containerWidth, setContainerWidth] = useState<number>();
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
const size = useSize(containerRef);
const onResize = useCallback((width?: number) => {
if (width) {
setContainerWidth(width);
}
}, []);
useEffect(() => {
onResize(size?.width);
}, [size?.width, onResize]);
return { containerWidth, setContainerRef };
};
function highlightPattern(text: string, pattern: string, pageNumber: number) {
if (pageNumber === 2) {
return `<mark>${text}</mark>`;
}
if (text.trim() !== '' && pattern.match(text)) {
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
return `<mark>${text}</mark>`;
}
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
}
export const useHighlightText = (searchText: string = '') => {
const textRenderer: CustomTextRenderer = useCallback(
(textItem) => {
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
},
[searchText],
);
return textRenderer;
};
export const useGetDocumentUrl = (isAgent: boolean) => {
const { documentId } = useGetKnowledgeSearchParams();
const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
const url = useMemo(() => {
if (isAgent) {
return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
}
return `${api_host}/document/get/${documentId}`;
}, [createdBy, documentId, id, isAgent]);
return url;
};

View File

@ -1,13 +0,0 @@
.documentContainer {
width: 100%;
// height: calc(100vh - 284px);
height: calc(100vh - 180px);
position: relative;
:global(.PdfHighlighter) {
overflow-x: hidden;
}
:global(.Highlight--scrolledTo .Highlight__part) {
overflow-x: hidden;
background-color: rgba(255, 226, 143, 1);
}
}

View File

@ -1,67 +0,0 @@
import { memo } from 'react';
import CSVFileViewer from './csv-preview';
import { DocPreviewer } from './doc-preview';
import { ExcelCsvPreviewer } from './excel-preview';
import { ImagePreviewer } from './image-preview';
import PdfPreviewer, { IProps } from './pdf-preview';
import { PptPreviewer } from './ppt-preview';
import { TxtPreviewer } from './txt-preview';
type PreviewProps = {
fileType: string;
className?: string;
url: string;
};
const Preview = ({
fileType,
className,
highlights,
setWidthAndHeight,
url,
}: PreviewProps & Partial<IProps>) => {
return (
<>
{fileType === 'pdf' && highlights && setWidthAndHeight && (
<section>
<PdfPreviewer
highlights={highlights}
setWidthAndHeight={setWidthAndHeight}
url={url}
></PdfPreviewer>
</section>
)}
{['doc', 'docx'].indexOf(fileType) > -1 && (
<section>
<DocPreviewer className={className} url={url} />
</section>
)}
{['txt', 'md'].indexOf(fileType) > -1 && (
<section>
<TxtPreviewer className={className} url={url} />
</section>
)}
{['visual'].indexOf(fileType) > -1 && (
<section>
<ImagePreviewer className={className} url={url} />
</section>
)}
{['pptx'].indexOf(fileType) > -1 && (
<section>
<PptPreviewer className={className} url={url} />
</section>
)}
{['xlsx'].indexOf(fileType) > -1 && (
<section>
<ExcelCsvPreviewer className={className} url={url} />
</section>
)}
{['csv'].indexOf(fileType) > -1 && (
<section>
<CSVFileViewer className={className} url={url} />
</section>
)}
</>
);
};
export default memo(Preview);

View File

@ -1,127 +0,0 @@
import { memo, useEffect, useRef } from 'react';
import {
AreaHighlight,
Highlight,
IHighlight,
PdfHighlighter,
PdfLoader,
Popup,
} from 'react-pdf-highlighter';
import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
import { Spin } from '@/components/ui/spin';
import FileError from '@/pages/document-viewer/file-error';
import styles from './index.less';
export interface IProps {
highlights: IHighlight[];
setWidthAndHeight: (width: number, height: number) => void;
url: string;
}
const HighlightPopup = ({
comment,
}: {
comment: { text: string; emoji: string };
}) =>
comment.text ? (
<div className="Highlight__popup">
{comment.emoji} {comment.text}
</div>
) : null;
// TODO: merge with DocumentPreviewer
const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
// const url = useGetDocumentUrl();
const ref = useRef<(highlight: IHighlight) => void>(() => {});
const error = useCatchDocumentError(url);
const resetHash = () => {};
useEffect(() => {
if (state.length > 0) {
ref?.current(state[0]);
}
}, [state]);
return (
<div
className={`${styles.documentContainer} rounded-[10px] overflow-hidden `}
>
<PdfLoader
url={url}
beforeLoad={
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
</div>
}
workerSrc="/pdfjs-dist/pdf.worker.min.js"
errorMessage={<FileError>{error}</FileError>}
>
{(pdfDocument) => {
pdfDocument.getPage(1).then((page) => {
const viewport = page.getViewport({ scale: 1 });
const width = viewport.width;
const height = viewport.height;
setWidthAndHeight(width, height);
});
return (
<PdfHighlighter
pdfDocument={pdfDocument}
enableAreaSelection={(event) => event.altKey}
onScrollChange={resetHash}
scrollRef={(scrollTo) => {
ref.current = scrollTo;
}}
onSelectionFinished={() => null}
highlightTransform={(
highlight,
index,
setTip,
hideTip,
viewportToScaled,
screenshot,
isScrolledTo,
) => {
const isTextHighlight = !Boolean(
highlight.content && highlight.content.image,
);
const component = isTextHighlight ? (
<Highlight
isScrolledTo={isScrolledTo}
position={highlight.position}
comment={highlight.comment}
/>
) : (
<AreaHighlight
isScrolledTo={isScrolledTo}
highlight={highlight}
onChange={() => {}}
/>
);
return (
<Popup
popupContent={<HighlightPopup {...highlight} />}
onMouseOver={(popupContent) =>
setTip(highlight, () => popupContent)
}
onMouseOut={hideTip}
key={index}
>
{component}
</Popup>
);
}}
highlights={state}
/>
);
}}
</PdfLoader>
</div>
);
};
export default memo(PdfPreview);

View File

@ -1,70 +0,0 @@
import message from '@/components/ui/message';
import request from '@/utils/request';
import classNames from 'classnames';
import { init } from 'pptx-preview';
import { useEffect, useRef } from 'react';
interface PptPreviewerProps {
className?: string;
url: string;
}
export const PptPreviewer: React.FC<PptPreviewerProps> = ({
className,
url,
}) => {
// const url = useGetDocumentUrl();
const wrapper = useRef<HTMLDivElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const fetchDocument = async () => {
const res = await request(url, {
method: 'GET',
responseType: 'blob',
onError: () => {
message.error('Document parsing failed');
console.error('Error loading document:', url);
},
});
console.log(res);
try {
const arrayBuffer = await res.data.arrayBuffer();
if (containerRef.current) {
let width = 500;
let height = 900;
if (containerRef.current) {
width = containerRef.current.clientWidth - 50;
height = containerRef.current.clientHeight - 50;
}
let pptxPrviewer = init(containerRef.current, {
width: width,
height: height,
});
pptxPrviewer.preview(arrayBuffer);
}
} catch (err) {
message.error('ppt parse failed');
}
};
useEffect(() => {
if (url) {
fetchDocument();
}
}, [url]);
return (
<div
ref={containerRef}
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md ppt-previewer',
className,
)}
>
<div className="overflow-auto p-2">
<div className="flex flex-col gap-4">
<div ref={wrapper} />
</div>
</div>
</div>
);
};

View File

@ -1,56 +0,0 @@
import message from '@/components/ui/message';
import { Spin } from '@/components/ui/spin';
import request from '@/utils/request';
import classNames from 'classnames';
import { useEffect, useState } from 'react';
type TxtPreviewerProps = { className?: string; url: string };
export const TxtPreviewer = ({ className, url }: TxtPreviewerProps) => {
// const url = useGetDocumentUrl();
const [loading, setLoading] = useState(false);
const [data, setData] = useState<string>('');
const fetchTxt = async () => {
setLoading(true);
const res = await request(url, {
method: 'GET',
responseType: 'blob',
onError: (err: any) => {
message.error('Failed to load file');
console.error('Error loading file:', err);
},
});
// blob to string
const reader = new FileReader();
reader.readAsText(res.data);
reader.onload = () => {
setData(reader.result as string);
setLoading(false);
console.log('file loaded successfully', reader.result);
};
console.log('file data:', res);
};
useEffect(() => {
if (url) {
fetchTxt();
} else {
setLoading(false);
setData('');
}
}, [url]);
return (
<div
className={classNames(
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
className,
)}
>
{loading && (
<div className="absolute inset-0 flex items-center justify-center">
<Spin />
</div>
)}
{!loading && <pre className="whitespace-pre-wrap p-2 ">{data}</pre>}
</div>
);
};

View File

@ -1,7 +1,7 @@
import DocumentPreview from '@/components/document-preview';
import { useFetchNextChunkList } from '@/hooks/use-chunk-request';
import { useMemo, useState } from 'react';
import { useTranslation } from 'react-i18next';
import DocumentPreview from './components/document-preview';
import {
useFetchPipelineFileLogDetail,
useFetchPipelineResult,
@ -13,8 +13,9 @@ import {
useTimelineDataFlow,
} from './hooks';
import DocumentHeader from './components/document-preview/document-header';
import DocumentHeader from '@/components/document-preview/document-header';
import { useGetDocumentUrl } from '@/components/document-preview/hooks';
import { TimelineNode } from '@/components/originui/timeline';
import { PageHeader } from '@/components/page-header';
import Spotlight from '@/components/spotlight';
@ -32,7 +33,6 @@ import { AgentCategory } from '@/constants/agent';
import { Images } from '@/constants/common';
import { useNavigatePage } from '@/hooks/logic-hooks/navigate-hooks';
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
import { useGetDocumentUrl } from './components/document-preview/hooks';
import TimelineDataFlow from './components/time-line';
import { TimelineNodeType } from './constant';
import styles from './index.less';
@ -76,13 +76,14 @@ const Chunk = () => {
const fileType = useMemo(() => {
if (isAgent) {
return Images.some((x) => x === documentExtension)
? 'visual'
? documentInfo?.name.split('.').pop() || 'visual'
: documentExtension;
}
switch (documentInfo?.type) {
case 'doc':
return documentInfo?.name.split('.').pop() || 'doc';
case 'visual':
return documentInfo?.name.split('.').pop() || 'visual';
case 'docx':
case 'txt':
case 'md':

View File

@ -1,282 +0,0 @@
// Copyright (c) 2017 PlanGrid, Inc.
.docxViewerWrapper {
overflow-y: scroll;
height: 100%;
width: 100%;
.box {
width: 100%;
height: 100%;
}
:global(.document-container) {
padding: 30px;
width: 700px;
background: rgba(255, 255, 255, 0.1);
margin: auto;
}
html,
bodyaddress,
blockquote,
body,
dd,
div,
dl,
dt,
fieldset,
form,
frame,
frameset,
h1,
h2,
h3,
h4,
h5,
h6,
noframes,
ol,
p,
ul,
center,
dir,
hr,
menu,
pre {
display: block;
unicode-bidi: embed;
}
li {
display: list-item;
list-style-type: disc;
}
head {
display: none;
}
table {
display: table;
}
img {
width: 100%;
}
tr {
display: table-row;
}
thead {
display: table-header-group;
}
tbody {
display: table-row-group;
}
tfoot {
display: table-footer-group;
}
col {
display: table-column;
}
colgroup {
display: table-column-group;
}
th {
display: table-cell;
}
td {
display: table-cell;
border-bottom: 1px solid #ccc;
border-right: 1px solid #ccc;
padding: 0.2em 0.5em;
}
caption {
display: table-caption;
}
th {
font-weight: bolder;
text-align: center;
}
caption {
text-align: center;
}
body {
margin: 8px;
}
h1 {
font-size: 2em;
margin: 0.67em 0;
}
h2 {
font-size: 1.5em;
margin: 0.75em 0;
}
h3 {
font-size: 1.17em;
margin: 0.83em 0;
}
h4,
p,
blockquote,
ul,
fieldset,
form,
ol,
dl,
dir,
menu {
margin: 1.12em 0;
}
h5 {
font-size: 0.83em;
margin: 1.5em 0;
}
h6 {
font-size: 0.75em;
margin: 1.67em 0;
}
h1,
h2,
h3,
h4,
h5,
h6,
b,
strong {
font-weight: bolder;
}
blockquote {
margin-left: 40px;
margin-right: 40px;
}
i,
cite,
em,
var,
address {
font-style: italic;
}
pre,
tt,
code,
kbd,
samp {
font-family: monospace;
}
pre {
white-space: pre;
}
button,
textarea,
input,
select {
display: inline-block;
}
big {
font-size: 1.17em;
}
small,
sub,
sup {
font-size: 0.83em;
}
sub {
vertical-align: sub;
}
sup {
vertical-align: super;
}
table {
border-spacing: 2px;
}
thead,
tbody,
tfoot {
vertical-align: middle;
}
td,
th,
tr {
vertical-align: inherit;
}
s,
strike,
del {
text-decoration: line-through;
}
hr {
border: 1px inset;
}
ol,
ul,
dir,
menu,
dd {
margin-left: 40px;
}
ol {
list-style-type: decimal;
}
ol ul,
ol ul,
ul ol,
ul ol,
ul ul,
ul ul,
ol ol,
ol ol {
margin-top: 0;
margin-bottom: 0;
}
u,
ins {
text-decoration: underline;
}
br:before {
content: '\A';
white-space: pre-line;
}
center {
text-align: center;
}
:link,
:visited {
text-decoration: underline;
}
:focus {
outline: thin dotted invert;
}
/* Begin bidirectionality settings (do not change) */
BDO[DIR='ltr'] {
direction: ltr;
unicode-bidi: bidi-override;
}
BDO[DIR='rtl'] {
direction: rtl;
unicode-bidi: bidi-override;
}
*[DIR='ltr'] {
direction: ltr;
unicode-bidi: embed;
}
*[DIR='rtl'] {
direction: rtl;
unicode-bidi: embed;
}
@media print {
h1 {
page-break-before: always;
}
h1,
h2,
h3,
h4,
h5,
h6 {
page-break-after: avoid;
}
ul,
ol,
dl {
page-break-before: avoid;
}
}
}

View File

@ -1,25 +0,0 @@
import { Spin } from 'antd';
import FileError from '../file-error';
import { useFetchDocx } from '../hooks';
import styles from './index.less';
const Docx = ({ filePath }: { filePath: string }) => {
const { succeed, containerRef, error } = useFetchDocx(filePath);
return (
<>
{succeed ? (
<section className={styles.docxViewerWrapper}>
<div id="docx" ref={containerRef} className={styles.box}>
<Spin />
</div>
</section>
) : (
<FileError>{error}</FileError>
)}
</>
);
};
export default Docx;

View File

@ -1,19 +0,0 @@
import '@js-preview/excel/lib/index.css';
import FileError from '../file-error';
import { useFetchExcel } from '../hooks';
const Excel = ({ filePath }: { filePath: string }) => {
const { status, containerRef, error } = useFetchExcel(filePath);
return (
<div
id="excel"
ref={containerRef}
style={{ height: '100%', width: '100%' }}
>
{status || <FileError>{error}</FileError>}
</div>
);
};
export default Excel;

View File

@ -1,4 +0,0 @@
.errorWrapper {
width: 100%;
height: 100%;
}

View File

@ -1,18 +1,18 @@
import { Alert, Flex } from 'antd';
import { useTranslate } from '@/hooks/common-hooks';
import React from 'react';
import styles from './index.less';
const FileError = ({ children }: React.PropsWithChildren) => {
const { t } = useTranslate('fileManager');
return (
<Flex align="center" justify="center" className={styles.errorWrapper}>
<Alert
type="error"
message={<h2>{children || t('fileError')}</h2>}
></Alert>
</Flex>
<div className="flex items-center justify-center min-h-screen">
<div className="bg-state-error-5 border border-state-error rounded-lg p-4 shadow-sm">
<div className="flex ml-3">
<div className="text-white font-medium">
{children || t('fileError')}
</div>
</div>
</div>
</div>
);
};

View File

@ -1,16 +1,22 @@
import { Images } from '@/constants/common';
import { api_host } from '@/utils/api';
import { Flex } from 'antd';
// import { Flex } from 'antd';
import { useParams, useSearchParams } from 'umi';
import Docx from './docx';
import Excel from './excel';
import Image from './image';
import Md from './md';
import Pdf from './pdf';
import Text from './text';
// import Docx from './docx';
// import Excel from './excel';
// import Image from './image';
// import Md from './md';
// import Pdf from './pdf';
// import Text from './text';
import { DocPreviewer } from '@/components/document-preview/doc-preview';
import { ExcelCsvPreviewer } from '@/components/document-preview/excel-preview';
import { ImagePreviewer } from '@/components/document-preview/image-preview';
import Md from '@/components/document-preview/md';
import PdfPreview from '@/components/document-preview/pdf-preview';
import { TxtPreviewer } from '@/components/document-preview/txt-preview';
import { previewHtmlFile } from '@/utils/file-util';
import styles from './index.less';
// import styles from './index.less';
// TODO: The interface returns an incorrect content-type for the SVG.
@ -20,6 +26,7 @@ const DocumentViewer = () => {
const ext = currentQueryParameters.get('ext');
const prefix = currentQueryParameters.get('prefix');
const api = `${api_host}/${prefix || 'file'}/get/${documentId}`;
// request.head
if (ext === 'html' && documentId) {
previewHtmlFile(documentId);
@ -27,19 +34,24 @@ const DocumentViewer = () => {
}
return (
<section className={styles.viewerWrapper}>
<section className="w-full h-full">
{Images.includes(ext!) && (
<Flex className={styles.image} align="center" justify="center">
<Image src={api} preview={false}></Image>
</Flex>
<div className="flex w-full h-full items-center justify-center">
{/* <Image src={api} preview={false}></Image> */}
<ImagePreviewer className="w-full !h-dvh p-5" url={api} />
</div>
)}
{ext === 'md' && <Md filePath={api}></Md>}
{ext === 'txt' && <Text filePath={api}></Text>}
{ext === 'md' && <Md url={api} className="!h-dvh p-5"></Md>}
{ext === 'txt' && <TxtPreviewer url={api}></TxtPreviewer>}
{ext === 'pdf' && <Pdf url={api}></Pdf>}
{(ext === 'xlsx' || ext === 'xls') && <Excel filePath={api}></Excel>}
{ext === 'pdf' && (
<PdfPreview url={api} className="!h-dvh p-5"></PdfPreview>
)}
{(ext === 'xlsx' || ext === 'xls') && (
<ExcelCsvPreviewer url={api}></ExcelCsvPreviewer>
)}
{ext === 'docx' && <Docx filePath={api}></Docx>}
{ext === 'docx' && <DocPreviewer url={api}></DocPreviewer>}
</section>
);
};

View File

@ -1,32 +0,0 @@
import React, { useEffect, useState } from 'react';
import FileError from '../file-error';
interface TxtProps {
filePath: string;
}
const Md: React.FC<TxtProps> = ({ filePath }) => {
const [content, setContent] = useState<string>('');
const [error, setError] = useState<string | null>(null);
useEffect(() => {
setError(null);
fetch(filePath)
.then((res) => {
if (!res.ok) throw new Error('Failed to fetch text file');
return res.text();
})
.then((text) => setContent(text))
.catch((err) => setError(err.message));
}, [filePath]);
if (error) return <FileError>{error}</FileError>;
return (
<div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
{content}
</div>
);
};
export default Md;

View File

@ -1,3 +1,4 @@
import DocumentPreview from '@/components/document-preview';
import { FileIcon } from '@/components/icon-font';
import { Modal } from '@/components/ui/modal/modal';
import {
@ -7,7 +8,6 @@ import {
import { IModalProps } from '@/interfaces/common';
import { IReferenceChunk } from '@/interfaces/database/chat';
import { IChunk } from '@/interfaces/database/knowledge';
import DocumentPreview from '@/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview';
import { useEffect, useState } from 'react';
interface IProps extends IModalProps<any> {

View File

@ -45,21 +45,23 @@ export const useListDataSource = () => {
const updatedDataSourceTemplates = useMemo(() => {
const categorizedData = categorizeDataBySource(list || []);
let sourcelist: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
let sourceList: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
[];
Object.keys(categorizedData).forEach((key: string) => {
const k = key as DataSourceKey;
sourcelist.push({
id: k,
name: DataSourceInfo[k].name,
description: DataSourceInfo[k].description,
icon: DataSourceInfo[k].icon,
list: categorizedData[k] || [],
});
if (DataSourceInfo[k]) {
sourceList.push({
id: k,
name: DataSourceInfo[k].name,
description: DataSourceInfo[k].description,
icon: DataSourceInfo[k].icon,
list: categorizedData[k] || [],
});
}
});
console.log('🚀 ~ useListDataSource ~ sourcelist:', sourcelist);
return sourcelist;
console.log('🚀 ~ useListDataSource ~ sourceList:', sourceList);
return sourceList;
}, [list]);
return { list, categorizedList: updatedDataSourceTemplates, isFetching };