mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-05 10:05:05 +08:00
Compare commits
3 Commits
a793dd2ea8
...
74e0b58d89
| Author | SHA1 | Date | |
|---|---|---|---|
| 74e0b58d89 | |||
| 7c20c964b4 | |||
| 5d0981d046 |
@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:
|
|||||||
|
|
||||||
# Replace any TAGS e.g. <table ...> to <table>
|
# Replace any TAGS e.g. <table ...> to <table>
|
||||||
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
||||||
table_with_attributes_pattern = re.compile(
|
table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
|
||||||
rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
|
|
||||||
)
|
|
||||||
def replace_tag(m):
|
def replace_tag(m):
|
||||||
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
||||||
return "<{}>".format(tag_name)
|
return "<{}>".format(tag_name)
|
||||||
@ -128,23 +127,48 @@ class MarkdownElementExtractor:
|
|||||||
self.markdown_content = markdown_content
|
self.markdown_content = markdown_content
|
||||||
self.lines = markdown_content.split("\n")
|
self.lines = markdown_content.split("\n")
|
||||||
|
|
||||||
def get_delimiters(self,delimiters):
|
def get_delimiters(self, delimiters):
|
||||||
toks = re.findall(r"`([^`]+)`", delimiters)
|
toks = re.findall(r"`([^`]+)`", delimiters)
|
||||||
toks = sorted(set(toks), key=lambda x: -len(x))
|
toks = sorted(set(toks), key=lambda x: -len(x))
|
||||||
return "|".join(re.escape(t) for t in toks if t)
|
return "|".join(re.escape(t) for t in toks if t)
|
||||||
|
|
||||||
def extract_elements(self,delimiter=None):
|
def extract_elements(self, delimiter=None, include_meta=False):
|
||||||
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||||
sections = []
|
sections = []
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
dels=""
|
dels = ""
|
||||||
if delimiter:
|
if delimiter:
|
||||||
dels = self.get_delimiters(delimiter)
|
dels = self.get_delimiters(delimiter)
|
||||||
if len(dels) > 0:
|
if len(dels) > 0:
|
||||||
text = "\n".join(self.lines)
|
text = "\n".join(self.lines)
|
||||||
parts = re.split(dels, text)
|
if include_meta:
|
||||||
sections = [p.strip() for p in parts if p and p.strip()]
|
pattern = re.compile(dels)
|
||||||
|
last_end = 0
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
part = text[last_end : m.start()]
|
||||||
|
if part and part.strip():
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"content": part.strip(),
|
||||||
|
"start_line": text.count("\n", 0, last_end),
|
||||||
|
"end_line": text.count("\n", 0, m.start()),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
last_end = m.end()
|
||||||
|
|
||||||
|
part = text[last_end:]
|
||||||
|
if part and part.strip():
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"content": part.strip(),
|
||||||
|
"start_line": text.count("\n", 0, last_end),
|
||||||
|
"end_line": text.count("\n", 0, len(text)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parts = re.split(dels, text)
|
||||||
|
sections = [p.strip() for p in parts if p and p.strip()]
|
||||||
return sections
|
return sections
|
||||||
while i < len(self.lines):
|
while i < len(self.lines):
|
||||||
line = self.lines[i]
|
line = self.lines[i]
|
||||||
@ -152,32 +176,35 @@ class MarkdownElementExtractor:
|
|||||||
if re.match(r"^#{1,6}\s+.*$", line):
|
if re.match(r"^#{1,6}\s+.*$", line):
|
||||||
# header
|
# header
|
||||||
element = self._extract_header(i)
|
element = self._extract_header(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip().startswith("```"):
|
elif line.strip().startswith("```"):
|
||||||
# code block
|
# code block
|
||||||
element = self._extract_code_block(i)
|
element = self._extract_code_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||||
# list block
|
# list block
|
||||||
element = self._extract_list_block(i)
|
element = self._extract_list_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip().startswith(">"):
|
elif line.strip().startswith(">"):
|
||||||
# blockquote
|
# blockquote
|
||||||
element = self._extract_blockquote(i)
|
element = self._extract_blockquote(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip():
|
elif line.strip():
|
||||||
# text block (paragraphs and inline elements until next block element)
|
# text block (paragraphs and inline elements until next block element)
|
||||||
element = self._extract_text_block(i)
|
element = self._extract_text_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
else:
|
else:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
sections = [section for section in sections if section.strip()]
|
if include_meta:
|
||||||
|
sections = [section for section in sections if section["content"].strip()]
|
||||||
|
else:
|
||||||
|
sections = [section for section in sections if section.strip()]
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def _extract_header(self, start_pos):
|
def _extract_header(self, start_pos):
|
||||||
|
|||||||
232
rag/app/naive.py
232
rag/app/naive.py
@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|||||||
from docx.opc.oxml import parse_xml
|
from docx.opc.oxml import parse_xml
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from common.token_utils import num_tokens_from_string
|
||||||
|
|
||||||
from common.constants import LLMType
|
from common.constants import LLMType
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
@ -465,50 +466,87 @@ class Markdown(MarkdownParser):
|
|||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_picture_urls(self, soup):
|
|
||||||
if soup:
|
|
||||||
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_hyperlink_urls(self, soup):
|
def get_hyperlink_urls(self, soup):
|
||||||
if soup:
|
if soup:
|
||||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_pictures(self, text):
|
def extract_image_urls_with_lines(self, text):
|
||||||
"""Download and open all images from markdown text."""
|
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
|
||||||
|
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
|
||||||
|
urls = []
|
||||||
|
seen = set()
|
||||||
|
lines = text.splitlines()
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
for url in md_img_re.findall(line):
|
||||||
|
if (url, idx) not in seen:
|
||||||
|
urls.append({"url": url, "line": idx})
|
||||||
|
seen.add((url, idx))
|
||||||
|
for url in html_img_re.findall(line):
|
||||||
|
if (url, idx) not in seen:
|
||||||
|
urls.append({"url": url, "line": idx})
|
||||||
|
seen.add((url, idx))
|
||||||
|
|
||||||
|
# cross-line
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
soup = BeautifulSoup(text, 'html.parser')
|
||||||
|
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
||||||
|
for img_tag in soup.find_all('img'):
|
||||||
|
src = img_tag.get('src')
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_str = str(img_tag)
|
||||||
|
pos = text.find(tag_str)
|
||||||
|
if pos == -1:
|
||||||
|
# fallback
|
||||||
|
pos = max(text.find(src), 0)
|
||||||
|
line_no = 0
|
||||||
|
for i, off in enumerate(newline_offsets):
|
||||||
|
if pos <= off:
|
||||||
|
line_no = i
|
||||||
|
break
|
||||||
|
if (src, line_no) not in seen:
|
||||||
|
urls.append({"url": src, "line": line_no})
|
||||||
|
seen.add((src, line_no))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def load_images_from_urls(self, urls, cache=None):
|
||||||
import requests
|
import requests
|
||||||
soup = self.md_to_html(text)
|
from pathlib import Path
|
||||||
image_urls = self.get_picture_urls(soup)
|
|
||||||
|
cache = cache or {}
|
||||||
images = []
|
images = []
|
||||||
# Find all image URLs in text
|
for url in urls:
|
||||||
for url in image_urls:
|
if url in cache:
|
||||||
if not url:
|
if cache[url]:
|
||||||
|
images.append(cache[url])
|
||||||
continue
|
continue
|
||||||
|
img_obj = None
|
||||||
try:
|
try:
|
||||||
# check if the url is a local file or a remote URL
|
|
||||||
if url.startswith(('http://', 'https://')):
|
if url.startswith(('http://', 'https://')):
|
||||||
# For remote URLs, download the image
|
|
||||||
response = requests.get(url, stream=True, timeout=30)
|
response = requests.get(url, stream=True, timeout=30)
|
||||||
if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
|
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
||||||
img = Image.open(BytesIO(response.content)).convert('RGB')
|
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
||||||
images.append(img)
|
|
||||||
else:
|
else:
|
||||||
# For local file paths, open the image directly
|
|
||||||
from pathlib import Path
|
|
||||||
local_path = Path(url)
|
local_path = Path(url)
|
||||||
if not local_path.exists():
|
if local_path.exists():
|
||||||
|
img_obj = Image.open(url).convert('RGB')
|
||||||
|
else:
|
||||||
logging.warning(f"Local image file not found: {url}")
|
logging.warning(f"Local image file not found: {url}")
|
||||||
continue
|
|
||||||
img = Image.open(url).convert('RGB')
|
|
||||||
images.append(img)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to download/open image from {url}: {e}")
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
||||||
continue
|
cache[url] = img_obj
|
||||||
|
if img_obj:
|
||||||
|
images.append(img_obj)
|
||||||
|
return images, cache
|
||||||
|
|
||||||
return images if images else None
|
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
|
||||||
|
|
||||||
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
|
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding, errors="ignore")
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
|
|||||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||||
# extractor = MarkdownElementExtractor(remainder)
|
# extractor = MarkdownElementExtractor(remainder)
|
||||||
extractor = MarkdownElementExtractor(txt)
|
extractor = MarkdownElementExtractor(txt)
|
||||||
element_sections = extractor.extract_elements(delimiter)
|
image_refs = self.extract_image_urls_with_lines(txt)
|
||||||
sections = [(element, "") for element in element_sections]
|
element_sections = extractor.extract_elements(delimiter, include_meta=True)
|
||||||
|
|
||||||
|
sections = []
|
||||||
|
section_images = []
|
||||||
|
image_cache = {}
|
||||||
|
for element in element_sections:
|
||||||
|
content = element["content"]
|
||||||
|
start_line = element["start_line"]
|
||||||
|
end_line = element["end_line"]
|
||||||
|
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
|
||||||
|
imgs = []
|
||||||
|
if urls_in_section:
|
||||||
|
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
|
||||||
|
combined_image = None
|
||||||
|
if imgs:
|
||||||
|
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
|
||||||
|
sections.append((content, ""))
|
||||||
|
section_images.append(combined_image)
|
||||||
|
|
||||||
tbls = []
|
tbls = []
|
||||||
for table in tables:
|
for table in tables:
|
||||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||||
|
if return_section_images:
|
||||||
|
return sections, tbls, section_images
|
||||||
return sections, tbls
|
return sections, tbls
|
||||||
|
|
||||||
def load_from_xml_v2(baseURI, rels_item_xml):
|
def load_from_xml_v2(baseURI, rels_item_xml):
|
||||||
@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
parser_config = kwargs.get(
|
parser_config = kwargs.get(
|
||||||
"parser_config", {
|
"parser_config", {
|
||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||||
|
final_sections = False
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
@ -695,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
if parser_config.get("html4excel"):
|
if parser_config.get("html4excel"):
|
||||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
else:
|
else:
|
||||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||||
parser_config["chunk_token_num"] = 12800
|
|
||||||
|
|
||||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||||
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
sections, tables, section_images = markdown_parser(
|
||||||
|
filename,
|
||||||
|
binary,
|
||||||
|
separate_tables=False,
|
||||||
|
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
|
||||||
|
return_section_images=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
final_sections = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||||
@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
if vision_model:
|
if vision_model:
|
||||||
# Process images for each section
|
# Process images for each section
|
||||||
section_images = []
|
|
||||||
for idx, (section_text, _) in enumerate(sections):
|
for idx, (section_text, _) in enumerate(sections):
|
||||||
images = markdown_parser.get_pictures(section_text) if section_text else None
|
images = []
|
||||||
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
||||||
|
images.append(section_images[idx])
|
||||||
|
|
||||||
if images:
|
if images and len(images) > 0:
|
||||||
# If multiple images found, combine them using concat_img
|
# If multiple images found, combine them using concat_img
|
||||||
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||||
section_images.append(combined_image)
|
if section_images:
|
||||||
|
section_images[idx] = combined_image
|
||||||
|
else:
|
||||||
|
section_images = [None] * len(sections)
|
||||||
|
section_images[idx] = combined_image
|
||||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||||
boosted_figures = markdown_vision_parser(callback=callback)
|
boosted_figures = markdown_vision_parser(callback=callback)
|
||||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
||||||
else:
|
|
||||||
section_images.append(None)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||||
@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
if section_images:
|
if final_sections:
|
||||||
# if all images are None, set section_images to None
|
merged_chunks = []
|
||||||
if all(image is None for image in section_images):
|
merged_images = []
|
||||||
section_images = None
|
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
||||||
|
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
|
||||||
|
overlapped_percent = max(0, min(overlapped_percent, 90))
|
||||||
|
|
||||||
if section_images:
|
current_text = ""
|
||||||
chunks, images = naive_merge_with_images(sections, section_images,
|
current_tokens = 0
|
||||||
int(parser_config.get(
|
current_image = None
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
|
||||||
"delimiter", "\n!?。;!?"))
|
for idx, sec in enumerate(sections):
|
||||||
|
text = sec[0] if isinstance(sec, tuple) else sec
|
||||||
|
sec_tokens = num_tokens_from_string(text)
|
||||||
|
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
|
||||||
|
|
||||||
|
if current_text and current_tokens + sec_tokens > chunk_limit:
|
||||||
|
merged_chunks.append(current_text)
|
||||||
|
merged_images.append(current_image)
|
||||||
|
overlap_part = ""
|
||||||
|
if overlapped_percent > 0:
|
||||||
|
overlap_len = int(len(current_text) * overlapped_percent / 100)
|
||||||
|
if overlap_len > 0:
|
||||||
|
overlap_part = current_text[-overlap_len:]
|
||||||
|
current_text = overlap_part
|
||||||
|
current_tokens = num_tokens_from_string(current_text)
|
||||||
|
current_image = current_image if overlap_part else None
|
||||||
|
|
||||||
|
if current_text:
|
||||||
|
current_text += "\n" + text
|
||||||
|
else:
|
||||||
|
current_text = text
|
||||||
|
current_tokens += sec_tokens
|
||||||
|
|
||||||
|
if sec_image:
|
||||||
|
current_image = concat_img(current_image, sec_image) if current_image else sec_image
|
||||||
|
|
||||||
|
if current_text:
|
||||||
|
merged_chunks.append(current_text)
|
||||||
|
merged_images.append(current_image)
|
||||||
|
|
||||||
|
chunks = merged_chunks
|
||||||
|
has_images = merged_images and any(img is not None for img in merged_images)
|
||||||
if kwargs.get("section_only", False):
|
if kwargs.get("section_only", False):
|
||||||
chunks.extend(embed_res)
|
chunks.extend(embed_res)
|
||||||
return chunks
|
return chunks
|
||||||
|
if has_images:
|
||||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
|
||||||
|
else:
|
||||||
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
else:
|
else:
|
||||||
chunks = naive_merge(
|
if section_images:
|
||||||
sections, int(parser_config.get(
|
if all(image is None for image in section_images):
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
section_images = None
|
||||||
"delimiter", "\n!?。;!?"))
|
|
||||||
if kwargs.get("section_only", False):
|
|
||||||
chunks.extend(embed_res)
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
if section_images:
|
||||||
|
chunks, images = naive_merge_with_images(sections, section_images,
|
||||||
|
int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
chunks.extend(embed_res)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
||||||
|
else:
|
||||||
|
chunks = naive_merge(
|
||||||
|
sections, int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
chunks.extend(embed_res)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
|
|
||||||
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
||||||
for index, url in enumerate(urls):
|
for index, url in enumerate(urls):
|
||||||
|
|||||||
@ -482,17 +482,25 @@ class Parser(ProcessBase):
|
|||||||
self.set_output("output_format", conf["output_format"])
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
markdown_parser = naive_markdown_parser()
|
markdown_parser = naive_markdown_parser()
|
||||||
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
sections, tables, section_images = markdown_parser(
|
||||||
|
name,
|
||||||
|
blob,
|
||||||
|
separate_tables=False,
|
||||||
|
delimiter=conf.get("delimiter"),
|
||||||
|
return_section_images=True,
|
||||||
|
)
|
||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
json_results = []
|
json_results = []
|
||||||
|
|
||||||
for section_text, _ in sections:
|
for idx, (section_text, _) in enumerate(sections):
|
||||||
json_result = {
|
json_result = {
|
||||||
"text": section_text,
|
"text": section_text,
|
||||||
}
|
}
|
||||||
|
|
||||||
images = markdown_parser.get_pictures(section_text) if section_text else None
|
images = []
|
||||||
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
||||||
|
images.append(section_images[idx])
|
||||||
if images:
|
if images:
|
||||||
# If multiple images found, combine them using concat_img
|
# If multiple images found, combine them using concat_img
|
||||||
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||||
|
|||||||
@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
|||||||
tk_nums.append(num_tokens_from_string(text))
|
tk_nums.append(num_tokens_from_string(text))
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
for sec, pos in sections:
|
for sec, pos in sections:
|
||||||
if num_tokens_from_string(sec) < chunk_token_num:
|
add_chunk("\n"+sec, pos)
|
||||||
add_chunk("\n"+sec, pos)
|
|
||||||
continue
|
|
||||||
split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
|
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, pos)
|
|
||||||
|
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
|||||||
tk_nums.append(num_tokens_from_string(text_seg))
|
tk_nums.append(num_tokens_from_string(text_seg))
|
||||||
return cks, result_images
|
return cks, result_images
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
for text, image in zip(texts, images):
|
for text, image in zip(texts, images):
|
||||||
# if text is tuple, unpack it
|
# if text is tuple, unpack it
|
||||||
if isinstance(text, tuple):
|
if isinstance(text, tuple):
|
||||||
text_str = text[0]
|
text_str = text[0]
|
||||||
text_pos = text[1] if len(text) > 1 else ""
|
text_pos = text[1] if len(text) > 1 else ""
|
||||||
split_sec = re.split(r"(%s)" % dels, text_str)
|
add_chunk("\n"+text_str, image, text_pos)
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, image, text_pos)
|
|
||||||
else:
|
else:
|
||||||
split_sec = re.split(r"(%s)" % dels, text)
|
add_chunk("\n"+text, image)
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, image)
|
|
||||||
|
|
||||||
return cks, result_images
|
return cks, result_images
|
||||||
|
|
||||||
|
|
||||||
def docx_question_level(p, bull=-1):
|
def docx_question_level(p, bull=-1):
|
||||||
txt = re.sub(r"\u3000", " ", p.text).strip()
|
txt = re.sub(r"\u3000", " ", p.text).strip()
|
||||||
if p.style.name.startswith('Heading'):
|
if p.style.name.startswith('Heading'):
|
||||||
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||||||
tk_nums.append(num_tokens_from_string(text_seg))
|
tk_nums.append(num_tokens_from_string(text_seg))
|
||||||
return cks, images
|
return cks, images
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
pattern = r"(%s)" % dels
|
|
||||||
|
|
||||||
for sec, image in sections:
|
for sec, image in sections:
|
||||||
split_sec = re.split(pattern, sec)
|
add_chunk("\n" + sec, image, "")
|
||||||
for sub_sec in split_sec:
|
|
||||||
if not sub_sec or re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n" + sub_sec, image, "")
|
|
||||||
|
|
||||||
return cks, images
|
return cks, images
|
||||||
|
|
||||||
@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):
|
|||||||
|
|
||||||
return dels_pattern
|
return dels_pattern
|
||||||
|
|
||||||
|
|
||||||
class Node:
|
class Node:
|
||||||
def __init__(self, level, depth=-1, texts=None):
|
def __init__(self, level, depth=-1, texts=None):
|
||||||
self.level = level
|
self.level = level
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
import message from '@/components/ui/message';
|
import message from '@/components/ui/message';
|
||||||
import { Spin } from '@/components/ui/spin';
|
import { Spin } from '@/components/ui/spin';
|
||||||
|
import { Authorization } from '@/constants/authorization';
|
||||||
|
import { getAuthorization } from '@/utils/authorization-util';
|
||||||
import request from '@/utils/request';
|
import request from '@/utils/request';
|
||||||
import classNames from 'classnames';
|
import classNames from 'classnames';
|
||||||
import mammoth from 'mammoth';
|
import mammoth from 'mammoth';
|
||||||
@ -22,6 +24,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|||||||
const res = await request(url, {
|
const res = await request(url, {
|
||||||
method: 'GET',
|
method: 'GET',
|
||||||
responseType: 'blob',
|
responseType: 'blob',
|
||||||
|
headers: { [Authorization]: getAuthorization() },
|
||||||
onError: () => {
|
onError: () => {
|
||||||
message.error('Document parsing failed');
|
message.error('Document parsing failed');
|
||||||
console.error('Error loading document:', url);
|
console.error('Error loading document:', url);
|
||||||
@ -1,5 +1,6 @@
|
|||||||
import { useFetchExcel } from '@/pages/document-viewer/hooks';
|
// import { useFetchExcel } from '@/pages/document-viewer/hooks';
|
||||||
import classNames from 'classnames';
|
import classNames from 'classnames';
|
||||||
|
import { useFetchExcel } from './hooks';
|
||||||
|
|
||||||
interface ExcelCsvPreviewerProps {
|
interface ExcelCsvPreviewerProps {
|
||||||
className?: string;
|
className?: string;
|
||||||
@ -1,9 +1,67 @@
|
|||||||
import { Authorization } from '@/constants/authorization';
|
import { Authorization } from '@/constants/authorization';
|
||||||
|
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
|
||||||
|
import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
|
||||||
|
import api, { api_host } from '@/utils/api';
|
||||||
import { getAuthorization } from '@/utils/authorization-util';
|
import { getAuthorization } from '@/utils/authorization-util';
|
||||||
import jsPreviewExcel from '@js-preview/excel';
|
import jsPreviewExcel from '@js-preview/excel';
|
||||||
|
import { useSize } from 'ahooks';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import mammoth from 'mammoth';
|
import mammoth from 'mammoth';
|
||||||
import { useCallback, useEffect, useRef, useState } from 'react';
|
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||||
|
|
||||||
|
export const useDocumentResizeObserver = () => {
|
||||||
|
const [containerWidth, setContainerWidth] = useState<number>();
|
||||||
|
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
|
||||||
|
const size = useSize(containerRef);
|
||||||
|
|
||||||
|
const onResize = useCallback((width?: number) => {
|
||||||
|
if (width) {
|
||||||
|
setContainerWidth(width);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
onResize(size?.width);
|
||||||
|
}, [size?.width, onResize]);
|
||||||
|
|
||||||
|
return { containerWidth, setContainerRef };
|
||||||
|
};
|
||||||
|
|
||||||
|
function highlightPattern(text: string, pattern: string, pageNumber: number) {
|
||||||
|
if (pageNumber === 2) {
|
||||||
|
return `<mark>${text}</mark>`;
|
||||||
|
}
|
||||||
|
if (text.trim() !== '' && pattern.match(text)) {
|
||||||
|
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
|
||||||
|
return `<mark>${text}</mark>`;
|
||||||
|
}
|
||||||
|
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
|
||||||
|
}
|
||||||
|
|
||||||
|
export const useHighlightText = (searchText: string = '') => {
|
||||||
|
const textRenderer = useCallback(
|
||||||
|
(textItem: any) => {
|
||||||
|
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
|
||||||
|
},
|
||||||
|
[searchText],
|
||||||
|
);
|
||||||
|
|
||||||
|
return textRenderer;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const useGetDocumentUrl = (isAgent: boolean) => {
|
||||||
|
const { documentId } = useGetKnowledgeSearchParams();
|
||||||
|
const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
|
||||||
|
|
||||||
|
const url = useMemo(() => {
|
||||||
|
if (isAgent) {
|
||||||
|
return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
|
||||||
|
}
|
||||||
|
return `${api_host}/document/get/${documentId}`;
|
||||||
|
}, [createdBy, documentId, id, isAgent]);
|
||||||
|
|
||||||
|
return url;
|
||||||
|
};
|
||||||
|
|
||||||
export const useCatchError = (api: string) => {
|
export const useCatchError = (api: string) => {
|
||||||
const [error, setError] = useState('');
|
const [error, setError] = useState('');
|
||||||
@ -1,5 +1,7 @@
|
|||||||
import message from '@/components/ui/message';
|
import message from '@/components/ui/message';
|
||||||
import { Spin } from '@/components/ui/spin';
|
import { Spin } from '@/components/ui/spin';
|
||||||
|
import { Authorization } from '@/constants/authorization';
|
||||||
|
import { getAuthorization } from '@/utils/authorization-util';
|
||||||
import request from '@/utils/request';
|
import request from '@/utils/request';
|
||||||
import classNames from 'classnames';
|
import classNames from 'classnames';
|
||||||
import { useEffect, useState } from 'react';
|
import { useEffect, useState } from 'react';
|
||||||
@ -22,6 +24,7 @@ export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
|
|||||||
const res = await request(url, {
|
const res = await request(url, {
|
||||||
method: 'GET',
|
method: 'GET',
|
||||||
responseType: 'blob',
|
responseType: 'blob',
|
||||||
|
headers: { [Authorization]: getAuthorization() },
|
||||||
onError: () => {
|
onError: () => {
|
||||||
message.error('Failed to load image');
|
message.error('Failed to load image');
|
||||||
setIsLoading(false);
|
setIsLoading(false);
|
||||||
@ -4,7 +4,7 @@ import CSVFileViewer from './csv-preview';
|
|||||||
import { DocPreviewer } from './doc-preview';
|
import { DocPreviewer } from './doc-preview';
|
||||||
import { ExcelCsvPreviewer } from './excel-preview';
|
import { ExcelCsvPreviewer } from './excel-preview';
|
||||||
import { ImagePreviewer } from './image-preview';
|
import { ImagePreviewer } from './image-preview';
|
||||||
import styles from './index.less';
|
import { Md } from './md';
|
||||||
import PdfPreviewer, { IProps } from './pdf-preview';
|
import PdfPreviewer, { IProps } from './pdf-preview';
|
||||||
import { PptPreviewer } from './ppt-preview';
|
import { PptPreviewer } from './ppt-preview';
|
||||||
import { TxtPreviewer } from './txt-preview';
|
import { TxtPreviewer } from './txt-preview';
|
||||||
@ -25,7 +25,7 @@ const Preview = ({
|
|||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
{fileType === 'pdf' && highlights && setWidthAndHeight && (
|
{fileType === 'pdf' && highlights && setWidthAndHeight && (
|
||||||
<section className={styles.documentPreview}>
|
<section>
|
||||||
<PdfPreviewer
|
<PdfPreviewer
|
||||||
highlights={highlights}
|
highlights={highlights}
|
||||||
setWidthAndHeight={setWidthAndHeight}
|
setWidthAndHeight={setWidthAndHeight}
|
||||||
@ -38,7 +38,7 @@ const Preview = ({
|
|||||||
<DocPreviewer className={className} url={url} />
|
<DocPreviewer className={className} url={url} />
|
||||||
</section>
|
</section>
|
||||||
)}
|
)}
|
||||||
{['txt', 'md'].indexOf(fileType) > -1 && (
|
{['txt'].indexOf(fileType) > -1 && (
|
||||||
<section>
|
<section>
|
||||||
<TxtPreviewer className={className} url={url} />
|
<TxtPreviewer className={className} url={url} />
|
||||||
</section>
|
</section>
|
||||||
@ -82,6 +82,11 @@ const Preview = ({
|
|||||||
<CSVFileViewer className={className} url={url} />
|
<CSVFileViewer className={className} url={url} />
|
||||||
</section>
|
</section>
|
||||||
)}
|
)}
|
||||||
|
{['md'].indexOf(fileType) > -1 && (
|
||||||
|
<section>
|
||||||
|
<Md className={className} url={url} />
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
@ -1,31 +1,39 @@
|
|||||||
|
import { Authorization } from '@/constants/authorization';
|
||||||
|
import { cn } from '@/lib/utils';
|
||||||
|
import FileError from '@/pages/document-viewer/file-error';
|
||||||
|
import { getAuthorization } from '@/utils/authorization-util';
|
||||||
import React, { useEffect, useState } from 'react';
|
import React, { useEffect, useState } from 'react';
|
||||||
import ReactMarkdown from 'react-markdown';
|
import ReactMarkdown from 'react-markdown';
|
||||||
import remarkGfm from 'remark-gfm';
|
import remarkGfm from 'remark-gfm';
|
||||||
import FileError from '../file-error';
|
|
||||||
|
|
||||||
interface MdProps {
|
interface MdProps {
|
||||||
filePath: string;
|
// filePath: string;
|
||||||
|
className?: string;
|
||||||
|
url: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Md: React.FC<MdProps> = ({ filePath }) => {
|
export const Md: React.FC<MdProps> = ({ url, className }) => {
|
||||||
const [content, setContent] = useState<string>('');
|
const [content, setContent] = useState<string>('');
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
setError(null);
|
setError(null);
|
||||||
fetch(filePath)
|
fetch(url, { headers: { [Authorization]: getAuthorization() } })
|
||||||
.then((res) => {
|
.then((res) => {
|
||||||
if (!res.ok) throw new Error('Failed to fetch markdown file');
|
if (!res.ok) throw new Error('Failed to fetch markdown file');
|
||||||
return res.text();
|
return res.text();
|
||||||
})
|
})
|
||||||
.then((text) => setContent(text))
|
.then((text) => setContent(text))
|
||||||
.catch((err) => setError(err.message));
|
.catch((err) => setError(err.message));
|
||||||
}, [filePath]);
|
}, [url]);
|
||||||
|
|
||||||
if (error) return <FileError>{error}</FileError>;
|
if (error) return <FileError>{error}</FileError>;
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
|
<div
|
||||||
|
style={{ padding: 4, overflow: 'scroll' }}
|
||||||
|
className={cn(className, 'markdown-body h-[calc(100vh - 200px)]')}
|
||||||
|
>
|
||||||
<ReactMarkdown remarkPlugins={[remarkGfm]}>{content}</ReactMarkdown>
|
<ReactMarkdown remarkPlugins={[remarkGfm]}>{content}</ReactMarkdown>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
@ -10,13 +10,21 @@ import {
|
|||||||
|
|
||||||
import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
|
import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
|
||||||
import { Spin } from '@/components/ui/spin';
|
import { Spin } from '@/components/ui/spin';
|
||||||
|
// import FileError from '@/pages/document-viewer/file-error';
|
||||||
|
import { Authorization } from '@/constants/authorization';
|
||||||
import FileError from '@/pages/document-viewer/file-error';
|
import FileError from '@/pages/document-viewer/file-error';
|
||||||
|
import { getAuthorization } from '@/utils/authorization-util';
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
|
type PdfLoaderProps = React.ComponentProps<typeof PdfLoader> & {
|
||||||
|
httpHeaders?: Record<string, string>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const Loader = PdfLoader as React.ComponentType<PdfLoaderProps>;
|
||||||
export interface IProps {
|
export interface IProps {
|
||||||
highlights: IHighlight[];
|
highlights?: IHighlight[];
|
||||||
setWidthAndHeight: (width: number, height: number) => void;
|
setWidthAndHeight?: (width: number, height: number) => void;
|
||||||
url: string;
|
url: string;
|
||||||
|
className?: string;
|
||||||
}
|
}
|
||||||
const HighlightPopup = ({
|
const HighlightPopup = ({
|
||||||
comment,
|
comment,
|
||||||
@ -30,7 +38,12 @@ const HighlightPopup = ({
|
|||||||
) : null;
|
) : null;
|
||||||
|
|
||||||
// TODO: merge with DocumentPreviewer
|
// TODO: merge with DocumentPreviewer
|
||||||
const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
|
const PdfPreview = ({
|
||||||
|
highlights: state,
|
||||||
|
setWidthAndHeight,
|
||||||
|
url,
|
||||||
|
className,
|
||||||
|
}: IProps) => {
|
||||||
// const url = useGetDocumentUrl();
|
// const url = useGetDocumentUrl();
|
||||||
|
|
||||||
const ref = useRef<(highlight: IHighlight) => void>(() => {});
|
const ref = useRef<(highlight: IHighlight) => void>(() => {});
|
||||||
@ -39,17 +52,22 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
|
|||||||
const resetHash = () => {};
|
const resetHash = () => {};
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (state.length > 0) {
|
if (state?.length && state?.length > 0) {
|
||||||
ref?.current(state[0]);
|
ref?.current(state[0]);
|
||||||
}
|
}
|
||||||
}, [state]);
|
}, [state]);
|
||||||
|
|
||||||
|
const httpHeaders = {
|
||||||
|
[Authorization]: getAuthorization(),
|
||||||
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
className={`${styles.documentContainer} rounded-[10px] overflow-hidden `}
|
className={`${styles.documentContainer} rounded-[10px] overflow-hidden ${className}`}
|
||||||
>
|
>
|
||||||
<PdfLoader
|
<Loader
|
||||||
url={url}
|
url={url}
|
||||||
|
httpHeaders={httpHeaders}
|
||||||
beforeLoad={
|
beforeLoad={
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
<div className="absolute inset-0 flex items-center justify-center">
|
||||||
<Spin />
|
<Spin />
|
||||||
@ -63,7 +81,7 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
|
|||||||
const viewport = page.getViewport({ scale: 1 });
|
const viewport = page.getViewport({ scale: 1 });
|
||||||
const width = viewport.width;
|
const width = viewport.width;
|
||||||
const height = viewport.height;
|
const height = viewport.height;
|
||||||
setWidthAndHeight(width, height);
|
setWidthAndHeight?.(width, height);
|
||||||
});
|
});
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@ -115,11 +133,11 @@ const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
|
|||||||
</Popup>
|
</Popup>
|
||||||
);
|
);
|
||||||
}}
|
}}
|
||||||
highlights={state}
|
highlights={state || []}
|
||||||
/>
|
/>
|
||||||
);
|
);
|
||||||
}}
|
}}
|
||||||
</PdfLoader>
|
</Loader>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
@ -148,7 +148,7 @@ export const Images = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
// Without FileViewer
|
// Without FileViewer
|
||||||
export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', ...Images];
|
export const ExceptiveType = ['xlsx', 'xls', 'pdf', 'docx', 'md', ...Images];
|
||||||
|
|
||||||
export const SupportedPreviewDocumentTypes = [...ExceptiveType];
|
export const SupportedPreviewDocumentTypes = [...ExceptiveType];
|
||||||
//#endregion
|
//#endregion
|
||||||
|
|||||||
@ -1,14 +1,13 @@
|
|||||||
import { Input } from '@/components/originui/input';
|
|
||||||
import { Button } from '@/components/ui/button';
|
import { Button } from '@/components/ui/button';
|
||||||
|
import { SearchInput } from '@/components/ui/input';
|
||||||
import {
|
import {
|
||||||
Popover,
|
Popover,
|
||||||
PopoverContent,
|
PopoverContent,
|
||||||
PopoverTrigger,
|
PopoverTrigger,
|
||||||
} from '@/components/ui/popover';
|
} from '@/components/ui/popover';
|
||||||
import { Radio } from '@/components/ui/radio';
|
import { Radio } from '@/components/ui/radio';
|
||||||
|
import { Segmented } from '@/components/ui/segmented';
|
||||||
import { useTranslate } from '@/hooks/common-hooks';
|
import { useTranslate } from '@/hooks/common-hooks';
|
||||||
import { cn } from '@/lib/utils';
|
|
||||||
import { SearchOutlined } from '@ant-design/icons';
|
|
||||||
import { ListFilter, Plus } from 'lucide-react';
|
import { ListFilter, Plus } from 'lucide-react';
|
||||||
import { useState } from 'react';
|
import { useState } from 'react';
|
||||||
import { ChunkTextMode } from '../../constant';
|
import { ChunkTextMode } from '../../constant';
|
||||||
@ -61,46 +60,43 @@ export default ({
|
|||||||
};
|
};
|
||||||
return (
|
return (
|
||||||
<div className="flex pr-[25px]">
|
<div className="flex pr-[25px]">
|
||||||
<div className="flex items-center gap-4 bg-bg-card text-muted-foreground w-fit h-[35px] rounded-md px-4 py-2">
|
<Segmented
|
||||||
{textSelectOptions.map((option) => (
|
options={textSelectOptions}
|
||||||
<div
|
value={textSelectValue}
|
||||||
key={option.value}
|
onChange={changeTextSelectValue}
|
||||||
className={cn('flex items-center cursor-pointer', {
|
|
||||||
'text-primary': option.value === textSelectValue,
|
|
||||||
})}
|
|
||||||
onClick={() => changeTextSelectValue(option.value)}
|
|
||||||
>
|
|
||||||
{option.label}
|
|
||||||
</div>
|
|
||||||
))}
|
|
||||||
</div>
|
|
||||||
<div className="ml-auto"></div>
|
|
||||||
<Input
|
|
||||||
className="bg-bg-card text-muted-foreground"
|
|
||||||
style={{ width: 200 }}
|
|
||||||
placeholder={t('search')}
|
|
||||||
icon={<SearchOutlined />}
|
|
||||||
onChange={handleInputChange}
|
|
||||||
value={searchString}
|
|
||||||
/>
|
/>
|
||||||
<div className="w-[20px]"></div>
|
<div className="ml-auto"></div>
|
||||||
<Popover>
|
<div className="h-8 flex items-center gap-5">
|
||||||
<PopoverTrigger asChild>
|
<SearchInput
|
||||||
<Button className="bg-bg-card text-muted-foreground hover:bg-card">
|
// style={{ width: 200 }}
|
||||||
<ListFilter />
|
placeholder={t('search')}
|
||||||
</Button>
|
// icon={<SearchOutlined />}
|
||||||
</PopoverTrigger>
|
onChange={handleInputChange}
|
||||||
<PopoverContent className="p-0 w-[200px]">
|
value={searchString}
|
||||||
{filterContent}
|
/>
|
||||||
</PopoverContent>
|
<Popover>
|
||||||
</Popover>
|
<PopoverTrigger asChild>
|
||||||
<div className="w-[20px]"></div>
|
<Button
|
||||||
<Button
|
variant={'ghost'}
|
||||||
onClick={() => createChunk()}
|
// className="bg-bg-card text-text-secondary hover:bg-card"
|
||||||
className="bg-bg-card text-primary hover:bg-card"
|
>
|
||||||
>
|
<ListFilter />
|
||||||
<Plus size={44} />
|
</Button>
|
||||||
</Button>
|
</PopoverTrigger>
|
||||||
|
<PopoverContent className="p-0 w-[200px]">
|
||||||
|
{filterContent}
|
||||||
|
</PopoverContent>
|
||||||
|
</Popover>
|
||||||
|
<Button
|
||||||
|
variant={'ghost'}
|
||||||
|
onClick={() => createChunk()}
|
||||||
|
// className="bg-bg-card text-primary hover:bg-card"
|
||||||
|
>
|
||||||
|
<Plus size={44} />
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
{/* <div className="w-[20px]"></div>
|
||||||
|
<div className="w-[20px]"></div> */}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
import { formatDate } from '@/utils/date';
|
|
||||||
import { formatBytes } from '@/utils/file-util';
|
|
||||||
|
|
||||||
type Props = {
|
|
||||||
size: number;
|
|
||||||
name: string;
|
|
||||||
create_date: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export default ({ size, name, create_date }: Props) => {
|
|
||||||
const sizeName = formatBytes(size);
|
|
||||||
const dateStr = formatDate(create_date);
|
|
||||||
return (
|
|
||||||
<div>
|
|
||||||
<h2 className="text-[24px]">{name}</h2>
|
|
||||||
<div className="text-[#979AAB] pt-[5px]">
|
|
||||||
Size:{sizeName} Uploaded Time:{dateStr}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
import { useFetchExcel } from '@/pages/document-viewer/hooks';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
|
|
||||||
interface ExcelCsvPreviewerProps {
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const ExcelCsvPreviewer: React.FC<ExcelCsvPreviewerProps> = ({
|
|
||||||
className,
|
|
||||||
url,
|
|
||||||
}) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const { containerRef } = useFetchExcel(url);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
ref={containerRef}
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md excel-csv-previewer',
|
|
||||||
className,
|
|
||||||
)}
|
|
||||||
></div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -1,55 +0,0 @@
|
|||||||
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
|
|
||||||
import { api_host } from '@/utils/api';
|
|
||||||
import { useSize } from 'ahooks';
|
|
||||||
import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
|
|
||||||
import { useCallback, useEffect, useMemo, useState } from 'react';
|
|
||||||
|
|
||||||
export const useDocumentResizeObserver = () => {
|
|
||||||
const [containerWidth, setContainerWidth] = useState<number>();
|
|
||||||
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
|
|
||||||
const size = useSize(containerRef);
|
|
||||||
|
|
||||||
const onResize = useCallback((width?: number) => {
|
|
||||||
if (width) {
|
|
||||||
setContainerWidth(width);
|
|
||||||
}
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
onResize(size?.width);
|
|
||||||
}, [size?.width, onResize]);
|
|
||||||
|
|
||||||
return { containerWidth, setContainerRef };
|
|
||||||
};
|
|
||||||
|
|
||||||
function highlightPattern(text: string, pattern: string, pageNumber: number) {
|
|
||||||
if (pageNumber === 2) {
|
|
||||||
return `<mark>${text}</mark>`;
|
|
||||||
}
|
|
||||||
if (text.trim() !== '' && pattern.match(text)) {
|
|
||||||
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
|
|
||||||
return `<mark>${text}</mark>`;
|
|
||||||
}
|
|
||||||
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
|
|
||||||
}
|
|
||||||
|
|
||||||
export const useHighlightText = (searchText: string = '') => {
|
|
||||||
const textRenderer: CustomTextRenderer = useCallback(
|
|
||||||
(textItem) => {
|
|
||||||
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
|
|
||||||
},
|
|
||||||
[searchText],
|
|
||||||
);
|
|
||||||
|
|
||||||
return textRenderer;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const useGetDocumentUrl = () => {
|
|
||||||
const { documentId } = useGetKnowledgeSearchParams();
|
|
||||||
|
|
||||||
const url = useMemo(() => {
|
|
||||||
return `${api_host}/document/get/${documentId}`;
|
|
||||||
}, [documentId]);
|
|
||||||
|
|
||||||
return url;
|
|
||||||
};
|
|
||||||
@ -1,74 +0,0 @@
|
|||||||
import message from '@/components/ui/message';
|
|
||||||
import { Spin } from '@/components/ui/spin';
|
|
||||||
import request from '@/utils/request';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
import { useCallback, useEffect, useState } from 'react';
|
|
||||||
|
|
||||||
interface ImagePreviewerProps {
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const ImagePreviewer: React.FC<ImagePreviewerProps> = ({
|
|
||||||
className,
|
|
||||||
url,
|
|
||||||
}) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const [imageSrc, setImageSrc] = useState<string | null>(null);
|
|
||||||
const [isLoading, setIsLoading] = useState<boolean>(true);
|
|
||||||
|
|
||||||
const fetchImage = useCallback(async () => {
|
|
||||||
setIsLoading(true);
|
|
||||||
const res = await request(url, {
|
|
||||||
method: 'GET',
|
|
||||||
responseType: 'blob',
|
|
||||||
onError: () => {
|
|
||||||
message.error('Failed to load image');
|
|
||||||
setIsLoading(false);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
const objectUrl = URL.createObjectURL(res.data);
|
|
||||||
setImageSrc(objectUrl);
|
|
||||||
setIsLoading(false);
|
|
||||||
}, [url]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (url) {
|
|
||||||
fetchImage();
|
|
||||||
}
|
|
||||||
}, [url, fetchImage]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
return () => {
|
|
||||||
if (imageSrc) {
|
|
||||||
URL.revokeObjectURL(imageSrc);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}, [imageSrc]);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md image-previewer',
|
|
||||||
className,
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
{isLoading && (
|
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{!isLoading && imageSrc && (
|
|
||||||
<div className="max-h-[80vh] overflow-auto p-2">
|
|
||||||
<img
|
|
||||||
src={imageSrc}
|
|
||||||
alt={'image'}
|
|
||||||
className="w-full h-auto max-w-full object-contain"
|
|
||||||
onLoad={() => URL.revokeObjectURL(imageSrc!)}
|
|
||||||
/>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -7,7 +7,6 @@ import { useCallback, useEffect, useMemo, useState } from 'react';
|
|||||||
import { useTranslation } from 'react-i18next';
|
import { useTranslation } from 'react-i18next';
|
||||||
import ChunkCard from './components/chunk-card';
|
import ChunkCard from './components/chunk-card';
|
||||||
import CreatingModal from './components/chunk-creating-modal';
|
import CreatingModal from './components/chunk-creating-modal';
|
||||||
import DocumentPreview from './components/document-preview';
|
|
||||||
import {
|
import {
|
||||||
useChangeChunkTextMode,
|
useChangeChunkTextMode,
|
||||||
useDeleteChunkByIds,
|
useDeleteChunkByIds,
|
||||||
@ -18,8 +17,11 @@ import {
|
|||||||
|
|
||||||
import ChunkResultBar from './components/chunk-result-bar';
|
import ChunkResultBar from './components/chunk-result-bar';
|
||||||
import CheckboxSets from './components/chunk-result-bar/checkbox-sets';
|
import CheckboxSets from './components/chunk-result-bar/checkbox-sets';
|
||||||
import DocumentHeader from './components/document-preview/document-header';
|
// import DocumentHeader from './components/document-preview/document-header';
|
||||||
|
|
||||||
|
import DocumentPreview from '@/components/document-preview';
|
||||||
|
import DocumentHeader from '@/components/document-preview/document-header';
|
||||||
|
import { useGetDocumentUrl } from '@/components/document-preview/hooks';
|
||||||
import { PageHeader } from '@/components/page-header';
|
import { PageHeader } from '@/components/page-header';
|
||||||
import {
|
import {
|
||||||
Breadcrumb,
|
Breadcrumb,
|
||||||
@ -40,7 +42,6 @@ import {
|
|||||||
useNavigatePage,
|
useNavigatePage,
|
||||||
} from '@/hooks/logic-hooks/navigate-hooks';
|
} from '@/hooks/logic-hooks/navigate-hooks';
|
||||||
import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-request';
|
import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-request';
|
||||||
import { useGetDocumentUrl } from './components/document-preview/hooks';
|
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
|
|
||||||
const Chunk = () => {
|
const Chunk = () => {
|
||||||
@ -74,7 +75,7 @@ const Chunk = () => {
|
|||||||
} = useUpdateChunk();
|
} = useUpdateChunk();
|
||||||
const { navigateToDataFile, getQueryString, navigateToDatasetList } =
|
const { navigateToDataFile, getQueryString, navigateToDatasetList } =
|
||||||
useNavigatePage();
|
useNavigatePage();
|
||||||
const fileUrl = useGetDocumentUrl();
|
const fileUrl = useGetDocumentUrl(false);
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
setChunkList(data);
|
setChunkList(data);
|
||||||
}, [data]);
|
}, [data]);
|
||||||
|
|||||||
@ -1,114 +0,0 @@
|
|||||||
import message from '@/components/ui/message';
|
|
||||||
import { Spin } from '@/components/ui/spin';
|
|
||||||
import request from '@/utils/request';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
import React, { useEffect, useRef, useState } from 'react';
|
|
||||||
|
|
||||||
interface CSVData {
|
|
||||||
rows: string[][];
|
|
||||||
headers: string[];
|
|
||||||
}
|
|
||||||
|
|
||||||
interface FileViewerProps {
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const CSVFileViewer: React.FC<FileViewerProps> = ({ url }) => {
|
|
||||||
const [data, setData] = useState<CSVData | null>(null);
|
|
||||||
const [isLoading, setIsLoading] = useState<boolean>(true);
|
|
||||||
const containerRef = useRef<HTMLDivElement>(null);
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const parseCSV = (csvText: string): CSVData => {
|
|
||||||
console.log('Parsing CSV data:', csvText);
|
|
||||||
const lines = csvText.split('\n');
|
|
||||||
const headers = lines[0].split(',').map((header) => header.trim());
|
|
||||||
const rows = lines
|
|
||||||
.slice(1)
|
|
||||||
.map((line) => line.split(',').map((cell) => cell.trim()));
|
|
||||||
|
|
||||||
return { headers, rows };
|
|
||||||
};
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
const loadCSV = async () => {
|
|
||||||
try {
|
|
||||||
const res = await request(url, {
|
|
||||||
method: 'GET',
|
|
||||||
responseType: 'blob',
|
|
||||||
onError: () => {
|
|
||||||
message.error('file load failed');
|
|
||||||
setIsLoading(false);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// parse CSV file
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.readAsText(res.data);
|
|
||||||
reader.onload = () => {
|
|
||||||
const parsedData = parseCSV(reader.result as string);
|
|
||||||
console.log('file loaded successfully', reader.result);
|
|
||||||
setData(parsedData);
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
message.error('CSV file parse failed');
|
|
||||||
console.error('Error loading CSV file:', error);
|
|
||||||
} finally {
|
|
||||||
setIsLoading(false);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
loadCSV();
|
|
||||||
|
|
||||||
return () => {
|
|
||||||
setData(null);
|
|
||||||
};
|
|
||||||
}, [url]);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
ref={containerRef}
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
|
|
||||||
'overflow-auto max-h-[80vh] p-2',
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
{isLoading ? (
|
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
) : data ? (
|
|
||||||
<table className="min-w-full divide-y divide-border-normal">
|
|
||||||
<thead className="bg-background-header-bar">
|
|
||||||
<tr>
|
|
||||||
{data.headers.map((header, index) => (
|
|
||||||
<th
|
|
||||||
key={`header-${index}`}
|
|
||||||
className="px-6 py-3 text-left text-sm font-medium text-text-primary"
|
|
||||||
>
|
|
||||||
{header}
|
|
||||||
</th>
|
|
||||||
))}
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody className="bg-background-paper divide-y divide-border-normal">
|
|
||||||
{data.rows.map((row, rowIndex) => (
|
|
||||||
<tr key={`row-${rowIndex}`}>
|
|
||||||
{row.map((cell, cellIndex) => (
|
|
||||||
<td
|
|
||||||
key={`cell-${rowIndex}-${cellIndex}`}
|
|
||||||
className="px-6 py-4 whitespace-nowrap text-sm text-text-secondary"
|
|
||||||
>
|
|
||||||
{cell || '-'}
|
|
||||||
</td>
|
|
||||||
))}
|
|
||||||
</tr>
|
|
||||||
))}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
) : null}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default CSVFileViewer;
|
|
||||||
@ -1,70 +0,0 @@
|
|||||||
import message from '@/components/ui/message';
|
|
||||||
import { Spin } from '@/components/ui/spin';
|
|
||||||
import request from '@/utils/request';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
import mammoth from 'mammoth';
|
|
||||||
import { useEffect, useState } from 'react';
|
|
||||||
|
|
||||||
interface DocPreviewerProps {
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|
||||||
className,
|
|
||||||
url,
|
|
||||||
}) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const [htmlContent, setHtmlContent] = useState<string>('');
|
|
||||||
const [loading, setLoading] = useState(false);
|
|
||||||
const fetchDocument = async () => {
|
|
||||||
setLoading(true);
|
|
||||||
const res = await request(url, {
|
|
||||||
method: 'GET',
|
|
||||||
responseType: 'blob',
|
|
||||||
onError: () => {
|
|
||||||
message.error('Document parsing failed');
|
|
||||||
console.error('Error loading document:', url);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
try {
|
|
||||||
const arrayBuffer = await res.data.arrayBuffer();
|
|
||||||
const result = await mammoth.convertToHtml(
|
|
||||||
{ arrayBuffer },
|
|
||||||
{ includeDefaultStyleMap: true },
|
|
||||||
);
|
|
||||||
|
|
||||||
const styledContent = result.value
|
|
||||||
.replace(/<p>/g, '<p class="mb-2">')
|
|
||||||
.replace(/<h(\d)>/g, '<h$1 class="font-semibold mt-4 mb-2">');
|
|
||||||
|
|
||||||
setHtmlContent(styledContent);
|
|
||||||
} catch (err) {
|
|
||||||
message.error('Document parsing failed');
|
|
||||||
console.error('Error parsing document:', err);
|
|
||||||
}
|
|
||||||
setLoading(false);
|
|
||||||
};
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (url) {
|
|
||||||
fetchDocument();
|
|
||||||
}
|
|
||||||
}, [url]);
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
|
|
||||||
className,
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
{loading && (
|
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{!loading && <div dangerouslySetInnerHTML={{ __html: htmlContent }} />}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
|
|
||||||
import api, { api_host } from '@/utils/api';
|
|
||||||
import { useSize } from 'ahooks';
|
|
||||||
import { CustomTextRenderer } from 'node_modules/react-pdf/dist/esm/shared/types';
|
|
||||||
import { useCallback, useEffect, useMemo, useState } from 'react';
|
|
||||||
import { useGetPipelineResultSearchParams } from '../../hooks';
|
|
||||||
|
|
||||||
export const useDocumentResizeObserver = () => {
|
|
||||||
const [containerWidth, setContainerWidth] = useState<number>();
|
|
||||||
const [containerRef, setContainerRef] = useState<HTMLElement | null>(null);
|
|
||||||
const size = useSize(containerRef);
|
|
||||||
|
|
||||||
const onResize = useCallback((width?: number) => {
|
|
||||||
if (width) {
|
|
||||||
setContainerWidth(width);
|
|
||||||
}
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
onResize(size?.width);
|
|
||||||
}, [size?.width, onResize]);
|
|
||||||
|
|
||||||
return { containerWidth, setContainerRef };
|
|
||||||
};
|
|
||||||
|
|
||||||
function highlightPattern(text: string, pattern: string, pageNumber: number) {
|
|
||||||
if (pageNumber === 2) {
|
|
||||||
return `<mark>${text}</mark>`;
|
|
||||||
}
|
|
||||||
if (text.trim() !== '' && pattern.match(text)) {
|
|
||||||
// return pattern.replace(text, (value) => `<mark>${value}</mark>`);
|
|
||||||
return `<mark>${text}</mark>`;
|
|
||||||
}
|
|
||||||
return text.replace(pattern, (value) => `<mark>${value}</mark>`);
|
|
||||||
}
|
|
||||||
|
|
||||||
export const useHighlightText = (searchText: string = '') => {
|
|
||||||
const textRenderer: CustomTextRenderer = useCallback(
|
|
||||||
(textItem) => {
|
|
||||||
return highlightPattern(textItem.str, searchText, textItem.pageNumber);
|
|
||||||
},
|
|
||||||
[searchText],
|
|
||||||
);
|
|
||||||
|
|
||||||
return textRenderer;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const useGetDocumentUrl = (isAgent: boolean) => {
|
|
||||||
const { documentId } = useGetKnowledgeSearchParams();
|
|
||||||
const { createdBy, documentId: id } = useGetPipelineResultSearchParams();
|
|
||||||
|
|
||||||
const url = useMemo(() => {
|
|
||||||
if (isAgent) {
|
|
||||||
return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
|
|
||||||
}
|
|
||||||
return `${api_host}/document/get/${documentId}`;
|
|
||||||
}, [createdBy, documentId, id, isAgent]);
|
|
||||||
|
|
||||||
return url;
|
|
||||||
};
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
.documentContainer {
|
|
||||||
width: 100%;
|
|
||||||
// height: calc(100vh - 284px);
|
|
||||||
height: calc(100vh - 180px);
|
|
||||||
position: relative;
|
|
||||||
:global(.PdfHighlighter) {
|
|
||||||
overflow-x: hidden;
|
|
||||||
}
|
|
||||||
:global(.Highlight--scrolledTo .Highlight__part) {
|
|
||||||
overflow-x: hidden;
|
|
||||||
background-color: rgba(255, 226, 143, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,67 +0,0 @@
|
|||||||
import { memo } from 'react';
|
|
||||||
|
|
||||||
import CSVFileViewer from './csv-preview';
|
|
||||||
import { DocPreviewer } from './doc-preview';
|
|
||||||
import { ExcelCsvPreviewer } from './excel-preview';
|
|
||||||
import { ImagePreviewer } from './image-preview';
|
|
||||||
import PdfPreviewer, { IProps } from './pdf-preview';
|
|
||||||
import { PptPreviewer } from './ppt-preview';
|
|
||||||
import { TxtPreviewer } from './txt-preview';
|
|
||||||
|
|
||||||
type PreviewProps = {
|
|
||||||
fileType: string;
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
};
|
|
||||||
const Preview = ({
|
|
||||||
fileType,
|
|
||||||
className,
|
|
||||||
highlights,
|
|
||||||
setWidthAndHeight,
|
|
||||||
url,
|
|
||||||
}: PreviewProps & Partial<IProps>) => {
|
|
||||||
return (
|
|
||||||
<>
|
|
||||||
{fileType === 'pdf' && highlights && setWidthAndHeight && (
|
|
||||||
<section>
|
|
||||||
<PdfPreviewer
|
|
||||||
highlights={highlights}
|
|
||||||
setWidthAndHeight={setWidthAndHeight}
|
|
||||||
url={url}
|
|
||||||
></PdfPreviewer>
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['doc', 'docx'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<DocPreviewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['txt', 'md'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<TxtPreviewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['visual'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<ImagePreviewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['pptx'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<PptPreviewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['xlsx'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<ExcelCsvPreviewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
{['csv'].indexOf(fileType) > -1 && (
|
|
||||||
<section>
|
|
||||||
<CSVFileViewer className={className} url={url} />
|
|
||||||
</section>
|
|
||||||
)}
|
|
||||||
</>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
export default memo(Preview);
|
|
||||||
@ -1,127 +0,0 @@
|
|||||||
import { memo, useEffect, useRef } from 'react';
|
|
||||||
import {
|
|
||||||
AreaHighlight,
|
|
||||||
Highlight,
|
|
||||||
IHighlight,
|
|
||||||
PdfHighlighter,
|
|
||||||
PdfLoader,
|
|
||||||
Popup,
|
|
||||||
} from 'react-pdf-highlighter';
|
|
||||||
|
|
||||||
import { useCatchDocumentError } from '@/components/pdf-previewer/hooks';
|
|
||||||
import { Spin } from '@/components/ui/spin';
|
|
||||||
import FileError from '@/pages/document-viewer/file-error';
|
|
||||||
import styles from './index.less';
|
|
||||||
|
|
||||||
export interface IProps {
|
|
||||||
highlights: IHighlight[];
|
|
||||||
setWidthAndHeight: (width: number, height: number) => void;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
const HighlightPopup = ({
|
|
||||||
comment,
|
|
||||||
}: {
|
|
||||||
comment: { text: string; emoji: string };
|
|
||||||
}) =>
|
|
||||||
comment.text ? (
|
|
||||||
<div className="Highlight__popup">
|
|
||||||
{comment.emoji} {comment.text}
|
|
||||||
</div>
|
|
||||||
) : null;
|
|
||||||
|
|
||||||
// TODO: merge with DocumentPreviewer
|
|
||||||
const PdfPreview = ({ highlights: state, setWidthAndHeight, url }: IProps) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
|
|
||||||
const ref = useRef<(highlight: IHighlight) => void>(() => {});
|
|
||||||
const error = useCatchDocumentError(url);
|
|
||||||
|
|
||||||
const resetHash = () => {};
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (state.length > 0) {
|
|
||||||
ref?.current(state[0]);
|
|
||||||
}
|
|
||||||
}, [state]);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
className={`${styles.documentContainer} rounded-[10px] overflow-hidden `}
|
|
||||||
>
|
|
||||||
<PdfLoader
|
|
||||||
url={url}
|
|
||||||
beforeLoad={
|
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
}
|
|
||||||
workerSrc="/pdfjs-dist/pdf.worker.min.js"
|
|
||||||
errorMessage={<FileError>{error}</FileError>}
|
|
||||||
>
|
|
||||||
{(pdfDocument) => {
|
|
||||||
pdfDocument.getPage(1).then((page) => {
|
|
||||||
const viewport = page.getViewport({ scale: 1 });
|
|
||||||
const width = viewport.width;
|
|
||||||
const height = viewport.height;
|
|
||||||
setWidthAndHeight(width, height);
|
|
||||||
});
|
|
||||||
|
|
||||||
return (
|
|
||||||
<PdfHighlighter
|
|
||||||
pdfDocument={pdfDocument}
|
|
||||||
enableAreaSelection={(event) => event.altKey}
|
|
||||||
onScrollChange={resetHash}
|
|
||||||
scrollRef={(scrollTo) => {
|
|
||||||
ref.current = scrollTo;
|
|
||||||
}}
|
|
||||||
onSelectionFinished={() => null}
|
|
||||||
highlightTransform={(
|
|
||||||
highlight,
|
|
||||||
index,
|
|
||||||
setTip,
|
|
||||||
hideTip,
|
|
||||||
viewportToScaled,
|
|
||||||
screenshot,
|
|
||||||
isScrolledTo,
|
|
||||||
) => {
|
|
||||||
const isTextHighlight = !Boolean(
|
|
||||||
highlight.content && highlight.content.image,
|
|
||||||
);
|
|
||||||
|
|
||||||
const component = isTextHighlight ? (
|
|
||||||
<Highlight
|
|
||||||
isScrolledTo={isScrolledTo}
|
|
||||||
position={highlight.position}
|
|
||||||
comment={highlight.comment}
|
|
||||||
/>
|
|
||||||
) : (
|
|
||||||
<AreaHighlight
|
|
||||||
isScrolledTo={isScrolledTo}
|
|
||||||
highlight={highlight}
|
|
||||||
onChange={() => {}}
|
|
||||||
/>
|
|
||||||
);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Popup
|
|
||||||
popupContent={<HighlightPopup {...highlight} />}
|
|
||||||
onMouseOver={(popupContent) =>
|
|
||||||
setTip(highlight, () => popupContent)
|
|
||||||
}
|
|
||||||
onMouseOut={hideTip}
|
|
||||||
key={index}
|
|
||||||
>
|
|
||||||
{component}
|
|
||||||
</Popup>
|
|
||||||
);
|
|
||||||
}}
|
|
||||||
highlights={state}
|
|
||||||
/>
|
|
||||||
);
|
|
||||||
}}
|
|
||||||
</PdfLoader>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default memo(PdfPreview);
|
|
||||||
@ -1,70 +0,0 @@
|
|||||||
import message from '@/components/ui/message';
|
|
||||||
import request from '@/utils/request';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
import { init } from 'pptx-preview';
|
|
||||||
import { useEffect, useRef } from 'react';
|
|
||||||
interface PptPreviewerProps {
|
|
||||||
className?: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const PptPreviewer: React.FC<PptPreviewerProps> = ({
|
|
||||||
className,
|
|
||||||
url,
|
|
||||||
}) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const wrapper = useRef<HTMLDivElement>(null);
|
|
||||||
const containerRef = useRef<HTMLDivElement>(null);
|
|
||||||
const fetchDocument = async () => {
|
|
||||||
const res = await request(url, {
|
|
||||||
method: 'GET',
|
|
||||||
responseType: 'blob',
|
|
||||||
onError: () => {
|
|
||||||
message.error('Document parsing failed');
|
|
||||||
console.error('Error loading document:', url);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
console.log(res);
|
|
||||||
try {
|
|
||||||
const arrayBuffer = await res.data.arrayBuffer();
|
|
||||||
|
|
||||||
if (containerRef.current) {
|
|
||||||
let width = 500;
|
|
||||||
let height = 900;
|
|
||||||
if (containerRef.current) {
|
|
||||||
width = containerRef.current.clientWidth - 50;
|
|
||||||
height = containerRef.current.clientHeight - 50;
|
|
||||||
}
|
|
||||||
let pptxPrviewer = init(containerRef.current, {
|
|
||||||
width: width,
|
|
||||||
height: height,
|
|
||||||
});
|
|
||||||
pptxPrviewer.preview(arrayBuffer);
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
message.error('ppt parse failed');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (url) {
|
|
||||||
fetchDocument();
|
|
||||||
}
|
|
||||||
}, [url]);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
ref={containerRef}
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md ppt-previewer',
|
|
||||||
className,
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
<div className="overflow-auto p-2">
|
|
||||||
<div className="flex flex-col gap-4">
|
|
||||||
<div ref={wrapper} />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -1,56 +0,0 @@
|
|||||||
import message from '@/components/ui/message';
|
|
||||||
import { Spin } from '@/components/ui/spin';
|
|
||||||
import request from '@/utils/request';
|
|
||||||
import classNames from 'classnames';
|
|
||||||
import { useEffect, useState } from 'react';
|
|
||||||
|
|
||||||
type TxtPreviewerProps = { className?: string; url: string };
|
|
||||||
export const TxtPreviewer = ({ className, url }: TxtPreviewerProps) => {
|
|
||||||
// const url = useGetDocumentUrl();
|
|
||||||
const [loading, setLoading] = useState(false);
|
|
||||||
const [data, setData] = useState<string>('');
|
|
||||||
const fetchTxt = async () => {
|
|
||||||
setLoading(true);
|
|
||||||
const res = await request(url, {
|
|
||||||
method: 'GET',
|
|
||||||
responseType: 'blob',
|
|
||||||
onError: (err: any) => {
|
|
||||||
message.error('Failed to load file');
|
|
||||||
console.error('Error loading file:', err);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
// blob to string
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.readAsText(res.data);
|
|
||||||
reader.onload = () => {
|
|
||||||
setData(reader.result as string);
|
|
||||||
setLoading(false);
|
|
||||||
console.log('file loaded successfully', reader.result);
|
|
||||||
};
|
|
||||||
console.log('file data:', res);
|
|
||||||
};
|
|
||||||
useEffect(() => {
|
|
||||||
if (url) {
|
|
||||||
fetchTxt();
|
|
||||||
} else {
|
|
||||||
setLoading(false);
|
|
||||||
setData('');
|
|
||||||
}
|
|
||||||
}, [url]);
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
className={classNames(
|
|
||||||
'relative w-full h-full p-4 bg-background-paper border border-border-normal rounded-md',
|
|
||||||
className,
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
{loading && (
|
|
||||||
<div className="absolute inset-0 flex items-center justify-center">
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{!loading && <pre className="whitespace-pre-wrap p-2 ">{data}</pre>}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@ -1,7 +1,7 @@
|
|||||||
|
import DocumentPreview from '@/components/document-preview';
|
||||||
import { useFetchNextChunkList } from '@/hooks/use-chunk-request';
|
import { useFetchNextChunkList } from '@/hooks/use-chunk-request';
|
||||||
import { useMemo, useState } from 'react';
|
import { useMemo, useState } from 'react';
|
||||||
import { useTranslation } from 'react-i18next';
|
import { useTranslation } from 'react-i18next';
|
||||||
import DocumentPreview from './components/document-preview';
|
|
||||||
import {
|
import {
|
||||||
useFetchPipelineFileLogDetail,
|
useFetchPipelineFileLogDetail,
|
||||||
useFetchPipelineResult,
|
useFetchPipelineResult,
|
||||||
@ -13,8 +13,9 @@ import {
|
|||||||
useTimelineDataFlow,
|
useTimelineDataFlow,
|
||||||
} from './hooks';
|
} from './hooks';
|
||||||
|
|
||||||
import DocumentHeader from './components/document-preview/document-header';
|
import DocumentHeader from '@/components/document-preview/document-header';
|
||||||
|
|
||||||
|
import { useGetDocumentUrl } from '@/components/document-preview/hooks';
|
||||||
import { TimelineNode } from '@/components/originui/timeline';
|
import { TimelineNode } from '@/components/originui/timeline';
|
||||||
import { PageHeader } from '@/components/page-header';
|
import { PageHeader } from '@/components/page-header';
|
||||||
import Spotlight from '@/components/spotlight';
|
import Spotlight from '@/components/spotlight';
|
||||||
@ -32,7 +33,6 @@ import { AgentCategory } from '@/constants/agent';
|
|||||||
import { Images } from '@/constants/common';
|
import { Images } from '@/constants/common';
|
||||||
import { useNavigatePage } from '@/hooks/logic-hooks/navigate-hooks';
|
import { useNavigatePage } from '@/hooks/logic-hooks/navigate-hooks';
|
||||||
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
|
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
|
||||||
import { useGetDocumentUrl } from './components/document-preview/hooks';
|
|
||||||
import TimelineDataFlow from './components/time-line';
|
import TimelineDataFlow from './components/time-line';
|
||||||
import { TimelineNodeType } from './constant';
|
import { TimelineNodeType } from './constant';
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
@ -76,13 +76,14 @@ const Chunk = () => {
|
|||||||
const fileType = useMemo(() => {
|
const fileType = useMemo(() => {
|
||||||
if (isAgent) {
|
if (isAgent) {
|
||||||
return Images.some((x) => x === documentExtension)
|
return Images.some((x) => x === documentExtension)
|
||||||
? 'visual'
|
? documentInfo?.name.split('.').pop() || 'visual'
|
||||||
: documentExtension;
|
: documentExtension;
|
||||||
}
|
}
|
||||||
switch (documentInfo?.type) {
|
switch (documentInfo?.type) {
|
||||||
case 'doc':
|
case 'doc':
|
||||||
return documentInfo?.name.split('.').pop() || 'doc';
|
return documentInfo?.name.split('.').pop() || 'doc';
|
||||||
case 'visual':
|
case 'visual':
|
||||||
|
return documentInfo?.name.split('.').pop() || 'visual';
|
||||||
case 'docx':
|
case 'docx':
|
||||||
case 'txt':
|
case 'txt':
|
||||||
case 'md':
|
case 'md':
|
||||||
|
|||||||
@ -1,282 +0,0 @@
|
|||||||
// Copyright (c) 2017 PlanGrid, Inc.
|
|
||||||
|
|
||||||
.docxViewerWrapper {
|
|
||||||
overflow-y: scroll;
|
|
||||||
height: 100%;
|
|
||||||
width: 100%;
|
|
||||||
|
|
||||||
.box {
|
|
||||||
width: 100%;
|
|
||||||
height: 100%;
|
|
||||||
}
|
|
||||||
|
|
||||||
:global(.document-container) {
|
|
||||||
padding: 30px;
|
|
||||||
width: 700px;
|
|
||||||
background: rgba(255, 255, 255, 0.1);
|
|
||||||
|
|
||||||
margin: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
html,
|
|
||||||
bodyaddress,
|
|
||||||
blockquote,
|
|
||||||
body,
|
|
||||||
dd,
|
|
||||||
div,
|
|
||||||
dl,
|
|
||||||
dt,
|
|
||||||
fieldset,
|
|
||||||
form,
|
|
||||||
frame,
|
|
||||||
frameset,
|
|
||||||
h1,
|
|
||||||
h2,
|
|
||||||
h3,
|
|
||||||
h4,
|
|
||||||
h5,
|
|
||||||
h6,
|
|
||||||
noframes,
|
|
||||||
ol,
|
|
||||||
p,
|
|
||||||
ul,
|
|
||||||
center,
|
|
||||||
dir,
|
|
||||||
hr,
|
|
||||||
menu,
|
|
||||||
pre {
|
|
||||||
display: block;
|
|
||||||
unicode-bidi: embed;
|
|
||||||
}
|
|
||||||
li {
|
|
||||||
display: list-item;
|
|
||||||
list-style-type: disc;
|
|
||||||
}
|
|
||||||
head {
|
|
||||||
display: none;
|
|
||||||
}
|
|
||||||
table {
|
|
||||||
display: table;
|
|
||||||
}
|
|
||||||
img {
|
|
||||||
width: 100%;
|
|
||||||
}
|
|
||||||
tr {
|
|
||||||
display: table-row;
|
|
||||||
}
|
|
||||||
thead {
|
|
||||||
display: table-header-group;
|
|
||||||
}
|
|
||||||
tbody {
|
|
||||||
display: table-row-group;
|
|
||||||
}
|
|
||||||
tfoot {
|
|
||||||
display: table-footer-group;
|
|
||||||
}
|
|
||||||
col {
|
|
||||||
display: table-column;
|
|
||||||
}
|
|
||||||
colgroup {
|
|
||||||
display: table-column-group;
|
|
||||||
}
|
|
||||||
th {
|
|
||||||
display: table-cell;
|
|
||||||
}
|
|
||||||
td {
|
|
||||||
display: table-cell;
|
|
||||||
border-bottom: 1px solid #ccc;
|
|
||||||
border-right: 1px solid #ccc;
|
|
||||||
padding: 0.2em 0.5em;
|
|
||||||
}
|
|
||||||
caption {
|
|
||||||
display: table-caption;
|
|
||||||
}
|
|
||||||
th {
|
|
||||||
font-weight: bolder;
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
caption {
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
body {
|
|
||||||
margin: 8px;
|
|
||||||
}
|
|
||||||
h1 {
|
|
||||||
font-size: 2em;
|
|
||||||
margin: 0.67em 0;
|
|
||||||
}
|
|
||||||
h2 {
|
|
||||||
font-size: 1.5em;
|
|
||||||
margin: 0.75em 0;
|
|
||||||
}
|
|
||||||
h3 {
|
|
||||||
font-size: 1.17em;
|
|
||||||
margin: 0.83em 0;
|
|
||||||
}
|
|
||||||
h4,
|
|
||||||
p,
|
|
||||||
blockquote,
|
|
||||||
ul,
|
|
||||||
fieldset,
|
|
||||||
form,
|
|
||||||
ol,
|
|
||||||
dl,
|
|
||||||
dir,
|
|
||||||
menu {
|
|
||||||
margin: 1.12em 0;
|
|
||||||
}
|
|
||||||
h5 {
|
|
||||||
font-size: 0.83em;
|
|
||||||
margin: 1.5em 0;
|
|
||||||
}
|
|
||||||
h6 {
|
|
||||||
font-size: 0.75em;
|
|
||||||
margin: 1.67em 0;
|
|
||||||
}
|
|
||||||
h1,
|
|
||||||
h2,
|
|
||||||
h3,
|
|
||||||
h4,
|
|
||||||
h5,
|
|
||||||
h6,
|
|
||||||
b,
|
|
||||||
strong {
|
|
||||||
font-weight: bolder;
|
|
||||||
}
|
|
||||||
blockquote {
|
|
||||||
margin-left: 40px;
|
|
||||||
margin-right: 40px;
|
|
||||||
}
|
|
||||||
i,
|
|
||||||
cite,
|
|
||||||
em,
|
|
||||||
var,
|
|
||||||
address {
|
|
||||||
font-style: italic;
|
|
||||||
}
|
|
||||||
pre,
|
|
||||||
tt,
|
|
||||||
code,
|
|
||||||
kbd,
|
|
||||||
samp {
|
|
||||||
font-family: monospace;
|
|
||||||
}
|
|
||||||
pre {
|
|
||||||
white-space: pre;
|
|
||||||
}
|
|
||||||
button,
|
|
||||||
textarea,
|
|
||||||
input,
|
|
||||||
select {
|
|
||||||
display: inline-block;
|
|
||||||
}
|
|
||||||
big {
|
|
||||||
font-size: 1.17em;
|
|
||||||
}
|
|
||||||
small,
|
|
||||||
sub,
|
|
||||||
sup {
|
|
||||||
font-size: 0.83em;
|
|
||||||
}
|
|
||||||
sub {
|
|
||||||
vertical-align: sub;
|
|
||||||
}
|
|
||||||
sup {
|
|
||||||
vertical-align: super;
|
|
||||||
}
|
|
||||||
table {
|
|
||||||
border-spacing: 2px;
|
|
||||||
}
|
|
||||||
thead,
|
|
||||||
tbody,
|
|
||||||
tfoot {
|
|
||||||
vertical-align: middle;
|
|
||||||
}
|
|
||||||
td,
|
|
||||||
th,
|
|
||||||
tr {
|
|
||||||
vertical-align: inherit;
|
|
||||||
}
|
|
||||||
s,
|
|
||||||
strike,
|
|
||||||
del {
|
|
||||||
text-decoration: line-through;
|
|
||||||
}
|
|
||||||
hr {
|
|
||||||
border: 1px inset;
|
|
||||||
}
|
|
||||||
ol,
|
|
||||||
ul,
|
|
||||||
dir,
|
|
||||||
menu,
|
|
||||||
dd {
|
|
||||||
margin-left: 40px;
|
|
||||||
}
|
|
||||||
ol {
|
|
||||||
list-style-type: decimal;
|
|
||||||
}
|
|
||||||
ol ul,
|
|
||||||
ol ul,
|
|
||||||
ul ol,
|
|
||||||
ul ol,
|
|
||||||
ul ul,
|
|
||||||
ul ul,
|
|
||||||
ol ol,
|
|
||||||
ol ol {
|
|
||||||
margin-top: 0;
|
|
||||||
margin-bottom: 0;
|
|
||||||
}
|
|
||||||
u,
|
|
||||||
ins {
|
|
||||||
text-decoration: underline;
|
|
||||||
}
|
|
||||||
br:before {
|
|
||||||
content: '\A';
|
|
||||||
white-space: pre-line;
|
|
||||||
}
|
|
||||||
center {
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
:link,
|
|
||||||
:visited {
|
|
||||||
text-decoration: underline;
|
|
||||||
}
|
|
||||||
:focus {
|
|
||||||
outline: thin dotted invert;
|
|
||||||
}
|
|
||||||
/* Begin bidirectionality settings (do not change) */
|
|
||||||
BDO[DIR='ltr'] {
|
|
||||||
direction: ltr;
|
|
||||||
unicode-bidi: bidi-override;
|
|
||||||
}
|
|
||||||
BDO[DIR='rtl'] {
|
|
||||||
direction: rtl;
|
|
||||||
unicode-bidi: bidi-override;
|
|
||||||
}
|
|
||||||
*[DIR='ltr'] {
|
|
||||||
direction: ltr;
|
|
||||||
unicode-bidi: embed;
|
|
||||||
}
|
|
||||||
*[DIR='rtl'] {
|
|
||||||
direction: rtl;
|
|
||||||
unicode-bidi: embed;
|
|
||||||
}
|
|
||||||
@media print {
|
|
||||||
h1 {
|
|
||||||
page-break-before: always;
|
|
||||||
}
|
|
||||||
h1,
|
|
||||||
h2,
|
|
||||||
h3,
|
|
||||||
h4,
|
|
||||||
h5,
|
|
||||||
h6 {
|
|
||||||
page-break-after: avoid;
|
|
||||||
}
|
|
||||||
ul,
|
|
||||||
ol,
|
|
||||||
dl {
|
|
||||||
page-break-before: avoid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
import { Spin } from 'antd';
|
|
||||||
import FileError from '../file-error';
|
|
||||||
|
|
||||||
import { useFetchDocx } from '../hooks';
|
|
||||||
import styles from './index.less';
|
|
||||||
|
|
||||||
const Docx = ({ filePath }: { filePath: string }) => {
|
|
||||||
const { succeed, containerRef, error } = useFetchDocx(filePath);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<>
|
|
||||||
{succeed ? (
|
|
||||||
<section className={styles.docxViewerWrapper}>
|
|
||||||
<div id="docx" ref={containerRef} className={styles.box}>
|
|
||||||
<Spin />
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
) : (
|
|
||||||
<FileError>{error}</FileError>
|
|
||||||
)}
|
|
||||||
</>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default Docx;
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
import '@js-preview/excel/lib/index.css';
|
|
||||||
import FileError from '../file-error';
|
|
||||||
import { useFetchExcel } from '../hooks';
|
|
||||||
|
|
||||||
const Excel = ({ filePath }: { filePath: string }) => {
|
|
||||||
const { status, containerRef, error } = useFetchExcel(filePath);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
id="excel"
|
|
||||||
ref={containerRef}
|
|
||||||
style={{ height: '100%', width: '100%' }}
|
|
||||||
>
|
|
||||||
{status || <FileError>{error}</FileError>}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default Excel;
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
.errorWrapper {
|
|
||||||
width: 100%;
|
|
||||||
height: 100%;
|
|
||||||
}
|
|
||||||
@ -1,18 +1,18 @@
|
|||||||
import { Alert, Flex } from 'antd';
|
|
||||||
|
|
||||||
import { useTranslate } from '@/hooks/common-hooks';
|
import { useTranslate } from '@/hooks/common-hooks';
|
||||||
import React from 'react';
|
import React from 'react';
|
||||||
import styles from './index.less';
|
|
||||||
|
|
||||||
const FileError = ({ children }: React.PropsWithChildren) => {
|
const FileError = ({ children }: React.PropsWithChildren) => {
|
||||||
const { t } = useTranslate('fileManager');
|
const { t } = useTranslate('fileManager');
|
||||||
return (
|
return (
|
||||||
<Flex align="center" justify="center" className={styles.errorWrapper}>
|
<div className="flex items-center justify-center min-h-screen">
|
||||||
<Alert
|
<div className="bg-state-error-5 border border-state-error rounded-lg p-4 shadow-sm">
|
||||||
type="error"
|
<div className="flex ml-3">
|
||||||
message={<h2>{children || t('fileError')}</h2>}
|
<div className="text-white font-medium">
|
||||||
></Alert>
|
{children || t('fileError')}
|
||||||
</Flex>
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,22 @@
|
|||||||
import { Images } from '@/constants/common';
|
import { Images } from '@/constants/common';
|
||||||
import { api_host } from '@/utils/api';
|
import { api_host } from '@/utils/api';
|
||||||
import { Flex } from 'antd';
|
// import { Flex } from 'antd';
|
||||||
import { useParams, useSearchParams } from 'umi';
|
import { useParams, useSearchParams } from 'umi';
|
||||||
import Docx from './docx';
|
// import Docx from './docx';
|
||||||
import Excel from './excel';
|
// import Excel from './excel';
|
||||||
import Image from './image';
|
// import Image from './image';
|
||||||
import Md from './md';
|
// import Md from './md';
|
||||||
import Pdf from './pdf';
|
// import Pdf from './pdf';
|
||||||
import Text from './text';
|
// import Text from './text';
|
||||||
|
|
||||||
|
import { DocPreviewer } from '@/components/document-preview/doc-preview';
|
||||||
|
import { ExcelCsvPreviewer } from '@/components/document-preview/excel-preview';
|
||||||
|
import { ImagePreviewer } from '@/components/document-preview/image-preview';
|
||||||
|
import Md from '@/components/document-preview/md';
|
||||||
|
import PdfPreview from '@/components/document-preview/pdf-preview';
|
||||||
|
import { TxtPreviewer } from '@/components/document-preview/txt-preview';
|
||||||
import { previewHtmlFile } from '@/utils/file-util';
|
import { previewHtmlFile } from '@/utils/file-util';
|
||||||
import styles from './index.less';
|
// import styles from './index.less';
|
||||||
|
|
||||||
// TODO: The interface returns an incorrect content-type for the SVG.
|
// TODO: The interface returns an incorrect content-type for the SVG.
|
||||||
|
|
||||||
@ -20,6 +26,7 @@ const DocumentViewer = () => {
|
|||||||
const ext = currentQueryParameters.get('ext');
|
const ext = currentQueryParameters.get('ext');
|
||||||
const prefix = currentQueryParameters.get('prefix');
|
const prefix = currentQueryParameters.get('prefix');
|
||||||
const api = `${api_host}/${prefix || 'file'}/get/${documentId}`;
|
const api = `${api_host}/${prefix || 'file'}/get/${documentId}`;
|
||||||
|
// request.head
|
||||||
|
|
||||||
if (ext === 'html' && documentId) {
|
if (ext === 'html' && documentId) {
|
||||||
previewHtmlFile(documentId);
|
previewHtmlFile(documentId);
|
||||||
@ -27,19 +34,24 @@ const DocumentViewer = () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<section className={styles.viewerWrapper}>
|
<section className="w-full h-full">
|
||||||
{Images.includes(ext!) && (
|
{Images.includes(ext!) && (
|
||||||
<Flex className={styles.image} align="center" justify="center">
|
<div className="flex w-full h-full items-center justify-center">
|
||||||
<Image src={api} preview={false}></Image>
|
{/* <Image src={api} preview={false}></Image> */}
|
||||||
</Flex>
|
<ImagePreviewer className="w-full !h-dvh p-5" url={api} />
|
||||||
|
</div>
|
||||||
)}
|
)}
|
||||||
{ext === 'md' && <Md filePath={api}></Md>}
|
{ext === 'md' && <Md url={api} className="!h-dvh p-5"></Md>}
|
||||||
{ext === 'txt' && <Text filePath={api}></Text>}
|
{ext === 'txt' && <TxtPreviewer url={api}></TxtPreviewer>}
|
||||||
|
|
||||||
{ext === 'pdf' && <Pdf url={api}></Pdf>}
|
{ext === 'pdf' && (
|
||||||
{(ext === 'xlsx' || ext === 'xls') && <Excel filePath={api}></Excel>}
|
<PdfPreview url={api} className="!h-dvh p-5"></PdfPreview>
|
||||||
|
)}
|
||||||
|
{(ext === 'xlsx' || ext === 'xls') && (
|
||||||
|
<ExcelCsvPreviewer url={api}></ExcelCsvPreviewer>
|
||||||
|
)}
|
||||||
|
|
||||||
{ext === 'docx' && <Docx filePath={api}></Docx>}
|
{ext === 'docx' && <DocPreviewer url={api}></DocPreviewer>}
|
||||||
</section>
|
</section>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,32 +0,0 @@
|
|||||||
import React, { useEffect, useState } from 'react';
|
|
||||||
import FileError from '../file-error';
|
|
||||||
|
|
||||||
interface TxtProps {
|
|
||||||
filePath: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const Md: React.FC<TxtProps> = ({ filePath }) => {
|
|
||||||
const [content, setContent] = useState<string>('');
|
|
||||||
const [error, setError] = useState<string | null>(null);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
setError(null);
|
|
||||||
fetch(filePath)
|
|
||||||
.then((res) => {
|
|
||||||
if (!res.ok) throw new Error('Failed to fetch text file');
|
|
||||||
return res.text();
|
|
||||||
})
|
|
||||||
.then((text) => setContent(text))
|
|
||||||
.catch((err) => setError(err.message));
|
|
||||||
}, [filePath]);
|
|
||||||
|
|
||||||
if (error) return <FileError>{error}</FileError>;
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div style={{ padding: 24, height: '100vh', overflow: 'scroll' }}>
|
|
||||||
{content}
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default Md;
|
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
import DocumentPreview from '@/components/document-preview';
|
||||||
import { FileIcon } from '@/components/icon-font';
|
import { FileIcon } from '@/components/icon-font';
|
||||||
import { Modal } from '@/components/ui/modal/modal';
|
import { Modal } from '@/components/ui/modal/modal';
|
||||||
import {
|
import {
|
||||||
@ -7,7 +8,6 @@ import {
|
|||||||
import { IModalProps } from '@/interfaces/common';
|
import { IModalProps } from '@/interfaces/common';
|
||||||
import { IReferenceChunk } from '@/interfaces/database/chat';
|
import { IReferenceChunk } from '@/interfaces/database/chat';
|
||||||
import { IChunk } from '@/interfaces/database/knowledge';
|
import { IChunk } from '@/interfaces/database/knowledge';
|
||||||
import DocumentPreview from '@/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/document-preview';
|
|
||||||
import { useEffect, useState } from 'react';
|
import { useEffect, useState } from 'react';
|
||||||
|
|
||||||
interface IProps extends IModalProps<any> {
|
interface IProps extends IModalProps<any> {
|
||||||
|
|||||||
@ -45,21 +45,23 @@ export const useListDataSource = () => {
|
|||||||
|
|
||||||
const updatedDataSourceTemplates = useMemo(() => {
|
const updatedDataSourceTemplates = useMemo(() => {
|
||||||
const categorizedData = categorizeDataBySource(list || []);
|
const categorizedData = categorizeDataBySource(list || []);
|
||||||
let sourcelist: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
|
let sourceList: Array<IDataSorceInfo & { list: Array<IDataSourceBase> }> =
|
||||||
[];
|
[];
|
||||||
Object.keys(categorizedData).forEach((key: string) => {
|
Object.keys(categorizedData).forEach((key: string) => {
|
||||||
const k = key as DataSourceKey;
|
const k = key as DataSourceKey;
|
||||||
sourcelist.push({
|
if (DataSourceInfo[k]) {
|
||||||
id: k,
|
sourceList.push({
|
||||||
name: DataSourceInfo[k].name,
|
id: k,
|
||||||
description: DataSourceInfo[k].description,
|
name: DataSourceInfo[k].name,
|
||||||
icon: DataSourceInfo[k].icon,
|
description: DataSourceInfo[k].description,
|
||||||
list: categorizedData[k] || [],
|
icon: DataSourceInfo[k].icon,
|
||||||
});
|
list: categorizedData[k] || [],
|
||||||
|
});
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log('🚀 ~ useListDataSource ~ sourcelist:', sourcelist);
|
console.log('🚀 ~ useListDataSource ~ sourceList:', sourceList);
|
||||||
return sourcelist;
|
return sourceList;
|
||||||
}, [list]);
|
}, [list]);
|
||||||
|
|
||||||
return { list, categorizedList: updatedDataSourceTemplates, isFetching };
|
return { list, categorizedList: updatedDataSourceTemplates, isFetching };
|
||||||
|
|||||||
Reference in New Issue
Block a user