mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: detect docx support via header-byte inspection (#11731)
## What problem does this PR solve? Feat: detect docx support via header-byte inspection, a further optimize based on #11684 Not all files with a .doc extension are truly legacy .doc formats, and some are internally valid .docx documents. The previous implementation relied on URL suffix checks, which misclassified these cases and was therefore not reliable. Doc file could be previewed: [en2zh.doc](https://github.com/user-attachments/files/23921131/en2zh.doc) Doc file could not be previewed: [file-sample_100kB.doc](https://github.com/user-attachments/files/23921134/file-sample_100kB.doc) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -10,6 +10,10 @@ interface DocPreviewerProps {
|
|||||||
url: string;
|
url: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Word document preview component. Behavior:
|
||||||
|
// 1) Fetches the document as a Blob.
|
||||||
|
// 2) Detects .docx input via a ZIP header probe.
|
||||||
|
// 3) Renders .docx using Mammoth; presents a controlled "unsupported" notice for non-ZIP payloads.
|
||||||
export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
||||||
className,
|
className,
|
||||||
url,
|
url,
|
||||||
@ -17,6 +21,33 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|||||||
const [htmlContent, setHtmlContent] = useState<string>('');
|
const [htmlContent, setHtmlContent] = useState<string>('');
|
||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
|
|
||||||
|
// Determines whether the Blob represents a .docx document by checking for the ZIP
|
||||||
|
// file signature ("PK") in the initial bytes. A valid .docx file is a ZIP container
|
||||||
|
// and always begins with:
|
||||||
|
// 50 4B 03 04 ("PK..")
|
||||||
|
//
|
||||||
|
// Legacy .doc files use the CFBF binary format, commonly starting with:
|
||||||
|
// D0 CF 11 E0 A1 B1 1A E1
|
||||||
|
//
|
||||||
|
// Note that some files distributed with a “.doc” extension may internally be .docx
|
||||||
|
// documents (e.g., renamed files or files produced by systems that export .docx
|
||||||
|
// content under a .doc filename). These files will still present the ZIP signature
|
||||||
|
// and are therefore treated as supported .docx payloads. The header inspection
|
||||||
|
// ensures correct routing regardless of filename or reported extension.
|
||||||
|
const isZipLikeBlob = async (blob: Blob): Promise<boolean> => {
|
||||||
|
try {
|
||||||
|
const headerSlice = blob.slice(0, 4);
|
||||||
|
const buf = await headerSlice.arrayBuffer();
|
||||||
|
const bytes = new Uint8Array(buf);
|
||||||
|
|
||||||
|
// ZIP files start with "PK" (0x50, 0x4B)
|
||||||
|
return bytes.length >= 2 && bytes[0] === 0x50 && bytes[1] === 0x4b;
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Failed to inspect blob header', e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const fetchDocument = async () => {
|
const fetchDocument = async () => {
|
||||||
if (!url) return;
|
if (!url) return;
|
||||||
|
|
||||||
@ -36,24 +67,21 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|||||||
const contentType: string =
|
const contentType: string =
|
||||||
blob.type || (res as any).headers?.['content-type'] || '';
|
blob.type || (res as any).headers?.['content-type'] || '';
|
||||||
|
|
||||||
// ---- Detect legacy .doc via MIME or URL ----
|
// Execution path selection: ZIP-like payloads are treated as .docx and rendered via Mammoth;
|
||||||
const cleanUrl = url.split(/[?#]/)[0].toLowerCase();
|
// non-ZIP payloads receive an explicit unsupported notice.
|
||||||
const isDocMime = /application\/msword/i.test(contentType);
|
const looksLikeZip = await isZipLikeBlob(blob);
|
||||||
const isLegacyDocByUrl =
|
|
||||||
cleanUrl.endsWith('.doc') && !cleanUrl.endsWith('.docx');
|
|
||||||
const isLegacyDoc = isDocMime || isLegacyDocByUrl;
|
|
||||||
|
|
||||||
if (isLegacyDoc) {
|
if (!looksLikeZip) {
|
||||||
// Do not call mammoth and do not throw an error; instead, show a note in the preview area
|
// Non-ZIP payload (likely legacy .doc or another format): skip Mammoth processing.
|
||||||
setHtmlContent(`
|
setHtmlContent(`
|
||||||
<div class="flex h-full items-center justify-center">
|
<div class="flex h-full items-center justify-center">
|
||||||
<div class="border border-dashed border-border-normal rounded-xl p-8 max-w-2xl text-center">
|
<div class="border border-dashed border-border-normal rounded-xl p-8 max-w-2xl text-center">
|
||||||
<p class="text-2xl font-bold mb-4">
|
<p class="text-2xl font-bold mb-4">
|
||||||
Preview not available for .doc files
|
Preview is not available for this Word document
|
||||||
</p>
|
</p>
|
||||||
<p class="italic text-sm text-muted-foreground leading-relaxed">
|
<p class="italic text-sm text-muted-foreground leading-relaxed">
|
||||||
Mammoth does not support <code>.doc</code> documents.<br/>
|
Mammoth supports modern <code>.docx</code> files only.<br/>
|
||||||
Inline preview is unavailable.
|
The file header does not indicate a <code>.docx</code> ZIP archive.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -61,7 +89,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- Standard .docx preview path ----
|
// ZIP-like payload: parse as .docx with Mammoth
|
||||||
const arrayBuffer = await blob.arrayBuffer();
|
const arrayBuffer = await blob.arrayBuffer();
|
||||||
const result = await mammoth.convertToHtml(
|
const result = await mammoth.convertToHtml(
|
||||||
{ arrayBuffer },
|
{ arrayBuffer },
|
||||||
@ -74,8 +102,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
|
|||||||
|
|
||||||
setHtmlContent(styledContent);
|
setHtmlContent(styledContent);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// Only errors from the mammoth conversion path should surface here
|
message.error('Failed to parse document.');
|
||||||
message.error('Document parsing failed');
|
|
||||||
console.error('Error parsing document:', err);
|
console.error('Error parsing document:', err);
|
||||||
} finally {
|
} finally {
|
||||||
setLoading(false);
|
setLoading(false);
|
||||||
|
|||||||
Reference in New Issue
Block a user