Synchronize with enterprise version (#4325)

### Type of change

- [x] Refactoring
This commit is contained in:
Yingfeng
2025-01-02 13:44:44 +08:00
committed by GitHub
parent 564277736a
commit 50f209204e
6 changed files with 94 additions and 69 deletions

View File

@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
elif re.search(r"\.docx$", filename, re.IGNORECASE):
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
@ -279,4 +279,4 @@ if __name__ == "__main__":
pass
chunk(sys.argv[1], callback=dummy)
chunk(sys.argv[1], callback=dummy)