init README of deepdoc, add picture processer. (#71)

* init README of deepdoc, add picture processer. * add resume parsing
2026-02-02 08:35:08 +08:00 · 2024-02-23 18:28:12 +08:00
parent d32322c081
commit 7fd1eca582
42 changed files with 58319 additions and 350 deletions
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -15,8 +15,7 @@ import re
 from collections import Counter

 from api.db import ParserType
-from deepdoc.parser import tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, tokenize
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@ -140,7 +139,7 @@ class Pdf(PdfParser):
        }


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Only pdf is supported.
        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
    # is it English
-    eng = pdf_parser.is_english
+    eng = lang.lower() == "english"#pdf_parser.is_english
    print("It's English.....", eng)

    res = []