Add get_txt function (#2639)

### What problem does this PR solve?

Add get_txt function to reduce duplicate code

### Type of change

- [x] Refactoring

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
yqkcn
2024-09-29 10:29:56 +08:00
committed by GitHub
parent 57237634f1
commit aea553c3a8
8 changed files with 52 additions and 88 deletions

View File

@ -10,28 +10,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from deepdoc.parser.utils import get_txt
from rag.nlp import num_tokens_from_string
from rag.nlp import find_codec,num_tokens_from_string
import re
class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
txt = get_txt(fnm, binary)
return self.parser_txt(txt, chunk_token_num, delimiter)
@classmethod
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if type(txt) != str:
if not isinstance(txt, str):
raise TypeError("txt type should be str!")
cks = [""]
tk_nums = [0]