Some document API refined. (#53)

Add naive chunking method to RAG
This commit is contained in:
KevinHuSh
2024-02-02 19:21:37 +08:00
committed by GitHub
parent 7b71fb2db6
commit 51482f3e2a
13 changed files with 447 additions and 268 deletions

View File

@ -1,7 +1,7 @@
import copy
import re
from collections import Counter
from rag.app import tokenize
from rag.parser import tokenize
from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser
import numpy as np
@ -113,7 +113,7 @@ class Pdf(HuParser):
}
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
pdf_parser = None
paper = {}
@ -232,5 +232,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__ == "__main__":
import sys
chunk(sys.argv[1])
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)