mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat/parse img (#10112)
### What problem does this PR solve? support parse image by OCR or VLM. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -12,10 +12,13 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from api.db import LLMType
|
from api.db import LLMType
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
@ -43,7 +46,9 @@ class ParserParam(ProcessParamBase):
|
|||||||
"json",
|
"json",
|
||||||
],
|
],
|
||||||
"ppt": [],
|
"ppt": [],
|
||||||
"image": [],
|
"image": [
|
||||||
|
"text"
|
||||||
|
],
|
||||||
"email": [],
|
"email": [],
|
||||||
"text": [
|
"text": [
|
||||||
"text",
|
"text",
|
||||||
@ -56,7 +61,7 @@ class ParserParam(ProcessParamBase):
|
|||||||
self.setups = {
|
self.setups = {
|
||||||
"pdf": {
|
"pdf": {
|
||||||
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
||||||
"vlm_name": "",
|
"llm_id": "",
|
||||||
"lang": "Chinese",
|
"lang": "Chinese",
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"pdf",
|
"pdf",
|
||||||
@ -84,7 +89,11 @@ class ParserParam(ProcessParamBase):
|
|||||||
},
|
},
|
||||||
"ppt": {},
|
"ppt": {},
|
||||||
"image": {
|
"image": {
|
||||||
"parse_method": "ocr",
|
"parse_method": ["ocr", "vlm"],
|
||||||
|
"llm_id": "",
|
||||||
|
"lang": "Chinese",
|
||||||
|
"suffix": ["jpg", "jpeg", "png", "gif"],
|
||||||
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"email": {},
|
"email": {},
|
||||||
"text": {
|
"text": {
|
||||||
@ -104,7 +113,7 @@ class ParserParam(ProcessParamBase):
|
|||||||
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
|
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
|
||||||
|
|
||||||
if pdf_parse_method not in ["deepdoc", "plain_text"]:
|
if pdf_parse_method not in ["deepdoc", "plain_text"]:
|
||||||
self.check_empty(pdf_config.get("vlm_name"), "VLM")
|
self.check_empty(pdf_config.get("llm_id"), "VLM")
|
||||||
|
|
||||||
pdf_language = pdf_config.get("lang", "")
|
pdf_language = pdf_config.get("lang", "")
|
||||||
self.check_empty(pdf_language, "Language")
|
self.check_empty(pdf_language, "Language")
|
||||||
@ -125,7 +134,12 @@ class ParserParam(ProcessParamBase):
|
|||||||
image_config = self.setups.get("image", "")
|
image_config = self.setups.get("image", "")
|
||||||
if image_config:
|
if image_config:
|
||||||
image_parse_method = image_config.get("parse_method", "")
|
image_parse_method = image_config.get("parse_method", "")
|
||||||
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
|
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
|
||||||
|
if image_parse_method not in ["ocr"]:
|
||||||
|
self.check_empty(image_config.get("llm_id"), "VLM")
|
||||||
|
|
||||||
|
image_language = image_config.get("lang", "")
|
||||||
|
self.check_empty(image_language, "Language")
|
||||||
|
|
||||||
text_config = self.setups.get("text", "")
|
text_config = self.setups.get("text", "")
|
||||||
if text_config:
|
if text_config:
|
||||||
@ -152,8 +166,8 @@ class Parser(ProcessBase):
|
|||||||
lines, _ = PlainParser()(blob)
|
lines, _ = PlainParser()(blob)
|
||||||
bboxes = [{"text": t} for t, _ in lines]
|
bboxes = [{"text": t} for t, _ in lines]
|
||||||
else:
|
else:
|
||||||
assert conf.get("vlm_name")
|
assert conf.get("llm_id")
|
||||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self._param.setups["pdf"].get("lang"))
|
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
|
||||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||||
bboxes = []
|
bboxes = []
|
||||||
for t, poss in lines:
|
for t, poss in lines:
|
||||||
@ -271,6 +285,34 @@ class Parser(ProcessBase):
|
|||||||
result = text_content
|
result = text_content
|
||||||
self.set_output("text", result)
|
self.set_output("text", result)
|
||||||
|
|
||||||
|
def _image(self, from_upstream: ParserFromUpstream):
|
||||||
|
from deepdoc.vision import OCR
|
||||||
|
|
||||||
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
|
||||||
|
|
||||||
|
blob = from_upstream.blob
|
||||||
|
conf = self._param.setups["image"]
|
||||||
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
|
img = Image.open(io.BytesIO(blob)).convert("RGB")
|
||||||
|
lang = conf["lang"]
|
||||||
|
|
||||||
|
if conf["parse_method"] == "ocr":
|
||||||
|
# use ocr, recognize chars only
|
||||||
|
ocr = OCR()
|
||||||
|
bxs = ocr(np.array(img)) # return boxes and recognize result
|
||||||
|
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||||
|
|
||||||
|
else:
|
||||||
|
# use VLM to describe the picture
|
||||||
|
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
|
||||||
|
img_binary = io.BytesIO()
|
||||||
|
img.save(img_binary, format="JPEG")
|
||||||
|
img_binary.seek(0)
|
||||||
|
txt = cv_model.describe(img_binary.read())
|
||||||
|
|
||||||
|
self.set_output("text", txt)
|
||||||
|
|
||||||
async def _invoke(self, **kwargs):
|
async def _invoke(self, **kwargs):
|
||||||
function_map = {
|
function_map = {
|
||||||
"pdf": self._pdf,
|
"pdf": self._pdf,
|
||||||
@ -278,6 +320,7 @@ class Parser(ProcessBase):
|
|||||||
"spreadsheet": self._spreadsheet,
|
"spreadsheet": self._spreadsheet,
|
||||||
"word": self._word,
|
"word": self._word,
|
||||||
"text": self._text,
|
"text": self._text,
|
||||||
|
"image": self._image,
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||||
|
|||||||
@ -48,6 +48,18 @@
|
|||||||
"text": {
|
"text": {
|
||||||
"suffix": ["txt"],
|
"suffix": ["txt"],
|
||||||
"output_format": "json"
|
"output_format": "json"
|
||||||
|
},
|
||||||
|
"image": {
|
||||||
|
"parse_method": "vlm",
|
||||||
|
"llm_id":"glm-4.5v",
|
||||||
|
"lang": "Chinese",
|
||||||
|
"suffix": [
|
||||||
|
"jpg",
|
||||||
|
"jpeg",
|
||||||
|
"png",
|
||||||
|
"gif"
|
||||||
|
],
|
||||||
|
"output_format": "text"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user