Feat/parse img (#10112)

### What problem does this PR solve?

support parse image by OCR or VLM.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lynn
2025-09-16 17:53:37 +08:00
committed by GitHub
parent 86f6da2f74
commit 152111fd9d
2 changed files with 62 additions and 7 deletions

View File

@ -12,10 +12,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import io
import logging import logging
import random import random
import trio import trio
import numpy as np
from PIL import Image
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
@ -43,7 +46,9 @@ class ParserParam(ProcessParamBase):
"json", "json",
], ],
"ppt": [], "ppt": [],
"image": [], "image": [
"text"
],
"email": [], "email": [],
"text": [ "text": [
"text", "text",
@ -56,7 +61,7 @@ class ParserParam(ProcessParamBase):
self.setups = { self.setups = {
"pdf": { "pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/vlm "parse_method": "deepdoc", # deepdoc/plain_text/vlm
"vlm_name": "", "llm_id": "",
"lang": "Chinese", "lang": "Chinese",
"suffix": [ "suffix": [
"pdf", "pdf",
@ -84,7 +89,11 @@ class ParserParam(ProcessParamBase):
}, },
"ppt": {}, "ppt": {},
"image": { "image": {
"parse_method": "ocr", "parse_method": ["ocr", "vlm"],
"llm_id": "",
"lang": "Chinese",
"suffix": ["jpg", "jpeg", "png", "gif"],
"output_format": "json",
}, },
"email": {}, "email": {},
"text": { "text": {
@ -104,7 +113,7 @@ class ParserParam(ProcessParamBase):
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"]) self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
if pdf_parse_method not in ["deepdoc", "plain_text"]: if pdf_parse_method not in ["deepdoc", "plain_text"]:
self.check_empty(pdf_config.get("vlm_name"), "VLM") self.check_empty(pdf_config.get("llm_id"), "VLM")
pdf_language = pdf_config.get("lang", "") pdf_language = pdf_config.get("lang", "")
self.check_empty(pdf_language, "Language") self.check_empty(pdf_language, "Language")
@ -125,7 +134,12 @@ class ParserParam(ProcessParamBase):
image_config = self.setups.get("image", "") image_config = self.setups.get("image", "")
if image_config: if image_config:
image_parse_method = image_config.get("parse_method", "") image_parse_method = image_config.get("parse_method", "")
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"]) self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
if image_parse_method not in ["ocr"]:
self.check_empty(image_config.get("llm_id"), "VLM")
image_language = image_config.get("lang", "")
self.check_empty(image_language, "Language")
text_config = self.setups.get("text", "") text_config = self.setups.get("text", "")
if text_config: if text_config:
@ -152,8 +166,8 @@ class Parser(ProcessBase):
lines, _ = PlainParser()(blob) lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines] bboxes = [{"text": t} for t, _ in lines]
else: else:
assert conf.get("vlm_name") assert conf.get("llm_id")
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self._param.setups["pdf"].get("lang")) vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback) lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
bboxes = [] bboxes = []
for t, poss in lines: for t, poss in lines:
@ -271,6 +285,34 @@ class Parser(ProcessBase):
result = text_content result = text_content
self.set_output("text", result) self.set_output("text", result)
def _image(self, from_upstream: ParserFromUpstream):
from deepdoc.vision import OCR
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
blob = from_upstream.blob
conf = self._param.setups["image"]
self.set_output("output_format", conf["output_format"])
img = Image.open(io.BytesIO(blob)).convert("RGB")
lang = conf["lang"]
if conf["parse_method"] == "ocr":
# use ocr, recognize chars only
ocr = OCR()
bxs = ocr(np.array(img)) # return boxes and recognize result
txt = "\n".join([t[0] for _, t in bxs if t[0]])
else:
# use VLM to describe the picture
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
img_binary = io.BytesIO()
img.save(img_binary, format="JPEG")
img_binary.seek(0)
txt = cv_model.describe(img_binary.read())
self.set_output("text", txt)
async def _invoke(self, **kwargs): async def _invoke(self, **kwargs):
function_map = { function_map = {
"pdf": self._pdf, "pdf": self._pdf,
@ -278,6 +320,7 @@ class Parser(ProcessBase):
"spreadsheet": self._spreadsheet, "spreadsheet": self._spreadsheet,
"word": self._word, "word": self._word,
"text": self._text, "text": self._text,
"image": self._image,
} }
try: try:
from_upstream = ParserFromUpstream.model_validate(kwargs) from_upstream = ParserFromUpstream.model_validate(kwargs)

View File

@ -48,6 +48,18 @@
"text": { "text": {
"suffix": ["txt"], "suffix": ["txt"],
"output_format": "json" "output_format": "json"
},
"image": {
"parse_method": "vlm",
"llm_id":"glm-4.5v",
"lang": "Chinese",
"suffix": [
"jpg",
"jpeg",
"png",
"gif"
],
"output_format": "text"
} }
} }
} }