init README of deepdoc, add picture processer. (#71)

* init README of deepdoc, add picture processer. * add resume parsing
2026-02-03 00:55:10 +08:00 · 2024-02-23 18:28:12 +08:00
parent d32322c081
commit 7fd1eca582
42 changed files with 58319 additions and 350 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -13,12 +13,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import io
 from abc import ABC
+
+from PIL import Image
 from openai import OpenAI
 import os
 import base64
 from io import BytesIO

+from api.utils import get_uuid
+from api.utils.file_utils import get_project_base_directory
+

 class Base(ABC):
    def __init__(self, key, model_name):
@ -44,25 +50,26 @@ class Base(ABC):
            {
                "role": "user",
                "content": [
-                    {
-                        "type": "text",
-                        "text": "请用中文详细描述一下图中的内容，比如时间，地点，人物，事情，人物心情等。",
-                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{b64}"
                        },
                    },
+                    {
+                        "text": "请用中文详细描述一下图中的内容，比如时间，地点，人物，事情，人物心情等，如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
+                            "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
+                    },
                ],
            }
        ]


 class GptV4(Base):
-    def __init__(self, key, model_name="gpt-4-vision-preview"):
+    def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese"):
        self.client = OpenAI(api_key=key)
        self.model_name = model_name
+        self.lang = lang

    def describe(self, image, max_tokens=300):
        b64 = self.image2base64(image)
@ -76,18 +83,40 @@ class GptV4(Base):


 class QWenCV(Base):
-    def __init__(self, key, model_name="qwen-vl-chat-v1"):
+    def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese"):
        import dashscope
        dashscope.api_key = key
        self.model_name = model_name
+        self.lang = lang
+
+    def prompt(self, binary):
+        # stupid as hell
+        tmp_dir = get_project_base_directory("tmp")
+        if not os.path.exists(tmp_dir): os.mkdir(tmp_dir)
+        path = os.path.join(tmp_dir, "%s.jpg"%get_uuid())
+        Image.open(io.BytesIO(binary)).save(path)
+        return [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "image": f"file://{path}"
+                    },
+                    {
+                        "text": "请用中文详细描述一下图中的内容，比如时间，地点，人物，事情，人物心情等，如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
+                            "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
+                    },
+                ],
+            }
+        ]

    def describe(self, image, max_tokens=300):
        from http import HTTPStatus
        from dashscope import MultiModalConversation
        response = MultiModalConversation.call(model=self.model_name,
-                                               messages=self.prompt(self.image2base64(image)))
+                                               messages=self.prompt(image))
        if response.status_code == HTTPStatus.OK:
-            return response.output.choices[0]['message']['content'], response.usage.output_tokens
+            return response.output.choices[0]['message']['content'][0]["text"], response.usage.output_tokens
        return response.message, 0


@ -95,9 +124,10 @@ from zhipuai import ZhipuAI


 class Zhipu4V(Base):
-    def __init__(self, key, model_name="glm-4v"):
+    def __init__(self, key, model_name="glm-4v", lang="Chinese"):
        self.client = ZhipuAI(api_key=key)
        self.model_name = model_name
+        self.lang = lang

    def describe(self, image, max_tokens=1024):
        b64 = self.image2base64(image)