Specify img2text model by tag (#5063)

### What problem does this PR solve? The current design is not well-suited for multimodal models, as each model can only be configured for a single purpose—either chat or Img2txt. To work around this limitation, we use model aliases such as gpt-4o-mini and gpt-4o-mini-2024-07-18. To fix this, this PR allows specifying the Img2txt model by tag instead of model_type. ### Type of change - [x] Refactoring
2026-01-04 03:25:30 +08:00 · 2025-02-18 11:14:48 +08:00
parent 224c5472c8
commit 4694604836
2 changed files with 38 additions and 15 deletions
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@ -8,13 +8,13 @@
            "llm": [
                {
                    "llm_name": "gpt-4o-mini",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
                {
                    "llm_name": "gpt-4o",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
@ -72,18 +72,6 @@
                    "max_tokens": 32768,
                    "model_type": "chat"
                },
-                {
-                    "llm_name": "gpt-4o-2024-08-06",
-                    "tags": "LLM,CHAT,IMAGE2TEXT",
-                    "max_tokens": 128000,
-                    "model_type": "image2text"
-                },
-                 {
-                    "llm_name": "gpt-4o-mini-2024-07-18",
-                    "tags": "LLM,CHAT,IMAGE2TEXT",
-                    "max_tokens": 128000,
-                    "model_type": "image2text"
-                },
                {
                    "llm_name": "tts-1",
                    "tags": "TTS",