feat: support image

2025-12-28 05:11:24 +08:00 · 2025-07-07 14:27:03 +08:00 · 2025-07-07 14:27:03 +08:00 · c72156b2da
commit c72156b2da
parent 9e252d7eb0
5 changed files with 144 additions and 41 deletions
--- a/packages/aiCore/src/index.ts
+++ b/packages/aiCore/src/index.ts
@ -60,6 +60,10 @@ export type {
  LanguageModelUsage, // AI SDK 4.0 中 TokenUsage 改名为 LanguageModelUsage
  // 消息相关类型
  ModelMessage,
+  TextPart,
+  FilePart,
+  ImagePart,
+  ToolCallPart,
  // 错误类型
  NoSuchToolError,
  StreamTextResult,
@ -75,6 +79,7 @@ export type {
  ToolSet,
  UserModelMessage
 } from 'ai'
+export type { ReasoningPart } from '@ai-sdk/provider-utils'
 export {
  defaultSettingsMiddleware,
  extractReasoningMiddleware,
--- a/src/renderer/src/aiCore/transformParameters.ts
+++ b/src/renderer/src/aiCore/transformParameters.ts
@ -3,7 +3,19 @@
 * 统一管理从各个 apiClient 提取的参数处理和转换功能
 */

-import { type ModelMessage, stepCountIs, type StreamTextParams } from '@cherrystudio/ai-core'
+import {
+  AssistantModelMessage,
+  FilePart,
+  ImagePart,
+  ModelMessage,
+  ReasoningPart,
+  stepCountIs,
+  type StreamTextParams,
+  TextPart,
+  ToolCallPart,
+  ToolResultPart,
+  UserModelMessage
+} from '@cherrystudio/ai-core'
 import { DEFAULT_MAX_TOKENS } from '@renderer/config/constant'
 import {
  isGenerateImageModel,
@ -14,18 +26,25 @@ import {
  isSupportedFlexServiceTier,
  isSupportedReasoningEffortModel,
  isSupportedThinkingTokenModel,
+  isVisionModel,
  isWebSearchModel
 } from '@renderer/config/models'
 import { getAssistantSettings, getDefaultModel } from '@renderer/services/AssistantService'
 import type { Assistant, MCPTool, Message, Model } from '@renderer/types'
 import { FileTypes } from '@renderer/types'
-import { findFileBlocks, findImageBlocks, getMainTextContent } from '@renderer/utils/messageUtils/find'
+import {
+  findFileBlocks,
+  findImageBlocks,
+  findThinkingBlocks,
+  getMainTextContent
+} from '@renderer/utils/messageUtils/find'
 import { buildSystemPrompt } from '@renderer/utils/prompt'
 import { defaultTimeout } from '@shared/config/constant'

 // import { jsonSchemaToZod } from 'json-schema-to-zod'
 import { setupToolsConfig } from './utils/mcp'
 import { buildProviderOptions } from './utils/options'
+import { FileMessageBlock, ImageMessageBlock, ThinkingMessageBlock } from '@renderer/types/newMessage'

 /**
 * 获取温度参数
@ -94,22 +113,25 @@ export async function extractFileContent(message: Message): Promise<string> {
 * 转换消息为 AI SDK 参数格式
 * 基于 OpenAI 格式的通用转换，支持文本、图片和文件
 */
-export async function convertMessageToSdkParam(message: Message, isVisionModel = false): Promise<any> {
+export async function convertMessageToSdkParam(message: Message, isVisionModel = false): Promise<ModelMessage> {
  const content = getMainTextContent(message)
  const fileBlocks = findFileBlocks(message)
  const imageBlocks = findImageBlocks(message)
-
-  // 简单消息（无文件无图片）
-  if (fileBlocks.length === 0 && imageBlocks.length === 0) {
-    return {
-      role: message.role === 'system' ? 'user' : message.role,
-      content
-    }
+  const reasoningBlocks = findThinkingBlocks(message)
+  if (message.role === 'user' || message.role === 'system') {
+    return convertMessageToUserModelMessage(content, fileBlocks, imageBlocks, isVisionModel)
+  } else {
+    return convertMessageToAssistantModelMessage(content, fileBlocks, reasoningBlocks)
  }
+}

-  // 复杂消息（包含文件或图片）
-  const parts: any[] = []
-
+async function convertMessageToUserModelMessage(
+  content: string,
+  fileBlocks: FileMessageBlock[],
+  imageBlocks: ImageMessageBlock[],
+  isVisionModel = false
+): Promise<UserModelMessage> {
+  const parts: Array<TextPart | FilePart | ImagePart> = []
  if (content) {
    parts.push({ type: 'text', text: content })
  }
@ -121,16 +143,17 @@ export async function convertMessageToSdkParam(message: Message, isVisionModel =
        try {
          const image = await window.api.file.base64Image(imageBlock.file.id + imageBlock.file.ext)
          parts.push({
-            type: 'image_url',
-            image_url: { url: image.data }
+            type: 'image',
+            image: image.base64,
+            mediaType: image.mime
          })
        } catch (error) {
          console.warn('Failed to load image:', error)
        }
-      } else if (imageBlock.url && imageBlock.url.startsWith('data:')) {
+      } else if (imageBlock.url) {
        parts.push({
-          type: 'image_url',
-          image_url: { url: imageBlock.url }
+          type: 'image',
+          image: imageBlock.url
        })
      }
    }
@ -138,28 +161,63 @@ export async function convertMessageToSdkParam(message: Message, isVisionModel =

  // 处理文件
  for (const fileBlock of fileBlocks) {
-    const file = fileBlock.file
-    if (!file) continue
-
-    if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
-      try {
-        const fileContent = await window.api.file.read(file.id + file.ext)
-        parts.push({
-          type: 'text',
-          text: `${file.origin_name}\n${fileContent.trim()}`
-        })
-      } catch (error) {
-        console.warn('Failed to read file:', error)
-      }
+    const textPart = await convertFileBlockToTextPart(fileBlock)
+    if (textPart) {
+      parts.push(textPart)
    }
  }

  return {
-    role: message.role === 'system' ? 'user' : message.role,
-    content: parts.length === 1 && parts[0].type === 'text' ? parts[0].text : parts
+    role: 'user',
+    content: parts
  }
 }

+async function convertMessageToAssistantModelMessage(
+  content: string,
+  fileBlocks: FileMessageBlock[],
+  thinkingBlocks: ThinkingMessageBlock[]
+): Promise<AssistantModelMessage> {
+  const parts: Array<TextPart | FilePart> = []
+  if (content) {
+    parts.push({ type: 'text', text: content })
+  }
+
+  for (const thinkingBlock of thinkingBlocks) {
+    parts.push({ type: 'text', text: thinkingBlock.content })
+  }
+
+  for (const fileBlock of fileBlocks) {
+    const textPart = await convertFileBlockToTextPart(fileBlock)
+    if (textPart) {
+      parts.push(textPart)
+    }
+  }
+
+  return {
+    role: 'assistant',
+    content: parts
+  }
+}
+
+async function convertFileBlockToTextPart(fileBlock: FileMessageBlock): Promise<TextPart | null> {
+  const file = fileBlock.file
+
+  if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
+    try {
+      const fileContent = await window.api.file.read(file.id + file.ext)
+      return {
+        type: 'text',
+        text: `${file.origin_name}\n${fileContent.trim()}`
+      }
+    } catch (error) {
+      console.warn('Failed to read file:', error)
+    }
+  }
+
+  return null
+}
+
 /**
 * 转换 Cherry Studio 消息数组为 AI SDK 消息数组
 */
@ -168,7 +226,7 @@ export async function convertMessagesToSdkMessages(
  model: Model
 ): Promise<StreamTextParams['messages']> {
  const sdkMessages: StreamTextParams['messages'] = []
-  const isVision = model.id.includes('vision') || model.id.includes('gpt-4') // 简单的视觉模型检测
+  const isVision = isVisionModel(model)

  for (const message of messages) {
    const sdkMessage = await convertMessageToSdkParam(message, isVision)
--- a/src/renderer/src/aiCore/utils/options.ts
+++ b/src/renderer/src/aiCore/utils/options.ts
@ -30,6 +30,9 @@ export function buildProviderOptions(
  // 构建 provider 特定的选项
  let providerSpecificOptions: Record<string, any> = {}

+  console.log('buildProviderOptions', providerId)
+  console.log('buildProviderOptions', provider)
+
  // 根据 provider 类型分离构建逻辑
  switch (provider.type) {
    case 'openai-response':
--- a/src/renderer/src/aiCore/utils/reasoning.ts
+++ b/src/renderer/src/aiCore/utils/reasoning.ts
@ -42,7 +42,7 @@ export function getReasoningEffort(assistant: Assistant, model: Model): Reasonin

    if (isSupportedThinkingTokenGeminiModel(model)) {
      if (GEMINI_FLASH_MODEL_REGEX.test(model.id)) {
-        return { reasoningEffort: 'none' }
+        return { reasoning_effort: 'none' }
      }
      return {}
    }
@ -71,6 +71,12 @@ export function getReasoningEffort(assistant: Assistant, model: Model): Reasonin
  }

  if (!reasoningEffort) {
+    if (model.provider === 'openrouter') {
+      if (isSupportedThinkingTokenGeminiModel(model) && !GEMINI_FLASH_MODEL_REGEX.test(model.id)) {
+        return {}
+      }
+      return { reasoning: { enabled: false, exclude: true } }
+    }
    if (isSupportedThinkingTokenQwenModel(model)) {
      return { enable_thinking: false }
    }
@ -80,12 +86,16 @@ export function getReasoningEffort(assistant: Assistant, model: Model): Reasonin
    }

    if (isSupportedThinkingTokenGeminiModel(model)) {
-      // openrouter没有提供一个不推理的选项，先隐藏
-      if (provider.id === 'openrouter') {
-        return { reasoning: { max_tokens: 0, exclude: true } }
-      }
      if (GEMINI_FLASH_MODEL_REGEX.test(model.id)) {
-        return { reasoningEffort: 'none' }
+        return {
+          extra_body: {
+            google: {
+              thinking_config: {
+                thinking_budget: 0
+              }
+            }
+          }
+        }
      }
      return {}
    }
@ -128,12 +138,37 @@ export function getReasoningEffort(assistant: Assistant, model: Model): Reasonin
  }

  // OpenAI models
-  if (isSupportedReasoningEffortOpenAIModel(model) || isSupportedThinkingTokenGeminiModel(model)) {
+  if (isSupportedReasoningEffortOpenAIModel(model)) {
    return {
      reasoningEffort: reasoningEffort
    }
  }

+  if (isSupportedThinkingTokenGeminiModel(model)) {
+    if (reasoningEffort === 'auto') {
+      return {
+        extra_body: {
+          google: {
+            thinking_config: {
+              thinking_budget: -1,
+              include_thoughts: true
+            }
+          }
+        }
+      }
+    }
+    return {
+      extra_body: {
+        google: {
+          thinking_config: {
+            thinking_budget: budgetTokens,
+            include_thoughts: true
+          }
+        }
+      }
+    }
+  }
+
  // Claude models
  if (isSupportedThinkingTokenClaudeModel(model)) {
    const maxTokens = assistant.settings?.maxTokens
--- a/src/renderer/src/types/sdk.ts
+++ b/src/renderer/src/types/sdk.ts
@ -50,9 +50,11 @@ export type ReasoningEffortOptionalParams = {
  thinking?: { type: 'disabled' | 'enabled' | 'auto'; budget_tokens?: number }
  reasoning?: { max_tokens?: number; exclude?: boolean; effort?: string; enabled?: boolean } | OpenAI.Reasoning
  reasoningEffort?: OpenAI.Chat.Completions.ChatCompletionCreateParams['reasoning_effort'] | 'none' | 'auto'
+  reasoning_effort?: OpenAI.Chat.Completions.ChatCompletionCreateParams['reasoning_effort'] | 'none' | 'auto'
  enable_thinking?: boolean
  thinking_budget?: number
  enable_reasoning?: boolean
+  extra_body?: Record<string, any>
  // Add any other potential reasoning-related keys here if they exist
 }