diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts index c8fb0f0..55dbbde 100644 --- a/app/api/chat/route.ts +++ b/app/api/chat/route.ts @@ -12,7 +12,11 @@ import fs from "fs/promises" import { jsonrepair } from "jsonrepair" import path from "path" import { z } from "zod" -import { getAIModel, supportsPromptCaching } from "@/lib/ai-providers" +import { + getAIModel, + supportsImageInput, + supportsPromptCaching, +} from "@/lib/ai-providers" import { findCachedResponse } from "@/lib/cached-responses" import { checkAndIncrementRequest, @@ -295,6 +299,17 @@ async function handleChatRequest(req: Request): Promise { lastUserMessage?.parts?.filter((part: any) => part.type === "file") || [] + // Check if user is sending images to a model that doesn't support them + // AI SDK silently drops unsupported parts, so we need to catch this early + if (fileParts.length > 0 && !supportsImageInput(modelId)) { + return Response.json( + { + error: `The model "${modelId}" does not support image input. Please use a vision-capable model (e.g., GPT-4o, Claude, Gemini) or remove the image.`, + }, + { status: 400 }, + ) + } + // User input only - XML is now in a separate cached system message const formattedUserInput = `User input: """md diff --git a/lib/ai-providers.ts b/lib/ai-providers.ts index 2ef585b..b482143 100644 --- a/lib/ai-providers.ts +++ b/lib/ai-providers.ts @@ -906,3 +906,34 @@ export function supportsPromptCaching(modelId: string): boolean { modelId.startsWith("eu.anthropic") ) } + +/** + * Check if a model supports image/vision input. + * Some models silently drop image parts without error (AI SDK warning only). + */ +export function supportsImageInput(modelId: string): boolean { + const lowerModelId = modelId.toLowerCase() + + // Helper to check if model has vision capability indicator + const hasVisionIndicator = + lowerModelId.includes("vision") || lowerModelId.includes("vl") + + // Models that DON'T support image/vision input (unless vision variant) + // Kimi K2 models don't support images + if (lowerModelId.includes("kimi") && !hasVisionIndicator) { + return false + } + + // DeepSeek text models (not vision variants) + if (lowerModelId.includes("deepseek") && !hasVisionIndicator) { + return false + } + + // Qwen text models (not vision variants like qwen-vl) + if (lowerModelId.includes("qwen") && !hasVisionIndicator) { + return false + } + + // Default: assume model supports images + return true +}