inputSchema
{
"type": "object",
"required": [
"text",
"apiKeys"
],
"properties": {
"text": {
"type": "string",
"maxLength": 5000,
"minLength": 10,
"description": "Source text to convert into video (200-2000 chars recommended for ~2min)"
},
"style": {
"enum": [
"cinematic",
"cartoon",
"documentary",
"minimalist",
"vlog"
],
"type": "string",
"default": "cinematic",
"description": "Visual style for scene prompts"
},
"title": {
"type": "string",
"maxLength": 100,
"description": "Video title"
},
"voice": {
"type": "object",
"properties": {
"speed": {
"type": "number",
"default": 1,
"maximum": 2,
"minimum": 0.5,
"description": "Narration speed"
},
"language": {
"enum": [
"zh-CN",
"en-US",
"ja-JP",
"ko-KR"
],
"type": "string",
"default": "zh-CN",
"description": "Narration language"
}
},
"description": "Voiceover configuration"
},
"apiKeys": {
"type": "object",
"required": [
"llm"
],
"properties": {
"llm": {
"type": "object",
"required": [
"apiKey"
],
"properties": {
"model": {
"type": "string",
"default": "deepseek-chat",
"description": "Model name (default: deepseek-chat)"
},
"apiKey": {
"type": "string",
"description": "API key for the LLM provider"
},
"baseUrl": {
"type": "string",
"description": "Custom API base URL (for proxies or self-hosted)"
},
"provider": {
"enum": [
"deepseek",
"openai",
"anthropic"
],
"type": "string",
"default": "deepseek",
"description": "LLM provider"
}
},
"description": "LLM provider for scene decomposition (default: DeepSeek)"
},
"tts": {
"type": "object",
"properties": {
"voice": {
"type": "string",
"default": "alloy",
"description": "Voice ID (alloy/echo/shimmer for OpenAI)"
},
"apiKey": {
"type": "string",
"description": "API key for TTS"
},
"baseUrl": {
"type": "string",
"description": "Custom API base URL"
},
"provider": {
"enum": [
"openai",
"elevenlabs"
],
"type": "string",
"default": "openai",
"description": "TTS provider"
}
},
"description": "Optional. TTS provider. If omitted, scenes output narration text only (no audio)."
},
"image": {
"type": "object",
"properties": {
"model": {
"type": "string",
"default": "dall-e-3",
"description": "Model (dall-e-3, black-forest-labs/FLUX.1-schnell, etc.)"
},
"apiKey": {
"type": "string",
"description": "API key for image generation"
},
"baseUrl": {
"type": "string",
"description": "Custom API base URL"
},
"provider": {
"enum": [
"openai",
"stability",
"siliconflow"
],
"type": "string",
"default": "openai",
"description": "Image gen provider. siliconflow supports FLUX models."
}
},
"description": "Optional. Image generation provider. If omitted, scenes output visual prompts only (no images)."
}
},
"description": "API keys for AI services. Only llm is required; image and tts are optional."
},
"resolution": {
"enum": [
"1024x1024",
"1024x1792",
"1792x1024"
],
"type": "string",
"default": "1792x1024",
"description": "Image resolution per scene"
}
}
}
outputSchema
{
"type": "object",
"required": [
"scenes",
"duration",
"sceneCount",
"mode"
],
"properties": {
"mode": {
"enum": [
"full",
"script-only",
"script+images",
"script+audio"
],
"type": "string",
"description": "full = all assets; script-only = only narration & visual prompts (no image/tts keys)"
},
"title": {
"type": "string"
},
"scenes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"index": {
"type": "integer"
},
"endTime": {
"type": "number"
},
"narration": {
"type": "string"
},
"startTime": {
"type": "number"
},
"audioBase64": {
"type": "string",
"description": "Base64 MP3 (empty if no tts key)"
},
"imageBase64": {
"type": "string",
"description": "Base64 PNG (empty if no image key)"
},
"visualPrompt": {
"type": "string"
}
}
}
},
"duration": {
"type": "number",
"description": "Estimated total duration in seconds"
},
"ffmpegHint": {
"type": "string",
"description": "FFmpeg command (only in full mode)"
},
"sceneCount": {
"type": "integer"
}
}
}