text-to-video

Hybrid media.video

Transform text into a ~2-minute short video script with optional image/audio generation. Only requires an LLM API key (defaults to DeepSeek). Optionally provide image generation (DALL-E/SiliconFlow FLUX) and TTS (OpenAI/ElevenLabs) keys for full asset pipeline.

by @sharesummer

v0.3.0 Apr 10, 2026

Newer version available: v0.5.0 →

README

No documentation yet.

Gene authors can add a README when publishing.

Phenotype

Input

Property	Type	Req	Description
text	string	✓	Source text to convert into video (200-2000 chars recommended for ~2min)
style	cinematic \| cartoon \| documentary \| minimalist \| vlog = cinematic		Visual style for scene prompts
title	string		Video title
voice	object		Voiceover configuration
apiKeys	object	✓	API keys for AI services. Only llm is required; image and tts are optional.
resolution	1024x1024 \| 1024x1792 \| 1792x1024 = 1792x1024		Image resolution per scene

Output

Property	Type	Req	Description
mode	full \| script-only \| script+images \| script+audio	✓	full = all assets; script-only = only narration & visual prompts (no image/tts keys)
title	string
scenes	array	✓
duration	number	✓	Estimated total duration in seconds
ffmpegHint	string		FFmpeg command (only in full mode)
sceneCount	integer	✓

Raw JSON Schema

inputSchema

{
  "type": "object",
  "required": [
    "text",
    "apiKeys"
  ],
  "properties": {
    "text": {
      "type": "string",
      "maxLength": 5000,
      "minLength": 10,
      "description": "Source text to convert into video (200-2000 chars recommended for ~2min)"
    },
    "style": {
      "enum": [
        "cinematic",
        "cartoon",
        "documentary",
        "minimalist",
        "vlog"
      ],
      "type": "string",
      "default": "cinematic",
      "description": "Visual style for scene prompts"
    },
    "title": {
      "type": "string",
      "maxLength": 100,
      "description": "Video title"
    },
    "voice": {
      "type": "object",
      "properties": {
        "speed": {
          "type": "number",
          "default": 1,
          "maximum": 2,
          "minimum": 0.5,
          "description": "Narration speed"
        },
        "language": {
          "enum": [
            "zh-CN",
            "en-US",
            "ja-JP",
            "ko-KR"
          ],
          "type": "string",
          "default": "zh-CN",
          "description": "Narration language"
        }
      },
      "description": "Voiceover configuration"
    },
    "apiKeys": {
      "type": "object",
      "required": [
        "llm"
      ],
      "properties": {
        "llm": {
          "type": "object",
          "required": [
            "apiKey"
          ],
          "properties": {
            "model": {
              "type": "string",
              "default": "deepseek-chat",
              "description": "Model name (default: deepseek-chat)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key for the LLM provider"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL (for proxies or self-hosted)"
            },
            "provider": {
              "enum": [
                "deepseek",
                "openai",
                "anthropic"
              ],
              "type": "string",
              "default": "deepseek",
              "description": "LLM provider"
            }
          },
          "description": "LLM provider for scene decomposition (default: DeepSeek)"
        },
        "tts": {
          "type": "object",
          "properties": {
            "voice": {
              "type": "string",
              "default": "alloy",
              "description": "Voice ID (alloy/echo/shimmer for OpenAI)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key for TTS"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL"
            },
            "provider": {
              "enum": [
                "openai",
                "elevenlabs"
              ],
              "type": "string",
              "default": "openai",
              "description": "TTS provider"
            }
          },
          "description": "Optional. TTS provider. If omitted, scenes output narration text only (no audio)."
        },
        "image": {
          "type": "object",
          "properties": {
            "model": {
              "type": "string",
              "default": "dall-e-3",
              "description": "Model (dall-e-3, black-forest-labs/FLUX.1-schnell, etc.)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key for image generation"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL"
            },
            "provider": {
              "enum": [
                "openai",
                "stability",
                "siliconflow"
              ],
              "type": "string",
              "default": "openai",
              "description": "Image gen provider. siliconflow supports FLUX models."
            }
          },
          "description": "Optional. Image generation provider. If omitted, scenes output visual prompts only (no images)."
        }
      },
      "description": "API keys for AI services. Only llm is required; image and tts are optional."
    },
    "resolution": {
      "enum": [
        "1024x1024",
        "1024x1792",
        "1792x1024"
      ],
      "type": "string",
      "default": "1792x1024",
      "description": "Image resolution per scene"
    }
  }
}

outputSchema

{
  "type": "object",
  "required": [
    "scenes",
    "duration",
    "sceneCount",
    "mode"
  ],
  "properties": {
    "mode": {
      "enum": [
        "full",
        "script-only",
        "script+images",
        "script+audio"
      ],
      "type": "string",
      "description": "full = all assets; script-only = only narration & visual prompts (no image/tts keys)"
    },
    "title": {
      "type": "string"
    },
    "scenes": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "index": {
            "type": "integer"
          },
          "endTime": {
            "type": "number"
          },
          "narration": {
            "type": "string"
          },
          "startTime": {
            "type": "number"
          },
          "audioBase64": {
            "type": "string",
            "description": "Base64 MP3 (empty if no tts key)"
          },
          "imageBase64": {
            "type": "string",
            "description": "Base64 PNG (empty if no image key)"
          },
          "visualPrompt": {
            "type": "string"
          }
        }
      }
    },
    "duration": {
      "type": "number",
      "description": "Estimated total duration in seconds"
    },
    "ffmpegHint": {
      "type": "string",
      "description": "FFmpeg command (only in full mode)"
    },
    "sceneCount": {
      "type": "integer"
    }
  }
}

Arena History

Date	Fitness	Safety	Calls
Mar 19	0.5000	1.00	1