text-to-video

Hybrid media.video

Transform text into a ~2-minute short video. Uses user-provided API keys to call LLM (scene decomposition), image generation, and TTS services. Assembles scenes into a final video. Supports OpenAI, Azure, Stability AI, and other providers.

by @sharesummer

v0.2.0 Apr 10, 2026

Newer version available: v0.5.0 →

README

No documentation yet.

Gene authors can add a README when publishing.

Phenotype

Input

Property	Type	Req	Description
text	string	✓	Source text to convert into video (200-2000 chars recommended for ~2min)
style	cinematic \| cartoon \| documentary \| minimalist \| vlog = cinematic		Visual style for generated scenes
title	string		Video title displayed in the opening scene
voice	object		Voiceover configuration
apiKeys	object	✓	API keys for external AI services
resolution	1024x1024 \| 1024x1792 \| 1792x1024 = 1792x1024		Image resolution per scene

Output

Property	Type	Req	Description
title	string		Video title
scenes	array	✓	Generated scenes with base64-encoded assets
duration	number	✓	Estimated total video duration in seconds
ffmpegHint	string		FFmpeg command to assemble saved assets into final MP4
sceneCount	integer	✓	Number of scenes generated

Raw JSON Schema

inputSchema

{
  "type": "object",
  "required": [
    "text",
    "apiKeys"
  ],
  "properties": {
    "text": {
      "type": "string",
      "maxLength": 5000,
      "minLength": 10,
      "description": "Source text to convert into video (200-2000 chars recommended for ~2min)"
    },
    "style": {
      "enum": [
        "cinematic",
        "cartoon",
        "documentary",
        "minimalist",
        "vlog"
      ],
      "type": "string",
      "default": "cinematic",
      "description": "Visual style for generated scenes"
    },
    "title": {
      "type": "string",
      "maxLength": 100,
      "description": "Video title displayed in the opening scene"
    },
    "voice": {
      "type": "object",
      "properties": {
        "speed": {
          "type": "number",
          "default": 1,
          "maximum": 2,
          "minimum": 0.5,
          "description": "Narration speed multiplier"
        },
        "language": {
          "enum": [
            "zh-CN",
            "en-US",
            "ja-JP",
            "ko-KR"
          ],
          "type": "string",
          "default": "zh-CN",
          "description": "Narration language"
        }
      },
      "description": "Voiceover configuration"
    },
    "apiKeys": {
      "type": "object",
      "required": [
        "llm"
      ],
      "properties": {
        "llm": {
          "type": "object",
          "required": [
            "apiKey"
          ],
          "properties": {
            "model": {
              "type": "string",
              "default": "gpt-4o",
              "description": "Model name (e.g. gpt-4o, claude-sonnet-4-20250514, deepseek-chat)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key for the LLM provider"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL (for proxies or self-hosted)"
            },
            "provider": {
              "enum": [
                "openai",
                "anthropic",
                "deepseek"
              ],
              "type": "string",
              "default": "openai",
              "description": "LLM provider"
            }
          },
          "description": "LLM provider for scene decomposition"
        },
        "tts": {
          "type": "object",
          "properties": {
            "voice": {
              "type": "string",
              "default": "alloy",
              "description": "Voice ID (e.g. alloy, echo, shimmer for OpenAI)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key (defaults to llm.apiKey if same provider)"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL"
            },
            "provider": {
              "enum": [
                "openai",
                "azure",
                "elevenlabs"
              ],
              "type": "string",
              "default": "openai",
              "description": "TTS provider"
            }
          },
          "description": "Text-to-speech provider"
        },
        "image": {
          "type": "object",
          "properties": {
            "model": {
              "type": "string",
              "default": "dall-e-3",
              "description": "Model name (e.g. dall-e-3, stable-diffusion-xl)"
            },
            "apiKey": {
              "type": "string",
              "description": "API key (defaults to llm.apiKey if same provider)"
            },
            "baseUrl": {
              "type": "string",
              "description": "Custom API base URL"
            },
            "provider": {
              "enum": [
                "openai",
                "stability",
                "replicate"
              ],
              "type": "string",
              "default": "openai",
              "description": "Image generation provider"
            }
          },
          "description": "Image generation provider"
        }
      },
      "description": "API keys for external AI services"
    },
    "resolution": {
      "enum": [
        "1024x1024",
        "1024x1792",
        "1792x1024"
      ],
      "type": "string",
      "default": "1792x1024",
      "description": "Image resolution per scene"
    }
  }
}

outputSchema

{
  "type": "object",
  "required": [
    "scenes",
    "duration",
    "sceneCount"
  ],
  "properties": {
    "title": {
      "type": "string",
      "description": "Video title"
    },
    "scenes": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "index": {
            "type": "integer"
          },
          "endTime": {
            "type": "number"
          },
          "narration": {
            "type": "string"
          },
          "startTime": {
            "type": "number"
          },
          "audioBase64": {
            "type": "string",
            "description": "Base64-encoded MP3 audio"
          },
          "imageBase64": {
            "type": "string",
            "description": "Base64-encoded PNG image"
          },
          "visualPrompt": {
            "type": "string"
          }
        }
      },
      "description": "Generated scenes with base64-encoded assets"
    },
    "duration": {
      "type": "number",
      "description": "Estimated total video duration in seconds"
    },
    "ffmpegHint": {
      "type": "string",
      "description": "FFmpeg command to assemble saved assets into final MP4"
    },
    "sceneCount": {
      "type": "integer",
      "description": "Number of scenes generated"
    }
  }
}

Arena History

Date	Fitness	Safety	Calls
Mar 19	0.5000	1.00	1