stemedb/cmd/pitch-voiceover/main.go

package main

import (
	"context"
	"flag"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"time"

	"pitch-voiceover/pkg/elevenlabs"
)

type SpeakerConfig struct {
	Name        string
	Description string
	Dir         string
}

var speakers = map[string]SpeakerConfig{
	"1": {
		Name:        "StemeDB Presenter",
		Description: "A warm, authoritative American male in his early 40s. Confident but approachable, like a trusted industry expert giving a presentation. Clear enunciation, measured pace, professional but not stiff. The voice of someone who has seen the problems firsthand and has a solution.",
		Dir:         "speaker-1",
	},
	"2": {
		Name:        "StemeDB Presenter v2",
		Description: "A deep, resonant American male voice with broadcast-quality clarity. Crystal clear diction, zero background noise, studio-perfect audio. The voice of a seasoned executive presenter - warm yet authoritative, with the polished delivery of a Fortune 500 keynote speaker. Natural pacing with confident pauses. Think: NPR host meets tech CEO.",
		Dir:         "speaker-2",
	},
	"3": {
		Name:        "StemeDB Presenter v3",
		Description: "American male, early 40s, deep smooth voice. Studio-quality recording with perfect audio clarity. Warm and authoritative tone. Clear precise diction with natural pacing. Professional narrator delivery, confident and measured. No breathiness, crystal clear pronunciation.",
		Dir:         "speaker-3",
	},
	"4": {
		Name:        "StemeDB Presenter v4",
		Description: "Male baritone voice, age 45, rich and full tone. Speaks slowly and deliberately like a documentary narrator. Strong consonants, open vowels. American midwest accent, neutral and professional. Clean studio recording quality.",
		Dir:         "speaker-4",
	},
	"5": {
		Name:        "StemeDB Presenter v5",
		Description: "Older British male, late 50s, distinguished and gravelly. Speaks with quiet authority like a veteran BBC journalist. Measured, thoughtful pacing. Slight rasp, very warm. Perfect studio audio quality.",
		Dir:         "speaker-5",
	},
	"6": {
		Name:        "StemeDB Presenter v6",
		Description: "User-selected voice",
		Dir:         "speaker-6",
	},
}

const outputBase = "../../applications/pitch/audio"

func main() {
	list := flag.Bool("list", false, "List all speaking blocks")
	voiceOnly := flag.Bool("voice-only", false, "Create or find voice only, don't generate audio")
	single := flag.Int("single", 0, "Generate only a single block by number")
	voiceID := flag.String("voice-id", "", "Use specific voice ID instead of finding/creating")
	speaker := flag.String("speaker", "1", "Speaker variant (1, 2, 3, or 4)")
	flag.Parse()

	if *list {
		listBlocks()
		return
	}

	cfg, ok := speakers[*speaker]
	if !ok {
		fmt.Fprintf(os.Stderr, "Error: unknown speaker '%s' (use 1, 2, 3, or 4)\n", *speaker)
		os.Exit(1)
	}

	apiKey := os.Getenv("ELEVENLABS_API_KEY")
	if apiKey == "" {
		fmt.Fprintln(os.Stderr, "Error: ELEVENLABS_API_KEY environment variable is required")
		os.Exit(1)
	}

	client, err := elevenlabs.NewClient(elevenlabs.Config{
		APIKey:  apiKey,
		Timeout: 120 * time.Second,
	})
	if err != nil {
		fmt.Fprintf(os.Stderr, "Error creating client: %v\n", err)
		os.Exit(1)
	}

	ctx := context.Background()

	// Health check
	if err := client.Health(ctx); err != nil {
		fmt.Fprintf(os.Stderr, "Error connecting to ElevenLabs: %v\n", err)
		os.Exit(1)
	}
	fmt.Println("✓ Connected to ElevenLabs API")

	// Find or use provided voice ID
	vid := *voiceID
	if vid == "" {
		vid, err = findOrCreateVoice(ctx, client, cfg)
		if err != nil {
			fmt.Fprintf(os.Stderr, "Error with voice: %v\n", err)
			os.Exit(1)
		}
	}
	fmt.Printf("✓ Using voice: %s\n", vid)

	if *voiceOnly {
		return
	}

	// Create output directory
	outputDir := filepath.Join(outputBase, cfg.Dir)
	if err := os.MkdirAll(outputDir, 0755); err != nil {
		fmt.Fprintf(os.Stderr, "Error creating output directory: %v\n", err)
		os.Exit(1)
	}

	// Generate audio
	blocks := Script
	if *single > 0 {
		for _, b := range Script {
			if b.Number == *single {
				blocks = []SpeakingBlock{b}
				break
			}
		}
		if len(blocks) == len(Script) {
			fmt.Fprintf(os.Stderr, "Error: block %d not found\n", *single)
			os.Exit(1)
		}
	}

	// DON'T override VoiceSettings - let the designed voice use its natural characteristics
	// Previously we used high Stability (0.80) + low Style (0.05) which normalized all voices
	var settings *elevenlabs.VoiceSettings // nil = use voice's default settings

	for _, block := range blocks {
		filename := fmt.Sprintf("%02d-%s.mp3", block.Number, block.Slug)
		outPath := filepath.Join(outputDir, filename)

		fmt.Printf("Generating %s...\n", filename)

		audio, err := client.TextToSpeechWithFormat(ctx, vid, elevenlabs.TextToSpeechRequest{
			Text:          block.Text,
			ModelID:       elevenlabs.ModelMultilingualV2,
			VoiceSettings: settings,
		}, elevenlabs.FormatMP3_44100_128)
		if err != nil {
			fmt.Fprintf(os.Stderr, "  Error: %v\n", err)
			continue
		}

		if err := os.WriteFile(outPath, audio, 0644); err != nil {
			fmt.Fprintf(os.Stderr, "  Error writing file: %v\n", err)
			continue
		}

		fmt.Printf("  ✓ %s (%d bytes)\n", filename, len(audio))
	}

	fmt.Printf("\n✓ Done! Audio files in %s\n", outputDir)
}

func listBlocks() {
	fmt.Printf("%-4s %-20s %s\n", "#", "Slug", "Step")
	fmt.Println(strings.Repeat("-", 60))
	for _, b := range Script {
		fmt.Printf("%-4d %-20s %s\n", b.Number, b.Slug, b.Step)
	}
	fmt.Printf("\nTotal: %d blocks\n", len(Script))
}

func findOrCreateVoice(ctx context.Context, client *elevenlabs.Client, cfg SpeakerConfig) (string, error) {
	// Check for existing voice
	voices, err := client.ListVoices(ctx)
	if err != nil {
		return "", fmt.Errorf("list voices: %w", err)
	}

	for _, v := range voices {
		if v.Name == cfg.Name {
			fmt.Printf("✓ Found existing voice: %s\n", v.VoiceID)
			return v.VoiceID, nil
		}
	}

	// Design new voice
	fmt.Printf("Creating new voice '%s'...\n", cfg.Name)
	designResp, err := client.DesignVoice(ctx, elevenlabs.VoiceDesignRequest{
		VoiceDescription: cfg.Description,
		AutoGenerateText: true,
		GuidanceScale:    3.0,
	})
	if err != nil {
		return "", fmt.Errorf("design voice: %w", err)
	}

	if len(designResp.Previews) == 0 {
		return "", fmt.Errorf("no voice previews generated")
	}

	// Save the first preview
	saveResp, err := client.SaveDesignedVoice(ctx, elevenlabs.SaveVoiceRequest{
		VoiceName:        cfg.Name,
		VoiceDescription: cfg.Description,
		GeneratedVoiceID: designResp.Previews[0].GeneratedVoiceID,
	})
	if err != nil {
		return "", fmt.Errorf("save voice: %w", err)
	}

	fmt.Printf("✓ Created new voice: %s\n", saveResp.VoiceID)
	return saveResp.VoiceID, nil
}