diff --git a/internal/adapter/templates/templates/components/worker/cmd/worker/main.go.tmpl b/internal/adapter/templates/templates/components/worker/cmd/worker/main.go.tmpl index 498978b..669aa37 100644 --- a/internal/adapter/templates/templates/components/worker/cmd/worker/main.go.tmpl +++ b/internal/adapter/templates/templates/components/worker/cmd/worker/main.go.tmpl @@ -16,6 +16,7 @@ import ( "{{GO_MODULE}}/pkg/logging" "{{GO_MODULE}}/pkg/mediagen" mediagenAdapters "{{GO_MODULE}}/pkg/mediagen/adapters" + "{{GO_MODULE}}/pkg/personagen" "{{GO_MODULE}}/pkg/queue" "{{GO_MODULE}}/pkg/realtime" "{{GO_MODULE}}/pkg/storage" @@ -216,6 +217,10 @@ func main() { handler.RegisterHandler("generate_text", handlers.TextHandler(textgenManager, ssePub, logger)) handler.RegisterHandler("ai_chat_response", handlers.ChatResponseHandler(textgenManager, ssePub, logger)) } + // Persona generation requires both textgen (5-stage LLM pipeline) and mediagen (20 images + 4 videos). + if textgenManager != nil && mediagenManager != nil { + handler.RegisterHandler("persona_generate", personagen.QueueHandler(textgenManager, mediagenManager, mediaStore, ssePub, logger.Logger)) + } // Setup signal handling sigCh := make(chan os.Signal, 1) diff --git a/internal/adapter/templates/templates/skeleton/.claude/guides/personagen.md b/internal/adapter/templates/templates/skeleton/.claude/guides/personagen.md index 7433807..a57edd8 100644 --- a/internal/adapter/templates/templates/skeleton/.claude/guides/personagen.md +++ b/internal/adapter/templates/templates/skeleton/.claude/guides/personagen.md @@ -76,6 +76,7 @@ Subscribe to `user:` channel before calling the generate endpoint: {"type": "persona_image_complete", "jobId": "...", "progress": 100, "result": {"personaId": "..."}} {"type": "persona_video_started", "jobId": "...", "result": {"motionType": "smile_reveal"}} {"type": "persona_video_complete", "jobId": "...", "result": {"motionType": "smile_reveal", "url": "..."}} +{"type": "persona_video_failed", "jobId": "...", "error": "smile_reveal video failed: ...", "result": {"motionType": "smile_reveal"}} {"type": "persona_failed", "jobId": "...", "error": "Spec generation failed: ..."} ``` diff --git a/internal/adapter/templates/templates/skeleton/pkg/personagen/imagegen.go.tmpl b/internal/adapter/templates/templates/skeleton/pkg/personagen/imagegen.go.tmpl index d80bba7..280ab4c 100644 --- a/internal/adapter/templates/templates/skeleton/pkg/personagen/imagegen.go.tmpl +++ b/internal/adapter/templates/templates/skeleton/pkg/personagen/imagegen.go.tmpl @@ -77,6 +77,8 @@ func buildHEIAPrompt(spec *persona.PersonaSpec, imgSpec *persona.ImageSpec) stri // buildIdentitySection creates the [IDENTITY] section. // Example: "[IDENTITY] 26-year-old Korean woman, 5'4" (163cm), slender-athletic build." +// For mixed-race personas with a resolved heritage breakdown, produces e.g. +// "[IDENTITY] 26-year-old East Asian and Latina/Hispanic heritage woman, ..." func buildIdentitySection(spec *persona.PersonaSpec) string { if spec.DNA == nil { return "" @@ -84,11 +86,20 @@ func buildIdentitySection(spec *persona.PersonaSpec) string { id := spec.DNA.Identity body := spec.DNA.Body + ethnicityDesc := ethnicitToAdj(id.Ethnicity) + if id.SecondaryHeritage != nil { + ethnicityDesc = fmt.Sprintf( + "%s and %s heritage", + ethnicitToAdj(id.PrimaryHeritage), + ethnicitToAdj(*id.SecondaryHeritage), + ) + } + heightFt := cmToFeet(body.HeightCM) return fmt.Sprintf( "[IDENTITY] %d-year-old %s %s, %s (%dcm), %s build.", id.Age, - ethnicitToAdj(id.Ethnicity), + ethnicityDesc, strings.ToLower(string(id.Gender)), heightFt, body.HeightCM, diff --git a/internal/adapter/templates/templates/skeleton/pkg/personagen/service.go.tmpl b/internal/adapter/templates/templates/skeleton/pkg/personagen/service.go.tmpl index e76d541..51851d3 100644 --- a/internal/adapter/templates/templates/skeleton/pkg/personagen/service.go.tmpl +++ b/internal/adapter/templates/templates/skeleton/pkg/personagen/service.go.tmpl @@ -19,6 +19,7 @@ import ( "errors" "fmt" "log/slog" + "strings" "time" "{{GO_MODULE}}/pkg/mediagen" @@ -122,14 +123,27 @@ func (s *Service) GenerateImages(ctx context.Context, spec *persona.PersonaSpec, return nil } -// GenerateVideo generates a video for the given motion type. +// GenerateVideo generates a video for the given motion type and uploads it to storage. // Requires SetAnchor() to have been called first (or GenerateImages() for position 1). // Returns ErrAnchorNotSet if no anchor is available. func (s *Service) GenerateVideo(ctx context.Context, spec *persona.PersonaSpec, motionType persona.MotionType) (*persona.VideoSpec, error) { if s.anchor == nil { return nil, ErrAnchorNotSet } - return generateVideo(ctx, s.mediagen, spec, motionType, s.anchor, s.logger) + + videoSpec, videoData, err := generateVideo(ctx, s.mediagen, spec, motionType, s.anchor, s.logger) + if err != nil { + return nil, err + } + + storagePath := fmt.Sprintf("personas/%s/videos/%s.mp4", spec.ID, string(motionType)) + url, err := s.store.Upload(ctx, storagePath, videoData, "video/mp4") + if err != nil { + videoSpec.Status = persona.VideoStatusFailed + return nil, fmt.Errorf("storing video %s: %w", motionType, err) + } + videoSpec.URL = url + return videoSpec, nil } // GenerateAvatar generates a square profile picture (close-up face, 1:1 crop). @@ -266,9 +280,31 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store Result: map[string]any{"personaId": spec.ID}, }) + // Build an ordered position list — position 1 (anchor) must always be generated first. + // generatePosition() mutates the spec.ImageMatrix entry in place (URL, Status), + // so we keep a pointer to each entry to read the URL after generation. + type posEntry struct { + pos int + imgSpec *persona.ImageSpec + } + orderedPositions := make([]posEntry, 0, len(spec.ImageMatrix)) + for i := range spec.ImageMatrix { + orderedPositions = append(orderedPositions, posEntry{ + pos: spec.ImageMatrix[i].Position, + imgSpec: &spec.ImageMatrix[i], + }) + } + // Swap position 1 to front if it isn't already. + for i, e := range orderedPositions { + if e.pos == 1 && i != 0 { + orderedPositions[0], orderedPositions[i] = orderedPositions[i], orderedPositions[0] + break + } + } + // Generate all 20 image positions, publishing progress events. - for _, imgSpec := range spec.ImageMatrix { - pos := imgSpec.Position + for _, entry := range orderedPositions { + pos := entry.pos sendEvent(&realtime.SSEEvent{ Type: "persona_image_started", JobID: job.ID, @@ -287,18 +323,11 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store } progress := (pos * 100) / 20 - url := "" - for _, is := range spec.ImageMatrix { - if is.Position == pos { - url = is.URL - break - } - } sendEvent(&realtime.SSEEvent{ Type: "persona_image_progress", JobID: job.ID, Progress: progress, - Result: map[string]any{"position": pos, "url": url}, + Result: map[string]any{"position": pos, "url": entry.imgSpec.URL}, }) } @@ -310,7 +339,8 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store Result: map[string]any{"personaId": spec.ID}, }) - // Generate 4 videos. + // Generate 4 videos. Videos are best-effort — a failed video does not abort the job, + // but a persona_video_failed event is sent so the frontend can reflect partial completion. for _, vs := range spec.Videos { sendEvent(&realtime.SSEEvent{ Type: "persona_video_started", @@ -322,7 +352,12 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store videoSpec, err := svc.GenerateVideo(ctx, spec, vs.MotionType) if err != nil { logger.Warn("persona video generation failed (non-fatal)", "error", err, "motion", vs.MotionType, "job_id", job.ID) - // Videos are best-effort; don't fail the entire job. + sendEvent(&realtime.SSEEvent{ + Type: "persona_video_failed", + JobID: job.ID, + Error: fmt.Sprintf("%s video failed: %s", vs.MotionType, err.Error()), + Result: map[string]any{"motionType": string(vs.MotionType)}, + }) continue } @@ -366,15 +401,16 @@ func buildBannerPrompt(spec *persona.PersonaSpec, style string) string { ) } -// inferGenerationTier infers a generation tier from the description keywords. +// inferGenerationTier infers a generation tier from the description keywords (case-insensitive). func inferGenerationTier(description string) persona.GenerationTier { + lower := strings.ToLower(description) for _, kw := range []string{"supermodel", "model", "editorial", "high fashion"} { - if contains(description, kw) { + if strings.Contains(lower, kw) { return persona.GenerationTierSupermodel } } for _, kw := range []string{"influencer", "content creator", "blogger", "social media"} { - if contains(description, kw) { + if strings.Contains(lower, kw) { return persona.GenerationTierInfluencer } } @@ -393,40 +429,5 @@ func inferAttractiveness(tier persona.GenerationTier) persona.AttractivenessTier } } -// contains checks if a string contains a substring (case-insensitive). -func contains(s, substr string) bool { - return len(s) >= len(substr) && - len(s) > 0 && - (s == substr || len(s) > 0 && stringContainsFold(s, substr)) -} - -func stringContainsFold(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if equalFold(s[i:i+len(substr)], substr) { - return true - } - } - return false -} - -func equalFold(a, b string) bool { - if len(a) != len(b) { - return false - } - for i := 0; i < len(a); i++ { - ca, cb := a[i], b[i] - if ca >= 'A' && ca <= 'Z' { - ca += 'a' - 'A' - } - if cb >= 'A' && cb <= 'Z' { - cb += 'a' - 'A' - } - if ca != cb { - return false - } - } - return true -} - // now returns the current time. Useful for overriding in tests. var now = func() time.Time { return time.Now() } diff --git a/internal/adapter/templates/templates/skeleton/pkg/personagen/specgen.go.tmpl b/internal/adapter/templates/templates/skeleton/pkg/personagen/specgen.go.tmpl index acf66a1..237e53d 100644 --- a/internal/adapter/templates/templates/skeleton/pkg/personagen/specgen.go.tmpl +++ b/internal/adapter/templates/templates/skeleton/pkg/personagen/specgen.go.tmpl @@ -152,6 +152,10 @@ type dnaLLMResponse struct { TorsoLength string `json:"torso_length"` BustSize string `json:"bust_size,omitempty"` Posture string `json:"posture"` + // Heritage breakdown (only populated for mixed-ethnicity personas) + PrimaryHeritage string `json:"primary_heritage,omitempty"` + SecondaryHeritage string `json:"secondary_heritage,omitempty"` + MixPercentage int `json:"mix_percentage,omitempty"` // Voice Pitch string `json:"pitch"` PitchRange string `json:"pitch_range"` @@ -430,6 +434,15 @@ Return ONLY a JSON object (all fields required): identity.Name.First, identity.Age, identity.Gender, identity.Ethnicity, identity.Nationality, fashionCtx.Name, fashionCtx.Description) + if identity.Ethnicity == persona.EthnicityMixed { + prompt += ` + +IMPORTANT — this persona is mixed-race. Append these 3 fields to the JSON response: + "primary_heritage": one of ["east_asian","south_asian","southeast_asian","african","hispanic","middle_eastern","caucasian"], + "secondary_heritage": one of the same list (different from primary_heritage), + "mix_percentage": number (50-80, percentage that is primary heritage)` + } + resp, err := tg.GenerateText(ctx, textgen.TextRequest{ Prompt: prompt, MaxTokens: 900, @@ -445,13 +458,13 @@ Return ONLY a JSON object (all fields required): return nil, fmt.Errorf("parsing DNA response: %w", err) } - return &persona.DNA{ + dna := &persona.DNA{ Identity: persona.IdentityDNA{ Ethnicity: identity.Ethnicity, Age: identity.Age, Gender: identity.Gender, Nationality: identity.Nationality, - PrimaryHeritage: identity.Ethnicity, + PrimaryHeritage: identity.Ethnicity, // matches Ethnicity for non-mixed personas }, Face: persona.FaceDNA{ FaceShape: persona.FaceShapeCategory(r.FaceShape), @@ -505,7 +518,19 @@ Return ONLY a JSON object (all fields required): Clarity: persona.ClarityCategory(r.Clarity), Expressiveness: persona.ExpressivenessCategory(r.Expressiveness), }, - }, nil + } + + // Populate mixed-heritage breakdown when the LLM returned heritage fields. + if r.PrimaryHeritage != "" { + dna.Identity.PrimaryHeritage = persona.EthnicityCode(r.PrimaryHeritage) + } + if r.SecondaryHeritage != "" { + sec := persona.EthnicityCode(r.SecondaryHeritage) + dna.Identity.SecondaryHeritage = &sec + dna.Identity.MixPercentage = r.MixPercentage + } + + return dna, nil } // populateImageMatrix assigns outfit and fashion context details to each image spec diff --git a/internal/adapter/templates/templates/skeleton/pkg/personagen/videogen.go.tmpl b/internal/adapter/templates/templates/skeleton/pkg/personagen/videogen.go.tmpl index 16e17fd..677da58 100644 --- a/internal/adapter/templates/templates/skeleton/pkg/personagen/videogen.go.tmpl +++ b/internal/adapter/templates/templates/skeleton/pkg/personagen/videogen.go.tmpl @@ -12,6 +12,7 @@ import ( // generateVideo builds a Veo prompt for the given motion type and calls the mediagen provider. // Requires anchor bytes (position 1 image) as the reference frame for identity consistency. +// Returns the VideoSpec and the raw video bytes (to be uploaded by the caller). func generateVideo( ctx context.Context, mg *mediagen.Manager, @@ -19,9 +20,9 @@ func generateVideo( motionType persona.MotionType, anchor []byte, logger *slog.Logger, -) (*persona.VideoSpec, error) { +) (*persona.VideoSpec, []byte, error) { if mg == nil { - return nil, fmt.Errorf("mediagen not configured") + return nil, nil, fmt.Errorf("mediagen not configured") } // Find the matching VideoSpec in the spec's Videos slice. @@ -43,7 +44,7 @@ func generateVideo( } } if videoSpec == nil { - return nil, fmt.Errorf("unsupported motion type: %s", motionType) + return nil, nil, fmt.Errorf("unsupported motion type: %s", motionType) } prompt := buildVeoPrompt(spec, motionType) @@ -62,17 +63,32 @@ func generateVideo( }) if err != nil { videoSpec.Status = persona.VideoStatusFailed - return nil, fmt.Errorf("video provider error: %w", err) + return nil, nil, fmt.Errorf("video provider error: %w", err) } if len(resp.Videos) == 0 { videoSpec.Status = persona.VideoStatusFailed - return nil, fmt.Errorf("no videos returned from provider for motion type %s", motionType) + return nil, nil, fmt.Errorf("no videos returned from provider for motion type %s", motionType) } - // URL will be set by the caller after uploading to storage. videoSpec.Status = persona.VideoStatusComplete - return videoSpec, nil + return videoSpec, resp.Videos[0].Data, nil +} + +// pronounSet holds subject and object pronouns for a persona. +type pronounSet struct{ subject, object string } + +// genderPronouns returns appropriate pronouns based on the persona's gender identity. +func genderPronouns(spec *persona.PersonaSpec) pronounSet { + if spec.DNA != nil { + switch spec.DNA.Identity.Gender { + case persona.GenderMan: + return pronounSet{"He", "him"} + case persona.GenderNonBinary: + return pronounSet{"They", "them"} + } + } + return pronounSet{"She", "her"} } // buildVeoPrompt constructs a Veo video generation prompt for the given motion type. @@ -80,60 +96,59 @@ func generateVideo( func buildVeoPrompt(spec *persona.PersonaSpec, motionType persona.MotionType) string { identity := buildIdentityLine(spec) audio := buildAudioDescriptor(spec) + pronouns := genderPronouns(spec) switch motionType { case persona.MotionSmileReveal: - return buildSmileRevealPrompt(identity, audio) + return buildSmileRevealPrompt(identity, audio, pronouns) case persona.MotionPersonality: - return buildPersonalityPrompt(spec, identity, audio) + return buildPersonalityPrompt(spec, identity, audio, pronouns) case persona.MotionLifestyle: - return buildLifestylePrompt(spec, identity, audio) + return buildLifestylePrompt(spec, identity, audio, pronouns) case persona.MotionInvitation: - return buildInvitationPrompt(spec, identity, audio) + return buildInvitationPrompt(spec, identity, audio, pronouns) default: return fmt.Sprintf("%s Natural, candid moment, warm natural lighting. %s", identity, audio) } } // buildSmileRevealPrompt creates a warm, genuine smile reveal video prompt. -func buildSmileRevealPrompt(identity, audio string) string { +func buildSmileRevealPrompt(identity, audio string, p pronounSet) string { return fmt.Sprintf( - "%s She looks slightly away, then turns directly to camera with a warm, genuine smile — "+ + "%s %s looks slightly away, then turns directly to camera with a warm, genuine smile — "+ "eyes lighting up, expression full of warmth and personality. "+ "Soft natural lighting, close-up framing, shallow depth of field. "+ "Slow motion for the smile reveal moment. %s", - identity, audio, + identity, p.subject, audio, ) } // buildPersonalityPrompt creates an expressive personality showcase video prompt. -func buildPersonalityPrompt(spec *persona.PersonaSpec, identity, audio string) string { - extraversion := "moderate" +func buildPersonalityPrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string { + expressStyle := "warm and natural" if spec.DNA != nil { - // We don't have HEXACO in DNA; use voice expressiveness as a proxy. + // Use voice expressiveness as a proxy for personality energy. switch spec.DNA.Voice.Expressiveness { case persona.ExpressivenessAnimated: - extraversion = "highly expressive and animated" + expressStyle = "highly expressive and animated" case persona.ExpressivenessExpressive: - extraversion = "expressive and engaging" - default: - extraversion = "warm and natural" + expressStyle = "expressive and engaging" } } return fmt.Sprintf( - "%s A candid personality moment — she is %s, laughing or reacting naturally, "+ + "%s A candid personality moment — %s is %s, laughing or reacting naturally, "+ "full of charisma. Dynamic handheld camera movement. "+ "Golden hour or warm studio lighting. "+ "Cut between close-up and mid-shot for rhythm. %s", - identity, extraversion, audio, + identity, strings.ToLower(p.subject), expressStyle, audio, ) } // buildLifestylePrompt creates a contextual lifestyle video prompt. -func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) string { +func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string { scene := "stylish urban environment" - activity := "going about her day" + activity := "going about their day" if spec.Lifestyle.VacationStyle.Primary != "" { switch spec.Lifestyle.VacationStyle.Primary { @@ -151,7 +166,7 @@ func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) str activity = "enjoying a refined moment" case "cultural": scene = "culturally rich environment" - activity = "immersed in her surroundings" + activity = "immersed in their surroundings" } } @@ -160,24 +175,24 @@ func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) str } return fmt.Sprintf( - "%s A natural lifestyle moment — she is %s in a %s. "+ + "%s A natural lifestyle moment — %s is %s in a %s. "+ "Wide establishing shot transitioning to mid-shot. "+ "Cinematic 16:9 composition, natural movement, vibrant color grading. %s", - identity, activity, scene, audio, + identity, strings.ToLower(p.subject), activity, scene, audio, ) } // buildInvitationPrompt creates a direct-address invitation video prompt. -func buildInvitationPrompt(spec *persona.PersonaSpec, identity, audio string) string { +func buildInvitationPrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string { name := spec.Name.First return fmt.Sprintf( - "%s She looks directly into the camera with a warm, confident expression. "+ + "%s %s looks directly into the camera with a warm, confident expression. "+ "%s gestures naturally as if personally inviting the viewer, "+ "making direct eye contact, with a knowing smile. "+ "Close-up to mid-shot. Clean, aspirational background. "+ "Cinematic vertical 9:16 framing. %s", - identity, name, audio, + identity, p.subject, name, audio, ) }