feat: wire mixed-heritage through Stage 4 and fix pronoun support
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

- specgen: extend dnaLLMResponse with heritage fields; conditionally extend
  Stage 4 prompt for EthnicityMixed to ask LLM for primary_heritage,
  secondary_heritage, and mix_percentage; populate IdentityDNA fields from
  response so mixed personas get a real heritage breakdown
- imagegen: buildIdentitySection() produces "East Asian and Latina/Hispanic
  heritage" description for mixed personas instead of generic "mixed-race"
- videogen: add genderPronouns() helper; replace hardcoded she/her with
  pronoun set across all 4 video prompts; generateVideo() returns raw bytes
  so caller can upload to storage
- service: GenerateVideo() uploads video to storage and sets VideoSpec.URL;
  anchor ordering ensures position 1 is generated first; emit
  persona_video_failed SSE event on non-fatal video failures; replace manual
  fold helpers with strings.ToLower + strings.Contains
- worker/main: register persona_generate handler when both AI managers ready
- docs: add persona_video_failed to SSE events reference in personagen.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-23 01:21:59 -07:00
parent 002c32aedb
commit 3979ef2d08
6 changed files with 145 additions and 87 deletions

View File

@ -16,6 +16,7 @@ import (
"{{GO_MODULE}}/pkg/logging"
"{{GO_MODULE}}/pkg/mediagen"
mediagenAdapters "{{GO_MODULE}}/pkg/mediagen/adapters"
"{{GO_MODULE}}/pkg/personagen"
"{{GO_MODULE}}/pkg/queue"
"{{GO_MODULE}}/pkg/realtime"
"{{GO_MODULE}}/pkg/storage"
@ -216,6 +217,10 @@ func main() {
handler.RegisterHandler("generate_text", handlers.TextHandler(textgenManager, ssePub, logger))
handler.RegisterHandler("ai_chat_response", handlers.ChatResponseHandler(textgenManager, ssePub, logger))
}
// Persona generation requires both textgen (5-stage LLM pipeline) and mediagen (20 images + 4 videos).
if textgenManager != nil && mediagenManager != nil {
handler.RegisterHandler("persona_generate", personagen.QueueHandler(textgenManager, mediagenManager, mediaStore, ssePub, logger.Logger))
}
// Setup signal handling
sigCh := make(chan os.Signal, 1)

View File

@ -76,6 +76,7 @@ Subscribe to `user:<userId>` channel before calling the generate endpoint:
{"type": "persona_image_complete", "jobId": "...", "progress": 100, "result": {"personaId": "..."}}
{"type": "persona_video_started", "jobId": "...", "result": {"motionType": "smile_reveal"}}
{"type": "persona_video_complete", "jobId": "...", "result": {"motionType": "smile_reveal", "url": "..."}}
{"type": "persona_video_failed", "jobId": "...", "error": "smile_reveal video failed: ...", "result": {"motionType": "smile_reveal"}}
{"type": "persona_failed", "jobId": "...", "error": "Spec generation failed: ..."}
```

View File

@ -77,6 +77,8 @@ func buildHEIAPrompt(spec *persona.PersonaSpec, imgSpec *persona.ImageSpec) stri
// buildIdentitySection creates the [IDENTITY] section.
// Example: "[IDENTITY] 26-year-old Korean woman, 5'4" (163cm), slender-athletic build."
// For mixed-race personas with a resolved heritage breakdown, produces e.g.
// "[IDENTITY] 26-year-old East Asian and Latina/Hispanic heritage woman, ..."
func buildIdentitySection(spec *persona.PersonaSpec) string {
if spec.DNA == nil {
return ""
@ -84,11 +86,20 @@ func buildIdentitySection(spec *persona.PersonaSpec) string {
id := spec.DNA.Identity
body := spec.DNA.Body
ethnicityDesc := ethnicitToAdj(id.Ethnicity)
if id.SecondaryHeritage != nil {
ethnicityDesc = fmt.Sprintf(
"%s and %s heritage",
ethnicitToAdj(id.PrimaryHeritage),
ethnicitToAdj(*id.SecondaryHeritage),
)
}
heightFt := cmToFeet(body.HeightCM)
return fmt.Sprintf(
"[IDENTITY] %d-year-old %s %s, %s (%dcm), %s build.",
id.Age,
ethnicitToAdj(id.Ethnicity),
ethnicityDesc,
strings.ToLower(string(id.Gender)),
heightFt,
body.HeightCM,

View File

@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"log/slog"
"strings"
"time"
"{{GO_MODULE}}/pkg/mediagen"
@ -122,14 +123,27 @@ func (s *Service) GenerateImages(ctx context.Context, spec *persona.PersonaSpec,
return nil
}
// GenerateVideo generates a video for the given motion type.
// GenerateVideo generates a video for the given motion type and uploads it to storage.
// Requires SetAnchor() to have been called first (or GenerateImages() for position 1).
// Returns ErrAnchorNotSet if no anchor is available.
func (s *Service) GenerateVideo(ctx context.Context, spec *persona.PersonaSpec, motionType persona.MotionType) (*persona.VideoSpec, error) {
if s.anchor == nil {
return nil, ErrAnchorNotSet
}
return generateVideo(ctx, s.mediagen, spec, motionType, s.anchor, s.logger)
videoSpec, videoData, err := generateVideo(ctx, s.mediagen, spec, motionType, s.anchor, s.logger)
if err != nil {
return nil, err
}
storagePath := fmt.Sprintf("personas/%s/videos/%s.mp4", spec.ID, string(motionType))
url, err := s.store.Upload(ctx, storagePath, videoData, "video/mp4")
if err != nil {
videoSpec.Status = persona.VideoStatusFailed
return nil, fmt.Errorf("storing video %s: %w", motionType, err)
}
videoSpec.URL = url
return videoSpec, nil
}
// GenerateAvatar generates a square profile picture (close-up face, 1:1 crop).
@ -266,9 +280,31 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store
Result: map[string]any{"personaId": spec.ID},
})
// Build an ordered position list — position 1 (anchor) must always be generated first.
// generatePosition() mutates the spec.ImageMatrix entry in place (URL, Status),
// so we keep a pointer to each entry to read the URL after generation.
type posEntry struct {
pos int
imgSpec *persona.ImageSpec
}
orderedPositions := make([]posEntry, 0, len(spec.ImageMatrix))
for i := range spec.ImageMatrix {
orderedPositions = append(orderedPositions, posEntry{
pos: spec.ImageMatrix[i].Position,
imgSpec: &spec.ImageMatrix[i],
})
}
// Swap position 1 to front if it isn't already.
for i, e := range orderedPositions {
if e.pos == 1 && i != 0 {
orderedPositions[0], orderedPositions[i] = orderedPositions[i], orderedPositions[0]
break
}
}
// Generate all 20 image positions, publishing progress events.
for _, imgSpec := range spec.ImageMatrix {
pos := imgSpec.Position
for _, entry := range orderedPositions {
pos := entry.pos
sendEvent(&realtime.SSEEvent{
Type: "persona_image_started",
JobID: job.ID,
@ -287,18 +323,11 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store
}
progress := (pos * 100) / 20
url := ""
for _, is := range spec.ImageMatrix {
if is.Position == pos {
url = is.URL
break
}
}
sendEvent(&realtime.SSEEvent{
Type: "persona_image_progress",
JobID: job.ID,
Progress: progress,
Result: map[string]any{"position": pos, "url": url},
Result: map[string]any{"position": pos, "url": entry.imgSpec.URL},
})
}
@ -310,7 +339,8 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store
Result: map[string]any{"personaId": spec.ID},
})
// Generate 4 videos.
// Generate 4 videos. Videos are best-effort — a failed video does not abort the job,
// but a persona_video_failed event is sent so the frontend can reflect partial completion.
for _, vs := range spec.Videos {
sendEvent(&realtime.SSEEvent{
Type: "persona_video_started",
@ -322,7 +352,12 @@ func QueueHandler(tg *textgen.Manager, mg *mediagen.Manager, store storage.Store
videoSpec, err := svc.GenerateVideo(ctx, spec, vs.MotionType)
if err != nil {
logger.Warn("persona video generation failed (non-fatal)", "error", err, "motion", vs.MotionType, "job_id", job.ID)
// Videos are best-effort; don't fail the entire job.
sendEvent(&realtime.SSEEvent{
Type: "persona_video_failed",
JobID: job.ID,
Error: fmt.Sprintf("%s video failed: %s", vs.MotionType, err.Error()),
Result: map[string]any{"motionType": string(vs.MotionType)},
})
continue
}
@ -366,15 +401,16 @@ func buildBannerPrompt(spec *persona.PersonaSpec, style string) string {
)
}
// inferGenerationTier infers a generation tier from the description keywords.
// inferGenerationTier infers a generation tier from the description keywords (case-insensitive).
func inferGenerationTier(description string) persona.GenerationTier {
lower := strings.ToLower(description)
for _, kw := range []string{"supermodel", "model", "editorial", "high fashion"} {
if contains(description, kw) {
if strings.Contains(lower, kw) {
return persona.GenerationTierSupermodel
}
}
for _, kw := range []string{"influencer", "content creator", "blogger", "social media"} {
if contains(description, kw) {
if strings.Contains(lower, kw) {
return persona.GenerationTierInfluencer
}
}
@ -393,40 +429,5 @@ func inferAttractiveness(tier persona.GenerationTier) persona.AttractivenessTier
}
}
// contains checks if a string contains a substring (case-insensitive).
func contains(s, substr string) bool {
return len(s) >= len(substr) &&
len(s) > 0 &&
(s == substr || len(s) > 0 && stringContainsFold(s, substr))
}
func stringContainsFold(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if equalFold(s[i:i+len(substr)], substr) {
return true
}
}
return false
}
func equalFold(a, b string) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < len(a); i++ {
ca, cb := a[i], b[i]
if ca >= 'A' && ca <= 'Z' {
ca += 'a' - 'A'
}
if cb >= 'A' && cb <= 'Z' {
cb += 'a' - 'A'
}
if ca != cb {
return false
}
}
return true
}
// now returns the current time. Useful for overriding in tests.
var now = func() time.Time { return time.Now() }

View File

@ -152,6 +152,10 @@ type dnaLLMResponse struct {
TorsoLength string `json:"torso_length"`
BustSize string `json:"bust_size,omitempty"`
Posture string `json:"posture"`
// Heritage breakdown (only populated for mixed-ethnicity personas)
PrimaryHeritage string `json:"primary_heritage,omitempty"`
SecondaryHeritage string `json:"secondary_heritage,omitempty"`
MixPercentage int `json:"mix_percentage,omitempty"`
// Voice
Pitch string `json:"pitch"`
PitchRange string `json:"pitch_range"`
@ -430,6 +434,15 @@ Return ONLY a JSON object (all fields required):
identity.Name.First, identity.Age, identity.Gender, identity.Ethnicity, identity.Nationality,
fashionCtx.Name, fashionCtx.Description)
if identity.Ethnicity == persona.EthnicityMixed {
prompt += `
IMPORTANT — this persona is mixed-race. Append these 3 fields to the JSON response:
"primary_heritage": one of ["east_asian","south_asian","southeast_asian","african","hispanic","middle_eastern","caucasian"],
"secondary_heritage": one of the same list (different from primary_heritage),
"mix_percentage": number (50-80, percentage that is primary heritage)`
}
resp, err := tg.GenerateText(ctx, textgen.TextRequest{
Prompt: prompt,
MaxTokens: 900,
@ -445,13 +458,13 @@ Return ONLY a JSON object (all fields required):
return nil, fmt.Errorf("parsing DNA response: %w", err)
}
return &persona.DNA{
dna := &persona.DNA{
Identity: persona.IdentityDNA{
Ethnicity: identity.Ethnicity,
Age: identity.Age,
Gender: identity.Gender,
Nationality: identity.Nationality,
PrimaryHeritage: identity.Ethnicity,
PrimaryHeritage: identity.Ethnicity, // matches Ethnicity for non-mixed personas
},
Face: persona.FaceDNA{
FaceShape: persona.FaceShapeCategory(r.FaceShape),
@ -505,7 +518,19 @@ Return ONLY a JSON object (all fields required):
Clarity: persona.ClarityCategory(r.Clarity),
Expressiveness: persona.ExpressivenessCategory(r.Expressiveness),
},
}, nil
}
// Populate mixed-heritage breakdown when the LLM returned heritage fields.
if r.PrimaryHeritage != "" {
dna.Identity.PrimaryHeritage = persona.EthnicityCode(r.PrimaryHeritage)
}
if r.SecondaryHeritage != "" {
sec := persona.EthnicityCode(r.SecondaryHeritage)
dna.Identity.SecondaryHeritage = &sec
dna.Identity.MixPercentage = r.MixPercentage
}
return dna, nil
}
// populateImageMatrix assigns outfit and fashion context details to each image spec

View File

@ -12,6 +12,7 @@ import (
// generateVideo builds a Veo prompt for the given motion type and calls the mediagen provider.
// Requires anchor bytes (position 1 image) as the reference frame for identity consistency.
// Returns the VideoSpec and the raw video bytes (to be uploaded by the caller).
func generateVideo(
ctx context.Context,
mg *mediagen.Manager,
@ -19,9 +20,9 @@ func generateVideo(
motionType persona.MotionType,
anchor []byte,
logger *slog.Logger,
) (*persona.VideoSpec, error) {
) (*persona.VideoSpec, []byte, error) {
if mg == nil {
return nil, fmt.Errorf("mediagen not configured")
return nil, nil, fmt.Errorf("mediagen not configured")
}
// Find the matching VideoSpec in the spec's Videos slice.
@ -43,7 +44,7 @@ func generateVideo(
}
}
if videoSpec == nil {
return nil, fmt.Errorf("unsupported motion type: %s", motionType)
return nil, nil, fmt.Errorf("unsupported motion type: %s", motionType)
}
prompt := buildVeoPrompt(spec, motionType)
@ -62,17 +63,32 @@ func generateVideo(
})
if err != nil {
videoSpec.Status = persona.VideoStatusFailed
return nil, fmt.Errorf("video provider error: %w", err)
return nil, nil, fmt.Errorf("video provider error: %w", err)
}
if len(resp.Videos) == 0 {
videoSpec.Status = persona.VideoStatusFailed
return nil, fmt.Errorf("no videos returned from provider for motion type %s", motionType)
return nil, nil, fmt.Errorf("no videos returned from provider for motion type %s", motionType)
}
// URL will be set by the caller after uploading to storage.
videoSpec.Status = persona.VideoStatusComplete
return videoSpec, nil
return videoSpec, resp.Videos[0].Data, nil
}
// pronounSet holds subject and object pronouns for a persona.
type pronounSet struct{ subject, object string }
// genderPronouns returns appropriate pronouns based on the persona's gender identity.
func genderPronouns(spec *persona.PersonaSpec) pronounSet {
if spec.DNA != nil {
switch spec.DNA.Identity.Gender {
case persona.GenderMan:
return pronounSet{"He", "him"}
case persona.GenderNonBinary:
return pronounSet{"They", "them"}
}
}
return pronounSet{"She", "her"}
}
// buildVeoPrompt constructs a Veo video generation prompt for the given motion type.
@ -80,60 +96,59 @@ func generateVideo(
func buildVeoPrompt(spec *persona.PersonaSpec, motionType persona.MotionType) string {
identity := buildIdentityLine(spec)
audio := buildAudioDescriptor(spec)
pronouns := genderPronouns(spec)
switch motionType {
case persona.MotionSmileReveal:
return buildSmileRevealPrompt(identity, audio)
return buildSmileRevealPrompt(identity, audio, pronouns)
case persona.MotionPersonality:
return buildPersonalityPrompt(spec, identity, audio)
return buildPersonalityPrompt(spec, identity, audio, pronouns)
case persona.MotionLifestyle:
return buildLifestylePrompt(spec, identity, audio)
return buildLifestylePrompt(spec, identity, audio, pronouns)
case persona.MotionInvitation:
return buildInvitationPrompt(spec, identity, audio)
return buildInvitationPrompt(spec, identity, audio, pronouns)
default:
return fmt.Sprintf("%s Natural, candid moment, warm natural lighting. %s", identity, audio)
}
}
// buildSmileRevealPrompt creates a warm, genuine smile reveal video prompt.
func buildSmileRevealPrompt(identity, audio string) string {
func buildSmileRevealPrompt(identity, audio string, p pronounSet) string {
return fmt.Sprintf(
"%s She looks slightly away, then turns directly to camera with a warm, genuine smile — "+
"%s %s looks slightly away, then turns directly to camera with a warm, genuine smile — "+
"eyes lighting up, expression full of warmth and personality. "+
"Soft natural lighting, close-up framing, shallow depth of field. "+
"Slow motion for the smile reveal moment. %s",
identity, audio,
identity, p.subject, audio,
)
}
// buildPersonalityPrompt creates an expressive personality showcase video prompt.
func buildPersonalityPrompt(spec *persona.PersonaSpec, identity, audio string) string {
extraversion := "moderate"
func buildPersonalityPrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string {
expressStyle := "warm and natural"
if spec.DNA != nil {
// We don't have HEXACO in DNA; use voice expressiveness as a proxy.
// Use voice expressiveness as a proxy for personality energy.
switch spec.DNA.Voice.Expressiveness {
case persona.ExpressivenessAnimated:
extraversion = "highly expressive and animated"
expressStyle = "highly expressive and animated"
case persona.ExpressivenessExpressive:
extraversion = "expressive and engaging"
default:
extraversion = "warm and natural"
expressStyle = "expressive and engaging"
}
}
return fmt.Sprintf(
"%s A candid personality moment — she is %s, laughing or reacting naturally, "+
"%s A candid personality moment — %s is %s, laughing or reacting naturally, "+
"full of charisma. Dynamic handheld camera movement. "+
"Golden hour or warm studio lighting. "+
"Cut between close-up and mid-shot for rhythm. %s",
identity, extraversion, audio,
identity, strings.ToLower(p.subject), expressStyle, audio,
)
}
// buildLifestylePrompt creates a contextual lifestyle video prompt.
func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) string {
func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string {
scene := "stylish urban environment"
activity := "going about her day"
activity := "going about their day"
if spec.Lifestyle.VacationStyle.Primary != "" {
switch spec.Lifestyle.VacationStyle.Primary {
@ -151,7 +166,7 @@ func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) str
activity = "enjoying a refined moment"
case "cultural":
scene = "culturally rich environment"
activity = "immersed in her surroundings"
activity = "immersed in their surroundings"
}
}
@ -160,24 +175,24 @@ func buildLifestylePrompt(spec *persona.PersonaSpec, identity, audio string) str
}
return fmt.Sprintf(
"%s A natural lifestyle moment — she is %s in a %s. "+
"%s A natural lifestyle moment — %s is %s in a %s. "+
"Wide establishing shot transitioning to mid-shot. "+
"Cinematic 16:9 composition, natural movement, vibrant color grading. %s",
identity, activity, scene, audio,
identity, strings.ToLower(p.subject), activity, scene, audio,
)
}
// buildInvitationPrompt creates a direct-address invitation video prompt.
func buildInvitationPrompt(spec *persona.PersonaSpec, identity, audio string) string {
func buildInvitationPrompt(spec *persona.PersonaSpec, identity, audio string, p pronounSet) string {
name := spec.Name.First
return fmt.Sprintf(
"%s She looks directly into the camera with a warm, confident expression. "+
"%s %s looks directly into the camera with a warm, confident expression. "+
"%s gestures naturally as if personally inviting the viewer, "+
"making direct eye contact, with a knowing smile. "+
"Close-up to mid-shot. Clean, aspirational background. "+
"Cinematic vertical 9:16 framing. %s",
identity, name, audio,
identity, p.subject, name, audio,
)
}