persona-community-5/pkg/laozhang/video.go

package laozhang

import (
	"context"
	"encoding/json"
	"fmt"
	"regexp"
	"strings"
)

const (
	defaultVideoModel = "veo-3.1"
	defaultVideoCount = 1
)

// VideoRequest represents a video generation request
type VideoRequest struct {
	Model           string   `json:"model"`                      // Model to use (default: "veo-3.1")
	Prompt          string   `json:"prompt"`                     // Required for text-to-video: text description of the desired video
	N               int      `json:"n,omitempty"`                // Number of videos to generate (1-4, default: 1)
	ReferenceImages []string `json:"reference_images,omitempty"` // Optional: base64 or URLs for image-to-video
}

// VideoResponse represents a video generation response
type VideoResponse struct {
	ID      string      `json:"id"`      // Response ID
	Created int64       `json:"created"` // Unix timestamp of when the video was created
	Data    []VideoData `json:"data"`    // List of generated videos
}

// VideoData represents a single generated video
type VideoData struct {
	URL string `json:"url"` // Video URL
}

// videoChatMessage represents the internal chat message format for video generation (request)
type videoChatMessage struct {
	Role    string                 `json:"role"`
	Content []videoChatContentPart `json:"content"`
}

// videoChatResponseMessage represents the response message format (content can be string or array)
type videoChatResponseMessage struct {
	Role    string          `json:"role"`
	Content json.RawMessage `json:"content"` // Can be string or []videoChatContentPart
}

// videoChatContentPart represents a part of the message content (text or image)
type videoChatContentPart struct {
	Type     string             `json:"type"`                // "text" or "image_url"
	Text     string             `json:"text,omitempty"`      // Text content
	ImageURL *videoChatImageURL `json:"image_url,omitempty"` // Image URL content
}

// videoChatImageURL represents an image URL in the chat message
type videoChatImageURL struct {
	URL string `json:"url"` // Base64 data URL or HTTP(S) URL
}

// videoChatRequest represents the internal chat completion request for video generation
type videoChatRequest struct {
	Model    string             `json:"model"`
	Messages []videoChatMessage `json:"messages"`
	Stream   bool               `json:"stream"`
	N        int                `json:"n"`
}

// videoChatResponse represents the internal chat completion response from Veo API
type videoChatResponse struct {
	ID      string            `json:"id"`
	Created int64             `json:"created"`
	Choices []videoChatChoice `json:"choices"`
}

// videoChatChoice represents a single choice in the chat response
type videoChatChoice struct {
	Message videoChatResponseMessage `json:"message"`
}

// GenerateVideo generates videos based on the provided prompt and optional reference images
// using the Veo 3.1 models via the chat completions API format.
//
// For text-to-video, only the Prompt field is required.
// For image-to-video (first/last frame interpolation), use models ending in "-fl" and provide ReferenceImages.
//
// Supported models:
//   - veo-3.1 (standard, $0.25/gen)
//   - veo-3.1-fast ($0.15/gen)
//   - veo-3.1-fl (first/last frame interpolation)
//   - veo-3.1-fast-fl (fast with interpolation)
//   - Add "-landscape" suffix for landscape variants (e.g., "veo-3.1-landscape")
func (c *Client) GenerateVideo(ctx context.Context, req VideoRequest) (*VideoResponse, error) {
	// Validate required fields
	if req.Prompt == "" {
		return nil, fmt.Errorf("%w: prompt is required", ErrInvalidConfig)
	}

	// Set defaults
	if req.Model == "" {
		req.Model = defaultVideoModel
	}
	if req.N == 0 {
		req.N = defaultVideoCount
	}

	// Validate N is in valid range
	if req.N < 1 || req.N > 4 {
		return nil, fmt.Errorf("%w: n must be between 1 and 4, got %d", ErrInvalidConfig, req.N)
	}

	// Build message content
	content := []videoChatContentPart{
		{
			Type: "text",
			Text: req.Prompt,
		},
	}

	// Add reference images if provided (for image-to-video)
	for _, imageURL := range req.ReferenceImages {
		content = append(content, videoChatContentPart{
			Type: "image_url",
			ImageURL: &videoChatImageURL{
				URL: imageURL,
			},
		})
	}

	// Build chat completion request
	chatReq := videoChatRequest{
		Model: req.Model,
		Messages: []videoChatMessage{
			{
				Role:    "user",
				Content: content,
			},
		},
		Stream: false,
		N:      req.N,
	}

	// Make request with video client (5m timeout) - video generation takes 2-5 minutes
	respBody, err := c.doRequestVideo(ctx, "POST", "/chat/completions", chatReq)
	if err != nil {
		return nil, err
	}

	// Unmarshal chat response
	var chatResp videoChatResponse
	if err := json.Unmarshal(respBody, &chatResp); err != nil {
		return nil, fmt.Errorf("unmarshal response: %w", err)
	}

	// Extract video URLs from chat response
	videoResp := &VideoResponse{
		ID:      chatResp.ID,
		Created: chatResp.Created,
		Data:    make([]VideoData, 0, len(chatResp.Choices)),
	}

	for _, choice := range chatResp.Choices {
		// The video URL can be returned as either:
		// 1. A plain string (the URL directly)
		// 2. An array of content parts with type "text"
		// 3. Markdown link format: [download video](url)
		content := choice.Message.Content

		// Try to unmarshal as string first
		var contentStr string
		if err := json.Unmarshal(content, &contentStr); err == nil {
			// Content is a plain string (may be URL or markdown link)
			if url := extractVideoURL(contentStr); url != "" {
				videoResp.Data = append(videoResp.Data, VideoData{
					URL: url,
				})
			}
			continue
		}

		// Try to unmarshal as array of content parts
		var contentParts []videoChatContentPart
		if err := json.Unmarshal(content, &contentParts); err == nil {
			for _, contentPart := range contentParts {
				if contentPart.Type == "text" && contentPart.Text != "" {
					if url := extractVideoURL(contentPart.Text); url != "" {
						videoResp.Data = append(videoResp.Data, VideoData{
							URL: url,
						})
					}
				}
			}
		}
	}

	if len(videoResp.Data) == 0 {
		return nil, fmt.Errorf("no video URLs in response (id=%s, choices=%d)", chatResp.ID, len(chatResp.Choices))
	}

	return videoResp, nil
}

// extractVideoURL extracts a clean URL from various response formats.
// LaoZhang sometimes returns URLs wrapped in markdown: [download video](https://...)
// This function handles both plain URLs and markdown-wrapped URLs.
func extractVideoURL(raw string) string {
	// Clean whitespace and newlines
	raw = strings.TrimSpace(raw)

	// Check for markdown link format: [text](url)
	mdLinkRegex := regexp.MustCompile(`\[.*?\]\((https?://[^\s\)]+)\)`)
	if matches := mdLinkRegex.FindStringSubmatch(raw); len(matches) > 1 {
		return matches[1]
	}

	// Check for bare URL
	urlRegex := regexp.MustCompile(`(https?://[^\s]+)`)
	if matches := urlRegex.FindStringSubmatch(raw); len(matches) > 1 {
		return matches[1]
	}

	// Return as-is if no pattern matched
	return raw
}