224 lines
6.9 KiB
Go
224 lines
6.9 KiB
Go
package laozhang
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
const (
|
|
defaultVideoModel = "veo-3.1"
|
|
defaultVideoCount = 1
|
|
)
|
|
|
|
// VideoRequest represents a video generation request
|
|
type VideoRequest struct {
|
|
Model string `json:"model"` // Model to use (default: "veo-3.1")
|
|
Prompt string `json:"prompt"` // Required for text-to-video: text description of the desired video
|
|
N int `json:"n,omitempty"` // Number of videos to generate (1-4, default: 1)
|
|
ReferenceImages []string `json:"reference_images,omitempty"` // Optional: base64 or URLs for image-to-video
|
|
}
|
|
|
|
// VideoResponse represents a video generation response
|
|
type VideoResponse struct {
|
|
ID string `json:"id"` // Response ID
|
|
Created int64 `json:"created"` // Unix timestamp of when the video was created
|
|
Data []VideoData `json:"data"` // List of generated videos
|
|
}
|
|
|
|
// VideoData represents a single generated video
|
|
type VideoData struct {
|
|
URL string `json:"url"` // Video URL
|
|
}
|
|
|
|
// videoChatMessage represents the internal chat message format for video generation (request)
|
|
type videoChatMessage struct {
|
|
Role string `json:"role"`
|
|
Content []videoChatContentPart `json:"content"`
|
|
}
|
|
|
|
// videoChatResponseMessage represents the response message format (content can be string or array)
|
|
type videoChatResponseMessage struct {
|
|
Role string `json:"role"`
|
|
Content json.RawMessage `json:"content"` // Can be string or []videoChatContentPart
|
|
}
|
|
|
|
// videoChatContentPart represents a part of the message content (text or image)
|
|
type videoChatContentPart struct {
|
|
Type string `json:"type"` // "text" or "image_url"
|
|
Text string `json:"text,omitempty"` // Text content
|
|
ImageURL *videoChatImageURL `json:"image_url,omitempty"` // Image URL content
|
|
}
|
|
|
|
// videoChatImageURL represents an image URL in the chat message
|
|
type videoChatImageURL struct {
|
|
URL string `json:"url"` // Base64 data URL or HTTP(S) URL
|
|
}
|
|
|
|
// videoChatRequest represents the internal chat completion request for video generation
|
|
type videoChatRequest struct {
|
|
Model string `json:"model"`
|
|
Messages []videoChatMessage `json:"messages"`
|
|
Stream bool `json:"stream"`
|
|
N int `json:"n"`
|
|
}
|
|
|
|
// videoChatResponse represents the internal chat completion response from Veo API
|
|
type videoChatResponse struct {
|
|
ID string `json:"id"`
|
|
Created int64 `json:"created"`
|
|
Choices []videoChatChoice `json:"choices"`
|
|
}
|
|
|
|
// videoChatChoice represents a single choice in the chat response
|
|
type videoChatChoice struct {
|
|
Message videoChatResponseMessage `json:"message"`
|
|
}
|
|
|
|
// GenerateVideo generates videos based on the provided prompt and optional reference images
|
|
// using the Veo 3.1 models via the chat completions API format.
|
|
//
|
|
// For text-to-video, only the Prompt field is required.
|
|
// For image-to-video (first/last frame interpolation), use models ending in "-fl" and provide ReferenceImages.
|
|
//
|
|
// Supported models:
|
|
// - veo-3.1 (standard, $0.25/gen)
|
|
// - veo-3.1-fast ($0.15/gen)
|
|
// - veo-3.1-fl (first/last frame interpolation)
|
|
// - veo-3.1-fast-fl (fast with interpolation)
|
|
// - Add "-landscape" suffix for landscape variants (e.g., "veo-3.1-landscape")
|
|
func (c *Client) GenerateVideo(ctx context.Context, req VideoRequest) (*VideoResponse, error) {
|
|
// Validate required fields
|
|
if req.Prompt == "" {
|
|
return nil, fmt.Errorf("%w: prompt is required", ErrInvalidConfig)
|
|
}
|
|
|
|
// Set defaults
|
|
if req.Model == "" {
|
|
req.Model = defaultVideoModel
|
|
}
|
|
if req.N == 0 {
|
|
req.N = defaultVideoCount
|
|
}
|
|
|
|
// Validate N is in valid range
|
|
if req.N < 1 || req.N > 4 {
|
|
return nil, fmt.Errorf("%w: n must be between 1 and 4, got %d", ErrInvalidConfig, req.N)
|
|
}
|
|
|
|
// Build message content
|
|
content := []videoChatContentPart{
|
|
{
|
|
Type: "text",
|
|
Text: req.Prompt,
|
|
},
|
|
}
|
|
|
|
// Add reference images if provided (for image-to-video)
|
|
for _, imageURL := range req.ReferenceImages {
|
|
content = append(content, videoChatContentPart{
|
|
Type: "image_url",
|
|
ImageURL: &videoChatImageURL{
|
|
URL: imageURL,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Build chat completion request
|
|
chatReq := videoChatRequest{
|
|
Model: req.Model,
|
|
Messages: []videoChatMessage{
|
|
{
|
|
Role: "user",
|
|
Content: content,
|
|
},
|
|
},
|
|
Stream: false,
|
|
N: req.N,
|
|
}
|
|
|
|
// Make request with video client (5m timeout) - video generation takes 2-5 minutes
|
|
respBody, err := c.doRequestVideo(ctx, "POST", "/chat/completions", chatReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Unmarshal chat response
|
|
var chatResp videoChatResponse
|
|
if err := json.Unmarshal(respBody, &chatResp); err != nil {
|
|
return nil, fmt.Errorf("unmarshal response: %w", err)
|
|
}
|
|
|
|
// Extract video URLs from chat response
|
|
videoResp := &VideoResponse{
|
|
ID: chatResp.ID,
|
|
Created: chatResp.Created,
|
|
Data: make([]VideoData, 0, len(chatResp.Choices)),
|
|
}
|
|
|
|
for _, choice := range chatResp.Choices {
|
|
// The video URL can be returned as either:
|
|
// 1. A plain string (the URL directly)
|
|
// 2. An array of content parts with type "text"
|
|
// 3. Markdown link format: [download video](url)
|
|
content := choice.Message.Content
|
|
|
|
// Try to unmarshal as string first
|
|
var contentStr string
|
|
if err := json.Unmarshal(content, &contentStr); err == nil {
|
|
// Content is a plain string (may be URL or markdown link)
|
|
if url := extractVideoURL(contentStr); url != "" {
|
|
videoResp.Data = append(videoResp.Data, VideoData{
|
|
URL: url,
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Try to unmarshal as array of content parts
|
|
var contentParts []videoChatContentPart
|
|
if err := json.Unmarshal(content, &contentParts); err == nil {
|
|
for _, contentPart := range contentParts {
|
|
if contentPart.Type == "text" && contentPart.Text != "" {
|
|
if url := extractVideoURL(contentPart.Text); url != "" {
|
|
videoResp.Data = append(videoResp.Data, VideoData{
|
|
URL: url,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(videoResp.Data) == 0 {
|
|
return nil, fmt.Errorf("no video URLs in response (id=%s, choices=%d)", chatResp.ID, len(chatResp.Choices))
|
|
}
|
|
|
|
return videoResp, nil
|
|
}
|
|
|
|
// extractVideoURL extracts a clean URL from various response formats.
|
|
// LaoZhang sometimes returns URLs wrapped in markdown: [download video](https://...)
|
|
// This function handles both plain URLs and markdown-wrapped URLs.
|
|
func extractVideoURL(raw string) string {
|
|
// Clean whitespace and newlines
|
|
raw = strings.TrimSpace(raw)
|
|
|
|
// Check for markdown link format: [text](url)
|
|
mdLinkRegex := regexp.MustCompile(`\[.*?\]\((https?://[^\s\)]+)\)`)
|
|
if matches := mdLinkRegex.FindStringSubmatch(raw); len(matches) > 1 {
|
|
return matches[1]
|
|
}
|
|
|
|
// Check for bare URL
|
|
urlRegex := regexp.MustCompile(`(https?://[^\s]+)`)
|
|
if matches := urlRegex.FindStringSubmatch(raw); len(matches) > 1 {
|
|
return matches[1]
|
|
}
|
|
|
|
// Return as-is if no pattern matched
|
|
return raw
|
|
}
|