rdev/internal/adapter/notify/provisioner.go
jordan ee1c214b7e
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
fix: correct Resend DNS record type, name, and MX priority
Three bugs in the notify provisioner DNS record upsert:

1. rec.Record ("DKIM"/"SPF") was used as the DNS record type — Cloudflare
   doesn't know those labels. Fix: use rec.DNSType ("TXT"/"MX") from the
   resendDNSRecord.type JSON field, which is the actual DNS record type.

2. rec.Name from Resend is already relative to the zone apex
   (e.g., "resend._domainkey.mail.project-name"), not relative to the
   registered domain. Code was doing rec.Name + "." + host which produced
   a doubled subdomain. Fix: pass rec.Name directly — Cloudflare's
   normalizeName appends ".baseDomain" to build the correct FQDN.

3. MX records have priority 10 in Resend's response but DNSRecord had no
   Priority field and Cloudflare CreateRecord/UpdateRecord didn't send it.
   Fix: add Priority int to domain.DNSRecord and include it in the body
   for both Create and Update when non-zero.

These bugs caused DKIM/SPF DNS records to never be created for any project.
Re-provision affected projects using POST /projects/{id}/notify/provision
after clearing NOTIFY_RESEND_DOMAIN_ID from the credential store.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 19:52:11 -07:00

460 lines
15 KiB
Go

package notify
import (
"context"
"fmt"
"log/slog"
"time"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/port"
)
// Provisioner implements port.NotifyProvisioner using the notify admin API.
// Each project gets an isolated sending host (mail.{slug}.{baseDomain}),
// a Resend domain with DKIM/SPF DNS records, and a dedicated send key.
type Provisioner struct {
client notifyAdminAPI
resend resendAPI // nil when ResendAPIKey not configured
resendAPIKey string // passed to createProvider; kept separate from resend for interface compatibility
dns port.DNSProvider // nil when Cloudflare not configured
baseDomain string // e.g., "threesix.ai"
logger *slog.Logger
}
// Config holds configuration for the notify provisioner.
type Config struct {
BaseURL string // Required: notify service URL (e.g., "https://notify.orchard9.ai")
AdminKey string // Required: admin API key (notify_admin_...)
ResendAPIKey string // Optional: Resend API key for per-project domain provisioning
BaseDomain string // Base domain for per-project hosts (default: "threesix.ai")
}
// NewProvisioner creates a new notify provisioner.
func NewProvisioner(cfg Config, dns port.DNSProvider, logger *slog.Logger) *Provisioner {
baseDomain := cfg.BaseDomain
if baseDomain == "" {
baseDomain = "threesix.ai"
}
p := &Provisioner{
client: newAdminClient(cfg.BaseURL, cfg.AdminKey),
dns: dns,
baseDomain: baseDomain,
logger: logger,
}
if cfg.ResendAPIKey != "" {
p.resend = newResendClient(cfg.ResendAPIKey)
p.resendAPIKey = cfg.ResendAPIKey
}
return p
}
// CreateProjectNotify provisions a per-project notify host, Resend domain, DNS records,
// and notify account with send key.
//
// Steps:
// 1. Create notify host mail.{slug}.{baseDomain}
// 2. Add Resend provider to the host (skipped if ResendAPIKey not configured)
// 3. Register from-address noreply@mail.{slug}.{baseDomain}
// 4. Create notify account "project-{projectID}"
// 5. Create send key for the account
// 6. Grant the account access to the host (non-fatal)
// 7. Create Resend domain (non-fatal — skipped if ResendAPIKey not configured)
// 8. Add DNS records via Cloudflare (non-fatal — skipped if DNS not configured)
// 9. Fire-and-forget async domain verification
func (p *Provisioner) CreateProjectNotify(ctx context.Context, projectID, slug string) (*domain.NotifyCredentials, error) {
host := "mail." + slug + "." + p.baseDomain
from := "noreply@" + host
accountName := "project-" + projectID
// 1. Create notify host
if err := p.client.createHost(ctx, host, "failover"); err != nil {
return nil, fmt.Errorf("notify: create host %s for project %s: %w", host, projectID, err)
}
// 2. Add Resend provider to the host (only when Resend is configured)
if p.resend != nil {
if err := p.client.createProvider(ctx, host, "resend", map[string]string{"api_key": p.resendAPIKey}, 1, 3, 1000); err != nil {
p.bestEffortDeleteHost(ctx, host, projectID)
return nil, fmt.Errorf("notify: create provider on host %s for project %s: %w", host, projectID, err)
}
}
// 3. Register from-address
if err := p.client.createFromAddress(ctx, host, from, slug); err != nil {
p.bestEffortDeleteHost(ctx, host, projectID)
return nil, fmt.Errorf("notify: create from-address %s for project %s: %w", from, projectID, err)
}
// 4. Create account
acct, err := p.client.createAccount(ctx, accountName)
if err != nil {
p.bestEffortDeleteHost(ctx, host, projectID)
return nil, fmt.Errorf("notify: create account for project %s: %w", projectID, err)
}
// 5. Create send key
key, err := p.client.createSendKey(ctx, acct.ID, accountName+"-send")
if err != nil {
p.bestEffortDeleteAccount(ctx, acct.ID, projectID)
p.bestEffortDeleteHost(ctx, host, projectID)
return nil, fmt.Errorf("notify: create send key for project %s: %w", projectID, err)
}
// 6. Grant host access (non-fatal — log warn and continue)
if err := p.client.grantHostAccess(ctx, host, acct.ID); err != nil {
p.logger.Warn("failed to grant notify host access",
"host", host,
"account_id", acct.ID,
"project_id", projectID,
"error", err,
)
}
// 7. Create Resend domain (non-fatal — project still usable, email won't send until fixed)
var resendDomainID string
var dnsRecords []resendDNSRecord
if p.resend != nil {
var resendErr error
resendDomainID, dnsRecords, resendErr = p.resend.createDomain(ctx, host, "us-east-1")
if resendErr != nil {
p.logger.Warn("failed to create resend domain — email delivery will not work until resolved",
"host", host,
"project_id", projectID,
"error", resendErr,
)
} else {
p.logger.Info("resend domain created", "host", host, "domain_id", resendDomainID)
}
}
// 8. Add DNS records for DKIM/SPF (non-fatal).
// rec.Name is relative to the zone apex (e.g., "resend._domainkey.mail.slug").
// Cloudflare's normalizeName appends ".baseDomain" to build the FQDN.
if p.dns != nil && len(dnsRecords) > 0 {
for _, rec := range dnsRecords {
dnsRec := domain.DNSRecord{
Type: rec.DNSType,
Name: rec.Name,
Content: rec.Value,
TTL: 1,
Priority: rec.Priority,
}
if _, upsertErr := p.dns.UpsertRecord(ctx, dnsRec); upsertErr != nil {
p.logger.Warn("failed to upsert notify DNS record",
"name", rec.Name,
"type", rec.DNSType,
"project_id", projectID,
"error", upsertErr,
)
}
}
}
// 9. Fire-and-forget async domain verification.
// Waits 60 seconds for DNS propagation, then retries verification up to 5 times with 30s backoff.
if p.resend != nil && resendDomainID != "" {
go func() {
verifyCtx := context.WithoutCancel(ctx)
p.verifyWithRetry(verifyCtx, resendDomainID, host, projectID)
}()
}
p.logger.Info("notify provisioned",
"project_id", projectID,
"host", host,
"resend_domain_id", resendDomainID,
)
return &domain.NotifyCredentials{
ProjectID: projectID,
AccountID: acct.ID,
APIKey: key.Key,
Host: host,
From: from,
ResendDomainID: resendDomainID,
CreatedAt: time.Now(),
}, nil
}
// verifyWithRetry waits for DNS propagation then attempts domain verification with retries.
// Called in a goroutine — all errors are logged and do not propagate.
func (p *Provisioner) verifyWithRetry(ctx context.Context, resendDomainID, host, projectID string) {
const (
initialDelay = 60 * time.Second
retryInterval = 30 * time.Second
maxAttempts = 5
)
// Wait for DNS propagation before first attempt.
select {
case <-ctx.Done():
return
case <-time.After(initialDelay):
}
for attempt := 1; attempt <= maxAttempts; attempt++ {
if err := p.resend.verifyDomain(ctx, resendDomainID); err != nil {
p.logger.Warn("resend domain verification attempt failed",
"attempt", attempt,
"max_attempts", maxAttempts,
"domain_id", resendDomainID,
"host", host,
"project_id", projectID,
"error", err,
)
if attempt < maxAttempts {
select {
case <-ctx.Done():
return
case <-time.After(retryInterval):
}
}
continue
}
p.logger.Info("resend domain verified",
"domain_id", resendDomainID,
"host", host,
"project_id", projectID,
"attempt", attempt,
)
return
}
p.logger.Warn("resend domain verification exhausted all attempts — re-verify manually via API",
"domain_id", resendDomainID,
"host", host,
"project_id", projectID,
)
}
// DeleteProjectNotify removes all notify resources for a project.
// Failures are logged as warnings — cleanup continues regardless.
func (p *Provisioner) DeleteProjectNotify(ctx context.Context, projectID, slug, resendDomainID string) error {
host := "mail." + slug + "." + p.baseDomain
// 1. Delete notify account (cascades keys + host grants)
acct, err := p.findAccountByProject(ctx, projectID)
if err != nil {
p.logger.Warn("failed to find notify account during deletion",
"project_id", projectID,
"error", err,
)
} else if acct != nil {
if err := p.client.deleteAccount(ctx, acct.ID); err != nil {
p.logger.Warn("failed to delete notify account",
"account_id", acct.ID,
"project_id", projectID,
"error", err,
)
}
}
// 2. Delete notify host
if err := p.client.deleteHost(ctx, host); err != nil {
p.logger.Warn("failed to delete notify host",
"host", host,
"project_id", projectID,
"error", err,
)
}
// 3. Delete Resend domain
if p.resend != nil && resendDomainID != "" {
if err := p.resend.deleteDomain(ctx, resendDomainID); err != nil {
p.logger.Warn("failed to delete resend domain",
"domain_id", resendDomainID,
"project_id", projectID,
"error", err,
)
}
}
// 4. Delete Cloudflare DNS records for DKIM/SPF.
// Names follow Resend's standard format:
// DKIM: resend._domainkey.{host}
// SPF MX: send.{host}
// SPF TXT: send.{host}
// If Resend changes their record naming, manual cleanup may be needed.
if p.dns != nil {
dkimName := "resend._domainkey." + host
if err := p.dns.DeleteRecordByName(ctx, "TXT", dkimName); err != nil {
p.logger.Warn("failed to delete DKIM DNS record",
"name", dkimName,
"project_id", projectID,
"error", err,
)
}
spfSendName := "send." + host
if err := p.dns.DeleteRecordByName(ctx, "MX", spfSendName); err != nil {
p.logger.Warn("failed to delete SPF MX DNS record",
"name", spfSendName,
"project_id", projectID,
"error", err,
)
}
if err := p.dns.DeleteRecordByName(ctx, "TXT", spfSendName); err != nil {
p.logger.Warn("failed to delete SPF TXT DNS record",
"name", spfSendName,
"project_id", projectID,
"error", err,
)
}
}
p.logger.Info("notify resources deleted", "project_id", projectID, "host", host)
return nil
}
// VerifyProjectNotify triggers Resend domain verification for the given domain ID.
// Call this after DNS records have had time to propagate (~60 seconds minimum).
func (p *Provisioner) VerifyProjectNotify(ctx context.Context, projectID, resendDomainID string) error {
if p.resend == nil {
return fmt.Errorf("notify: resend not configured")
}
if resendDomainID == "" {
return fmt.Errorf("notify: resend domain ID not available for project %s", projectID)
}
if err := p.resend.verifyDomain(ctx, resendDomainID); err != nil {
return fmt.Errorf("notify: verify domain for project %s: %w", projectID, err)
}
return nil
}
// ProvisionNotifyDomain creates the Resend domain for an existing notify host, adds DKIM/SPF DNS
// records via Cloudflare, and kicks off async verification. Use this to repair projects where
// Resend domain creation failed during initial provisioning (steps 7-9 of CreateProjectNotify).
// Returns the Resend domain ID which must be stored as NOTIFY_RESEND_DOMAIN_ID by the caller.
func (p *Provisioner) ProvisionNotifyDomain(ctx context.Context, projectID, host string) (string, error) {
if p.resend == nil {
return "", fmt.Errorf("notify: resend not configured")
}
if host == "" {
return "", fmt.Errorf("notify: host is required")
}
// Step 7: Create Resend domain for the existing notify host.
resendDomainID, dnsRecords, err := p.resend.createDomain(ctx, host, "us-east-1")
if err != nil {
return "", fmt.Errorf("notify: create resend domain for %s: %w", host, err)
}
p.logger.Info("resend domain created", "host", host, "domain_id", resendDomainID, "project_id", projectID)
// Step 8: Add DKIM/SPF DNS records (non-fatal).
// rec.Name is relative to the zone apex (e.g., "resend._domainkey.mail.slug").
// Cloudflare's normalizeName appends ".baseDomain" to build the FQDN.
if p.dns != nil && len(dnsRecords) > 0 {
for _, rec := range dnsRecords {
if _, upsertErr := p.dns.UpsertRecord(ctx, domain.DNSRecord{
Type: rec.DNSType,
Name: rec.Name,
Content: rec.Value,
TTL: 1,
Priority: rec.Priority,
}); upsertErr != nil {
p.logger.Warn("failed to upsert notify DNS record",
"name", rec.Name,
"type", rec.DNSType,
"project_id", projectID,
"error", upsertErr,
)
}
}
}
// Step 9: Fire-and-forget async verification with DNS propagation wait.
go func() {
verifyCtx := context.WithoutCancel(ctx)
p.verifyWithRetry(verifyCtx, resendDomainID, host, projectID)
}()
return resendDomainID, nil
}
// GetNotifyDomainStatus returns the Resend verification status for the project's email domain.
func (p *Provisioner) GetNotifyDomainStatus(ctx context.Context, host, resendDomainID string) (*domain.NotifyDomainStatus, error) {
if p.resend == nil || resendDomainID == "" {
return &domain.NotifyDomainStatus{
Host: host,
ResendDomainID: resendDomainID,
Status: "not_configured",
}, nil
}
status, err := p.resend.getDomainStatus(ctx, resendDomainID)
if err != nil {
return nil, fmt.Errorf("notify: get domain status for %s: %w", host, err)
}
return &domain.NotifyDomainStatus{
Host: host,
ResendDomainID: resendDomainID,
Status: status,
}, nil
}
// GetProjectNotify returns notify credentials for the project, or nil if not provisioned.
// Note: Only AccountID and CreatedAt are populated — APIKey, Host, and From are not
// recoverable after provisioning. Use this method solely to check whether provisioning
// has already occurred (non-nil return = already provisioned).
func (p *Provisioner) GetProjectNotify(ctx context.Context, projectID string) (*domain.NotifyCredentials, error) {
acct, err := p.findAccountByProject(ctx, projectID)
if err != nil {
return nil, fmt.Errorf("notify: find account for project %s: %w", projectID, err)
}
if acct == nil {
return nil, nil
}
return &domain.NotifyCredentials{
ProjectID: projectID,
AccountID: acct.ID,
CreatedAt: acct.CreatedAt,
}, nil
}
// TestConnection verifies the notify admin API is reachable.
func (p *Provisioner) TestConnection(ctx context.Context) error {
_, err := p.client.listAccounts(ctx)
if err != nil {
return fmt.Errorf("notify admin API unreachable: %w", err)
}
return nil
}
// findAccountByProject looks up the account named "project-{projectID}".
func (p *Provisioner) findAccountByProject(ctx context.Context, projectID string) (*accountResponse, error) {
accounts, err := p.client.listAccounts(ctx)
if err != nil {
return nil, err
}
targetName := "project-" + projectID
for i := range accounts {
if accounts[i].Name == targetName {
return &accounts[i], nil
}
}
return nil, nil
}
// bestEffortDeleteHost deletes the notify host, logging on failure.
func (p *Provisioner) bestEffortDeleteHost(ctx context.Context, host, projectID string) {
if err := p.client.deleteHost(ctx, host); err != nil {
p.logger.Warn("failed to clean up notify host after provisioning failure",
"host", host,
"project_id", projectID,
"error", err,
)
}
}
// bestEffortDeleteAccount deletes the notify account, logging on failure.
func (p *Provisioner) bestEffortDeleteAccount(ctx context.Context, accountID, projectID string) {
if err := p.client.deleteAccount(ctx, accountID); err != nil {
p.logger.Warn("failed to clean up notify account after provisioning failure",
"account_id", accountID,
"project_id", projectID,
"error", err,
)
}
}