#!/usr/bin/env bash
# =============================================================================
# compose_short.sh — Ai4Managers flagship Short compositor (9:16, ~40s)
# -----------------------------------------------------------------------------
# Layers (bottom -> top):
#   1. Avatar (HeyGen 9:16 VO source) normalized to 1080x1920 @30fps
#   2. B-roll cutaways (full-screen, only during each beat window)
#   3. Carteles (Playfair Display, champagne, upper third)
#   4. Karaoke captions of the VO (Space Grotesk, safe zone)
# Audio: avatar VO, normalized to ~-20dB via volumedetect (NOT loudnorm).
#
# Usage:
#   compose_short.sh --avatar <path> --work <dir> --out <path> [--lang es]
#
# The script is idempotent and logs every step. It does NOT require the avatar
# to be parsed/linted — only to RUN. Missing b-rolls are skipped, never fatal.
# =============================================================================
set -euo pipefail

# ---------------------------------------------------------------------------
# Brand palette (obsidian + champagne)
# ---------------------------------------------------------------------------
BG="0a0a0a"
CHAMPAGNE="d4af37"     # cartel accent
CHAMPAGNE_DIM="9a7f2e"
RUST="c0392b"
TEXT_LIGHT="ece6da"
CAPTION_FILL="ffffff"  # karaoke active word
CAPTION_BASE="ece6da"  # karaoke inactive words

# Cartel + caption fonts (resolved via fontconfig family names)
CARTEL_FONT="Playfair Display"
CAPTION_FONT="Space Grotesk"

# Output geometry
W=1080
H=1920
FPS=30

# Karaoke safe zone (esquiva el UI de YouTube Shorts)
CAP_MARGIN_V=420       # px desde abajo
CAP_MARGIN_LR=90       # margen lateral
CAP_FONTSIZE=84

# Cartel placement (tercio superior, fuera de la zona de captions)
CARTEL_FONTSIZE=96
CARTEL_MARGIN_V_TOP=300  # px desde arriba (Alignment 8 = top-center)

# ---------------------------------------------------------------------------
# Logging helpers
# ---------------------------------------------------------------------------
log()  { printf '\033[0;36m[compose]\033[0m %s\n' "$*" >&2; }
ok()   { printf '\033[0;32m[ ok ]\033[0m %s\n' "$*" >&2; }
warn() { printf '\033[0;33m[warn]\033[0m %s\n' "$*" >&2; }
die()  { printf '\033[0;31m[FAIL]\033[0m %s\n' "$*" >&2; exit 1; }

# ---------------------------------------------------------------------------
# Arg parsing
# ---------------------------------------------------------------------------
AVATAR=""
WORK="/home/clawd/playgrounds/ai4m-fear-shorts-prod"
OUT=""
LANG_CODE="es"

while [[ $# -gt 0 ]]; do
  case "$1" in
    --avatar) AVATAR="${2:-}"; shift 2 ;;
    --work)   WORK="${2:-}";   shift 2 ;;
    --out)    OUT="${2:-}";    shift 2 ;;
    --lang)   LANG_CODE="${2:-}"; shift 2 ;;
    -h|--help)
      grep -E '^# ' "$0" | sed 's/^# \{0,1\}//' >&2
      exit 0 ;;
    *) die "Unknown argument: $1" ;;
  esac
done

[[ -n "$WORK" ]] || die "--work es obligatorio"
[[ -n "$OUT"  ]] || OUT="$WORK/out/equipo-hibrido-short.mp4"
BEATS="$WORK/beats.json"

# ---------------------------------------------------------------------------
# Dependency + input checks
# ---------------------------------------------------------------------------
command -v ffmpeg   >/dev/null 2>&1 || die "ffmpeg no encontrado en PATH"
command -v ffprobe  >/dev/null 2>&1 || die "ffprobe no encontrado en PATH"
command -v jq       >/dev/null 2>&1 || die "jq no encontrado en PATH"

[[ -f "$BEATS" ]] || die "No existe beats.json en: $BEATS"

if [[ -z "$AVATAR" ]]; then
  die "--avatar es obligatorio para EJECUTAR (no para escribir el script)."
fi
if [[ ! -f "$AVATAR" ]]; then
  die "Avatar no encontrado: $AVATAR — pasa la ruta al MP4 del avatar HeyGen 9:16."
fi

# Work dirs (idempotent)
CAP_DIR="$WORK/captions"
TMP_DIR="$WORK/tmp"
OUT_DIR="$(dirname "$OUT")"
mkdir -p "$CAP_DIR" "$TMP_DIR" "$OUT_DIR" "$WORK/broll"

log "Avatar : $AVATAR"
log "Work   : $WORK"
log "Out    : $OUT"
log "Lang   : $LANG_CODE"

# ===========================================================================
# STEP A — Normalizar avatar a 1080x1920 @30fps (cover + center crop), VO intacto
# ===========================================================================
NORM="$TMP_DIR/avatar_norm.mp4"
log "STEP A — normalizando avatar a ${W}x${H} @${FPS}fps (cover + crop centrado)"

# scale para cubrir (increase) y luego crop centrado al canvas exacto.
ffmpeg -y -hide_banner -loglevel error -i "$AVATAR" \
  -vf "scale=${W}:${H}:force_original_aspect_ratio=increase,crop=${W}:${H},fps=${FPS},format=yuv420p" \
  -c:v libx264 -preset medium -crf 18 \
  -c:a aac -b:a 192k -ar 48000 \
  "$NORM"
[[ -f "$NORM" ]] || die "STEP A — fallo al normalizar avatar"
ok "STEP A — avatar normalizado: $NORM"

# Verificar que tiene audio (VO)
HAS_AUDIO="$(ffprobe -v error -select_streams a -show_entries stream=codec_type -of csv=p=0 "$NORM" || true)"
[[ "$HAS_AUDIO" == "audio" ]] || warn "STEP A — avatar normalizado SIN pista de audio (VO). Captions usaran timings vacios."

# Duración real (para clamps de cartel/caption)
DUR="$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$NORM" || echo 0)"
log "STEP A — duracion avatar normalizado: ${DUR}s"

# ===========================================================================
# STEP B — Captions karaoke del VO (hyperframes transcribe, fallback whisper)
# ===========================================================================
WORDS_JSON="$CAP_DIR/words.json"
CAP_ASS="$CAP_DIR/captions.ass"
log "STEP B — obteniendo word-timings del VO"

transcribe_ok=0
if command -v hyperframes >/dev/null 2>&1; then
  log "STEP B — intentando hyperframes transcribe (-l $LANG_CODE)"
  if hyperframes transcribe "$NORM" -l "$LANG_CODE" --json > "$WORDS_JSON" 2>"$TMP_DIR/hf_transcribe.log"; then
    if jq -e '.' "$WORDS_JSON" >/dev/null 2>&1 && [[ "$(jq 'length // (.words|length) // 0' "$WORDS_JSON" 2>/dev/null || echo 0)" != "0" ]]; then
      transcribe_ok=1
      ok "STEP B — hyperframes transcribe OK"
    else
      warn "STEP B — hyperframes devolvio JSON vacio/invalido, usando fallback"
    fi
  else
    warn "STEP B — hyperframes transcribe fallo (ver $TMP_DIR/hf_transcribe.log), usando fallback"
  fi
else
  warn "STEP B — hyperframes no esta en PATH, usando fallback whisper"
fi

# Fallback: faster-whisper del venv -> escribir words.json en formato neutro
if [[ "$transcribe_ok" -ne 1 ]]; then
  VENV_PY="/home/clawd/agents-claude-env/bin/python"
  [[ -x "$VENV_PY" ]] || die "STEP B — fallback whisper: no existe $VENV_PY"
  log "STEP B — fallback faster-whisper ($VENV_PY)"
  AUDIO_WAV="$TMP_DIR/vo.wav"
  ffmpeg -y -hide_banner -loglevel error -i "$NORM" -vn -ac 1 -ar 16000 "$AUDIO_WAV" || die "STEP B — no se pudo extraer audio del avatar"
  "$VENV_PY" - "$AUDIO_WAV" "$WORDS_JSON" "$LANG_CODE" <<'PYEOF' || die "STEP B — fallback whisper fallo"
import sys, json
from faster_whisper import WhisperModel
audio, out, lang = sys.argv[1], sys.argv[2], sys.argv[3]
model = WhisperModel("small", device="cpu", compute_type="int8")
segments, _ = model.transcribe(audio, language=lang, word_timestamps=True, vad_filter=True)
words = []
for seg in segments:
    for w in (seg.words or []):
        words.append({"word": w.word.strip(), "start": round(w.start, 3), "end": round(w.end, 3)})
json.dump({"words": words}, open(out, "w"), ensure_ascii=False, indent=2)
print(f"[whisper] {len(words)} words -> {out}", file=sys.stderr)
PYEOF
  ok "STEP B — fallback whisper OK"
fi

# --- Normalizar words.json a un array plano [{word,start,end}] ---
# hyperframes puede devolver {words:[...]} o [...] o {segments:[{words:[...]}]}.
FLAT_WORDS="$CAP_DIR/words_flat.json"
jq '
  if type == "array" then .
  elif (.words? | type) == "array" then .words
  elif (.segments? | type) == "array" then [.segments[] | (.words // [])[]]
  else [] end
  | map({
      word:  (.word // .text // .value // ""),
      start: (.start // .t_start // .from // 0),
      end:   (.end   // .t_end   // .to   // 0)
    })
  | map(select(.word != ""))
' "$WORDS_JSON" > "$FLAT_WORDS" 2>/dev/null || echo "[]" > "$FLAT_WORDS"

NWORDS="$(jq 'length' "$FLAT_WORDS" 2>/dev/null || echo 0)"
log "STEP B — palabras detectadas: $NWORDS"

# --- Generar captions.ass karaoke (2-4 palabras por linea, \k por palabra) ---
log "STEP B — generando ASS karaoke -> $CAP_ASS"

ass_time() { # segundos(float) -> H:MM:SS.cs
  awk -v t="$1" 'BEGIN{
    if (t<0) t=0;
    h=int(t/3600); t-=h*3600;
    m=int(t/60);   t-=m*60;
    s=int(t);      cs=int((t-s)*100+0.5);
    if (cs>=100){cs-=100; s+=1}
    printf "%d:%02d:%02d.%02d", h, m, s, cs;
  }'
}

ass_escape() { printf '%s' "$1" | sed 's/{/(/g; s/}/)/g'; }

{
  cat <<ASSHDR
[Script Info]
ScriptType: v4.00+
PlayResX: ${W}
PlayResY: ${H}
WrapStyle: 2
ScaledBorderAndShadow: yes
YCbCr Matrix: TV.709

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Karaoke, ${CAPTION_FONT}, ${CAP_FONTSIZE}, &H00FFFFFF, &H00${CHAMPAGNE:4:2}${CHAMPAGNE:2:2}${CHAMPAGNE:0:2}, &H00141210, &H96000000, -1, 0, 0, 0, 100, 100, 1, 0, 1, 6, 4, 2, ${CAP_MARGIN_LR}, ${CAP_MARGIN_LR}, ${CAP_MARGIN_V}, 1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
ASSHDR

  # Agrupar palabras en lineas de 2-4 y emitir un Dialogue con \k por palabra.
  if [[ "$NWORDS" -gt 0 ]]; then
    GROUP=3
    i=0
    while [[ $i -lt $NWORDS ]]; do
      end_i=$(( i + GROUP ))
      [[ $end_i -gt $NWORDS ]] && end_i=$NWORDS
      line_start="$(jq -r --argjson a "$i" '.[$a].start' "$FLAT_WORDS")"
      line_end="$(jq -r --argjson b "$((end_i-1))" '.[$b].end' "$FLAT_WORDS")"
      # Construir el texto con karaoke por palabra
      line_text=""
      j=$i
      while [[ $j -lt $end_i ]]; do
        ws="$(jq -r --argjson k "$j" '.[$k].start' "$FLAT_WORDS")"
        we="$(jq -r --argjson k "$j" '.[$k].end'   "$FLAT_WORDS")"
        wtxt="$(jq -r --argjson k "$j" '.[$k].word' "$FLAT_WORDS")"
        wtxt="$(ass_escape "$wtxt")"
        # duracion del karaoke en centisegundos
        kdur="$(awk -v s="$ws" -v e="$we" 'BEGIN{d=(e-s)*100; if(d<1)d=1; printf "%d", d+0.5}')"
        line_text+="{\\k${kdur}}${wtxt} "
        j=$(( j + 1 ))
      done
      printf 'Dialogue: 0,%s,%s,Karaoke,,0,0,0,,%s\n' \
        "$(ass_time "$line_start")" "$(ass_time "$line_end")" "${line_text% }"
      i=$end_i
    done
  fi
} > "$CAP_ASS"

ok "STEP B — captions.ass generado ($(grep -c '^Dialogue' "$CAP_ASS" || echo 0) lineas)"

# ===========================================================================
# STEP C — Carteles (del beats.json) -> eventos ASS, Playfair champagne, upper third
# ===========================================================================
CARTEL_ASS="$CAP_DIR/carteles.ass"
log "STEP C — generando carteles ASS -> $CARTEL_ASS"

# Champagne en formato ASS BGR: &H00 + BB GG RR
CHAMP_BGR="&H00${CHAMPAGNE:4:2}${CHAMPAGNE:2:2}${CHAMPAGNE:0:2}"

{
  cat <<CARTHDR
[Script Info]
ScriptType: v4.00+
PlayResX: ${W}
PlayResY: ${H}
WrapStyle: 0
ScaledBorderAndShadow: yes
YCbCr Matrix: TV.709

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Cartel, ${CARTEL_FONT}, ${CARTEL_FONTSIZE}, ${CHAMP_BGR}, ${CHAMP_BGR}, &H00050505, &H64000000, -1, 0, 0, 0, 100, 100, 2, 0, 1, 4, 3, 8, 100, 100, ${CARTEL_MARGIN_V_TOP}, 1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
CARTHDR

  N_BEATS="$(jq '.beats | length' "$BEATS")"
  k=0
  while [[ $k -lt $N_BEATS ]]; do
    c_text="$(jq -r --argjson i "$k" '.beats[$i].cartel_text // ""' "$BEATS")"
    c_start="$(jq -r --argjson i "$k" '.beats[$i].cartel_t_start // .beats[$i].t_start' "$BEATS")"
    c_end="$(jq -r --argjson i "$k" '.beats[$i].cartel_t_end // .beats[$i].t_end' "$BEATS")"
    k=$(( k + 1 ))
    [[ -z "$c_text" || "$c_text" == "null" ]] && continue
    # \n literal del JSON -> salto de linea ASS (\N). Escapar llaves.
    c_text="$(printf '%s' "$c_text" | sed 's/{/(/g; s/}/)/g')"
    c_text="${c_text//\\n/\\N}"
    # fade-in/out sutil
    printf 'Dialogue: 0,%s,%s,Cartel,,0,0,0,,{\\fad(220,220)}%s\n' \
      "$(ass_time "$c_start")" "$(ass_time "$c_end")" "$c_text"
  done
} > "$CARTEL_ASS"

ok "STEP C — carteles.ass generado ($(grep -c '^Dialogue' "$CARTEL_ASS" || echo 0) carteles)"

# ===========================================================================
# STEP D — B-roll overlays (full-screen) solo durante la ventana de cada beat
# ===========================================================================
log "STEP D — preparando overlays de b-roll (solo b-rolls existentes)"

# Construir filtergraph dinamico. Base = avatar normalizado [base].
# Por cada b-roll existente: escalar a WxH, dar PTS desplazado al t_start del beat,
# y hacer overlay con enable='between(t,t_start,t_end)'.
FILTER_PARTS=()
INPUTS=( -i "$NORM" )
in_idx=1            # 0 es el avatar
labels_prev="[0:v]"
# Normalizamos base a label estable
FILTER_PARTS+=("[0:v]setpts=PTS-STARTPTS,format=yuv420p[base]")
cur="[base]"

N_BEATS="$(jq '.beats | length' "$BEATS")"
brolls_used=0
b=0
while [[ $b -lt $N_BEATS ]]; do
  bf="$(jq -r --argjson i "$b" '.beats[$i].broll_file // ""' "$BEATS")"
  ts="$(jq -r --argjson i "$b" '.beats[$i].t_start' "$BEATS")"
  te="$(jq -r --argjson i "$b" '.beats[$i].t_end' "$BEATS")"
  bidx="$(jq -r --argjson i "$b" '.beats[$i].idx' "$BEATS")"
  b=$(( b + 1 ))
  [[ -z "$bf" || "$bf" == "null" ]] && continue
  bpath="$WORK/$bf"
  if [[ ! -f "$bpath" ]]; then
    warn "STEP D — b-roll ausente (skip): $bf"
    continue
  fi
  log "STEP D — overlay beat $bidx [$ts..$te] <- $bf"
  INPUTS+=( -i "$bpath" )
  dur="$(awk -v a="$ts" -v z="$te" 'BEGIN{printf "%.3f", z-a}')"
  # escalar b-roll a cover + crop, recortar a la duracion del beat, y reubicar su PTS al t_start del beat
  FILTER_PARTS+=("[${in_idx}:v]scale=${W}:${H}:force_original_aspect_ratio=increase,crop=${W}:${H},fps=${FPS},format=yuv420p,trim=duration=${dur},setpts=PTS-STARTPTS+${ts}/TB[bv${in_idx}]")
  next="[vov${in_idx}]"
  FILTER_PARTS+=("${cur}[bv${in_idx}]overlay=0:0:enable='between(t,${ts},${te})':eof_action=pass${next}")
  cur="$next"
  in_idx=$(( in_idx + 1 ))
  brolls_used=$(( brolls_used + 1 ))
done
log "STEP D — b-rolls aplicados: $brolls_used"

# ===========================================================================
# STEP E — Audio: normalizar VO a ~-20dB via volumedetect (NO loudnorm)
# ===========================================================================
log "STEP E — midiendo mean_volume del VO (volumedetect)"
GAIN_DB="0"
if [[ "$HAS_AUDIO" == "audio" ]]; then
  MEAN="$(ffmpeg -hide_banner -i "$NORM" -map 0:a -af volumedetect -f null /dev/null 2>&1 \
          | grep 'mean_volume:' | sed -E 's/.*mean_volume:[[:space:]]*(-?[0-9.]+) dB.*/\1/' | head -1 || true)"
  if [[ -n "${MEAN:-}" ]]; then
    GAIN_DB="$(awk -v m="$MEAN" 'BEGIN{ printf "%.2f", (-20.0) - m }')"
    log "STEP E — mean_volume=${MEAN}dB -> aplicar volume=${GAIN_DB}dB (target -20dB)"
  else
    warn "STEP E — no se pudo medir mean_volume, gain=0dB"
  fi
else
  warn "STEP E — sin audio; se generara pista silenciosa"
fi

# ===========================================================================
# STEP F — Render final: carteles + captions sobre [cur], audio normalizado
# ===========================================================================
log "STEP F — render final -> $OUT"

# Las rutas .ass para el filtro subtitles/ass deben escapar ':' y '\'
ass_path_escape() { printf '%s' "$1" | sed -e 's/\\/\\\\/g' -e 's/:/\\:/g'; }
CARTEL_ASS_ESC="$(ass_path_escape "$CARTEL_ASS")"
CAP_ASS_ESC="$(ass_path_escape "$CAP_ASS")"
FONTS_DIR_ESC="$(ass_path_escape "$HOME/.fonts")"

# Cadena de subtitulos: primero carteles (tercio superior), luego captions (safe zone)
SUBS="${cur}ass=filename='${CARTEL_ASS_ESC}':fontsdir='${FONTS_DIR_ESC}'[withcartel];[withcartel]ass=filename='${CAP_ASS_ESC}':fontsdir='${FONTS_DIR_ESC}'[vout]"
FILTER_PARTS+=("$SUBS")

# Audio filter
if [[ "$HAS_AUDIO" == "audio" ]]; then
  AUDIO_MAP=( -map "0:a" -af "volume=${GAIN_DB}dB" )
else
  AUDIO_MAP=( -f lavfi -i "anullsrc=channel_layout=stereo:sample_rate=48000" -shortest )
fi

# Unir filtergraph
FILTERGRAPH="$(IFS=';'; echo "${FILTER_PARTS[*]}")"

# Guardar el filtergraph para debug/idempotencia
printf '%s\n' "$FILTERGRAPH" > "$TMP_DIR/filtergraph.txt"
log "STEP F — filtergraph guardado en $TMP_DIR/filtergraph.txt"

if [[ "$HAS_AUDIO" == "audio" ]]; then
  ffmpeg -y -hide_banner -loglevel error -stats \
    "${INPUTS[@]}" \
    -filter_complex "$FILTERGRAPH" \
    -map "[vout]" "${AUDIO_MAP[@]}" \
    -c:v libx264 -preset medium -crf 19 -pix_fmt yuv420p \
    -c:a aac -b:a 192k -ar 48000 \
    -r "$FPS" -movflags +faststart \
    -t "$DUR" \
    "$OUT"
else
  ffmpeg -y -hide_banner -loglevel error -stats \
    "${INPUTS[@]}" \
    "${AUDIO_MAP[@]}" \
    -filter_complex "$FILTERGRAPH" \
    -map "[vout]" -map "${in_idx}:a" \
    -c:v libx264 -preset medium -crf 19 -pix_fmt yuv420p \
    -c:a aac -b:a 192k -ar 48000 \
    -r "$FPS" -movflags +faststart \
    -t "$DUR" \
    "$OUT"
fi

[[ -f "$OUT" ]] || die "STEP F — el render no produjo archivo de salida"

# Verificacion final
OUT_RES="$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=p=0 "$OUT" || echo '?')"
OUT_DUR="$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$OUT" || echo '?')"
ok "STEP F — render OK: $OUT (res=$OUT_RES, dur=${OUT_DUR}s)"
log "Listo. Output: $OUT"
