clawdbot/scripts/transcribe

#!/usr/bin/env bash
# Transcribe audio files using Whisper
# Usage: transcribe <audio_file> [options]
#        transcribe recording.mp3
#        transcribe -m medium meeting.wav

set -e

MODEL="${WHISPER_MODEL:-base}"
LANGUAGE=""
OUTPUT_FORMAT="txt"
AUDIO_FILE=""

show_help() {
  echo "Usage: transcribe [options] <audio_file>"
  echo ""
  echo "Options:"
  echo "  -m, --model MODEL     Whisper model (tiny, base, small, medium, large)"
  echo "                        Default: base (fast), use medium/large for accuracy"
  echo "  -l, --language LANG   Force language (e.g., en, es, fr)"
  echo "  -f, --format FORMAT   Output format (txt, json, srt, vtt)"
  echo "  -h, --help            Show this help"
  echo ""
  echo "Examples:"
  echo "  transcribe meeting.mp3               # Quick transcription"
  echo "  transcribe -m medium interview.wav   # Better accuracy"
  echo "  transcribe -l en -f srt podcast.mp3  # English subtitles"
  echo ""
  echo "Models (speed vs accuracy):"
  echo "  tiny   - Fastest, lowest accuracy (~1GB VRAM)"
  echo "  base   - Fast, good accuracy (~1GB VRAM) [default]"
  echo "  small  - Balanced (~2GB VRAM)"
  echo "  medium - Better accuracy (~5GB VRAM)"
  echo "  large  - Best accuracy (~10GB VRAM)"
  echo ""
  echo "Environment:"
  echo "  WHISPER_MODEL    Default model (default: base)"
}

while [[ $# -gt 0 ]]; do
  case $1 in
    -m|--model)
      MODEL="$2"
      shift 2
      ;;
    -l|--language)
      LANGUAGE="$2"
      shift 2
      ;;
    -f|--format)
      OUTPUT_FORMAT="$2"
      shift 2
      ;;
    -h|--help)
      show_help
      exit 0
      ;;
    -*)
      echo "Unknown option: $1" >&2
      exit 1
      ;;
    *)
      AUDIO_FILE="$1"
      shift
      ;;
  esac
done

if [[ -z "$AUDIO_FILE" ]]; then
  echo "Error: No audio file provided" >&2
  show_help
  exit 1
fi

if [[ ! -f "$AUDIO_FILE" ]]; then
  echo "Error: File not found: $AUDIO_FILE" >&2
  exit 1
fi

# Build whisper command
CMD="whisper \"$AUDIO_FILE\" --model $MODEL --output_format $OUTPUT_FORMAT"

if [[ -n "$LANGUAGE" ]]; then
  CMD="$CMD --language $LANGUAGE"
fi

# Run transcription
echo "Transcribing: $AUDIO_FILE (model: $MODEL)" >&2
eval $CMD

# Show output location
BASE=$(basename "$AUDIO_FILE" | sed 's/\.[^.]*$//')
echo "" >&2
echo "Output: ${BASE}.${OUTPUT_FORMAT}" >&2