diff --git a/CHANGELOG.md b/CHANGELOG.md index 59832a7..9d358e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -124,10 +124,17 @@ All notable changes to Flynn are documented in this file. - **Gateway Server** -- `GatewayServerConfig` now accepts `channelRegistry` for channel status reporting; static file server supports `.mjs`, `.png`, `.ico`, `.woff2` - **Entry Points Refactored** -- `src/index.ts` and `src/tui.ts` now delegate to - the CLI module (`src/cli/index.ts`) instead of directly starting the daemon/TUI -- **Daemon Wiring** -- CronScheduler auto-registers in the channel registry when + CLI module (`src/cli/index.ts`) instead of directly starting daemon/TUI +- **Daemon Wiring** -- CronScheduler auto-registers in channel registry when `automation.cron` jobs are configured; channelRegistry passed to GatewayServer +### Fixed + +- **Voice Message Failure Handling** -- Telegram voice/audio messages now send user feedback on + download failures instead of silently dropping. When audio transcription is not configured for + non-audio-capable models, a graceful error message is sent to the user instead of + an empty message which would cause an API crash. + ## [0.1.0] - 2026-02-05 ### Added diff --git a/README.md b/README.md index b9e4a5c..f0f9e7a 100644 --- a/README.md +++ b/README.md @@ -170,20 +170,37 @@ Configure a Whisper-compatible endpoint for models that don't support native aud ```yaml audio: - transcription_endpoint: "http://localhost:8080/v1/audio/transcriptions" - transcription_api_key: "${WHISPER_API_KEY}" # Optional Bearer token - transcription_model: "whisper-1" # Model name (default: whisper-1) - transcription_provider: "openai" # Provider format: openai (default) + enabled: true + provider: + type: custom # openai, groq, ollama, llamacpp, custom + endpoint: "http://localhost:18801/v1/audio/transcriptions" + api_key: "${WHISPER_API_KEY}" # Optional Bearer token + model: "whisper-1" # Model name (default: whisper-1) ``` | Field | Required | Description | |-------|----------|-------------| -| `transcription_endpoint` | yes | Whisper-compatible API endpoint | -| `transcription_api_key` | no | Bearer token for authentication | -| `transcription_model` | no | Model name sent in the request (default: `whisper-1`) | -| `transcription_provider` | no | API format: `openai` (default) | +| `enabled` | no | Enable audio transcription (default: `false`) | +| `provider.type` | yes | Provider type: `openai`, `groq`, `ollama`, `llamacpp`, or `custom` | +| `provider.endpoint` | yes | Whisper-compatible API endpoint | +| `provider.api_key` | no | Bearer token for authentication | +| `provider.model` | no | Model name sent in request (default: `whisper-1`) | -Without an `audio` config, voice messages from non-audio-capable models are silently skipped. +Without an `audio` config, voice messages from non-audio-capable models will display an error message to the user. For local transcription, you can run a whisper.cpp server: + +```bash +# Start whisper.cpp server with OpenAI-compatible endpoint +docker run -d \ + --name whisper-server \ + -p 18801:8080 \ + ghcr.io/ggml-org/whisper.cpp:main \ + --model /app/models/ggml-base.en.bin \ + --host 0.0.0.0 \ + --port 8080 \ + --convert \ + --language en \ + --inference-path /v1/audio/transcriptions +``` ## Telegram Commands diff --git a/config/default.yaml b/config/default.yaml index 56f1849..1462e19 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -123,9 +123,18 @@ hooks: # Configure a Whisper-compatible endpoint for audio transcription. # Models that support native audio input (Gemini, OpenAI, GitHub) will # receive raw audio directly; others fall back to this endpoint. - +# +# For local transcription with whisper.cpp: +# docker run -d --name whisper-server -p 18801:8080 \ +# ghcr.io/ggml-org/whisper.cpp:main \ +# --model /app/models/ggml-base.en.bin \ +# --host 0.0.0.0 --port 8080 --convert --language en \ +# --inference-path /v1/audio/transcriptions +# # audio: -# transcription_endpoint: "http://localhost:8080/v1/audio/transcriptions" -# transcription_api_key: "${WHISPER_API_KEY}" -# transcription_model: "whisper-1" -# transcription_provider: "openai" +# enabled: true +# provider: +# type: custom # openai, groq, ollama, llamacpp, custom +# endpoint: "http://localhost:18801/v1/audio/transcriptions" +# api_key: "${WHISPER_API_KEY}" # Optional Bearer token +# model: "whisper-1" # Model name (default: whisper-1)