From 4e2ab7cdd88fe947a9dcf4fcc7d8e41bd6eff590 Mon Sep 17 00:00:00 2001 From: OpenCode Test Date: Wed, 24 Dec 2025 12:45:22 -0800 Subject: [PATCH] task-11: complete QA + hardening with resilience fixes - Created comprehensive QA checklist covering edge cases (missing EXIF, timezones, codecs, corrupt files) - Added ErrorBoundary component wrapped around TimelineTree and MediaPanel - Created global error.tsx page for unhandled errors - Improved failed asset UX with red borders, warning icons, and inline error display - Added loading skeletons to TimelineTree and MediaPanel - Added retry button for failed media loads - Created DEPLOYMENT_VALIDATION.md with validation commands and checklist - Applied k8s recommendations: - Changed node affinity to required for compute nodes (Pi 5) - Enabled Tailscale LoadBalancer service for MinIO S3 (reliable Range requests) - Enabled cleanup CronJob for staging files --- .tmp-render.yaml | 663 ++++++++++++++++++++++ .tmp-values.yaml | 25 + DEPLOYMENT_VALIDATION.md | 331 +++++++++++ PLAN.md | 85 ++- apps/web/app/components/ErrorBoundary.tsx | 45 ++ apps/web/app/components/MediaPanel.tsx | 262 ++++++--- apps/web/app/components/TimelineTree.tsx | 82 ++- apps/web/app/error.tsx | 35 ++ apps/web/app/page.tsx | 9 +- apps/web/tsconfig.tsbuildinfo | 1 + helm/porthole/templates/_helpers.tpl | 13 + helm/porthole/templates/secret.yaml.tpl | 4 +- helm/porthole/values.yaml | 20 +- 13 files changed, 1444 insertions(+), 131 deletions(-) create mode 100644 .tmp-render.yaml create mode 100644 .tmp-values.yaml create mode 100644 DEPLOYMENT_VALIDATION.md create mode 100644 apps/web/app/components/ErrorBoundary.tsx create mode 100644 apps/web/app/error.tsx create mode 100644 apps/web/tsconfig.tsbuildinfo diff --git a/.tmp-render.yaml b/.tmp-render.yaml new file mode 100644 index 0000000..8bc4a1f --- /dev/null +++ b/.tmp-render.yaml @@ -0,0 +1,663 @@ +--- +# Source: tline/templates/secret.yaml.tpl +apiVersion: v1 +kind: Secret +metadata: + name: tline-tline-secrets + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" +type: Opaque +data: + POSTGRES_PASSWORD: Y2hhbmdlLW1l + MINIO_ACCESS_KEY_ID: bWluaW9hZG1pbg== + MINIO_SECRET_ACCESS_KEY: bWluaW9hZG1pbg==--- +apiVersion: v1 +kind: Secret +metadata: + name: tline-tline-registry + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: eyJhdXRocyI6eyJyZWdpc3RyeS5sYW46NTAwMCI6eyJhdXRoIjoiZFRwdyIsImVtYWlsIjoiZUBleGFtcGxlLmNvbSIsInBhc3N3b3JkIjoicCIsInVzZXJuYW1lIjoidSJ9fX0= +--- +# Source: tline/templates/configmap.yaml.tpl +apiVersion: v1 +kind: ConfigMap +metadata: + name: tline-tline-config + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" +data: + APP_NAME: "flux" + NEXT_PUBLIC_APP_NAME: "flux" + QUEUE_NAME: "tline" + DATABASE_URL: "postgres://tline:change-me@tline-tline-postgres:5432/tline" + REDIS_URL: "redis://tline-tline-redis:6379" + MINIO_INTERNAL_ENDPOINT: "http://tline-tline-minio:9000" + MINIO_PUBLIC_ENDPOINT_TS: "https://minio.tailxyz.ts.net" + MINIO_REGION: "us-east-1" + MINIO_BUCKET: "media" + MINIO_PRESIGN_EXPIRES_SECONDS: "900" +--- +# Source: tline/templates/minio.yaml.tpl +apiVersion: v1 +kind: Service +metadata: + name: tline-tline-minio + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: minio +spec: + type: ClusterIP + ports: + - name: s3 + port: 9000 + targetPort: s3 + - name: console + port: 9001 + targetPort: console + selector: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: minio +--- +# Source: tline/templates/postgres.yaml.tpl +apiVersion: v1 +kind: Service +metadata: + name: tline-tline-postgres + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" +spec: + type: ClusterIP + ports: + - name: postgres + port: 5432 + targetPort: postgres + selector: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: postgres +--- +# Source: tline/templates/redis.yaml.tpl +apiVersion: v1 +kind: Service +metadata: + name: tline-tline-redis + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" +spec: + type: ClusterIP + ports: + - name: redis + port: 6379 + targetPort: redis + selector: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: redis +--- +# Source: tline/templates/web.yaml.tpl +apiVersion: v1 +kind: Service +metadata: + name: tline-tline-web + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: web +spec: + type: ClusterIP + ports: + - name: http + port: 3000 + targetPort: http + selector: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: web +--- +# Source: tline/templates/redis.yaml.tpl +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tline-tline-redis + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: redis +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: redis + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: redis + spec: + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: redis + image: "redis:7" + imagePullPolicy: IfNotPresent + ports: + - name: redis + containerPort: 6379 + resources: + limits: + cpu: 300m + memory: 512Mi + requests: + cpu: 50m + memory: 128Mi +--- +# Source: tline/templates/web.yaml.tpl +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tline-tline-web + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: web +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: web + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: web + spec: + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: web + image: "registry.lan:5000/tline-web:dev" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 3000 + envFrom: + - configMapRef: + name: tline-tline-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: POSTGRES_PASSWORD + - name: MINIO_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_ACCESS_KEY_ID + - name: MINIO_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_SECRET_ACCESS_KEY + readinessProbe: + httpGet: + path: /api/healthz + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /api/healthz + port: http + initialDelaySeconds: 20 + periodSeconds: 10 + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 200m + memory: 256Mi +--- +# Source: tline/templates/worker.yaml.tpl +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tline-tline-worker + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: worker +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: worker + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: worker + spec: + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: worker + image: "registry.lan:5000/tline-worker:dev" + imagePullPolicy: IfNotPresent + envFrom: + - configMapRef: + name: tline-tline-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: POSTGRES_PASSWORD + - name: MINIO_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_ACCESS_KEY_ID + - name: MINIO_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_SECRET_ACCESS_KEY + resources: + limits: + cpu: 2000m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi +--- +# Source: tline/templates/minio.yaml.tpl +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: tline-tline-minio + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: minio +spec: + serviceName: tline-tline-minio + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: minio + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: minio + spec: + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: minio + image: "minio/minio:RELEASE.2024-01-16T16-07-38Z" + imagePullPolicy: IfNotPresent + args: + - server + - /data + - "--console-address=:9001" + ports: + - name: s3 + containerPort: 9000 + - name: console + containerPort: 9001 + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_ACCESS_KEY_ID + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: MINIO_SECRET_ACCESS_KEY + readinessProbe: + httpGet: + path: /minio/health/ready + port: s3 + initialDelaySeconds: 10 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /minio/health/live + port: s3 + initialDelaySeconds: 20 + periodSeconds: 10 + resources: + limits: + cpu: 1500m + memory: 2Gi + requests: + cpu: 250m + memory: 512Mi + volumeMounts: + - name: data + mountPath: /data + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "200Gi" +--- +# Source: tline/templates/postgres.yaml.tpl +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: tline-tline-postgres + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: postgres +spec: + serviceName: tline-tline-postgres + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: postgres + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: postgres + spec: + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: postgres + image: "postgres:16" + imagePullPolicy: IfNotPresent + ports: + - name: postgres + containerPort: 5432 + env: + - name: POSTGRES_USER + value: "tline" + - name: POSTGRES_DB + value: "tline" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: POSTGRES_PASSWORD + readinessProbe: + exec: + command: + - sh + - -c + - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + exec: + command: + - sh + - -c + - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" + initialDelaySeconds: 20 + periodSeconds: 10 + resources: + limits: + cpu: 1500m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "20Gi" +--- +# Source: tline/templates/ingress-tailscale.yaml.tpl +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tline-tline-web + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: web + annotations: +spec: + ingressClassName: tailscale + tls: + - hosts: + - "app" + rules: + - host: "app" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tline-tline-web + port: + number: 3000 +--- +# Source: tline/templates/ingress-tailscale.yaml.tpl +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tline-tline-minio + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: minio + annotations: +spec: + ingressClassName: tailscale + tls: + - hosts: + - "minio" + rules: + - host: "minio" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tline-tline-minio + port: + number: 9000 +--- +# Source: tline/templates/ingress-tailscale.yaml.tpl +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tline-tline-minio-console + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: minio + annotations: +spec: + ingressClassName: tailscale + tls: + - hosts: + - "minio-console" + rules: + - host: "minio-console" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tline-tline-minio + port: + number: 9001 +--- +# Source: tline/templates/job-migrate.yaml.tpl +apiVersion: batch/v1 +kind: Job +metadata: + name: tline-tline-migrate + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/managed-by: Helm + helm.sh/chart: "tline-0.1.0" + app.kubernetes.io/component: migrate + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 3 + template: + metadata: + labels: + app.kubernetes.io/name: tline + app.kubernetes.io/instance: tline + app.kubernetes.io/component: migrate + spec: + restartPolicy: Never + imagePullSecrets: + - name: "tline-tline-registry" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: In + values: + - compute + containers: + - name: migrate + image: "registry.lan:5000/tline-worker:dev" + imagePullPolicy: IfNotPresent + command: + - bun + - run + - packages/db/src/migrate.ts + envFrom: + - configMapRef: + name: tline-tline-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: tline-tline-secrets + key: POSTGRES_PASSWORD diff --git a/.tmp-values.yaml b/.tmp-values.yaml new file mode 100644 index 0000000..43e706a --- /dev/null +++ b/.tmp-values.yaml @@ -0,0 +1,25 @@ +secrets: + postgres: + password: "change-me" + minio: + accessKeyId: "minioadmin" + secretAccessKey: "minioadmin" + +images: + web: + repository: registry.lan:5000/tline-web + tag: dev + worker: + repository: registry.lan:5000/tline-worker + tag: dev + +global: + tailscale: + tailnetFQDN: "tailxyz.ts.net" + +registrySecret: + create: true + server: "registry.lan:5000" + username: "u" + password: "p" + email: "e@example.com" diff --git a/DEPLOYMENT_VALIDATION.md b/DEPLOYMENT_VALIDATION.md new file mode 100644 index 0000000..af74995 --- /dev/null +++ b/DEPLOYMENT_VALIDATION.md @@ -0,0 +1,331 @@ +# Task 11 - Kubernetes Deployment Validation Report + +## Configuration Review Summary + +### ✅ Correctly Configured + +#### 1. Tailscale Ingress + +All three ingress resources are properly defined: + +- **App** (`app.`) → web service port 3000 +- **MinIO S3** (`minio.`) → MinIO port 9000 +- **MinIO Console** (`minio-console.`) → MinIO console port 9001 + +Each ingress correctly: + +- Uses Tailscale ingress class +- Configures TLS with the appropriate hostname +- Routes to the correct service and port + +#### 2. Tailscale Service Option (LoadBalancer) + +Alternative exposure method via Tailscale LoadBalancer is available: + +- `helm/porthole/templates/service-minio-tailscale-s3.yaml.tpl` - S3 API at `minio.` +- `helm/porthole/templates/service-minio-tailscale-console.yaml.tpl` - Console at `minio-console.` + +Currently disabled (`minio.tailscaleServiceS3.enabled: false` in values.yaml). + +#### 3. Node Scheduling + +All heavy workloads are configured with `schedulingClass: compute`: + +- web (1Gi limit) +- worker (2Gi limit) +- postgres (2Gi limit) +- redis (512Mi limit) +- minio (2Gi limit) + +The scheduling helper (`_helpers.tpl:40-46`) applies the `scheduling.compute.affinity` which prefers nodes labeled with `node-class=compute`. + +#### 4. Longhorn PVCs + +Both stateful workloads use Longhorn PVCs: + +- Postgres: 20Gi storage +- MinIO: 200Gi storage + +#### 5. Resource Limits + +All workloads have appropriate resource requests and limits for Pi hardware: + +- Web: 200m CPU / 256Mi → 1000m CPU / 1Gi +- Worker: 500m CPU / 1Gi → 2000m CPU / 2Gi +- Postgres: 500m CPU / 1Gi → 1500m CPU / 2Gi +- Redis: 50m CPU / 128Mi → 300m CPU / 512Mi +- MinIO: 250m CPU / 512Mi → 1500m CPU / 2Gi + +#### 6. Cleanup CronJob + +Staging cleanup is properly configured but disabled by default: + +- Only targets `staging/` prefix (safe, never touches `originals/`) +- Removes files older than 14 days +- Must be enabled manually: `cronjobs.cleanupStaging.enabled: true` + +--- + +### ⚠️ Issues & Recommendations + +#### 1. Node Affinity Now Uses "Required" + +**Status:** ✅ Fixed - Affinity changed to `requiredDuringSchedulingIgnoredDuringExecution`. + +All heavy workloads now require `node-class=compute` nodes (Pi 5). The Pi 3 node is tainted with `capacity=low:NoExecute`, which provides an additional safeguard preventing any pods from being scheduled on it. + +**Alternative:** Keep preferred affinity but add anti-affinity for Pi 3 node (requires labeling Pi 3 with `node-class=tiny`): + +```yaml +scheduling: + compute: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: node-class + operator: In + values: + - compute + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-class + operator: NotIn + values: + - tiny +``` + +#### 2. No Range Request Optimizations on Ingress + +**Issue:** The Tailscale ingress resources (`ingress-tailscale.yaml.tpl`) don't have annotations for proxy timeout or buffer settings that are important for video streaming and Range requests. + +**Risk:** Video seeking may be unreliable or fail for large files through Tailscale Ingress. + +**Recommendation 1 (Preferred):** Enable Tailscale LoadBalancer Service for MinIO S3 instead of Ingress. This provides a more direct connection for streaming: + +```yaml +# In values.yaml +minio: + tailscaleServiceS3: + enabled: true + hostnameLabel: minio +``` + +This will: + +- Create a LoadBalancer service accessible via `https://minio.` +- Provide more reliable Range request support +- Bypass potential ingress buffering issues + +**Recommendation 2 (If using Ingress):** Add custom annotations for timeout/buffer optimization. Add to `values.yaml`: + +```yaml +minio: + ingressS3: + extraAnnotations: + nginx.ingress.kubernetes.io/proxy-body-size: "500m" + nginx.ingress.kubernetes.io/proxy-request-buffering: "off" + nginx.ingress.kubernetes.io/proxy-max-temp-file-size: "0" + nginx.ingress.kubernetes.io/proxy-read-timeout: "600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "600" +``` + +Note: These annotations are specific to nginx ingress. If using Tailscale ingress, check Tailscale documentation for equivalent settings. + +#### 3. Cleanup CronJob Disabled by Default + +**Issue:** `cronjobs.cleanupStaging.enabled: false` in values.yaml means old staging files will accumulate indefinitely. + +**Risk:** Staging files from failed/interrupted uploads will fill up MinIO PVC over time. + +**Recommendation:** Enable cleanup after initial testing: + +```bash +helm upgrade --install porthole helm/porthole -f values.yaml \ + --set cronjobs.cleanupStaging.enabled=true +``` + +Or set in values.yaml: + +```yaml +cronjobs: + cleanupStaging: + enabled: true +``` + +--- + +## Deployment Validation Commands + +### 1. Verify Pod Scheduling + +```bash +# Check all pods are on Pi 5 nodes (not Pi 3) +kubectl get pods -n porthole -o wide + +# Expected: All pods except optional cronjobs should be on nodes with node-class=compute +``` + +### 2. Verify Tailscale Endpoints + +```bash +# Check Tailscale ingress status +kubectl get ingress -n porthole + +# If LoadBalancer service enabled: +kubectl get svc -n porthole -l app.kubernetes.io/component=minio +``` + +### 3. Verify PVCs + +```bash +# Check Longhorn PVCs are created and bound +kubectl get pvc -n porthole + +# Check PVC status +kubectl describe pvc -n porthole | grep -A 5 "Status:" +``` + +### 4. Verify Resource Usage + +```bash +# Check current resource usage +kubectl top pods -n porthole + +# Check resource requests/limits +kubectl get pods -n porthole -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{range .spec.containers[*]} {.name}: CPU={.resources.requests.cpu}→{.resources.limits.cpu}, MEM={.resources.requests.memory}→{.resources.limits.memory}{"\n"}{end}{"\n"}{end}' +``` + +### 5. Test Presigned URL (HTTPS) + +```bash +# Get presigned URL (replace ) +curl -sS "https://app./api/assets//url?variant=original" | jq .url + +# Expected: URL starts with "https://minio...." +# NOT "http://..." +``` + +### 6. Test Range Request Support + +```bash +# Get presigned URL +URL=$(curl -sS "https://app./api/assets//url?variant=original" | jq -r .url) + +# Test Range request (request first 1KB) +curl -sS -D- -H 'Range: bytes=0-1023' "$URL" -o /dev/null + +# Expected: HTTP/1.1 206 Partial Content +# If you see 200 OK, Range requests are not working +``` + +### 7. Verify Worker Concurrency + +```bash +# Check BullMQ configuration in worker +kubectl exec -n porthole deployment/porthole-worker -- cat /app/src/index.ts | grep -A 5 "concurrency" + +# Expected: concurrency: 1 (or at most 2 for Pi hardware) +``` + +### 8. Test Timeline with Failed Assets + +```bash +# Query timeline with failed assets included +curl -sS "https://app./api/tree?includeFailed=1" | jq '.nodes[] | select(.count_ready < .count_total)' + +# Should return nodes where some assets have status != 'ready' +``` + +### 9. Database Verification + +```bash +# Connect to Postgres +kubectl exec -it -n porthole statefulset/porthole-postgres -- psql -U porthole -d porthole + +-- Check failed assets +SELECT id, media_type, status, error_message, date_confidence FROM assets WHERE status = 'failed' LIMIT 10; + +-- Check assets without capture date (should not appear in timeline) +SELECT COUNT(*) FROM assets WHERE capture_ts_utc IS NULL; + +-- Verify external originals not copied to canonical +SELECT COUNT(*) FROM assets WHERE source_key LIKE 'originals/%' AND canonical_key IS NOT NULL; +-- Should be 0 +``` + +--- + +## End-to-End Deployment Verification Checklist + +### Pre-Deployment + +- [ ] Label Pi 5 nodes: `kubectl label node node-class=compute` +- [ ] Label Pi 5 nodes: `kubectl label node node-class=compute` +- [ ] Verify Pi 3 has taint: `kubectl taint node capacity=low:NoExecute` +- [ ] Set `global.tailscale.tailnetFQDN` in values.yaml +- [ ] Set secret values (postgres password, minio credentials) +- [ ] Build and push multi-arch images to registry + +### Deployment + +```bash +# Install Helm chart +helm install porthole helm/porthole -f values.yaml --namespace porthole --create-namespace + +# Wait for pods to be ready +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=porthole -n porthole --timeout=10m +``` + +### Post-Deployment Verification + +- [ ] All pods are running on Pi 5 nodes (check `kubectl get pods -n porthole -o wide`) +- [ ] PVCs are created and bound (`kubectl get pvc -n porthole`) +- [ ] Tailscale endpoints are accessible: + - [ ] `https://app.` - web UI loads + - [ ] `https://minio.` - MinIO S3 accessible (mc ls) + - [ ] `https://minio-console.` - MinIO console loads +- [ ] Presigned URLs use HTTPS and point to tailnet hostname +- [ ] Range requests return 206 Partial Content +- [ ] Upload flow works: `/admin` → upload → asset appears in timeline +- [ ] Scan flow works: trigger scan → `originals/` indexed → timeline populated +- [ ] Failed assets show as placeholders without breaking UI +- [ ] Video playback works for supported codecs; poster shown for unsupported +- [ ] Worker memory usage stays within 2Gi limit during large file processing +- [ ] No mixed-content warnings in browser console + +### Performance Validation + +- [ ] Timeline tree loads and remains responsive +- [ ] Zoom/pan works smoothly on mobile (test touch) +- [ ] Video seeking works without stutter +- [ ] Worker processes queue without OOM +- [ ] Postgres memory stays within 2Gi +- [ ] MinIO memory stays within 2Gi + +--- + +## High-Risk Areas Summary + +| Risk | Impact | Likelihood | Mitigation | +| -------------------------------------- | -------------------------------------- | ---------- | ------------------------------------------------------------------- | +| Pi 3 node receives heavy pod | OOMKilled, cluster instability | Very Low | Required affinity + capacity=low:NoExecute taint prevent scheduling | +| Tailscale Ingress Range request issues | Video seeking broken, poor UX | Medium | Enable `tailscaleServiceS3.enabled: true` for MinIO | +| Worker OOM on large video processing | Worker crashes, queue stalls | Low | Concurrency=1 already set; monitor memory during testing | +| MinIO presigned URL expiration | Videos stop playing mid-session | Low | 900s TTL is reasonable; user can re-open viewer | +| Staging files accumulate | Disk fills up | Medium | Enable `cleanupStaging.enabled: true` | +| Missing error boundaries | Component crashes show unhandled error | Low | Error boundaries now implemented | + +--- + +## Next Steps + +1. **Update node affinity** to `required` for compute class (or add anti-affinity for Pi 3) +2. **Enable Tailscale LoadBalancer service** for MinIO S3 for reliable Range requests +3. **Enable cleanup CronJob** after initial testing: `--set cronjobs.cleanupStaging.enabled=true` +4. **Deploy to cluster** and run validation commands +5. **Perform end-to-end testing** with real media (upload + scan) +6. **Monitor resource usage** during typical operations to confirm limits are appropriate diff --git a/PLAN.md b/PLAN.md index 4a3f5fd..a29a101 100644 --- a/PLAN.md +++ b/PLAN.md @@ -36,12 +36,14 @@ This plan is written to be executed by multiple subagents (parallelizable workst ## Key Decisions (Locked) ### App identity + - App name: `porthole` - Set the app name via environment variable: `APP_NAME=porthole`. - Use `APP_NAME` everywhere (web + worker) via the shared config module so renaming is global. - If the UI needs to display the name in the browser, also provide `NEXT_PUBLIC_APP_NAME` (either set explicitly or derived at build time from `APP_NAME`). ### Networking + - Tailnet clients access the app via **Tailscale Ingress HTTPS termination**. - MinIO is reachable **over tailnet** via a dedicated FQDN: - `https://minio.` (S3 API) @@ -51,6 +53,7 @@ This plan is written to be executed by multiple subagents (parallelizable workst - Optional LAN ingress exists using `nip.io` and nginx ingress, but tailnet clients use Tailscale hostnames. ### Storage model + - **MinIO is the source of truth**. - External archive objects under **`originals/`** are treated as **immutable**: - The app **indexes in place**. @@ -60,20 +63,24 @@ This plan is written to be executed by multiple subagents (parallelizable workst - Uploads are processed then stored in canonical by default. ### Presigned URL strategy + - Use **path-style presigned URLs** signed against: - `MINIO_PUBLIC_ENDPOINT_TS=https://minio.` - Using HTTPS for MinIO on tailnet avoids mixed-content block when the app is served via HTTPS. ### Kubernetes constraints + - Cluster nodes: **2× Raspberry Pi 5 (8GB)** + **1× Raspberry Pi 3 B+ (1GB)**. - Heavy pods must be pinned to Pi 5 nodes. - Multi-arch images required (arm64 + amd64), built on a laptop and pushed to an in-cluster **insecure HTTP registry**. ### Metadata extraction + - **Photos**: camera-like EXIF first (`DateTimeOriginal`), then fallbacks. - **Videos**: camera-like tags first (ExifTool QuickTime/vendor tags), fallback to universal container `creation_time`. ### Derived media + - Image thumbs: `image_256.jpg` and `image_768.jpg`. - Video posters: only `poster_256.jpg` initially (CPU-friendly). @@ -82,6 +89,7 @@ This plan is written to be executed by multiple subagents (parallelizable workst ## Architecture ### Components + - **Web**: Next.js (UI + API) - **Worker**: Node worker using BullMQ - **Queue**: Redis @@ -89,6 +97,7 @@ This plan is written to be executed by multiple subagents (parallelizable workst - **Object store**: MinIO (in-cluster, single-node) ### Data flow + 1. Ingestion (upload or scan) creates/updates DB asset records. 2. Worker extracts metadata and generates thumbs/posters. 3. UI queries aggregated timeline nodes and displays a tree. @@ -146,6 +155,7 @@ Example bucket: `media`. - `raw_tags_json` (jsonb, optional but recommended for debugging) Indexes: + - `capture_ts_utc`, `status`, `media_type` ### Table: `imports` @@ -161,11 +171,13 @@ Indexes: ## Worker Jobs (BullMQ) ### `scan_minio_prefix(importId, bucket, prefix)` + - Guardrails: only allow prefixes from allowlist, starting with `originals/`. - Lists objects; upserts `assets` by `source_key`. - Enqueues `process_asset(assetId)`. ### `process_asset(assetId)` + - Downloads object (stream or temp file). - Extracts metadata: - Photos: ExifTool EXIF chain. @@ -177,6 +189,7 @@ Indexes: - Never throws errors that would crash the worker loop; failures are captured on the asset row. ### `copy_to_canonical(assetId)` + - Computes canonical key: `canonical/originals/YYYY/MM/DD/{assetId}.{origExt}`. - Copy-only; never deletes `source_key` for external archive. - Updates `canonical_key` and flips `active_key`. @@ -186,12 +199,14 @@ Indexes: ## API (MVP) ### Admin ingestion + - `POST /api/imports` → create import batch - `POST /api/imports/:id/upload` → upload media to `staging/` and enqueue processing - `POST /api/imports/:id/scan-minio` → enqueue scan of allowlisted prefix - `GET /api/imports/:id/status` → progress ### Timeline and browsing + - `GET /api/tree` - params: `start`, `end`, `granularity=year|month|day`, filters: `mediaType` - returns nodes with counts and sample thumbs @@ -205,10 +220,12 @@ Indexes: ## Frontend UX/UI (MVP) ### Pages + - `/` Timeline tree - `/admin` Admin tools (upload, scan, import status) ### Timeline tree + - SVG tree rendering with: - Vertical/horizontal orientation toggle. - Zoom/pan (touch supported). @@ -219,11 +236,13 @@ Indexes: - Virtualized thumbnail list. ### Viewer + - Image viewer modal. - Video playback via HTML5 `