harden(deploy): apply safe fixes from review report-only items

- #3 Liveness probe targets full SSR DB-querying / route, coupling pod liveness to SQLite - #4 Chart values-staging/production.yaml are dead config under Flux; drift trap - #6 tsconfig includes gitignored emdash-env.d.ts that only the dev server generates - #7 Dockerfile package-lock glob + npm install fallback can silently build an unlocked image - #8 Dockerfile creates runtime user without pinning its GID - #9 entrypoint.sh gates `emdash init` on data.db absence, skipping migrations on PVC reuse - #10 pullPolicy: Always vs digest pinning - #11 Dockerfile state symlinks contradict the STATE_DIR contract; Dockerfile does not set ENV STATE_DIR - #12 astro is a production dependency, so npm prune --omit=dev keeps build-only tooling - #14 Two ImageUpdateAutomations write back to the same anton-helm-workloads main branch - #16 memoryCache provider is per-process; correctness depends implicitly on replicas:1 - #17 Root catch-all [slug].astro couples nav links to pages-collection rows + DB hit per unmatched path - #18 Detail pages render a 200-style body under a 404 status and have no try/catch around getEmDash* calls - #19 vite allowedHosts hardcodes ddev hostnames (dev-only; no prod impact)
2026-06-02 04:50:54 +03:00
parent 0c2cea8c25
commit 8c119efff8
15 changed files with 157 additions and 31 deletions
@@ -50,3 +50,32 @@ The HelmRelease itself lives in the workloads repo because that repo is
 the source of truth for what runs on the kotkanagrilli.fi subdomain
 pool. Same convention as the existing `kotkanagrilli/` (legacy WP) and
 `hello-kotkan/` entries there.
+
+## Why two image automations share one branch
+
+Both `cms-plugins-staging` and `cms-plugins-production` define an
+`ImageUpdateAutomation` that checks out, commits to, and pushes the
+**same** `main` branch of `anton-helm-workloads` on the same `interval: 1m`.
+This is intentional and safe:
+
+- Each automation is scoped to a disjoint `update.path`
+  (`./cms-plugins-staging` vs `./cms-plugins-production`), so they only ever
+  rewrite the digest setter inside their *own* `helmrelease.yaml`. They
+  never touch the same file.
+- `strategy: Setters` rewrites only the explicitly marked digest setter, not
+  arbitrary YAML — there is no whole-file regeneration that could clobber a
+  sibling's change.
+- The image-automation-controller serializes its git pushes and retries on
+  a non-fast-forward rejection, so two automations landing commits on `main`
+  in the same reconcile window resolve cleanly rather than racing.
+
+This mirrors the per-env automations under
+`~/projects/servers/fleet/apps/base/` for `emdash-kotkanagrilli-*`. The
+only deviation (justified in `image-automation.yaml`) is that these reuse
+the read-side `anton-helm-workloads` `GitRepository` as the write-back
+`sourceRef` instead of a dedicated image-automation source, because these
+workloads live in that same repo.
+
+Note for go-live: nothing here reconciles while the HelmReleases are
+`suspend: true` (Phase 0). These automations only begin writing back once
+the releases are deliberately resumed.
@@ -32,6 +32,8 @@ spec:
      # change when CI retags the floating `production` tag.
      tag: production
      digest: "" # {"$imagepolicy": "kotkan:cms-plugins-production:digest"}
+      # digest-pinned below, so this is effectively a no-op (a digest is
+      # immutable); kept as Always to match the chart default.
      pullPolicy: Always
    ingress:
      host: cms-plugins-production.kotkanagrilli.fi
@@ -32,6 +32,8 @@ spec:
      # change when CI retags the floating `staging` tag.
      tag: staging
      digest: "" # {"$imagepolicy": "kotkan:cms-plugins-staging:digest"}
+      # digest-pinned below, so this is effectively a no-op (a digest is
+      # immutable); kept as Always to match the chart default.
      pullPolicy: Always
    ingress:
      host: cms-plugins-staging.kotkanagrilli.fi
@@ -69,6 +69,7 @@ spec:
            initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }}
            periodSeconds: {{ .Values.probes.liveness.periodSeconds }}
            timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.liveness.failureThreshold }}
          readinessProbe:
            httpGet:
              path: {{ .Values.probes.readiness.path }}
@@ -76,6 +77,7 @@ spec:
            initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }}
            periodSeconds: {{ .Values.probes.readiness.periodSeconds }}
            timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.readiness.failureThreshold }}
          resources:
            {{- toYaml .Values.resources | nindent 12 }}
      volumes:
@@ -1,5 +1,10 @@
-# Production overrides — applied via the FluxCD HelmRelease (or directly with
-# `helm upgrade -f values-production.yaml`).
+# Production overrides for DIRECT `helm upgrade -f values-production.yaml` use only.
+#
+# IMPORTANT: FluxCD does NOT read this file. The live production deploy is driven
+# solely by the inline `spec.values:` block in
+# deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml (copied into
+# anton-helm-workloads). Editing values here has NO effect on the cluster.
+# Keep this file in sync with that HR `values:` block by hand, or it will drift.

 image:
  tag: production-latest
@@ -1,5 +1,10 @@
-# Staging overrides — applied via the FluxCD HelmRelease (or directly with
-# `helm upgrade -f values-staging.yaml`).
+# Staging overrides for DIRECT `helm upgrade -f values-staging.yaml` use only.
+#
+# IMPORTANT: FluxCD does NOT read this file. The live staging deploy is driven
+# solely by the inline `spec.values:` block in
+# deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml (copied into
+# anton-helm-workloads). Editing values here has NO effect on the cluster.
+# Keep this file in sync with that HR `values:` block by hand, or it will drift.

 image:
  tag: staging-latest
@@ -1,13 +1,21 @@
 # Defaults for the cms-plugins chart.
-# Per-env overrides come from values-staging.yaml / values-production.yaml
-# and from the FluxCD HelmRelease's `values:` block.
+# Per-env overrides: Flux applies ONLY the HelmRelease `values:` block.
+# values-staging.yaml / values-production.yaml are for direct `helm upgrade -f`
+# use and are NOT read by Flux — keep them in sync with the HR by hand.

 image:
  repository: git.oleks.space/oleks/cms-plugins
  tag: develop-latest
-  # The tag is a mutable floating pointer (CI retags <branch>-latest onto
-  # each new build), so kubelet must always re-pull — IfNotPresent would
-  # pin the node to whatever digest it cached first and never roll.
+  # `Always` is here for the chart-default FLOATING-TAG path: with no
+  # `digest` set, the image renders as `repository:<branch>-latest`
+  # (a mutable pointer CI retags onto each build), so kubelet must
+  # re-pull or it would pin to the first cached digest and never roll.
+  # NOTE: the deployed overlays pin by `digest` (repository@sha256:…),
+  # where a tag change instead changes the image *reference string*, so
+  # `helm upgrade` already detects it and `Always` is a no-op (a digest
+  # is content-addressed — it can never resolve to different bytes).
+  # `IfNotPresent` would be marginally better on the digest path but is
+  # left as `Always` so both render paths share one safe value.
  pullPolicy: Always

 service:
@@ -57,17 +65,27 @@ imagePullSecrets:
 probes:
  liveness:
    # /_emdash/api/health requires auth (401 to unauthenticated requests),
-    # so kubelet probes fail and the pod gets killed. The site root is
-    # public and a 200 from it is a reasonable proxy for "the server is up".
+    # so we probe the public site root instead. But `/` is server-rendered
+    # and queries SQLite content, so a content/render or DB fault makes it
+    # 500 while the Node process is perfectly alive. Liveness must NOT
+    # crash-loop the single SQLite replica over a transient content/DB
+    # error: keep failureThreshold high so only a genuinely wedged process
+    # (sustained failures) triggers a restart. Readiness (below) is what
+    # sheds traffic on a content 500.
    path: /
    initialDelaySeconds: 30
    periodSeconds: 30
    timeoutSeconds: 5
+    failureThreshold: 6
  readiness:
+    # Probe the public site root. A content/render 500 here removes the pod
+    # from Endpoints (stops serving 500s) WITHOUT the kubelet killing the
+    # process — readiness failures never restart the container.
    path: /
    initialDelaySeconds: 5
    periodSeconds: 10
    timeoutSeconds: 5
+    failureThreshold: 3

 resources:
  requests: