harden(deploy): apply safe fixes from review report-only items

- #3 Liveness probe targets full SSR DB-querying / route, coupling pod liveness to SQLite - #4 Chart values-staging/production.yaml are dead config under Flux; drift trap - #6 tsconfig includes gitignored emdash-env.d.ts that only the dev server generates - #7 Dockerfile package-lock glob + npm install fallback can silently build an unlocked image - #8 Dockerfile creates runtime user without pinning its GID - #9 entrypoint.sh gates `emdash init` on data.db absence, skipping migrations on PVC reuse - #10 pullPolicy: Always vs digest pinning - #11 Dockerfile state symlinks contradict the STATE_DIR contract; Dockerfile does not set ENV STATE_DIR - #12 astro is a production dependency, so npm prune --omit=dev keeps build-only tooling - #14 Two ImageUpdateAutomations write back to the same anton-helm-workloads main branch - #16 memoryCache provider is per-process; correctness depends implicitly on replicas:1 - #17 Root catch-all [slug].astro couples nav links to pages-collection rows + DB hit per unmatched path - #18 Detail pages render a 200-style body under a 404 status and have no try/catch around getEmDash* calls - #19 vite allowedHosts hardcodes ddev hostnames (dev-only; no prod impact)
2026-06-02 04:50:54 +03:00
parent 0c2cea8c25
commit 8c119efff8
15 changed files with 157 additions and 31 deletions
@@ -66,13 +66,26 @@ the matching branch of this repo. The `ignore` rule in `source.yaml`
 restricts reconciliation to `/deploy/helm`, so app-source pushes don't
 trigger chart re-reconcile.

-Per-env values overlay `values.yaml` via `values-staging.yaml` /
-`values-production.yaml` (for `helm upgrade` direct use) and via the
-`values:` block in the FluxCD `HelmRelease`.
+Per-env values come from **two independent sources** that must be kept in
+sync by hand:
+
+- The inline `values:` block in each FluxCD `HelmRelease` (in
+  `anton-helm-workloads`, templated under `deploy/fleet-overlay/`). **This is
+  the only source Flux applies to the cluster.**
+- `values-staging.yaml` / `values-production.yaml` in `deploy/helm/`, used
+  *only* for direct `helm upgrade -f ...` invocations. **Flux never reads
+  these files**, so editing them has no effect on the live deploy and they
+  can silently drift from the HelmRelease.

 Key shape decisions:

- **`replicas: 1`, `strategy: Recreate`.** SQLite is single-writer.
+- **`replicas: 1`, `strategy: Recreate`.** SQLite is single-writer. This
+  also keeps the in-process Astro response cache (`memoryCache()` in
+  `app/astro.config.mjs`, fed by `Astro.cache.set(cacheHint)` on the
+  content pages) coherent: it is per-process, so a second replica would
+  hold a divergent, independently-expiring cache. Scaling out would
+  require a shared cache provider, not just relaxing the SQLite writer
+  constraint.
 - **`nodeSelector: kotkan`.** `local-path` PV is sticky to one node; emdash
  is colocated with the legacy kotkanagrilli WP install.
 - **Pinned by digest, not tag.** The HelmRelease sets
@@ -53,7 +53,12 @@ cd app && npm install && npm run bootstrap && npx emdash dev
 # Build production image
 docker build -t cms-plugins:dev .

-# Typecheck
+# Typecheck — NOTE: requires app/emdash-env.d.ts, which tsconfig.json
+# includes but which is gitignored + untracked. Emdash regenerates it
+# ONLY via the dev-server `astro:server:setup` hook, so on a clean
+# checkout you must start the dev server once (`npx emdash dev`, then
+# stop it) before `astro check` will resolve emdash types. `astro build`
+# (the Docker/CI image path) does NOT type-check and is unaffected.
 cd app && npm run typecheck
 ```

@@ -5,16 +5,23 @@ WORKDIR /app
 RUN apt-get update \
 && apt-get install -y --no-install-recommends python3 make g++ ca-certificates \
 && rm -rf /var/lib/apt/lists/*
-COPY app/package.json app/package-lock.json* ./
-# package-lock.json may not exist on the first commit — fall back to `npm install`
-# so the image still builds; once a lockfile is committed, npm ci kicks in.
-RUN if [ -f package-lock.json ]; then npm ci --include=dev; else npm install --include=dev; fi
+COPY app/package.json app/package-lock.json ./
+# Lockfile (lockfileVersion 3) is committed; npm ci is reproducible and
+# fails if it drifts from package.json. No npm install fallback.
+RUN npm ci --include=dev

 FROM deps AS build
 WORKDIR /app
 COPY app/ ./
 RUN rm -f data.db data.db-shm data.db-wal && rm -rf uploads
 RUN npm run build
+# `astro` is a runtime dependency (required by the @astrojs/node standalone
+# SSR server), so this prune only drops the two devDependencies
+# (@astrojs/check, @types/node). Astro's transitive build tooling
+# (vite, esbuild, @astrojs/compiler, rollup plugins) stays in node_modules
+# because Astro itself declares them as runtime deps. Slimming those out
+# would require verifying the dist/server bundle never imports `astro/*` at
+# boot; not attempted here. Image-size tradeoff is accepted for now.
 RUN npm prune --omit=dev

 FROM node:22-bookworm-slim AS runtime
@@ -25,7 +32,8 @@ ENV NODE_ENV=production \
 RUN apt-get update \
 && apt-get install -y --no-install-recommends ca-certificates tini \
 && rm -rf /var/lib/apt/lists/* \
- && useradd --system --uid 1001 --home /app emdash \
+ && groupadd --system --gid 1001 emdash \
+ && useradd --system --uid 1001 --gid 1001 --home /app emdash \
 && mkdir -p /app/state/uploads \
 && chown -R emdash:emdash /app

@@ -35,8 +43,14 @@ COPY --from=build --chown=emdash:emdash /app/dist ./dist
 COPY --from=build --chown=emdash:emdash /app/seed ./seed

 # Persistent state lives in /app/state (single PVC in k3s).
-# data.db and uploads/ are symlinked from the working directory so the
-# default emdash paths (./data.db, ./uploads) resolve into the volume.
+# STATE_DIR is intentionally NOT set here: the Helm chart injects
+# STATE_DIR=/app/state (deploy/helm values.yaml), so the running app writes
+# directly to /app/state/data.db and /app/state/uploads and these symlinks
+# are never traversed. Leaving it unset keeps the image deploy-agnostic so a
+# bare `docker run` / DDEV smoke test falls back to the WORKDIR (./data.db).
+# The symlinks below only backstop that STATE_DIR-unset fallback
+# (astro.config.mjs: `process.env.STATE_DIR ?? "."`), redirecting the default
+# emdash paths into the volume — they do not contradict the STATE_DIR contract.
 RUN ln -s /app/state/data.db /app/data.db \
 && ln -s /app/state/uploads /app/uploads

@@ -16,8 +16,9 @@ export default defineConfig({
 	server: { host: true, port: 4321 },
 	vite: {
 		server: {
-			// Dev runs behind DDEV's nginx (https://cms-plugins.ddev.site/).
-			// Vite's host check must allow the public hostname.
+			// Dev-only: `vite.server` applies to `astro dev` / `emdash dev`
+			// (DDEV's nginx fronts https://cms-plugins.ddev.site/). Production
+			// runs the @astrojs/node standalone server, so this has no prod effect.
 			allowedHosts: ["cms-plugins.ddev.site", ".ddev.site"],
 		},
 	},
@@ -3,8 +3,23 @@ import { getEmDashEntry, decodeSlug } from "emdash";
 import { PortableText } from "emdash/ui";
 import Base from "../layouts/Base.astro";

+// Root-level catch-all for the `pages` content collection (standard Emdash model:
+// a CMS "page" IS a pages-collection row whose `entry.id` is its slug). The site nav
+// (`/about` in Base.astro) deliberately links to pages-collection rows rather than to
+// hard-coded .astro files, so adding/renaming a static page is a content edit, not a
+// code change. Consequence: every unmatched top-level path (incl. bot probes) does one
+// indexed SQLite point-lookup here and 404s on a miss. Acceptable by design — single
+// replica, single-writer local-path SQLite on kotkan, and hits already emit the
+// `cacheHint` below. Do NOT "fix" by reintroducing per-page .astro files.
+
 export const prerender = false;

+// Missing slug/entry → 404 status + a "Not found" body (the markup below).
+// getEmDashEntry is intentionally NOT wrapped in try/catch: a thrown error
+// here means SQLite/infra is unhealthy and should surface as a 500 (a real
+// pod-health signal on this single-replica/local-path deploy), not be masked
+// as a 404. Mirrors the emdash-kotkanagrilli reference, which also lets these
+// calls propagate.
 const slug = decodeSlug(Astro.params.slug);
 const { entry, cacheHint } = slug
 	? await getEmDashEntry("pages", slug)
@@ -6,6 +6,12 @@ import { PLUGIN_FETCH_CAP } from "../../lib/statuses";

 export const prerender = false;

+// Missing slug/entry → 404 status + a "Not found" body (the markup below).
+// getEmDashEntry/getEmDashCollection are intentionally NOT wrapped in
+// try/catch: a thrown error here means SQLite/infra is unhealthy and should
+// surface as a 500 (a real pod-health signal on this single-replica/local-path
+// deploy), not be masked as a 404. Mirrors the emdash-kotkanagrilli reference,
+// which also lets these calls propagate.
 const slug = decodeSlug(Astro.params.slug);
 const { entry: cms, cacheHint } = slug
 	? await getEmDashEntry("cmses", slug)
@@ -6,6 +6,12 @@ import StatusBadge from "../../components/StatusBadge.astro";

 export const prerender = false;

+// Missing slug/entry → 404 status + a "Not found" body (the markup below).
+// getEmDashEntry/getEmDashCollection are intentionally NOT wrapped in
+// try/catch: a thrown error here means SQLite/infra is unhealthy and should
+// surface as a 500 (a real pod-health signal on this single-replica/local-path
+// deploy), not be masked as a 404. Mirrors the emdash-kotkanagrilli reference,
+// which also lets these calls propagate.
 const slug = decodeSlug(Astro.params.slug);
 const { entry, cacheHint } = slug
 	? await getEmDashEntry("plugins", slug)
@@ -50,3 +50,32 @@ The HelmRelease itself lives in the workloads repo because that repo is
 the source of truth for what runs on the kotkanagrilli.fi subdomain
 pool. Same convention as the existing `kotkanagrilli/` (legacy WP) and
 `hello-kotkan/` entries there.
+
+## Why two image automations share one branch
+
+Both `cms-plugins-staging` and `cms-plugins-production` define an
+`ImageUpdateAutomation` that checks out, commits to, and pushes the
+**same** `main` branch of `anton-helm-workloads` on the same `interval: 1m`.
+This is intentional and safe:
+
+- Each automation is scoped to a disjoint `update.path`
+  (`./cms-plugins-staging` vs `./cms-plugins-production`), so they only ever
+  rewrite the digest setter inside their *own* `helmrelease.yaml`. They
+  never touch the same file.
+- `strategy: Setters` rewrites only the explicitly marked digest setter, not
+  arbitrary YAML — there is no whole-file regeneration that could clobber a
+  sibling's change.
+- The image-automation-controller serializes its git pushes and retries on
+  a non-fast-forward rejection, so two automations landing commits on `main`
+  in the same reconcile window resolve cleanly rather than racing.
+
+This mirrors the per-env automations under
+`~/projects/servers/fleet/apps/base/` for `emdash-kotkanagrilli-*`. The
+only deviation (justified in `image-automation.yaml`) is that these reuse
+the read-side `anton-helm-workloads` `GitRepository` as the write-back
+`sourceRef` instead of a dedicated image-automation source, because these
+workloads live in that same repo.
+
+Note for go-live: nothing here reconciles while the HelmReleases are
+`suspend: true` (Phase 0). These automations only begin writing back once
+the releases are deliberately resumed.
@@ -32,6 +32,8 @@ spec:
      # change when CI retags the floating `production` tag.
      tag: production
      digest: "" # {"$imagepolicy": "kotkan:cms-plugins-production:digest"}
+      # digest-pinned below, so this is effectively a no-op (a digest is
+      # immutable); kept as Always to match the chart default.
      pullPolicy: Always
    ingress:
      host: cms-plugins-production.kotkanagrilli.fi
@@ -32,6 +32,8 @@ spec:
      # change when CI retags the floating `staging` tag.
      tag: staging
      digest: "" # {"$imagepolicy": "kotkan:cms-plugins-staging:digest"}
+      # digest-pinned below, so this is effectively a no-op (a digest is
+      # immutable); kept as Always to match the chart default.
      pullPolicy: Always
    ingress:
      host: cms-plugins-staging.kotkanagrilli.fi
@@ -69,6 +69,7 @@ spec:
            initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }}
            periodSeconds: {{ .Values.probes.liveness.periodSeconds }}
            timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.liveness.failureThreshold }}
          readinessProbe:
            httpGet:
              path: {{ .Values.probes.readiness.path }}
@@ -76,6 +77,7 @@ spec:
            initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }}
            periodSeconds: {{ .Values.probes.readiness.periodSeconds }}
            timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.readiness.failureThreshold }}
          resources:
            {{- toYaml .Values.resources | nindent 12 }}
      volumes:
@@ -1,5 +1,10 @@
-# Production overrides — applied via the FluxCD HelmRelease (or directly with
-# `helm upgrade -f values-production.yaml`).
+# Production overrides for DIRECT `helm upgrade -f values-production.yaml` use only.
+#
+# IMPORTANT: FluxCD does NOT read this file. The live production deploy is driven
+# solely by the inline `spec.values:` block in
+# deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml (copied into
+# anton-helm-workloads). Editing values here has NO effect on the cluster.
+# Keep this file in sync with that HR `values:` block by hand, or it will drift.

 image:
  tag: production-latest
@@ -1,5 +1,10 @@
-# Staging overrides — applied via the FluxCD HelmRelease (or directly with
-# `helm upgrade -f values-staging.yaml`).
+# Staging overrides for DIRECT `helm upgrade -f values-staging.yaml` use only.
+#
+# IMPORTANT: FluxCD does NOT read this file. The live staging deploy is driven
+# solely by the inline `spec.values:` block in
+# deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml (copied into
+# anton-helm-workloads). Editing values here has NO effect on the cluster.
+# Keep this file in sync with that HR `values:` block by hand, or it will drift.

 image:
  tag: staging-latest
@@ -1,13 +1,21 @@
 # Defaults for the cms-plugins chart.
-# Per-env overrides come from values-staging.yaml / values-production.yaml
-# and from the FluxCD HelmRelease's `values:` block.
+# Per-env overrides: Flux applies ONLY the HelmRelease `values:` block.
+# values-staging.yaml / values-production.yaml are for direct `helm upgrade -f`
+# use and are NOT read by Flux — keep them in sync with the HR by hand.

 image:
  repository: git.oleks.space/oleks/cms-plugins
  tag: develop-latest
-  # The tag is a mutable floating pointer (CI retags <branch>-latest onto
-  # each new build), so kubelet must always re-pull — IfNotPresent would
-  # pin the node to whatever digest it cached first and never roll.
+  # `Always` is here for the chart-default FLOATING-TAG path: with no
+  # `digest` set, the image renders as `repository:<branch>-latest`
+  # (a mutable pointer CI retags onto each build), so kubelet must
+  # re-pull or it would pin to the first cached digest and never roll.
+  # NOTE: the deployed overlays pin by `digest` (repository@sha256:…),
+  # where a tag change instead changes the image *reference string*, so
+  # `helm upgrade` already detects it and `Always` is a no-op (a digest
+  # is content-addressed — it can never resolve to different bytes).
+  # `IfNotPresent` would be marginally better on the digest path but is
+  # left as `Always` so both render paths share one safe value.
  pullPolicy: Always

 service:
@@ -57,17 +65,27 @@ imagePullSecrets:
 probes:
  liveness:
    # /_emdash/api/health requires auth (401 to unauthenticated requests),
-    # so kubelet probes fail and the pod gets killed. The site root is
-    # public and a 200 from it is a reasonable proxy for "the server is up".
+    # so we probe the public site root instead. But `/` is server-rendered
+    # and queries SQLite content, so a content/render or DB fault makes it
+    # 500 while the Node process is perfectly alive. Liveness must NOT
+    # crash-loop the single SQLite replica over a transient content/DB
+    # error: keep failureThreshold high so only a genuinely wedged process
+    # (sustained failures) triggers a restart. Readiness (below) is what
+    # sheds traffic on a content 500.
    path: /
    initialDelaySeconds: 30
    periodSeconds: 30
    timeoutSeconds: 5
+    failureThreshold: 6
  readiness:
+    # Probe the public site root. A content/render 500 here removes the pod
+    # from Endpoints (stops serving 500s) WITHOUT the kubelet killing the
+    # process — readiness failures never restart the container.
    path: /
    initialDelaySeconds: 5
    periodSeconds: 10
    timeoutSeconds: 5
+    failureThreshold: 3

 resources:
  requests:
@@ -4,11 +4,14 @@ set -eu
 # Ensure persistent state dirs exist (volume may be empty on first boot).
 mkdir -p /app/state/uploads

-# Bootstrap on first run: create data.db and apply migrations.
-# emdash init is expected to be idempotent on subsequent boots.
-if [ ! -f /app/state/data.db ]; then
-  echo "[entrypoint] no data.db found in /app/state, running emdash init"
-  node_modules/.bin/emdash init
-fi
+# Run emdash init on EVERY boot, before exec'ing the server. It is idempotent:
+# runMigrations applies only pending migrations (no-op when all are applied) and
+# init skips re-seeding once collections exist. Under `set -e` a non-zero exit
+# aborts before `exec "$@"`, so a failed/partial init surfaces as a crash-loop
+# with logs instead of a silently half-migrated boot. (Gating on the mere
+# presence of data.db would skip pending migrations on image upgrades against an
+# existing PVC and never recover a partial first-run init.)
+echo "[entrypoint] running emdash init (applies pending migrations, skips re-seed when collections exist)"
+node_modules/.bin/emdash init

 exec "$@"