diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index c072700..9dbd15e 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -66,13 +66,26 @@ the matching branch of this repo. The `ignore` rule in `source.yaml` restricts reconciliation to `/deploy/helm`, so app-source pushes don't trigger chart re-reconcile. -Per-env values overlay `values.yaml` via `values-staging.yaml` / -`values-production.yaml` (for `helm upgrade` direct use) and via the -`values:` block in the FluxCD `HelmRelease`. +Per-env values come from **two independent sources** that must be kept in +sync by hand: + +- The inline `values:` block in each FluxCD `HelmRelease` (in + `anton-helm-workloads`, templated under `deploy/fleet-overlay/`). **This is + the only source Flux applies to the cluster.** +- `values-staging.yaml` / `values-production.yaml` in `deploy/helm/`, used + *only* for direct `helm upgrade -f ...` invocations. **Flux never reads + these files**, so editing them has no effect on the live deploy and they + can silently drift from the HelmRelease. Key shape decisions: -- **`replicas: 1`, `strategy: Recreate`.** SQLite is single-writer. +- **`replicas: 1`, `strategy: Recreate`.** SQLite is single-writer. This + also keeps the in-process Astro response cache (`memoryCache()` in + `app/astro.config.mjs`, fed by `Astro.cache.set(cacheHint)` on the + content pages) coherent: it is per-process, so a second replica would + hold a divergent, independently-expiring cache. Scaling out would + require a shared cache provider, not just relaxing the SQLite writer + constraint. - **`nodeSelector: kotkan`.** `local-path` PV is sticky to one node; emdash is colocated with the legacy kotkanagrilli WP install. - **Pinned by digest, not tag.** The HelmRelease sets diff --git a/CLAUDE.md b/CLAUDE.md index 92e556d..f20b6e2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,7 +53,12 @@ cd app && npm install && npm run bootstrap && npx emdash dev # Build production image docker build -t cms-plugins:dev . -# Typecheck +# Typecheck — NOTE: requires app/emdash-env.d.ts, which tsconfig.json +# includes but which is gitignored + untracked. Emdash regenerates it +# ONLY via the dev-server `astro:server:setup` hook, so on a clean +# checkout you must start the dev server once (`npx emdash dev`, then +# stop it) before `astro check` will resolve emdash types. `astro build` +# (the Docker/CI image path) does NOT type-check and is unaffected. cd app && npm run typecheck ``` diff --git a/Dockerfile b/Dockerfile index 0998923..98072cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,16 +5,23 @@ WORKDIR /app RUN apt-get update \ && apt-get install -y --no-install-recommends python3 make g++ ca-certificates \ && rm -rf /var/lib/apt/lists/* -COPY app/package.json app/package-lock.json* ./ -# package-lock.json may not exist on the first commit — fall back to `npm install` -# so the image still builds; once a lockfile is committed, npm ci kicks in. -RUN if [ -f package-lock.json ]; then npm ci --include=dev; else npm install --include=dev; fi +COPY app/package.json app/package-lock.json ./ +# Lockfile (lockfileVersion 3) is committed; npm ci is reproducible and +# fails if it drifts from package.json. No npm install fallback. +RUN npm ci --include=dev FROM deps AS build WORKDIR /app COPY app/ ./ RUN rm -f data.db data.db-shm data.db-wal && rm -rf uploads RUN npm run build +# `astro` is a runtime dependency (required by the @astrojs/node standalone +# SSR server), so this prune only drops the two devDependencies +# (@astrojs/check, @types/node). Astro's transitive build tooling +# (vite, esbuild, @astrojs/compiler, rollup plugins) stays in node_modules +# because Astro itself declares them as runtime deps. Slimming those out +# would require verifying the dist/server bundle never imports `astro/*` at +# boot; not attempted here. Image-size tradeoff is accepted for now. RUN npm prune --omit=dev FROM node:22-bookworm-slim AS runtime @@ -25,7 +32,8 @@ ENV NODE_ENV=production \ RUN apt-get update \ && apt-get install -y --no-install-recommends ca-certificates tini \ && rm -rf /var/lib/apt/lists/* \ - && useradd --system --uid 1001 --home /app emdash \ + && groupadd --system --gid 1001 emdash \ + && useradd --system --uid 1001 --gid 1001 --home /app emdash \ && mkdir -p /app/state/uploads \ && chown -R emdash:emdash /app @@ -35,8 +43,14 @@ COPY --from=build --chown=emdash:emdash /app/dist ./dist COPY --from=build --chown=emdash:emdash /app/seed ./seed # Persistent state lives in /app/state (single PVC in k3s). -# data.db and uploads/ are symlinked from the working directory so the -# default emdash paths (./data.db, ./uploads) resolve into the volume. +# STATE_DIR is intentionally NOT set here: the Helm chart injects +# STATE_DIR=/app/state (deploy/helm values.yaml), so the running app writes +# directly to /app/state/data.db and /app/state/uploads and these symlinks +# are never traversed. Leaving it unset keeps the image deploy-agnostic so a +# bare `docker run` / DDEV smoke test falls back to the WORKDIR (./data.db). +# The symlinks below only backstop that STATE_DIR-unset fallback +# (astro.config.mjs: `process.env.STATE_DIR ?? "."`), redirecting the default +# emdash paths into the volume — they do not contradict the STATE_DIR contract. RUN ln -s /app/state/data.db /app/data.db \ && ln -s /app/state/uploads /app/uploads diff --git a/app/astro.config.mjs b/app/astro.config.mjs index f4372e5..abc7338 100644 --- a/app/astro.config.mjs +++ b/app/astro.config.mjs @@ -16,8 +16,9 @@ export default defineConfig({ server: { host: true, port: 4321 }, vite: { server: { - // Dev runs behind DDEV's nginx (https://cms-plugins.ddev.site/). - // Vite's host check must allow the public hostname. + // Dev-only: `vite.server` applies to `astro dev` / `emdash dev` + // (DDEV's nginx fronts https://cms-plugins.ddev.site/). Production + // runs the @astrojs/node standalone server, so this has no prod effect. allowedHosts: ["cms-plugins.ddev.site", ".ddev.site"], }, }, diff --git a/app/src/pages/[slug].astro b/app/src/pages/[slug].astro index efeb0b0..55f15e9 100644 --- a/app/src/pages/[slug].astro +++ b/app/src/pages/[slug].astro @@ -3,8 +3,23 @@ import { getEmDashEntry, decodeSlug } from "emdash"; import { PortableText } from "emdash/ui"; import Base from "../layouts/Base.astro"; +// Root-level catch-all for the `pages` content collection (standard Emdash model: +// a CMS "page" IS a pages-collection row whose `entry.id` is its slug). The site nav +// (`/about` in Base.astro) deliberately links to pages-collection rows rather than to +// hard-coded .astro files, so adding/renaming a static page is a content edit, not a +// code change. Consequence: every unmatched top-level path (incl. bot probes) does one +// indexed SQLite point-lookup here and 404s on a miss. Acceptable by design — single +// replica, single-writer local-path SQLite on kotkan, and hits already emit the +// `cacheHint` below. Do NOT "fix" by reintroducing per-page .astro files. + export const prerender = false; +// Missing slug/entry → 404 status + a "Not found" body (the markup below). +// getEmDashEntry is intentionally NOT wrapped in try/catch: a thrown error +// here means SQLite/infra is unhealthy and should surface as a 500 (a real +// pod-health signal on this single-replica/local-path deploy), not be masked +// as a 404. Mirrors the emdash-kotkanagrilli reference, which also lets these +// calls propagate. const slug = decodeSlug(Astro.params.slug); const { entry, cacheHint } = slug ? await getEmDashEntry("pages", slug) diff --git a/app/src/pages/cms/[slug].astro b/app/src/pages/cms/[slug].astro index f736eee..cb2120c 100644 --- a/app/src/pages/cms/[slug].astro +++ b/app/src/pages/cms/[slug].astro @@ -6,6 +6,12 @@ import { PLUGIN_FETCH_CAP } from "../../lib/statuses"; export const prerender = false; +// Missing slug/entry → 404 status + a "Not found" body (the markup below). +// getEmDashEntry/getEmDashCollection are intentionally NOT wrapped in +// try/catch: a thrown error here means SQLite/infra is unhealthy and should +// surface as a 500 (a real pod-health signal on this single-replica/local-path +// deploy), not be masked as a 404. Mirrors the emdash-kotkanagrilli reference, +// which also lets these calls propagate. const slug = decodeSlug(Astro.params.slug); const { entry: cms, cacheHint } = slug ? await getEmDashEntry("cmses", slug) diff --git a/app/src/pages/plugins/[slug].astro b/app/src/pages/plugins/[slug].astro index 20d06d7..eac60f1 100644 --- a/app/src/pages/plugins/[slug].astro +++ b/app/src/pages/plugins/[slug].astro @@ -6,6 +6,12 @@ import StatusBadge from "../../components/StatusBadge.astro"; export const prerender = false; +// Missing slug/entry → 404 status + a "Not found" body (the markup below). +// getEmDashEntry/getEmDashCollection are intentionally NOT wrapped in +// try/catch: a thrown error here means SQLite/infra is unhealthy and should +// surface as a 500 (a real pod-health signal on this single-replica/local-path +// deploy), not be masked as a 404. Mirrors the emdash-kotkanagrilli reference, +// which also lets these calls propagate. const slug = decodeSlug(Astro.params.slug); const { entry, cacheHint } = slug ? await getEmDashEntry("plugins", slug) diff --git a/deploy/fleet-overlay/README.md b/deploy/fleet-overlay/README.md index 2c15cfd..5f55610 100644 --- a/deploy/fleet-overlay/README.md +++ b/deploy/fleet-overlay/README.md @@ -50,3 +50,32 @@ The HelmRelease itself lives in the workloads repo because that repo is the source of truth for what runs on the kotkanagrilli.fi subdomain pool. Same convention as the existing `kotkanagrilli/` (legacy WP) and `hello-kotkan/` entries there. + +## Why two image automations share one branch + +Both `cms-plugins-staging` and `cms-plugins-production` define an +`ImageUpdateAutomation` that checks out, commits to, and pushes the +**same** `main` branch of `anton-helm-workloads` on the same `interval: 1m`. +This is intentional and safe: + +- Each automation is scoped to a disjoint `update.path` + (`./cms-plugins-staging` vs `./cms-plugins-production`), so they only ever + rewrite the digest setter inside their *own* `helmrelease.yaml`. They + never touch the same file. +- `strategy: Setters` rewrites only the explicitly marked digest setter, not + arbitrary YAML — there is no whole-file regeneration that could clobber a + sibling's change. +- The image-automation-controller serializes its git pushes and retries on + a non-fast-forward rejection, so two automations landing commits on `main` + in the same reconcile window resolve cleanly rather than racing. + +This mirrors the per-env automations under +`~/projects/servers/fleet/apps/base/` for `emdash-kotkanagrilli-*`. The +only deviation (justified in `image-automation.yaml`) is that these reuse +the read-side `anton-helm-workloads` `GitRepository` as the write-back +`sourceRef` instead of a dedicated image-automation source, because these +workloads live in that same repo. + +Note for go-live: nothing here reconciles while the HelmReleases are +`suspend: true` (Phase 0). These automations only begin writing back once +the releases are deliberately resumed. diff --git a/deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml b/deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml index 5ad1f34..c006a37 100644 --- a/deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml +++ b/deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml @@ -32,6 +32,8 @@ spec: # change when CI retags the floating `production` tag. tag: production digest: "" # {"$imagepolicy": "kotkan:cms-plugins-production:digest"} + # digest-pinned below, so this is effectively a no-op (a digest is + # immutable); kept as Always to match the chart default. pullPolicy: Always ingress: host: cms-plugins-production.kotkanagrilli.fi diff --git a/deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml b/deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml index 530f3a5..8c7c830 100644 --- a/deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml +++ b/deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml @@ -32,6 +32,8 @@ spec: # change when CI retags the floating `staging` tag. tag: staging digest: "" # {"$imagepolicy": "kotkan:cms-plugins-staging:digest"} + # digest-pinned below, so this is effectively a no-op (a digest is + # immutable); kept as Always to match the chart default. pullPolicy: Always ingress: host: cms-plugins-staging.kotkanagrilli.fi diff --git a/deploy/helm/templates/deployment.yaml b/deploy/helm/templates/deployment.yaml index 22b341a..b414e93 100644 --- a/deploy/helm/templates/deployment.yaml +++ b/deploy/helm/templates/deployment.yaml @@ -69,6 +69,7 @@ spec: initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }} periodSeconds: {{ .Values.probes.liveness.periodSeconds }} timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.liveness.failureThreshold }} readinessProbe: httpGet: path: {{ .Values.probes.readiness.path }} @@ -76,6 +77,7 @@ spec: initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }} periodSeconds: {{ .Values.probes.readiness.periodSeconds }} timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.readiness.failureThreshold }} resources: {{- toYaml .Values.resources | nindent 12 }} volumes: diff --git a/deploy/helm/values-production.yaml b/deploy/helm/values-production.yaml index 653cb6e..a416618 100644 --- a/deploy/helm/values-production.yaml +++ b/deploy/helm/values-production.yaml @@ -1,5 +1,10 @@ -# Production overrides — applied via the FluxCD HelmRelease (or directly with -# `helm upgrade -f values-production.yaml`). +# Production overrides for DIRECT `helm upgrade -f values-production.yaml` use only. +# +# IMPORTANT: FluxCD does NOT read this file. The live production deploy is driven +# solely by the inline `spec.values:` block in +# deploy/fleet-overlay/cms-plugins-production/helmrelease.yaml (copied into +# anton-helm-workloads). Editing values here has NO effect on the cluster. +# Keep this file in sync with that HR `values:` block by hand, or it will drift. image: tag: production-latest diff --git a/deploy/helm/values-staging.yaml b/deploy/helm/values-staging.yaml index fb7ccd2..19de2f0 100644 --- a/deploy/helm/values-staging.yaml +++ b/deploy/helm/values-staging.yaml @@ -1,5 +1,10 @@ -# Staging overrides — applied via the FluxCD HelmRelease (or directly with -# `helm upgrade -f values-staging.yaml`). +# Staging overrides for DIRECT `helm upgrade -f values-staging.yaml` use only. +# +# IMPORTANT: FluxCD does NOT read this file. The live staging deploy is driven +# solely by the inline `spec.values:` block in +# deploy/fleet-overlay/cms-plugins-staging/helmrelease.yaml (copied into +# anton-helm-workloads). Editing values here has NO effect on the cluster. +# Keep this file in sync with that HR `values:` block by hand, or it will drift. image: tag: staging-latest diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 7fce992..d05e2ae 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -1,13 +1,21 @@ # Defaults for the cms-plugins chart. -# Per-env overrides come from values-staging.yaml / values-production.yaml -# and from the FluxCD HelmRelease's `values:` block. +# Per-env overrides: Flux applies ONLY the HelmRelease `values:` block. +# values-staging.yaml / values-production.yaml are for direct `helm upgrade -f` +# use and are NOT read by Flux — keep them in sync with the HR by hand. image: repository: git.oleks.space/oleks/cms-plugins tag: develop-latest - # The tag is a mutable floating pointer (CI retags -latest onto - # each new build), so kubelet must always re-pull — IfNotPresent would - # pin the node to whatever digest it cached first and never roll. + # `Always` is here for the chart-default FLOATING-TAG path: with no + # `digest` set, the image renders as `repository:-latest` + # (a mutable pointer CI retags onto each build), so kubelet must + # re-pull or it would pin to the first cached digest and never roll. + # NOTE: the deployed overlays pin by `digest` (repository@sha256:…), + # where a tag change instead changes the image *reference string*, so + # `helm upgrade` already detects it and `Always` is a no-op (a digest + # is content-addressed — it can never resolve to different bytes). + # `IfNotPresent` would be marginally better on the digest path but is + # left as `Always` so both render paths share one safe value. pullPolicy: Always service: @@ -57,17 +65,27 @@ imagePullSecrets: probes: liveness: # /_emdash/api/health requires auth (401 to unauthenticated requests), - # so kubelet probes fail and the pod gets killed. The site root is - # public and a 200 from it is a reasonable proxy for "the server is up". + # so we probe the public site root instead. But `/` is server-rendered + # and queries SQLite content, so a content/render or DB fault makes it + # 500 while the Node process is perfectly alive. Liveness must NOT + # crash-loop the single SQLite replica over a transient content/DB + # error: keep failureThreshold high so only a genuinely wedged process + # (sustained failures) triggers a restart. Readiness (below) is what + # sheds traffic on a content 500. path: / initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 5 + failureThreshold: 6 readiness: + # Probe the public site root. A content/render 500 here removes the pod + # from Endpoints (stops serving 500s) WITHOUT the kubelet killing the + # process — readiness failures never restart the container. path: / initialDelaySeconds: 5 periodSeconds: 10 timeoutSeconds: 5 + failureThreshold: 3 resources: requests: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 9b01097..14dacf5 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -4,11 +4,14 @@ set -eu # Ensure persistent state dirs exist (volume may be empty on first boot). mkdir -p /app/state/uploads -# Bootstrap on first run: create data.db and apply migrations. -# emdash init is expected to be idempotent on subsequent boots. -if [ ! -f /app/state/data.db ]; then - echo "[entrypoint] no data.db found in /app/state, running emdash init" - node_modules/.bin/emdash init -fi +# Run emdash init on EVERY boot, before exec'ing the server. It is idempotent: +# runMigrations applies only pending migrations (no-op when all are applied) and +# init skips re-seeding once collections exist. Under `set -e` a non-zero exit +# aborts before `exec "$@"`, so a failed/partial init surfaces as a crash-loop +# with logs instead of a silently half-migrated boot. (Gating on the mere +# presence of data.db would skip pending migrations on image upgrades against an +# existing PVC and never recover a partial first-run init.) +echo "[entrypoint] running emdash init (applies pending migrations, skips re-seed when collections exist)" +node_modules/.bin/emdash init exec "$@"