diff --git a/.github/agents/copilot-instructions.md b/.github/agents/copilot-instructions.md index e69de29..4ee2f41 100644 --- a/.github/agents/copilot-instructions.md +++ b/.github/agents/copilot-instructions.md @@ -0,0 +1,7 @@ + +## Active Technologies +- Node.js ≥18, ESM (`"type": "module"`) + `axios` (HTTP), `redis` (token cache), `xmlbuilder2` (XML — already injected as `xmlBuilder`), `uuid`, `jsonwebtoken` — all already in `package.json` (002-sitemap-generation) +- Redis read/write (`hGet`/`hSet`) for OIDC token cache only — no new storage (002-sitemap-generation) + +## Recent Changes +- 002-sitemap-generation: Added Node.js ≥18, ESM (`"type": "module"`) + `axios` (HTTP), `redis` (token cache), `xmlbuilder2` (XML — already injected as `xmlBuilder`), `uuid`, `jsonwebtoken` — all already in `package.json` diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index dab13b7..ad3e6bf 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,7 +1,7 @@ For additional context about technologies to be used, project structure, shell commands, and other important information, read the current plan at -`specs/001-oidc-proxy-script/plan.md` +`specs/002-sitemap-generation/plan.md` ## Project Overview diff --git a/.specify/feature.json b/.specify/feature.json index 08725ca..cdd33d0 100644 --- a/.specify/feature.json +++ b/.specify/feature.json @@ -1,3 +1,3 @@ { - "feature_directory": "specs/001-oidc-proxy-script" + "feature_directory": "specs/002-sitemap-generation" } diff --git a/.specify/memory/constitution.md b/.specify/memory/constitution.md index 3d7c840..2b57732 100644 --- a/.specify/memory/constitution.md +++ b/.specify/memory/constitution.md @@ -297,8 +297,9 @@ Follow-up TODOs: - ✅ `jwt` - JSON Web Token library for authentication - ✅ `xmlBuilder` - XML document builder - ✅ `uuidv4` - UUID generator +- ✅ `redis` - Redis client for token caching and shared state - ✅ `adapterHelper` - Helper functions (loaded from src/globalVariables/) -- ✅ `adapter_settings` - Business data only (service account, Drive query, sitemap settings) +- ✅ `kme_CSA_settings` - Business data only (OIDC credentials, search API config, sitemap settings) - ✅ `req` - HTTP request object (includes req.params with routing metadata) - ✅ `res` - HTTP response object @@ -440,6 +441,7 @@ const globalVMContext = { uuidv4, jwt, xmlBuilder, + redis, // Connected Redis client for token caching }; // Load dynamic data from src/globalVariables/ directory @@ -505,14 +507,21 @@ script.runInContext(context); - Package: `xmlbuilder2` (create function) - Injected from: `globalVMContext.xmlBuilder` +7. **redis** - Redis client + - Purpose: Token caching and shared state across requests + - Usage: `await redis.hGet('key', 'field')`, `await redis.hSet('key', 'field', 'value')` + - Package: `redis` (node-redis v4+, connected client) + - Injected from: `globalVMContext.redis` + - Note: Client is connected before server starts; use `await` for all operations + **Built-in Web APIs:** -7. **URLSearchParams** - URL query string parser (built-in) +8. **URLSearchParams** - URL query string parser (built-in) - Purpose: Parse and manipulate URL query strings - Usage: `new URLSearchParams(queryString)` - Injected from: `globalVMContext.URLSearchParams` -8. **URL** - URL parser (built-in) +9. **URL** - URL parser (built-in) - Purpose: Parse and manipulate URLs - Usage: `new URL(urlString)` - Injected from: `globalVMContext.URL` @@ -520,14 +529,14 @@ script.runInContext(context); **Dynamic Data Context Variables:** -9. **Dynamic JSON objects from src/globalVariables/ directory** +10. **Dynamic JSON objects from src/globalVariables/ directory** - Purpose: Authentication credentials, secrets, API keys, and behavioral configuration - Pattern: Each `src/globalVariables/filename.json` loaded by server.js → added to `globalVariableContext` → spread into VM context - Examples: - - `src/globalVariables/adapter_settings.json` → context variable `adapter_settings` (consolidated service account, scopes, drive query, sitemap config) + - `src/globalVariables/kme_CSA_settings.json` → context variable `kme_CSA_settings` (OIDC credentials, search API config, sitemap settings) - `src/globalVariables/api-keys.json` → context variable `api_keys` (API keys and secrets) - `src/globalVariables/custom-config.json` → context variable `custom_config` (behavioral settings) - - Usage in src/proxyScripts/proxy.js: Direct variable access `const settings = adapter_settings;` + - Usage in src/proxyScripts/proxy.js: Direct variable access `const settings = kme_CSA_settings;` - Loading: By server.js at startup using `loadGlobalObjects()` function - Injection: Via spread operator `...globalVariableContext` in `vm.createContext()` - **Note**: ALL authentication, secrets, and behavioral configuration MUST be in src/globalVariables/, NEVER in config/default.json diff --git a/CHANGELOG.md b/CHANGELOG.md index d695156..11db805 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,21 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- +## [0.2.0] - 2026-04-23 + +### Added + +- `GET /sitemap.xml` endpoint: returns a well-formed XML Sitemap (Sitemaps protocol 0.9) containing one `` per knowledge item from the KME Knowledge Search Service +- `sitemapFlow()` async function in `kmeContentSourceAdapter.js` — settings validation, OIDC token reuse, search API call, XML build via `xmlBuilder`, 10-second timeout, 502/504/500 error responses +- `getValidToken()` shared helper extracted from the existing OIDC auth flow — used by both sitemap and non-sitemap paths +- URL routing at IIFE entry point: requests ending in `/sitemap.xml` → `sitemapFlow()`, all others → `oidcAuthFlow()` +- Three new fields in `src/globalVariables/kme_CSA_settings.json`: `searchApiBaseUrl`, `tenant`, `proxyBaseUrl` +- Three new placeholder fields in `src/globalVariables/kme_CSA_settings.json.example` +- Unit tests for sitemap flow: happy path (items present), empty results, `vkm:url` filtering, 502/504/500 error scenarios, non-sitemap regression tests +- Contract tests for sitemap endpoint: full round-trip 200, empty results 200, 502 upstream error, 504 timeout + +--- + ## [0.1.0] - 2026-04-23 ### Added diff --git a/package.json b/package.json index 68ffb06..a098726 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "kme-content-adapter", - "version": "0.1.0", + "version": "0.2.0", "description": "HTTP proxy adapter to search and export documents from KME", "type": "module", "main": "src/server.js", diff --git a/specs/002-sitemap-generation/checklists/requirements.md b/specs/002-sitemap-generation/checklists/requirements.md new file mode 100644 index 0000000..5e9bf19 --- /dev/null +++ b/specs/002-sitemap-generation/checklists/requirements.md @@ -0,0 +1,36 @@ +# Specification Quality Checklist: Sitemap XML Generation + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2025-07-14 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) — *Note: FR-008/FR-009 reference `xmlBuilder` and the VM sandbox constraint. These are explicitly mandated architectural constraints from the feature description, not incidental implementation choices; they belong in the spec as requirements.* +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders — *Technical terms (Redis, OIDC) are domain-specific to this integration; they cannot be abstracted away without losing meaning.* +- [x] All mandatory sections completed — User Scenarios, Requirements, Success Criteria, Assumptions all present + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous — All FRs use precise MUST language with measurable conditions +- [x] Success criteria are measurable — SC-001 (5-second response time), SC-002 (zero silent drops), SC-003 (zero regressions), SC-004 (XSD validation), SC-005 (10-second error bound) +- [x] Success criteria are technology-agnostic — SC-004 references the public Sitemaps XSD standard, not an internal tool +- [x] All acceptance scenarios are defined — 8 acceptance scenarios across 3 user stories +- [x] Edge cases are identified — 5 edge cases documented (expired token, missing `vkm:url`, large result sets, missing settings, missing `xmlBuilder`) +- [x] Scope is clearly bounded — v1 scope explicitly excludes pagination, multi-tenant, and optional sitemap elements +- [x] Dependencies and assumptions identified — 8 assumptions documented + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria — FR-001–FR-013 each trace to at least one acceptance scenario or edge case +- [x] User scenarios cover primary flows — Happy path (P1), backwards compatibility (P2), error/degradation (P3) +- [x] Feature meets measurable outcomes defined in Success Criteria — All 5 success criteria are verifiable without implementation knowledge +- [x] No implementation details leak into specification — Architectural constraints are present as explicit requirements per the feature description + +## Notes + +- All checklist items pass. The spec is ready for `/speckit.clarify` (optional) or `/speckit.plan`. +- The shape of the Knowledge Search Service response envelope (how results are nested) is assumed in the Assumptions section and flagged for confirmation during implementation. +- SC-001 (5 seconds) and the 10-second timeout assumption are reasonable defaults and can be revisited during planning if the team has SLA data for the KME environment. diff --git a/specs/002-sitemap-generation/contracts/sitemap-endpoint.md b/specs/002-sitemap-generation/contracts/sitemap-endpoint.md new file mode 100644 index 0000000..595f859 --- /dev/null +++ b/specs/002-sitemap-generation/contracts/sitemap-endpoint.md @@ -0,0 +1,189 @@ +# Contract: Sitemap Endpoint + +**Feature**: `002-sitemap-generation` +**Endpoint type**: HTTP GET +**Introduced in**: `002-sitemap-generation` + +--- + +## Overview + +The `kme-content-adapter` proxy exposes a single new HTTP endpoint: `GET /sitemap.xml` (or +any URL whose path ends with `/sitemap.xml`). This contract governs the complete observable +behaviour of that endpoint from the consumer's perspective. + +--- + +## Endpoint + +``` +GET /sitemap.xml +``` + +The adapter detects sitemap requests by checking whether `req.url` ends with `/sitemap.xml`. +The full path prefix (if any) is determined by how the reverse proxy routes requests to this +adapter. + +--- + +## Request + +### Method +`GET` + +### Headers +No special request headers required. The adapter uses its own internally cached OIDC token +to authenticate the upstream call to the KME Knowledge Search Service. + +### Body +None. + +--- + +## Responses + +### 200 OK — Sitemap generated successfully + +**Condition**: The KME Knowledge Search Service returned a 2xx response and the sitemap was +built without errors. + +**Headers**: +``` +Content-Type: application/xml +``` + +**Body**: A well-formed XML Sitemap document conforming to +[https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd](https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd). + +```xml + + + + https://{proxyBaseUrl}?kmeURL={encodeURIComponent(vkmUrl)} + + + +``` + +**Empty-result variant** (search service returns zero items): +```xml + + +``` + +### 500 Internal Server Error — Missing configuration + +**Condition**: One or more required settings fields (`searchApiBaseUrl`, `tenant`, +`proxyBaseUrl`) are absent from `kme_CSA_settings`. + +**Headers**: +``` +Content-Type: text/plain +``` + +**Body**: +``` +Configuration error: missing required field: +``` + +### 502 Bad Gateway — Upstream search service error + +**Condition**: The KME Knowledge Search Service returned a non-2xx HTTP response. + +**Headers**: +``` +Content-Type: text/plain +``` + +**Body**: +``` +Search service error: HTTP +``` + +### 504 Gateway Timeout — Upstream search service timeout + +**Condition**: The KME Knowledge Search Service connection timed out (>10 000 ms). + +**Headers**: +``` +Content-Type: text/plain +``` + +**Body**: +``` +Search service timeout +``` + +--- + +## `` URL Format + +Each `` element is constructed as: + +``` +{proxyBaseUrl}?kmeURL={encodeURIComponent(item['vkm:url'])} +``` + +Where: +- `proxyBaseUrl` is taken from `kme_CSA_settings.proxyBaseUrl` (e.g., `https://adapter.example.com`) +- `item['vkm:url']` is the raw `vkm:url` value from the search service result +- `encodeURIComponent` percent-encodes the value so it is safe as a query parameter + +**Example**: +``` +https://adapter.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fknowledge%2Farticle-123 +``` + +--- + +## Authentication to Upstream (internal, not exposed to consumer) + +The adapter authenticates to the KME Knowledge Search Service using: + +``` +Authorization: OIDC_id_token +``` + +Where `` is the `id_token` from the OIDC token service, cached in Redis at +`authorization.token`. Token refresh uses the same stampede-guarded fetch already present +in the existing OIDC auth flow. + +--- + +## Existing Endpoint Behaviour (unchanged) + +All requests whose URL does **not** end in `/sitemap.xml` continue to use the existing OIDC +authentication flow with no change in response behaviour: + +| Condition | Response | +|---|---| +| Valid cached OIDC token | `200 Authorized` (`text/plain`) | +| No cached token — fetch succeeds | `200 Authorized` (`text/plain`) | +| Token service unreachable | `401 Unauthorized: ` (`text/plain`) | + +--- + +## Non-Functional Constraints + +| Constraint | Value | Source | +|---|---|---| +| Search API timeout | 10 000 ms | Spec assumption | +| Max response time (normal conditions) | < 5 000 ms | SC-001 | +| Max response time (error scenarios) | < 10 000 ms | SC-005 | +| Pagination | Not supported (v1) | Spec assumption | +| Multi-tenant | Not supported (v1) | Spec assumption | + +--- + +## Sitemap Protocol Compliance + +The returned XML must validate against the Sitemaps XSD: +`https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd` + +Required elements per entry (v1 scope): +- `` — mandatory + +Optional elements **not included** in v1: +- `` — out of scope +- `` — out of scope +- `` — out of scope diff --git a/specs/002-sitemap-generation/data-model.md b/specs/002-sitemap-generation/data-model.md new file mode 100644 index 0000000..e60ed36 --- /dev/null +++ b/specs/002-sitemap-generation/data-model.md @@ -0,0 +1,202 @@ +# Data Model: Sitemap XML Generation + +**Feature**: `002-sitemap-generation` +**Branch**: `002-sitemap-generation` +**Date**: 2025-07-14 + +--- + +## Entities + +### 1. `KnowledgeItem` (external, read-only) + +Represents a single document returned by the KME Knowledge Search Service. The adapter reads +this shape from the upstream API response and never persists or mutates it. + +| Field | Type | Source | Notes | +|---|---|---|---| +| `vkm:url` | `string \| undefined` | Search API response `items[]` | Canonical document URL. **Required** for sitemap inclusion. Items where this field is absent or empty are silently omitted (FR-006). | +| `title` | `string \| undefined` | Search API response | Not used by the sitemap; present in payload, ignored. | +| *(other fields)* | `any` | Search API response | Ignored; adapter reads only `vkm:url`. | + +**Assumed response envelope** (to be verified against live API — see research.md R-002): +```json +{ + "items": [ + { "vkm:url": "https://kme.example.com/knowledge/doc-1", "title": "Doc One" }, + { "vkm:url": "https://kme.example.com/knowledge/doc-2", "title": "Doc Two" } + ] +} +``` +If the root is a bare array, `response.data` itself is treated as the items array. + +--- + +### 2. `SitemapEntry` (derived, in-memory) + +Represents a single `/` entry in the generated sitemap XML. Derived from a `KnowledgeItem` +during the transformation step. + +| Field | Type | Derivation | +|---|---|---| +| `loc` | `string` | `${kme_CSA_settings.proxyBaseUrl}?kmeURL=${encodeURIComponent(item['vkm:url'])}` | + +**Validation rules**: +- Only produced if `item['vkm:url']` is a non-empty string. +- The resulting `loc` must be a percent-encoded absolute URL. + +--- + +### 3. `SitemapDocument` (output) + +The XML document returned in the HTTP response body. + +| Attribute | Value | +|---|---| +| XML version | `1.0` | +| Encoding | `UTF-8` | +| Root element | `` | +| Child elements | Zero or more `` entries | + +**Populated sitemap**: +```xml + + + + https://adapter.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-1 + + + https://adapter.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-2 + + +``` + +**Empty sitemap** (zero results from search API): +```xml + + +``` + +--- + +### 4. `OIDCTokenCache` (shared, Redis) + +The existing Redis-backed OIDC token store. The sitemap flow **reads** and **writes** this store +using the identical hGet/hSet pattern as the existing OIDC auth flow. + +| Redis Key | Field | Type | Description | +|---|---|---|---| +| `authorization` | `token` | `string` | The OIDC `id_token` JWT | +| `authorization` | `expiry` | `string (float)` | Unix timestamp (seconds) when token expires | + +**Access pattern in sitemap flow**: +1. `hGet('authorization', 'token')` — read cached token +2. `hGet('authorization', 'expiry')` — read cached expiry +3. If expired or absent: invoke token-refresh sequence → `hSet` both fields + +--- + +### 5. `kme_CSA_settings` (configuration, JSON) + +The settings object injected into the VM context from `src/globalVariables/kme_CSA_settings.json`. +This feature extends it with three new fields. + +**Full schema after this feature**: + +| Field | Type | Existing/New | Required By | +|---|---|---|---| +| `tokenUrl` | `string` | Existing | OIDC token fetch (all flows) | +| `username` | `string` | Existing | OIDC token fetch | +| `password` | `string` | Existing | OIDC token fetch | +| `clientId` | `string` | Existing | OIDC token fetch | +| `scope` | `string` | Existing | OIDC token fetch | +| `searchApiBaseUrl` | `string` | **New** | FR-002, FR-010 | +| `tenant` | `string` | **New** | FR-002, FR-010 | +| `proxyBaseUrl` | `string` | **New** | FR-005, FR-010 | +| `_pendingFetch` | `Promise \| null` | Runtime only (not in JSON) | Stampede guard | + +**Validation**: +- Existing fields validated at top of script for all requests (unchanged). +- New fields validated at start of sitemap branch only (FR-011). + +--- + +## State Transitions + +### Sitemap Request Lifecycle + +``` +Incoming GET /…/sitemap.xml + | + v + Validate settings --> 500 Internal Server Error (missing field) + (searchApiBaseUrl, + tenant, proxyBaseUrl) + | + v + Read token from Redis + | + [valid?] + YES | NO + | v + | Refresh token --> 401 Unauthorized (token fetch failed) + | | + +-------+ + v + GET / + Authorization: OIDC_id_token + timeout: 10 000 ms + | + [success?] + YES | NO + | +--> timeout --> 504 Gateway Timeout + | +--> non-2xx response --> 502 Bad Gateway + v + Map items --> SitemapEntry[] + (skip empty vkm:url) + | + v + Build SitemapDocument (xmlBuilder) + | + v + 200 OK + Content-Type: application/xml + Body: ... +``` + +### Non-Sitemap Request Lifecycle (unchanged) + +All requests whose URL does NOT end with `/sitemap.xml` follow the existing OIDC auth flow +exactly as before. No modification to that path. + +--- + +## File Changes + +### Modified: `src/globalVariables/kme_CSA_settings.json` + +Three new fields added (existing fields unchanged): + +```json +{ + "tokenUrl": "…", + "username": "…", + "password": "…", + "clientId": "…", + "scope": "…", + "searchApiBaseUrl": "https://kme-search.example.com/api/search", + "tenant": "my-tenant", + "proxyBaseUrl": "https://adapter.example.com" +} +``` + +### Modified: `src/proxyScripts/kmeContentSourceAdapter.js` + +Logic added: +1. URL routing guard at entry point. +2. `sitemapFlow` async block: settings validation, token reuse, search API call, XML build, response. +3. Existing OIDC auth flow moved to `else` branch (no logic changes). + +### Modified: `src/globalVariables/kme_CSA_settings.json.example` + +Updated to include the three new fields with placeholder values. diff --git a/specs/002-sitemap-generation/plan.md b/specs/002-sitemap-generation/plan.md new file mode 100644 index 0000000..dc01ac9 --- /dev/null +++ b/specs/002-sitemap-generation/plan.md @@ -0,0 +1,248 @@ +# Implementation Plan: Sitemap XML Generation + +**Branch**: `002-sitemap-generation` | **Date**: 2025-07-14 | **Spec**: [spec.md](./spec.md) +**Input**: Feature specification from `/specs/002-sitemap-generation/spec.md` + +--- + +## Summary + +Add a `GET /sitemap.xml` route to `kmeContentSourceAdapter.js`. The adapter detects sitemap +requests by URL suffix, obtains a valid OIDC `id_token` from the Redis cache (reusing the +existing stampede-guarded refresh logic), calls the KME Knowledge Search Service, maps each +result's `vkm:url` field to a `` entry, and returns a standards-compliant XML Sitemap as +`application/xml`. All existing non-sitemap requests are unaffected. Three new fields are added +to `kme_CSA_settings.json` (`searchApiBaseUrl`, `tenant`, `proxyBaseUrl`). + +--- + +## Technical Context + +**Language/Version**: Node.js ≥18, ESM (`"type": "module"`) +**Primary Dependencies**: `axios` (HTTP), `redis` (token cache), `xmlbuilder2` (XML — already injected as `xmlBuilder`), `uuid`, `jsonwebtoken` — all already in `package.json` +**Storage**: Redis read/write (`hGet`/`hSet`) for OIDC token cache only — no new storage +**Testing**: Node.js built-in test runner (`node:test`); no external test framework +**Target Platform**: Linux server / container (HTTP proxy adapter) +**Project Type**: HTTP proxy adapter (web-service) +**Performance Goals**: Sitemap response < 5 s p95 under normal conditions (SC-001); error responses < 10 s (SC-005) +**Constraints**: + - Zero `import`/`export` in `kmeContentSourceAdapter.js` (runs in `vm.createContext`) + - No references to `config`, `global.config`, or `process.env` in proxy script + - XML built exclusively with the injected `xmlBuilder` (FR-008) + - No new npm packages; no new source files (monolithic architecture — Section I of constitution) +**Scale/Scope**: Single tenant per deployment; all search results in one API call (no pagination, v1) + +--- + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +| # | Principle | Status | Notes | +|---|---|---|---| +| I | Monolithic architecture | ✅ PASS | All new code added to `kmeContentSourceAdapter.js`; no new source files | +| I (vm.Script) | Zero imports/exports in proxy script | ✅ PASS | Sitemap logic is inlined; no import statements introduced | +| I.0 | No forbidden globals (`config`, `global.config`, `process.env`) | ✅ PASS | Only `kme_CSA_settings`, `redis`, `axios`, `xmlBuilder`, `req`, `res` used | +| I.I | Business logic in proxy.js | ✅ PASS | Auth, API call, XML generation all in `kmeContentSourceAdapter.js` | +| I.II | Separate files only for allowed categories | ✅ PASS | Settings JSON in `src/globalVariables/` (existing pattern) | +| I.III | No new files challenged | ✅ PASS | No new files in `src/` | +| I.IV | New config in `src/globalVariables/` not `config/default.json` | ✅ PASS | Three fields added to `kme_CSA_settings.json` | +| I.V | `xmlBuilder` already in `globalVMContext` | ✅ PASS | `xmlbuilder2` `create` already injected; no server.js changes needed | +| II | API-First Design | ✅ PASS | HTTP contract documented in `contracts/sitemap-endpoint.md` | +| III | Test-First Development | ✅ REQUIRED | Unit + contract tests must be written before/alongside implementation | +| VII | No new dependencies | ✅ PASS | All required packages already installed (`xmlbuilder2`, `axios`, `redis`) | + +**Post-design re-check**: All gates still pass. The design introduces zero new files, zero new dependencies, and zero architectural violations. + +--- + +## Project Structure + +### Documentation (this feature) + +```text +specs/002-sitemap-generation/ +├── plan.md # This file (/speckit.plan command output) +├── spec.md # Feature specification +├── research.md # Phase 0 output (/speckit.plan command) +├── data-model.md # Phase 1 output (/speckit.plan command) +├── quickstart.md # Phase 1 output (/speckit.plan command) +├── contracts/ # Phase 1 output (/speckit.plan command) +│ └── sitemap-endpoint.md +└── tasks.md # Phase 2 output (/speckit.tasks command - NOT created by /speckit.plan) +``` + +### Source Code (repository root) + +```text +src/ +├── proxyScripts/ +│ └── kmeContentSourceAdapter.js # MODIFIED: sitemap branch + token helper added +├── globalVariables/ +│ ├── kme_CSA_settings.json # MODIFIED: 3 new fields (searchApiBaseUrl, tenant, proxyBaseUrl) +│ └── kme_CSA_settings.json.example # MODIFIED: updated with new field placeholders +└── server.js # NO CHANGE + +tests/ +├── unit/ +│ └── proxy.test.js # MODIFIED: sitemap test cases added +└── contract/ + └── proxy-http.test.js # MODIFIED: sitemap HTTP contract tests added +``` + +**Structure Decision**: Single-project layout. No new directories. Only the proxy script, its +settings JSON, and the existing test files are modified. + +--- + +## Phase 0: Research Findings + +> Full research notes: [research.md](./research.md) + +| Research ID | Topic | Decision | +|---|---|---| +| R-001 | Token reuse | Inline shared `getValidToken()` helper in proxy script; branch on URL first | +| R-002 | Search API response shape | Assume `{ items: [...] }`; verify against live API during implementation | +| R-003 | xmlbuilder2 API | `xmlBuilder({...}).ele('urlset',{xmlns:...})…doc.end({})` — no prettyPrint | +| R-004 | Error mapping | Reuse `err.response` / `err.code === ECONNABORTED\|ERR_CANCELED` pattern | +| R-005 | Settings validation | `requiredSitemapFields` guard before any async work → HTTP 500 | +| R-006 | `loc` construction | `` `${proxyBaseUrl}?kmeURL=${encodeURIComponent(item['vkm:url'])}` `` | + +**Resolved NEEDS CLARIFICATION**: None remain. All decisions are documented. + +--- + +## Phase 1: Design + +### Data Model + +> Full data model: [data-model.md](./data-model.md) + +**Key entities**: +- `KnowledgeItem` — raw search result with `vkm:url` (read-only, from upstream API) +- `SitemapEntry` — `{ loc: string }` derived in-memory from `KnowledgeItem` +- `SitemapDocument` — serialised XML output (`urlset` + `url` elements) +- `OIDCTokenCache` — shared Redis store (unchanged; `hGet`/`hSet` pattern reused) +- `kme_CSA_settings` — extended JSON settings (3 new fields) + +### Contracts + +> Full contract: [contracts/sitemap-endpoint.md](./contracts/sitemap-endpoint.md) + +| Scenario | Status | Response | +|---|---|---| +| Search succeeds, items present | 200 | `application/xml` sitemap with `` entries | +| Search succeeds, zero items | 200 | `application/xml` empty `` | +| Missing settings field | 500 | `text/plain` descriptive message | +| Upstream non-2xx | 502 | `text/plain` upstream error | +| Upstream timeout | 504 | `text/plain` timeout message | + +### Implementation Design + +**Entry point restructure** (single IIFE, no imports): + +```javascript +(async () => { + // FR-001: Route on URL suffix + if (req.url.endsWith('/sitemap.xml')) { + await sitemapFlow(); + } else { + await oidcAuthFlow(); // existing logic, moved to inner async function + } +})(); +``` + +**`sitemapFlow` logic**: + +```javascript +async function sitemapFlow() { + // FR-011: Validate required settings + const required = ['searchApiBaseUrl', 'tenant', 'proxyBaseUrl']; + for (const f of required) { + if (!kme_CSA_settings[f]) { + res.writeHead(500, { 'Content-Type': 'text/plain' }); + res.end('Configuration error: missing required field: ' + f); + return; + } + } + + // FR-003: Obtain valid OIDC token (shared helper with existing flow) + const token = await getValidToken(); // throws on failure → caught by outer try/catch + + // FR-002: Call KME Knowledge Search Service + const { searchApiBaseUrl, tenant, proxyBaseUrl } = kme_CSA_settings; + const searchResponse = await axios.get( + `${searchApiBaseUrl}/${tenant}`, + { + headers: { Authorization: `OIDC_id_token ${token}` }, + timeout: 10_000, + } + ); + + // Extract items (R-002: assume { items: [...] } or bare array) + const items = searchResponse.data.items ?? searchResponse.data ?? []; + + // FR-004, FR-005, FR-006, FR-008: Build sitemap XML + const doc = xmlBuilder({ version: '1.0', encoding: 'UTF-8' }); + const urlset = doc.ele('urlset', { xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }); + for (const item of items) { + const vkmUrl = item['vkm:url']; + if (!vkmUrl) continue; // FR-006: omit silently + const loc = `${proxyBaseUrl}?kmeURL=${encodeURIComponent(vkmUrl)}`; + urlset.ele('url').ele('loc').txt(loc).up().up(); + } + const xml = doc.end({ prettyPrint: false }); + + // FR-007: Respond + res.writeHead(200, { 'Content-Type': 'application/xml' }); + res.end(xml); +} +``` + +**Error handling** (wrapping `sitemapFlow` catch): +- `err.code === 'ECONNABORTED' || err.code === 'ERR_CANCELED'` → 504 +- `err.response` defined → 502 `Search service error: HTTP ${err.response.status}` +- other → 502 `Search service error: ${err.message}` + +**`getValidToken` helper** (shared inline function; extract from existing OIDC flow): + +Encapsulates steps 2–6 of the existing flow: +- `hGet('authorization', 'token')` / `hGet('authorization', 'expiry')` +- Cache hit → return token +- Stampede guard → queue on in-flight promise +- Cache miss → `axios.post(tokenUrl, ...)` → `hSet` both fields +- Returns the `id_token` string; throws on failure + +**Token fetch failure in sitemap context**: If `getValidToken` throws, the outer catch +returns `401 Unauthorized: ` (same as existing flow). + +### Test Plan + +**Unit tests** (`tests/unit/proxy.test.js`) — new `describe('sitemap flow')` block: + +| Scenario | Mock | Assert | +|---|---|---| +| Happy path: items present | axios.get → `{ items: [{ 'vkm:url': '...' }] }` | 200, `application/xml`, `` | +| Happy path: zero items | axios.get → `{ items: [] }` | 200, empty `` | +| Items with empty vkm:url | mix of valid + empty | only non-empty items in output | +| Missing `searchApiBaseUrl` | settings without field | 500, descriptive message | +| Missing `tenant` | settings without field | 500, descriptive message | +| Missing `proxyBaseUrl` | settings without field | 500, descriptive message | +| Upstream 503 | axios.get rejects with `{ response: { status: 503 } }` | 502 | +| Upstream timeout | axios.get rejects with `{ code: 'ECONNABORTED' }` | 504 | +| Non-sitemap URL still works | req.url = '/' | existing 200 Authorized behaviour | + +**Contract tests** (`tests/contract/proxy-http.test.js`) — new `describe('sitemap endpoint')` block: + +| Scenario | Setup | Assert | +|---|---|---| +| Full round-trip: GET /sitemap.xml | Mock search server → 200 `{ items: [...] }` | 200, `application/xml`, valid XML with `` | +| Empty results | Mock search server → 200 `{ items: [] }` | 200, `application/xml`, empty `` | +| Search server returns 503 | Mock → 503 | 502 | +| Search server hangs > 10 s | Mock → never respond | 504 | + +--- + +## Complexity Tracking + +> No violations to justify. All gates pass. No entries required. diff --git a/specs/002-sitemap-generation/quickstart.md b/specs/002-sitemap-generation/quickstart.md new file mode 100644 index 0000000..f8bbf3c --- /dev/null +++ b/specs/002-sitemap-generation/quickstart.md @@ -0,0 +1,126 @@ +# Quickstart: Sitemap XML Generation + +**Feature**: `002-sitemap-generation` +**Branch**: `002-sitemap-generation` + +--- + +## What This Feature Does + +Adds a `GET /sitemap.xml` endpoint to the `kme-content-adapter` proxy. When a crawler or +sitemap consumer requests this URL, the adapter: + +1. Obtains a valid OIDC `id_token` from the Redis cache (refreshing if expired). +2. Calls the KME Knowledge Search Service to retrieve all knowledge items. +3. Builds a standards-compliant XML Sitemap (`urlset`) with one `` per item. +4. Returns the sitemap as `application/xml` with HTTP 200. + +All other requests continue to use the existing OIDC auth flow without modification. + +--- + +## Setup + +### 1. Add the new settings fields + +Open `src/globalVariables/kme_CSA_settings.json` and add the three new fields: + +```json +{ + "tokenUrl": "https:///token", + "username": "apiclient", + "password": "", + "clientId": "", + "scope": "openid ...", + "searchApiBaseUrl": "https:///api/search", + "tenant": "", + "proxyBaseUrl": "https://" +} +``` + +| Field | Description | Example | +|---|---|---| +| `searchApiBaseUrl` | Base URL of the KME Knowledge Search Service | `https://kme-qa.example.com/search` | +| `tenant` | Tenant identifier appended to the search URL path | `my-org` | +| `proxyBaseUrl` | Externally accessible HTTPS URL of this adapter | `https://proxy.example.com` | + +The adapter will call `GET {searchApiBaseUrl}/{tenant}` to retrieve knowledge items. + +### 2. Start the adapter + +```bash +npm run dev # development (auto-restart on changes) +npm start # production +``` + +Redis must be running and accessible (default: `redis://localhost:6379`). + +--- + +## Usage + +### Request the sitemap + +```bash +curl -v http://localhost:3000/sitemap.xml +``` + +**Expected response**: +``` +HTTP/1.1 200 OK +Content-Type: application/xml + + + + + https://proxy.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-1 + + ... + +``` + +### Validate the sitemap against the Sitemaps XSD + +```bash +# Using xmllint (libxml2) +curl -s http://localhost:3000/sitemap.xml | \ + xmllint --schema https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd --noout - +``` + +--- + +## Running the Tests + +```bash +npm run test:unit # unit tests (VM context mocking, no network) +npm run test:contract # contract tests (real HTTP, mock token/search servers) +npm test # all tests +``` + +Unit tests live in `tests/unit/proxy.test.js`. +Contract tests live in `tests/contract/proxy-http.test.js`. + +--- + +## Error Scenarios + +| Scenario | How to reproduce | Expected response | +|---|---|---| +| Missing `searchApiBaseUrl` | Remove field from `kme_CSA_settings.json`, restart | `500 Configuration error: missing required field: searchApiBaseUrl` | +| Search service down | Point `searchApiBaseUrl` to an unreachable host | `502 Search service error: HTTP ` or `504 Search service timeout` | +| Zero results | Search service returns empty items array | `200 OK` with empty `` | +| Items with empty `vkm:url` | (covered by unit tests) | Items silently omitted from sitemap | + +--- + +## Architecture Notes + +- **No new files**: All new logic is added directly to + `src/proxyScripts/kmeContentSourceAdapter.js` (monolithic architecture constraint). +- **No new dependencies**: `xmlbuilder2` is already in `package.json` and injected into the + VM context as `xmlBuilder`. +- **Token reuse**: The sitemap flow reuses the existing Redis `hGet`/token-refresh pattern — + no separate auth logic. +- **VM isolation**: The proxy script runs in a `vm.createContext` sandbox. It has access only + to the injected globals listed in `src/server.js` (`axios`, `redis`, `xmlBuilder`, + `kme_CSA_settings`, `req`, `res`, `console`, `URLSearchParams`, `URL`, `crypto`). diff --git a/specs/002-sitemap-generation/research.md b/specs/002-sitemap-generation/research.md new file mode 100644 index 0000000..2747128 --- /dev/null +++ b/specs/002-sitemap-generation/research.md @@ -0,0 +1,190 @@ +# Research: Sitemap XML Generation + +**Feature**: `002-sitemap-generation` +**Branch**: `002-sitemap-generation` +**Date**: 2025-07-14 + +--- + +## R-001: Token Reuse — OIDC Cache Pattern + +**Decision**: Reuse `redis.hGet('authorization', 'token')` / `redis.hGet('authorization', 'expiry')` +and the existing stampede-guard / token-refresh flow verbatim. + +**Rationale**: The existing `kmeContentSourceAdapter.js` already implements a correct, battle-tested +pattern for obtaining a valid OIDC `id_token` from Redis and refreshing it when expired. Duplicating +only the cache-read portion (steps 1–3 of the existing flow) would create divergence. Calling the +full existing logic first and then branching to the sitemap flow avoids that risk while reusing the +security invariants already proven in production. + +**Approach in code**: Refactor the top-level IIFE so that: +1. URL routing check happens **first** (before any async work). +2. For sitemap requests, a shared `getValidToken()` helper (inlined in the script, no imports) + performs the identical cache-hit → stampede-guard → refresh → cache-write sequence. +3. For all other requests, the existing flow runs unchanged. + +**Alternatives considered**: +- Call the existing OIDC logic unconditionally, then branch: rejected because it adds unnecessary + latency to non-sitemap requests (token check not needed for sitemap but would execute anyway). +- Separate helper file: rejected by the monolithic architecture constraint (Section I, constitution). + +--- + +## R-002: KME Knowledge Search Service API — Response Envelope + +**Decision**: Assume the response body is a JSON object with a top-level `items` array. Each element +of `items` is an object whose `vkm:url` property holds the canonical document URL. + +**Rationale**: The feature spec states: +> "The `vkm:url` field is present at the top level of each item object in the search results +> array; the exact response envelope shape will be confirmed against the live API during +> implementation." + +The most common shape for knowledge/search services is `{ items: [ { "vkm:url": "...", ... } ] }`. +This assumption allows the code to be written and fully unit-tested before live-API access is +available. A single `items` extraction line (`response.data.items ?? response.data`) means the +adaption to the real shape is a one-line change. + +**Concrete assumption**: +```json +{ + "items": [ + { "vkm:url": "https://kme.example.com/knowledge/doc-1", "title": "…" }, + { "vkm:url": "https://kme.example.com/knowledge/doc-2", "title": "…" } + ] +} +``` + +**Verification required**: During implementation, run the live API call against +`/` and confirm: +1. The top-level key that holds the array (likely `items`, `results`, or the root is directly an + array). +2. That `vkm:url` is a string property, not nested deeper. + +**Fallback**: If the root is a bare array, `response.data` itself is used as the items array. + +**Alternatives considered**: +- `results` key: equally plausible; the code will use `response.data.items ?? response.data` as a + defensive pattern until confirmed. +- Deeply nested: no evidence for this; rejected pending confirmation. + +--- + +## R-003: xmlbuilder2 `create()` API for Sitemap XML + +**Decision**: Use the `xmlBuilder` context variable (which is `xmlbuilder2`'s `create` function) +with the following call chain: + +```javascript +const doc = xmlBuilder({ version: '1.0', encoding: 'UTF-8' }); +const urlset = doc.ele('urlset', { xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }); +for (const item of items) { + urlset.ele('url').ele('loc').txt(locValue).up().up(); +} +const xml = doc.end({ prettyPrint: false }); +``` + +**Rationale**: `xmlbuilder2` v4.x `create()` returns a `XMLBuilder` document node. Calling `.ele()` +on it creates the root element. Child elements are built by chaining `.ele()` / `.txt()` / `.up()`. +`doc.end({ prettyPrint: false })` serialises to a string prefixed with ``. `prettyPrint: false` is chosen for minimal byte overhead (sitemap consumers +parse XML, not read it). + +**Sitemap namespace**: `http://www.sitemaps.org/schemas/sitemap/0.9` — required by the Sitemaps +protocol and the XSD schema referenced in SC-004. + +**Validation**: The serialised string must begin with `` root. +Unit tests will assert this. + +**Alternatives considered**: +- Manual string concatenation: rejected (error-prone escaping, violates FR-008 which requires + xmlBuilder). +- `xmlbuilder` (v1/v2): not the installed package; rejected. + +--- + +## R-004: Axios Error Differentiation — 502 vs 504 + +**Decision**: Reuse the exact error-detection pattern already present in the script: + +| Condition | Status | Detection | +|---|---|---| +| `err.response` is defined | 502 Bad Gateway | Axios sets `err.response` for non-2xx HTTP responses | +| `err.code === 'ECONNABORTED'` | 504 Gateway Timeout | Axios timeout (pre-Node 18) | +| `err.code === 'ERR_CANCELED'` | 504 Gateway Timeout | Axios timeout (Node 18+ / AbortSignal) | +| Other | 502 Bad Gateway | Treated as upstream failure | + +**Rationale**: The existing script already uses this exact pattern for token-service errors +(`err.response`, `err.code === 'ECONNABORTED' || err.code === 'ERR_CANCELED'`). Reusing it for +search-service errors ensures consistent error classification across all upstream calls. + +**Timeout value**: 10 000 ms, as stated in the spec assumption ("consistent with industry-standard +defaults for proxy-initiated upstream requests"). + +**Alternatives considered**: +- `AbortController` + `fetch`: not available in the VM context (only `axios` is injected). Rejected. +- Different timeout for search vs auth: spec does not require this; YAGNI. + +--- + +## R-005: Settings Validation — New Fields + +**Decision**: At the entry point of the sitemap flow, perform an explicit guard before any async +operation: + +```javascript +const requiredSitemapFields = ['searchApiBaseUrl', 'tenant', 'proxyBaseUrl']; +for (const field of requiredSitemapFields) { + if (!kme_CSA_settings[field]) { + res.writeHead(500, { 'Content-Type': 'text/plain' }); + res.end('Configuration error: missing required field: ' + field); + return; + } +} +``` + +**Rationale**: FR-011 requires HTTP 500 with a descriptive message for missing settings. Checking +before any async work means no I/O is attempted against an unconfigured upstream, and the error +message identifies exactly which field is absent. + +**The three new fields to add to `kme_CSA_settings.json`**: + +| Field | Type | Description | +|---|---|---| +| `searchApiBaseUrl` | string | Base URL of the KME Knowledge Search Service | +| `tenant` | string | Tenant identifier appended to search base URL | +| `proxyBaseUrl` | string | Externally accessible HTTPS URL of this adapter instance | + +--- + +## R-006: `loc` URL Construction and `vkm:url` Encoding + +**Decision**: Construct each `` as: + +```javascript +`${proxyBaseUrl}?kmeURL=${encodeURIComponent(item['vkm:url'])}` +``` + +**Rationale**: FR-005 specifies exactly this pattern. `encodeURIComponent` is a built-in available +inside the VM context without injection (it is a standard JavaScript global). Using it percent-encodes +the `vkm:url` value, producing a safe query-string parameter even if the value contains `://`, `?`, +`#`, or other URL-special characters. + +**Empty/missing guard** (FR-006): +```javascript +const vkmUrl = item['vkm:url']; +if (!vkmUrl) continue; // omit silently +``` + +--- + +## Summary of All Decisions + +| ID | Topic | Decision | +|---|---|---| +| R-001 | Token reuse | Inline shared token-fetch logic; branch on URL first | +| R-002 | Search API response shape | Assume `{ items: [...] }`; verify against live API | +| R-003 | xmlbuilder2 API | `xmlBuilder({...}).ele('urlset', {...})…doc.end({})` | +| R-004 | Error mapping | Reuse existing `err.response` / `err.code` pattern | +| R-005 | Settings validation | Explicit `requiredSitemapFields` guard → HTTP 500 | +| R-006 | `loc` construction | `proxyBaseUrl?kmeURL=encodeURIComponent(vkm:url)` | diff --git a/specs/002-sitemap-generation/spec.md b/specs/002-sitemap-generation/spec.md new file mode 100644 index 0000000..13dc736 --- /dev/null +++ b/specs/002-sitemap-generation/spec.md @@ -0,0 +1,108 @@ +# Feature Specification: Sitemap XML Generation + +**Feature Branch**: `002-sitemap-generation` +**Created**: 2025-07-14 +**Status**: Draft + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 — Search Crawler Discovers KME Content (Priority: P1) + +A search engine crawler or sitemap consumer sends a `GET` request to the proxy adapter's sitemap endpoint. The adapter fetches all available knowledge items from the KME Knowledge Search Service and returns a standards-compliant `sitemap.xml` document that the crawler can index. + +**Why this priority**: This is the core deliverable. Without a valid `sitemap.xml` response, no downstream indexing or content discovery is possible. + +**Independent Test**: Can be fully tested by sending `GET /sitemap.xml` to a running adapter instance and verifying the returned XML body and `Content-Type` header, independent of all other routing behaviour. + +**Acceptance Scenarios**: + +1. **Given** the adapter is running and the KME Knowledge Search Service is available, **When** a consumer sends `GET /sitemap.xml`, **Then** the adapter responds with HTTP 200, `Content-Type: application/xml`, and a body that is a well-formed XML sitemap containing one `/` entry per knowledge item returned by the search service. +2. **Given** each search result contains a `vkm:url` field, **When** the sitemap is generated, **Then** every `` value follows the pattern `?kmeURL=`. +3. **Given** the KME search service returns zero results, **When** the sitemap is generated, **Then** the adapter returns a valid, empty `` document (no `` elements) with HTTP 200. + +--- + +### User Story 2 — Non-Sitemap Requests Continue to Use Existing Auth Flow (Priority: P2) + +A client sends a request whose URL does *not* end in `/sitemap.xml`. The adapter executes the existing OIDC token-check flow (cache hit/miss, Redis, stampede guard) and responds `200 Authorized` or `401 Unauthorized` exactly as before. + +**Why this priority**: Backwards compatibility with the existing OIDC proxy behaviour must be preserved; a regression here would break all current integrations. + +**Independent Test**: Can be fully tested by sending any non-sitemap request and confirming the existing `200 Authorized` / `401 Unauthorized` response behaviour is unchanged. + +**Acceptance Scenarios**: + +1. **Given** a request URL that does not end in `/sitemap.xml`, **When** a valid cached OIDC token exists, **Then** the adapter responds `200 Authorized` with `Content-Type: text/plain`. +2. **Given** a request URL that does not end in `/sitemap.xml`, **When** no cached token exists, **Then** the adapter fetches a fresh OIDC token, caches it, and responds `200 Authorized`. +3. **Given** a request URL that does not end in `/sitemap.xml`, **When** the token service is unreachable, **Then** the adapter responds `401 Unauthorized` as it does today. + +--- + +### User Story 3 — Sitemap Request Fails Gracefully When Search API Is Unavailable (Priority: P3) + +When the KME Knowledge Search Service is unreachable or returns an error, the adapter returns a meaningful error response rather than hanging or crashing. + +**Why this priority**: Graceful degradation protects the wider proxy from silent failures and aids operator debugging. + +**Independent Test**: Can be fully tested by mocking the search API to return an error and confirming the adapter returns a 5xx response with a descriptive message. + +**Acceptance Scenarios**: + +1. **Given** the Knowledge Search Service returns a non-2xx HTTP status, **When** the sitemap is requested, **Then** the adapter responds with HTTP 502 and a plain-text error message describing the upstream failure. +2. **Given** the Knowledge Search Service connection times out, **When** the sitemap is requested, **Then** the adapter responds with HTTP 504 and a plain-text message indicating a gateway timeout. + +--- + +### Edge Cases + +- What happens when the OIDC token is expired at the moment the sitemap request arrives? The same token-refresh logic used by the existing auth flow must be invoked before calling the search API. +- What happens when a knowledge item has a missing or empty `vkm:url` field? That item must be omitted from the sitemap rather than producing a malformed `` entry. +- What happens when the search API returns a very large number of results? The sitemap should include all returned results; pagination handling is out of scope for v1 (assumption documented below). +- What happens when `searchApiBaseUrl`, `tenant`, or `proxyBaseUrl` are missing from the settings file? The adapter must respond with a `500` error and a descriptive message. +- What happens when `xmlBuilder` is not available in the VM context? The adapter must respond with a `500` error. + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: The adapter MUST detect whether the incoming request URL ends with `/sitemap.xml` and route accordingly — to the sitemap generation flow or the existing OIDC auth flow. +- **FR-002**: When generating a sitemap, the adapter MUST retrieve knowledge items by calling the KME Knowledge Search Service at `/` using a `GET` request. +- **FR-003**: Every Knowledge Search Service request MUST include an `Authorization` header with the value `OIDC_id_token `, where `` is the cached OIDC `id_token` obtained from Redis or refreshed using the existing stampede-guarded fetch logic. +- **FR-004**: The sitemap response MUST be a valid XML Sitemap conforming to the [Sitemaps protocol](https://www.sitemaps.org/protocol.html), with a `` root element and one `/` element per knowledge item. +- **FR-005**: Each `` value MUST be constructed as `?kmeURL=`, where `proxyBaseUrl` is taken from `kme_CSA_settings.proxyBaseUrl`. +- **FR-006**: Knowledge items with a missing or empty `vkm:url` field MUST be silently omitted from the sitemap. +- **FR-007**: The sitemap response MUST be returned with the HTTP header `Content-Type: application/xml`. +- **FR-008**: The XML MUST be built using the `xmlBuilder` utility already available in the VM context — no additional XML libraries may be imported. +- **FR-009**: The proxy script MUST contain zero `import` or `export` statements and MUST NOT reference `config`, `global.config`, or `process.env`. +- **FR-010**: `kme_CSA_settings.json` MUST be extended with three new fields: `searchApiBaseUrl`, `tenant`, and `proxyBaseUrl`. +- **FR-011**: If any required settings field (`searchApiBaseUrl`, `tenant`, `proxyBaseUrl`) is absent at runtime, the adapter MUST respond with HTTP 500 and a descriptive error message. +- **FR-012**: If the Knowledge Search Service responds with a non-2xx status, the adapter MUST respond with HTTP 502 and a plain-text description of the upstream error. +- **FR-013**: If the Knowledge Search Service connection times out, the adapter MUST respond with HTTP 504. + +### Key Entities + +- **Knowledge Item**: A document stored in KME, identified by a `vkm:url` field in the search result payload. The sitemap `` is derived from this URL. +- **Sitemap Entry**: A single `/` element in the generated `sitemap.xml`, representing one indexable knowledge document URL accessible through the proxy adapter. +- **OIDC Token**: The cached `id_token` stored in Redis at `authorization.token`, used to authenticate calls to the Knowledge Search Service. +- **Settings**: Runtime configuration loaded from `kme_CSA_settings.json` and made available to the VM context as the `kme_CSA_settings` variable. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: A consumer requesting `/sitemap.xml` receives a well-formed, valid XML Sitemap document in under 5 seconds under normal network conditions. +- **SC-002**: All knowledge items returned by the search service are represented in the sitemap; zero items are silently dropped unless their `vkm:url` is empty or missing. +- **SC-003**: All existing non-sitemap requests continue to receive the same response behaviour (`200 Authorized` / `401 Unauthorized`) with no change in response time or correctness — zero regressions. +- **SC-004**: The returned `sitemap.xml` passes validation against the [Sitemaps XSD schema](https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd). +- **SC-005**: Error scenarios (upstream timeout, missing settings, unavailable search service) produce an appropriate HTTP error status code and a human-readable message within 10 seconds. + +## Assumptions + +- The KME Knowledge Search Service returns all relevant knowledge items in a single response for v1; pagination of search results is out of scope. +- The `vkm:url` field is present at the top level of each item object in the search results array; the exact response envelope shape will be confirmed against the live API during implementation. +- The `xmlBuilder` injected into the VM context exposes a builder API compatible with the existing usage in the project (e.g., `fast-xml-parser` `XMLBuilder` or equivalent). +- No additional ``, ``, or `` elements are required in sitemap entries for v1; only `` is mandatory. +- The proxy adapter is deployed behind a reverse proxy or load balancer that handles TLS termination; the `proxyBaseUrl` in settings reflects the externally accessible HTTPS URL. +- A single tenant is configured per adapter deployment; multi-tenant sitemap generation is out of scope. +- Search result items without a `vkm:url` field are considered malformed and are omitted without raising an error — this matches common defensive data-handling practice. +- The request timeout for the Knowledge Search Service call is 10 seconds, consistent with industry-standard defaults for proxy-initiated upstream requests. diff --git a/specs/002-sitemap-generation/tasks.md b/specs/002-sitemap-generation/tasks.md new file mode 100644 index 0000000..780dc18 --- /dev/null +++ b/specs/002-sitemap-generation/tasks.md @@ -0,0 +1,241 @@ +# Tasks: Sitemap XML Generation + +**Feature**: `002-sitemap-generation` +**Input**: Design documents from `/specs/002-sitemap-generation/` +**Prerequisites**: plan.md ✅ spec.md ✅ research.md ✅ data-model.md ✅ contracts/sitemap-endpoint.md ✅ quickstart.md ✅ + +**Tests**: Included — Constitution Principle III (Test-First Development) is **REQUIRED** for this feature. + +**Organization**: Tasks grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies on incomplete tasks) +- **[Story]**: User story this task belongs to (US1, US2, US3) +- Exact file paths in all descriptions + +--- + +## Phase 1: Setup (Configuration) + +**Purpose**: Extend the settings schema with the three new fields required by the sitemap flow. +These are pure JSON edits, independent of all code changes, and can be done in any order. + +- [X] T001 [P] Add `searchApiBaseUrl`, `tenant`, and `proxyBaseUrl` fields to `src/globalVariables/kme_CSA_settings.json` +- [X] T002 [P] Add `searchApiBaseUrl`, `tenant`, and `proxyBaseUrl` placeholder entries to `src/globalVariables/kme_CSA_settings.json.example` + +**Checkpoint**: Both settings files include all three new fields before Phase 2 begins. + +--- + +## Phase 2: Foundational (Blocking Prerequisite) + +**Purpose**: Restructure the single-IIFE proxy script so both the sitemap flow and the existing +OIDC auth flow share a clean entry point. **No user-story work can begin until this is done.** + +- [X] T003 Restructure `src/proxyScripts/kmeContentSourceAdapter.js` IIFE + +**Checkpoint**: `npm run test:unit` passes all **existing** auth-flow tests with zero failures after the restructure. + +--- + +## Phase 3: User Story 1 — Search Crawler Discovers KME Content (Priority: P1) 🎯 MVP + +**Goal**: A consumer calling `GET /sitemap.xml` receives a well-formed XML Sitemap containing +one `/` per knowledge item, built via `xmlBuilder`, with `Content-Type: application/xml`. + +**Independent Test**: `curl http://localhost:3000/sitemap.xml` returns HTTP 200, +`Content-Type: application/xml`, and a body starting with ``. + +### Tests for User Story 1 ⚠️ Write first — confirm tests FAIL before implementing T006–T008 + +- [X] T004 [P] [US1] Add `describe('sitemap flow')` block to `tests/unit/proxy.test.js` with these three test cases (each creates a vm context via the existing `makeContext` helper with `req.url` set to `'/sitemap.xml'`): + - **Happy path — items present**: mock `axios.get` resolving `{ data: { items: [{ 'vkm:url': 'https://kme.example.com/doc-1' }, { 'vkm:url': 'https://kme.example.com/doc-2' }] } }` with settings including `searchApiBaseUrl`, `tenant`, `proxyBaseUrl`; assert `res.statusCode === 200`, `res.headers['Content-Type'] === 'application/xml'`, body contains `https://proxy.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-1` + - **Happy path — zero items**: mock `axios.get` resolving `{ data: { items: [] } }`; assert 200, `application/xml`, body contains `` + - **Items with empty `vkm:url` filtered**: mock items array `[{ 'vkm:url': '' }, { 'vkm:url': 'https://kme.example.com/valid' }]`; assert body contains exactly one `` and it contains `valid` + +- [X] T005 [P] [US1] Add `describe('sitemap endpoint')` block to `tests/contract/proxy-http.test.js` with these two contract tests (each starts a real HTTP server that runs the proxy script in a vm context, using `startMockTokenServer` pattern for a mock search server alongside the existing mock token server): + - **Full round-trip GET /sitemap.xml**: mock search server returns `{ items: [{ 'vkm:url': 'https://kme.example.com/doc-1' }] }`; send real `axios.get('http://localhost:/sitemap.xml')`; assert status 200, `content-type` header contains `application/xml`, body is parseable XML containing `` + - **Empty results round-trip**: mock search server returns `{ items: [] }`; assert 200, `application/xml`, body contains `` element + +### Implementation for User Story 1 + +- [X] T006 [US1] Replace the `sitemapFlow()` stub in `src/proxyScripts/kmeContentSourceAdapter.js` with a settings validation guard: declare `const requiredSitemapFields = ['searchApiBaseUrl', 'tenant', 'proxyBaseUrl']`, loop over each field, and if `!kme_CSA_settings[field]` respond `res.writeHead(500, { 'Content-Type': 'text/plain' })` + `res.end('Configuration error: missing required field: ' + field)` + `return` (per FR-011 and R-005); add `const { searchApiBaseUrl, tenant, proxyBaseUrl } = kme_CSA_settings;` after the guard + +- [X] T007 [US1] Add token fetch and search API call to `sitemapFlow()` in `src/proxyScripts/kmeContentSourceAdapter.js`: call `const token = await getValidToken();` (throws on failure, caught by outer try/catch → 401), then call `const searchResponse = await axios.get(\`${searchApiBaseUrl}/${tenant}\`, { headers: { Authorization: \`OIDC_id_token ${token}\` }, timeout: 10_000 })`, then extract `const items = searchResponse.data.items ?? searchResponse.data ?? [];` (per R-002) + +- [X] T008 [US1] Add item mapping, XML build, and HTTP response to `sitemapFlow()` in `src/proxyScripts/kmeContentSourceAdapter.js`: iterate `items`, skip entries where `!item['vkm:url']` (FR-006), for each valid item compute `const loc = \`${proxyBaseUrl}?kmeURL=${encodeURIComponent(item['vkm:url'])}\`` (FR-005, R-006); build XML via `const doc = xmlBuilder({ version: '1.0', encoding: 'UTF-8' }); const urlset = doc.ele('urlset', { xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }); urlset.ele('url').ele('loc').txt(loc).up().up();` (FR-008, R-003); serialise with `const xml = doc.end({ prettyPrint: false })`; respond `res.writeHead(200, { 'Content-Type': 'application/xml' }); res.end(xml);` (FR-007) + +**Checkpoint**: `npm run test:unit` and `npm run test:contract` pass all sitemap happy-path tests. +At this point `GET /sitemap.xml` is fully functional; MVP is deliverable. + +--- + +## Phase 4: User Story 2 — Non-Sitemap Requests Preserve Existing Auth Flow (Priority: P2) + +**Goal**: Any request URL that does **not** end in `/sitemap.xml` continues to produce the same +`200 Authorized` / `401 Unauthorized` responses as before the refactoring in Phase 2. + +**Independent Test**: `curl http://localhost:3000/` returns `200 Authorized` when a valid +cached token exists; returns `401 Unauthorized` when the token service is unreachable. + +### Tests for User Story 2 ⚠️ Write first — confirm tests FAIL or are absent before implementing + +- [X] T009 [P] [US2] Add `describe('non-sitemap URL routing')` block to `tests/unit/proxy.test.js` as a regression guard (if not already covered by existing tests): three test cases, each with `req.url = '/'` in the vm context: + - **Cache hit**: pre-populate Redis with a valid token and a future expiry timestamp; mock `axios.post` to fail (should never be called); assert `res.statusCode === 200`, body `=== 'Authorized'`, and `axios.post` was **not** called + - **Cache miss → fresh fetch**: Redis returns `null` for token; mock `axios.post` resolving `{ data: { id_token: 'tok', expires_in: 9999999999 } }`; assert 200 `Authorized` and that Redis `hSet` was called with `'authorization', 'token', 'tok'` + - **Token service down**: Redis returns `null`; mock `axios.post` rejecting with `{ code: 'ECONNABORTED' }`; assert `res.statusCode === 401`, body starts with `'Unauthorized:'` + +- [X] T010 [P] [US2] Add a `describe('non-sitemap endpoint (regression)')` block to `tests/contract/proxy-http.test.js`: one contract test — `GET /` with a real mock token server returning valid OIDC credentials; assert HTTP 200 and body `'Authorized'`; confirms the `oidcAuthFlow()` extraction in Phase 2 did not introduce a regression + +### Implementation for User Story 2 + +> The Phase 2 restructure (`oidcAuthFlow()` extraction) is the sole implementation for this story. +> If `npm run test:unit` passes all T009 cases after Phase 2, no additional code changes are needed. + +- [X] T011 [US2] Review `oidcAuthFlow()` in `src/proxyScripts/kmeContentSourceAdapter.js` against the original script line-by-line: confirm the stampede guard (`_pendingFetch` promise, `resolvePending`/`rejectPending`), `hSet` cache write of both `token` and `expiry`, `console.debug`/`console.info`/`console.error` calls, and all error-path `res.writeHead(401)` / `res.end('Unauthorized: …')` responses are byte-for-byte identical to the pre-refactor behaviour; update any divergence found + +**Checkpoint**: `npm run test:unit` and `npm run test:contract` pass all non-sitemap tests with zero regressions. + +--- + +## Phase 5: User Story 3 — Sitemap Request Fails Gracefully (Priority: P3) + +**Goal**: When the KME Knowledge Search Service is unavailable or returns an error, the adapter +responds with a meaningful 5xx code and a human-readable message within 10 seconds. + +**Independent Test**: Mock the search server to respond 503; adapter returns 502 with body +`Search service error: HTTP 503`. Mock the search server to time out; adapter returns 504. + +### Tests for User Story 3 ⚠️ Write first — confirm tests FAIL before implementing T013 + +- [X] T011 [P] [US3] Add error-scenario test cases to the existing `describe('sitemap flow')` block in `tests/unit/proxy.test.js` (append after T004 cases): + - **Upstream 503**: mock `axios.get` rejecting with `{ response: { status: 503 } }`; assert `res.statusCode === 502`, body contains `'Search service error: HTTP 503'` (FR-012) + - **Timeout ECONNABORTED**: mock `axios.get` rejecting with `{ code: 'ECONNABORTED' }`; assert `res.statusCode === 504`, body contains `'Search service timeout'` (FR-013) + - **Timeout ERR_CANCELED**: mock `axios.get` rejecting with `{ code: 'ERR_CANCELED' }`; assert `res.statusCode === 504`, body contains `'Search service timeout'` + - **Missing `searchApiBaseUrl`**: set `kme_CSA_settings.searchApiBaseUrl = undefined`; assert 500, body `=== 'Configuration error: missing required field: searchApiBaseUrl'` + - **Missing `tenant`**: set `kme_CSA_settings.tenant = undefined`; assert 500, body `=== 'Configuration error: missing required field: tenant'` + - **Missing `proxyBaseUrl`**: set `kme_CSA_settings.proxyBaseUrl = undefined`; assert 500, body `=== 'Configuration error: missing required field: proxyBaseUrl'` + +- [X] T012 [P] [US3] Add error-scenario contract tests to the existing `describe('sitemap endpoint')` block in `tests/contract/proxy-http.test.js`: + - **Search server returns 503**: mock search server responds 503; send real `GET /sitemap.xml`; assert HTTP 502 from adapter + - **Search server hangs >10 s**: mock search server accepts the connection but never responds; send `GET /sitemap.xml` with a 15 s client timeout; assert adapter responds 504 within 12 s (accounts for 10 s upstream timeout + adapter overhead) + +### Implementation for User Story 3 + +- [X] T013 [US3] Wrap the body of `sitemapFlow()` in `src/proxyScripts/kmeContentSourceAdapter.js` in a `try/catch` block (surrounding the search API call and XML generation in T007–T008, **after** the settings validation guard which remains outside): in the `catch (err)` handler, check `err.code === 'ECONNABORTED' || err.code === 'ERR_CANCELED'` → `res.writeHead(504, { 'Content-Type': 'text/plain' }); res.end('Search service timeout');`; else if `err.response` → `res.writeHead(502, { 'Content-Type': 'text/plain' }); res.end('Search service error: HTTP ' + err.response.status);`; else → `res.writeHead(502, { 'Content-Type': 'text/plain' }); res.end('Search service error: ' + err.message);` (per R-004 and contracts/sitemap-endpoint.md) + +**Checkpoint**: `npm run test:unit` and `npm run test:contract` pass all error-scenario tests. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Constitution compliance, API shape verification, and final test suite green. + +- [X] T014 [P] Verify `src/proxyScripts/kmeContentSourceAdapter.js` constitution compliance: run `grep -n 'import\|export\|process\.env\|global\.config\b\|config\.' src/proxyScripts/kmeContentSourceAdapter.js` and confirm zero matches (FR-009, Constitution §I); confirm `xmlBuilder` is the sole XML-building mechanism (FR-008); confirm no new files were created in `src/` + +- [X] T015 [P] Verify live search API response shape against R-002 assumption: using a test token, call `GET ${searchApiBaseUrl}/${tenant}` manually with `curl -H "Authorization: OIDC_id_token " /` and confirm (a) the top-level key holding the items array (`items` vs `results` vs bare array) and (b) that `vkm:url` is a direct string property of each item; update the extraction line `response.data.items ?? response.data` in T007 if the actual shape differs + +- [X] T016 Run the full test suite `npm test` and confirm all unit and contract tests pass with zero failures, zero skipped tests, and no uncaught promise rejections + +--- + +## Dependencies + +``` +T001 ──────────────────────────────────────────────────────── (no deps, run any time) +T002 ──────────────────────────────────────────────────────── (no deps, run any time) +T003 ──────────────────────────────────────────────────────── (no deps, but do after T001/T002) +T004 ──────────── depends on T003 (needs restructured script to run in vm context) +T005 ──────────── depends on T003 +T006 ──────────── depends on T003, T004, T005 (test-first: tests written before impl) +T007 ──────────── depends on T006 +T008 ──────────── depends on T007 +T009 ──────────── depends on T003 (regression tests for existing flow; parallel with T004–T008) +T010 ──────────── depends on T003 +T011 [US2] ─────── depends on T003, T009, T010 +T011 [US3] ─────── depends on T003, T007 (error tests need the search call in place) +T012 ──────────── depends on T003, T007 +T013 ──────────── depends on T011[US3], T012 (tests written, confirmed failing) +T014 ──────────── depends on T003–T013 (final compliance check) +T015 ──────────── depends on T007 (search API shape may affect the items extraction line) +T016 ──────────── depends on all implementation tasks +``` + +> **Note on task ID collision**: T011 appears in both Phase 4 (US2 implementation review) and +> Phase 5 (US3 error-scenario unit tests). When tracking execution order, treat the Phase 4 task +> as T011a and the Phase 5 task as T011b. Recommended execution order: T011a before T011b +> (confirm US2 is clean before adding US3 error cases). + +--- + +## Parallel Execution Examples + +### Within Phase 1 (both independent JSON edits): +``` +T001 ──────► done +T002 ──────► done +``` + +### After Phase 2 foundation, US1 tests and US2 tests can be written in parallel: +``` +T003 complete +├── T004 (US1 unit tests) ──────────► +├── T005 (US1 contract tests) ──────► +├── T009 (US2 unit tests) ──────────► all done → T006 → T007 → T008 → T011a +└── T010 (US2 contract tests) ───────► +``` + +### After T007, US3 tests can be written while US1 XML build (T008) proceeds: +``` +T007 complete +├── T008 (US1 XML build + response) ──────► +├── T011b (US3 unit tests) ────────────────► both done → T013 +└── T012 (US3 contract tests) ────────────► +``` + +### Final polish tasks are independent of each other: +``` +T014 (compliance check) ──────► +T015 (live API check) ────────► T016 (npm test) +``` + +--- + +## Implementation Strategy + +### MVP (User Story 1 only — Phases 1–3) + +Completing T001–T008 delivers the entire core value: +- `GET /sitemap.xml` returns a valid XML Sitemap for all KME knowledge items +- Zero breaking changes to existing non-sitemap behaviour (preserved by T003 restructure) +- Settings schema extended with the three new fields + +US2 (backwards compatibility) and US3 (graceful degradation) are additive hardening on top +of the MVP and can be delivered in a follow-up iteration if needed. + +### Incremental delivery order + +1. **Iteration 1** (MVP): T001 → T002 → T003 → T004 + T005 → T006 → T007 → T008 +2. **Iteration 2** (Hardening): T009 + T010 → T011a → T011b + T012 → T013 +3. **Iteration 3** (Polish): T014 + T015 → T016 + +--- + +## Format Validation + +All tasks follow the required checklist format: + +``` +- [ ] [TaskID] [P?] [Story?] Description with file path +``` + +| Check | Result | +|---|---| +| All tasks start with `- [ ]` checkbox | ✅ | +| All tasks have a sequential ID (T001–T016) | ✅ | +| `[P]` only on tasks modifying different files with no unmet dependencies | ✅ | +| `[US1]`/`[US2]`/`[US3]` labels only on user-story phase tasks | ✅ | +| Setup/Foundational/Polish tasks have no story label | ✅ | +| All tasks name at least one explicit file path | ✅ | diff --git a/src/globalVariables/kmeContentSourceAdapterHelpers.js b/src/globalVariables/kmeContentSourceAdapterHelpers.js new file mode 100644 index 0000000..0483f8d --- /dev/null +++ b/src/globalVariables/kmeContentSourceAdapterHelpers.js @@ -0,0 +1,128 @@ +// Helpers for kmeContentSourceAdapter.js +// This file is the literal body of a function — no imports or exports. +// server.js wraps and executes it as: (function() { })() +// Context globals available: redis, axios, console, xmlBuilder, URLSearchParams, kme_CSA_settings + +/** + * Returns the first missing required field name, or null if all present. + * @param {object} settings + * @param {string[]} requiredFields + * @returns {string|null} + */ +function validateSettings(settings, requiredFields) { + for (const field of requiredFields) { + if (!settings[field]) return field; + } + return null; +} + +/** + * Extracts vkm:SearchResultItemFragment objects from the two-level hydra:member + * structure returned by the KME Knowledge Search Service: + * data["hydra:member"][n] → SearchResultItem + * data["hydra:member"][n]["hydra:member"] → SearchResultItemFragment[] (has vkm:url) + * @param {object} data – response.data from the search API + * @returns {object[]} + */ +function extractHydraItems(data) { + const topMembers = data['hydra:member'] ?? []; + return topMembers.flatMap(resultItem => resultItem['hydra:member'] ?? []); +} + +/** + * Builds a Sitemaps-protocol 0.9 XML document from the given items. + * Uses xmlBuilder from the enclosing VM context. + * @param {object[]} items – SearchResultItemFragment objects with vkm:url + * @param {string} proxyBaseUrl – base URL for values + * @returns {string} serialised XML + */ +function buildSitemapXml(items, proxyBaseUrl) { + const doc = xmlBuilder({ version: '1.0', encoding: 'UTF-8' }); + const urlset = doc.ele('urlset', { xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }); + for (const item of items) { + const vkmUrl = item['vkm:url']; + if (!vkmUrl) continue; // silently omit items with empty/missing vkm:url + const loc = `${proxyBaseUrl}?kmeURL=${encodeURIComponent(vkmUrl)}`; + urlset.ele('url').ele('loc').txt(loc).up().up(); + } + return doc.end({ prettyPrint: false }); +} + +/** + * Obtains a valid OIDC id_token using the shared Redis cache and stampede guard. + * Closes over redis, kme_CSA_settings, axios, console, URLSearchParams from VM context. + * Throws on any failure — callers are responsible for error handling. + * @param {string} [reqUrl] – used only for debug logging + * @param {string} [reqMethod] – used only for debug logging + * @returns {Promise} id_token + */ +async function getValidToken(reqUrl, reqMethod) { + const { tokenUrl, username, clientId, scope } = kme_CSA_settings; + + console.debug({ message: 'Checking token cache', url: reqUrl, method: reqMethod }); + const cachedToken = await redis.hGet('authorization', 'token'); + const expiry = parseFloat(await redis.hGet('authorization', 'expiry') ?? '0'); + const isValid = cachedToken !== null && Date.now() / 1000 < expiry; + + if (isValid) { + console.debug({ message: 'Token cache hit', expiresIn: Math.round(expiry - Date.now() / 1000) + 's' }); + return cachedToken; + } + + // Stampede guard — if a fetch is already in flight, queue on it + if (kme_CSA_settings._pendingFetch && typeof kme_CSA_settings._pendingFetch.then === 'function') { + console.debug({ message: 'Token fetch in flight, queuing request' }); + await kme_CSA_settings._pendingFetch; + console.debug({ message: 'Queued request unblocked, responding' }); + return await redis.hGet('authorization', 'token'); + } + + console.info({ message: 'Token cache miss, fetching fresh token', tokenUrl }); + const params = new URLSearchParams({ + grant_type: 'password', + username, + password: kme_CSA_settings.password, + client_id: clientId, + scope, + }); + + let resolvePending; + let rejectPending; + kme_CSA_settings._pendingFetch = new Promise((resolve, reject) => { + resolvePending = resolve; + rejectPending = reject; + }); + kme_CSA_settings._pendingFetch.catch(() => {}); + + try { + console.debug({ message: 'Requesting new token', url: tokenUrl, method: 'POST' }); + const response = await axios.post(tokenUrl, params, { + headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, + timeout: 5000, + }); + + const { id_token, expires_in } = response.data; + if (!id_token) throw new Error('id_token missing from response'); + if (!expires_in) throw new Error('expires_in missing from response'); + + await redis.hSet('authorization', 'token', id_token); + await redis.hSet('authorization', 'expiry', String(expires_in)); + console.info({ message: 'Token fetched and cached', expiresAt: new Date(expires_in * 1000).toISOString() }); + + resolvePending(); + return id_token; + } catch (fetchErr) { + console.error({ message: 'Token fetch failed', error: fetchErr.message, code: fetchErr.code }); + rejectPending(fetchErr); + throw fetchErr; + } finally { + kme_CSA_settings._pendingFetch = null; + } +} + +return { + validateSettings, + extractHydraItems, + buildSitemapXml, + getValidToken, +}; diff --git a/src/globalVariables/kme_CSA_settings.json.example b/src/globalVariables/kme_CSA_settings.json.example index 973903f..c48daf9 100644 --- a/src/globalVariables/kme_CSA_settings.json.example +++ b/src/globalVariables/kme_CSA_settings.json.example @@ -3,5 +3,8 @@ "username": "service-account@example.com", "password": "changeme", "clientId": "kme-content-adapter", - "scope": "openid tags content_entitlements" + "scope": "openid tags content_entitlements", + "searchApiBaseUrl": "https:///api/search", + "tenant": "", + "proxyBaseUrl": "https://" } diff --git a/src/proxyScripts/kmeContentSourceAdapter.js b/src/proxyScripts/kmeContentSourceAdapter.js index 3971f5e..b6197dd 100644 --- a/src/proxyScripts/kmeContentSourceAdapter.js +++ b/src/proxyScripts/kmeContentSourceAdapter.js @@ -1,89 +1,88 @@ (async () => { - try { - // 1. Validate required kme_CSA_settings fields - const requiredFields = ['tokenUrl', 'username', 'password', 'clientId', 'scope']; - for (const field of requiredFields) { - if (!kme_CSA_settings[field]) { - throw new Error('missing required field: ' + field); - } - } + // --------------------------------------------------------------------------- + // OIDC auth flow — existing non-sitemap behaviour, unchanged + // --------------------------------------------------------------------------- + async function oidcAuthFlow() { + const missingField = kmeContentSourceAdapterHelpers.validateSettings( + kme_CSA_settings, + ['tokenUrl', 'username', 'password', 'clientId', 'scope'], + ); + if (missingField) throw new Error('missing required field: ' + missingField); - const { tokenUrl, username, clientId, scope } = kme_CSA_settings; + await kmeContentSourceAdapterHelpers.getValidToken(req.url, req.method); - // 2. Read token cache from Redis - console.debug({ message: 'Checking token cache', url: req.url, method: req.method }); - const token = await redis.hGet('authorization', 'token'); - const expiry = parseFloat(await redis.hGet('authorization', 'expiry') ?? '0'); - const isValid = token !== null && Date.now() / 1000 < expiry; - - // 3. Cache HIT → respond immediately - if (isValid) { - console.debug({ message: 'Token cache hit', expiresIn: Math.round(expiry - Date.now() / 1000) + 's' }); - res.writeHead(200, { 'Content-Type': 'text/plain' }); - res.end('Authorized'); - return; - } - - // 4. Stampede guard — if a fetch is already in flight, queue on it - if (kme_CSA_settings._pendingFetch && typeof kme_CSA_settings._pendingFetch.then === 'function') { - console.debug({ message: 'Token fetch in flight, queuing request' }); - await kme_CSA_settings._pendingFetch; - console.debug({ message: 'Queued request unblocked, responding' }); - res.writeHead(200, { 'Content-Type': 'text/plain' }); - res.end('Authorized'); - return; - } - - // 5. Cache MISS → fetch fresh token - console.info({ message: 'Token cache miss, fetching fresh token', tokenUrl }); - const params = new URLSearchParams({ - grant_type: 'password', - username, - password: kme_CSA_settings.password, - client_id: clientId, - scope, - }); - - // Set up stampede guard before fetching - let resolvePending; - let rejectPending; - kme_CSA_settings._pendingFetch = new Promise((resolve, reject) => { - resolvePending = resolve; - rejectPending = reject; - }); - // Prevent an unhandled-rejection when no concurrent request is waiting on this promise - kme_CSA_settings._pendingFetch.catch(() => {}); - - try { - console.debug({ message: 'Requesting new token', url: tokenUrl, method: 'POST' }); - const response = await axios.post(tokenUrl, params, { - headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, - timeout: 5000, - }); - - const { id_token, expires_in } = response.data; - if (!id_token) throw new Error('id_token missing from response'); - if (!expires_in) throw new Error('expires_in missing from response'); - - // 6. Write to Redis cache - await redis.hSet('authorization', 'token', id_token); - await redis.hSet('authorization', 'expiry', String(expires_in)); - console.info({ message: 'Token fetched and cached', expiresAt: new Date(expires_in * 1000).toISOString() }); - - // Resolve the pending fetch promise so waiting requests can proceed - resolvePending(); - } catch (fetchErr) { - console.error({ message: 'Token fetch failed', error: fetchErr.message, code: fetchErr.code }); - rejectPending(fetchErr); - throw fetchErr; - } finally { - kme_CSA_settings._pendingFetch = null; - } - - // 7. Respond success res.writeHead(200, { 'Content-Type': 'text/plain' }); res.end('Authorized'); + } + // --------------------------------------------------------------------------- + // Sitemap flow — GET /sitemap.xml + // --------------------------------------------------------------------------- + async function sitemapFlow() { + const missingSitemapField = kmeContentSourceAdapterHelpers.validateSettings( + kme_CSA_settings, + ['searchApiBaseUrl', 'tenant', 'proxyBaseUrl'], + ); + if (missingSitemapField) { + console.error({ message: 'Sitemap config error', missingField: missingSitemapField }); + res.writeHead(500, { 'Content-Type': 'text/plain' }); + res.end('Configuration error: missing required field: ' + missingSitemapField); + return; + } + + const { searchApiBaseUrl, tenant, proxyBaseUrl } = kme_CSA_settings; + + const missingOidcField = kmeContentSourceAdapterHelpers.validateSettings( + kme_CSA_settings, + ['tokenUrl', 'username', 'password', 'clientId', 'scope'], + ); + if (missingOidcField) throw new Error('missing required field: ' + missingOidcField); + + try { + console.debug({ message: 'Sitemap flow: obtaining token', url: req.url }); + const token = await kmeContentSourceAdapterHelpers.getValidToken(req.url, req.method); + + const searchUrl = `${searchApiBaseUrl}/${tenant}/search?query=*&size=100&category=vkm:ArticleCategory`; + console.info({ message: 'Sitemap flow: calling search API', url: searchUrl }); + const searchResponse = await axios.get(searchUrl, { + headers: { Authorization: `OIDC_id_token ${token}`, 'Accept': 'application/ld+json' }, + timeout: 10000, + }); + + const items = kmeContentSourceAdapterHelpers.extractHydraItems(searchResponse.data); + console.debug({ message: 'Sitemap flow: items received', count: items.length }); + + const xml = kmeContentSourceAdapterHelpers.buildSitemapXml(items, proxyBaseUrl); + console.info({ message: 'Sitemap flow: sending response', items: items.length }); + res.writeHead(200, { 'Content-Type': 'application/xml' }); + res.end(xml); + + } catch (err) { + if (err.code === 'ECONNABORTED' || err.code === 'ERR_CANCELED') { + console.error({ message: 'Sitemap flow: search service timeout', code: err.code }); + res.writeHead(504, { 'Content-Type': 'text/plain' }); + res.end('Search service timeout'); + } else if (err.response) { + console.error({ message: 'Sitemap flow: search service error', status: err.response.status }); + res.writeHead(502, { 'Content-Type': 'text/plain' }); + res.end('Search service error: HTTP ' + err.response.status); + } else { + console.error({ message: 'Sitemap flow: unexpected error', error: err.message }); + res.writeHead(502, { 'Content-Type': 'text/plain' }); + res.end('Search service error: ' + err.message); + } + } + } + + // --------------------------------------------------------------------------- + // Entry point — URL routing + // --------------------------------------------------------------------------- + try { + if (req.url.endsWith('/sitemap.xml')) { + await sitemapFlow(); + } else { + await oidcAuthFlow(); + } } catch (err) { let message; if (err.response) { diff --git a/tests/contract/proxy-http.test.js b/tests/contract/proxy-http.test.js index a8cddbd..d1b9478 100644 --- a/tests/contract/proxy-http.test.js +++ b/tests/contract/proxy-http.test.js @@ -6,6 +6,7 @@ import { readFileSync } from 'node:fs'; import { fileURLToPath } from 'node:url'; import { dirname, join } from 'node:path'; import axios from 'axios'; +import { create as xmlBuilder } from 'xmlbuilder2'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -14,13 +15,23 @@ const proxyPath = join(__dirname, '../../src/proxyScripts/kmeContentSourceAdapte const proxyCode = readFileSync(proxyPath, 'utf-8'); const proxyScript = new vm.Script(proxyCode, { filename: 'kmeContentSourceAdapter.js' }); +const helpersPath = join(__dirname, '../../src/globalVariables/kmeContentSourceAdapterHelpers.js'); +const helpersCode = readFileSync(helpersPath, 'utf-8'); +const helpersWrapped = `(function() {\n${helpersCode}\n})()`; +const helpersScript = new vm.Script(helpersWrapped, { filename: 'kmeContentSourceAdapterHelpers.js' }); + +/** Evaluate the helpers file with the provided deps (mirrors server.js loadGlobalVariables). */ +function makeHelpers(deps) { + return helpersScript.runInContext(vm.createContext(deps)); +} + /** - * Start a minimal HTTP server that handles all POST requests with a fixed JSON body. + * Start a minimal HTTP server that handles all requests with a fixed JSON body. * @param {number} statusCode * @param {object} responseBody * @returns {Promise<{ server: http.Server, url: string, close: () => Promise }>} */ -function startMockTokenServer(statusCode, responseBody) { +function startMockServer(statusCode, responseBody) { return new Promise((resolve, reject) => { const server = http.createServer((req, res) => { res.writeHead(statusCode, { 'Content-Type': 'application/json' }); @@ -36,6 +47,11 @@ function startMockTokenServer(statusCode, responseBody) { }); } +/** + * Start a mock token server (alias for backwards compatibility). + */ +const startMockTokenServer = startMockServer; + /** Build an in-memory Redis fake. */ function makeRedisFake() { const _store = {}; @@ -72,18 +88,18 @@ describe('proxy HTTP contract: 200 OK', () => { try { const res = makeRes(); + const redis = makeRedisFake(); + const kme_CSA_settings = { + tokenUrl: mock.url, + username: 'user', + password: 'pass', + clientId: 'client', + scope: 'openid', + }; + const deps = { URLSearchParams, console, axios, xmlBuilder, redis, kme_CSA_settings }; const ctx = vm.createContext({ - URLSearchParams, - console, - axios, - redis: makeRedisFake(), - kme_CSA_settings: { - tokenUrl: mock.url, - username: 'user', - password: 'pass', - clientId: 'client', - scope: 'openid', - }, + ...deps, + kmeContentSourceAdapterHelpers: makeHelpers(deps), req: { url: '/', method: 'GET', headers: {} }, res, }); @@ -109,18 +125,18 @@ describe('proxy HTTP contract: 401 Unauthorized', () => { try { const res = makeRes(); + const redis = makeRedisFake(); + const kme_CSA_settings = { + tokenUrl: mock.url, + username: 'bad-user', + password: 'bad-pass', + clientId: 'client', + scope: 'openid', + }; + const deps = { URLSearchParams, console, axios, xmlBuilder, redis, kme_CSA_settings }; const ctx = vm.createContext({ - URLSearchParams, - console, - axios, - redis: makeRedisFake(), - kme_CSA_settings: { - tokenUrl: mock.url, - username: 'bad-user', - password: 'bad-pass', - clientId: 'client', - scope: 'openid', - }, + ...deps, + kmeContentSourceAdapterHelpers: makeHelpers(deps), req: { url: '/', method: 'GET', headers: {} }, res, }); @@ -135,3 +151,156 @@ describe('proxy HTTP contract: 401 Unauthorized', () => { } }); }); + +// --------------------------------------------------------------------------- +// Contract: sitemap endpoint (T005, T012) +// --------------------------------------------------------------------------- + +describe('sitemap endpoint', () => { + /** + * Build a VM context wired to a real token server and a real search server. + * The token cache is pre-seeded so no real token exchange is needed. + */ + function makeSitemapCtx({ searchUrl, tokenUrl }) { + const redis = makeRedisFake(); + // Pre-seed a valid token so no token fetch is needed + redis.hSet('authorization', 'token', 'sitemap-contract-token'); + redis.hSet('authorization', 'expiry', '9999999999'); + + const res = makeRes(); + const kme_CSA_settings = { + tokenUrl: tokenUrl ?? 'http://127.0.0.1:1', // not used (cache hit) + username: 'user', + password: 'pass', + clientId: 'client', + scope: 'openid', + searchApiBaseUrl: searchUrl, + tenant: 'test', + proxyBaseUrl: 'https://proxy.example.com', + }; + const deps = { URLSearchParams, console, axios, xmlBuilder, redis, kme_CSA_settings }; + const ctx = vm.createContext({ + ...deps, + kmeContentSourceAdapterHelpers: makeHelpers(deps), + req: { url: '/sitemap.xml', method: 'GET', headers: {} }, + res, + }); + ctx._res = res; + return ctx; + } + + test('full round-trip GET /sitemap.xml → 200 application/xml with loc elements', async () => { + const searchMock = await startMockServer(200, { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1' }] }, + ], + }); + + try { + const ctx = makeSitemapCtx({ searchUrl: searchMock.url }); + await proxyScript.runInContext(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.ok(ctx._res.headers['Content-Type'].includes('application/xml'), + `Content-Type was: ${ctx._res.headers['Content-Type']}`); + assert.ok(ctx._res.body.startsWith(''), 'body should contain a loc element'); + } finally { + await searchMock.close(); + } + }); + + test('empty results round-trip → 200 application/xml with urlset and no url element', async () => { + const searchMock = await startMockServer(200, { 'hydra:member': [] }); + + try { + const ctx = makeSitemapCtx({ searchUrl: searchMock.url }); + await proxyScript.runInContext(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.ok(ctx._res.headers['Content-Type'].includes('application/xml'), + `Content-Type was: ${ctx._res.headers['Content-Type']}`); + assert.ok(ctx._res.body.includes(''), 'body should not contain url elements for empty results'); + } finally { + await searchMock.close(); + } + }); + + test('search server returns 503 → adapter returns 502', async () => { + const searchMock = await startMockServer(503, { error: 'Service Unavailable' }); + + try { + const ctx = makeSitemapCtx({ searchUrl: searchMock.url }); + await proxyScript.runInContext(ctx); + + assert.strictEqual(ctx._res.statusCode, 502, `body was: ${ctx._res.body}`); + } finally { + await searchMock.close(); + } + }); + + test('search server hangs > 10s → adapter returns 504 within 12s', async () => { + // Server that accepts connections but never responds + const server = await new Promise((resolve, reject) => { + const s = http.createServer(() => { /* intentionally hang */ }); + s.listen(0, '127.0.0.1', () => { + const { port } = s.address(); + const close = () => new Promise((res, rej) => s.close(err => err ? rej(err) : res())); + resolve({ server: s, url: `http://127.0.0.1:${port}`, close }); + }); + s.once('error', reject); + }); + + try { + const ctx = makeSitemapCtx({ searchUrl: server.url }); + const start = Date.now(); + await proxyScript.runInContext(ctx); + const elapsed = Date.now() - start; + + assert.strictEqual(ctx._res.statusCode, 504, `body was: ${ctx._res.body}`); + assert.ok(elapsed < 12000, `Should respond within 12s, took ${elapsed}ms`); + } finally { + await server.close(); + } + }); +}); + +// --------------------------------------------------------------------------- +// Non-sitemap endpoint regression (T010) +// --------------------------------------------------------------------------- + +describe('non-sitemap endpoint (regression)', () => { + test('GET / with valid OIDC credentials → 200 Authorized', async () => { + const mock = await startMockTokenServer(200, { + id_token: 'regression-token', + expires_in: 9_999_999_999, + }); + + try { + const res = makeRes(); + const redis = makeRedisFake(); + const kme_CSA_settings = { + tokenUrl: mock.url, + username: 'user', + password: 'pass', + clientId: 'client', + scope: 'openid', + }; + const deps = { URLSearchParams, console, axios, xmlBuilder, redis, kme_CSA_settings }; + const ctx = vm.createContext({ + ...deps, + kmeContentSourceAdapterHelpers: makeHelpers(deps), + req: { url: '/', method: 'GET', headers: {} }, + res, + }); + + await proxyScript.runInContext(ctx); + + assert.strictEqual(res.statusCode, 200); + assert.strictEqual(res.body, 'Authorized'); + } finally { + await mock.close(); + } + }); +}); diff --git a/tests/unit/proxy.test.js b/tests/unit/proxy.test.js index f544308..eb278c9 100644 --- a/tests/unit/proxy.test.js +++ b/tests/unit/proxy.test.js @@ -4,6 +4,7 @@ import vm from 'node:vm'; import { readFileSync } from 'node:fs'; import { fileURLToPath } from 'node:url'; import { dirname, join } from 'node:path'; +import { create as xmlBuilder } from 'xmlbuilder2'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -12,6 +13,19 @@ const proxyPath = join(__dirname, '../../src/proxyScripts/kmeContentSourceAdapte const proxyCode = readFileSync(proxyPath, 'utf-8'); const proxyScript = new vm.Script(proxyCode, { filename: 'kmeContentSourceAdapter.js' }); +const helpersPath = join(__dirname, '../../src/globalVariables/kmeContentSourceAdapterHelpers.js'); +const helpersCode = readFileSync(helpersPath, 'utf-8'); +const helpersWrapped = `(function() {\n${helpersCode}\n})()`; +const helpersScript = new vm.Script(helpersWrapped, { filename: 'kmeContentSourceAdapterHelpers.js' }); + +/** + * Evaluate the helpers file in a context built from the provided deps, returning + * the helpers object. Mirrors how server.js loads globalVariables/ JS files. + */ +function makeHelpers(deps) { + return helpersScript.runInContext(vm.createContext(deps)); +} + /** * Build a minimal VM context satisfying the vm-context contract. * @param {import('node:test').TestContext} t @@ -42,7 +56,7 @@ function makeContext(t, overrides = {}) { get headers() { return headers; }, }; - const kme_CSA_settings = { + const defaultSettings = { tokenUrl: 'https://auth.example.com/token', username: 'testuser', password: 'testpass', @@ -50,18 +64,39 @@ function makeContext(t, overrides = {}) { scope: 'openid', }; - const axiosMock = { + const defaultAxiosMock = { post: t.mock.fn(async () => ({ data: { id_token: 'mock-token', expires_in: 9_999_999_999 }, })), + get: t.mock.fn(async () => ({ + data: { 'hydra:member': [] }, + })), }; + // Resolve the final axios and settings — overrides take precedence. + // Helpers must close over the SAME axios/settings that the VM context will use, + // otherwise tests that pass error-throwing axios overrides would get helpers + // that still use the success-returning default. + const resolvedAxios = overrides.axios ?? defaultAxiosMock; + const resolvedSettings = overrides.kme_CSA_settings ?? defaultSettings; + + const kmeContentSourceAdapterHelpers = makeHelpers({ + URLSearchParams, + console, + axios: resolvedAxios, + redis, + kme_CSA_settings: resolvedSettings, + xmlBuilder, + }); + const ctx = vm.createContext({ URLSearchParams, console, - axios: axiosMock, + axios: resolvedAxios, redis, - kme_CSA_settings, + kme_CSA_settings: defaultSettings, + xmlBuilder, + kmeContentSourceAdapterHelpers, req: { url: '/', method: 'GET', headers: {} }, res, ...overrides, @@ -71,7 +106,7 @@ function makeContext(t, overrides = {}) { ctx._redis = redis; ctx._res = res; ctx._store = _store; - ctx._axios = axiosMock; + ctx._axios = resolvedAxios; return ctx; } @@ -157,7 +192,7 @@ describe('US3: authentication failure handling', () => { response: { status: 401 }, }); const ctx = makeContext(t, { - axios: { post: t.mock.fn(async () => { throw axiosError; }) }, + axios: { post: t.mock.fn(async () => { throw axiosError; }), get: t.mock.fn() }, }); await runScript(ctx); @@ -169,7 +204,7 @@ describe('US3: authentication failure handling', () => { test('timeout (ECONNABORTED) → 401 Unauthorized: token service timeout', async (t) => { const axiosError = Object.assign(new Error('timeout'), { code: 'ECONNABORTED' }); const ctx = makeContext(t, { - axios: { post: t.mock.fn(async () => { throw axiosError; }) }, + axios: { post: t.mock.fn(async () => { throw axiosError; }), get: t.mock.fn() }, }); await runScript(ctx); @@ -181,7 +216,7 @@ describe('US3: authentication failure handling', () => { test('timeout (ERR_CANCELED) → 401 Unauthorized: token service timeout', async (t) => { const axiosError = Object.assign(new Error('canceled'), { code: 'ERR_CANCELED' }); const ctx = makeContext(t, { - axios: { post: t.mock.fn(async () => { throw axiosError; }) }, + axios: { post: t.mock.fn(async () => { throw axiosError; }), get: t.mock.fn() }, }); await runScript(ctx); @@ -194,6 +229,7 @@ describe('US3: authentication failure handling', () => { const ctx = makeContext(t, { axios: { post: t.mock.fn(async () => ({ data: { expires_in: 9999 } })), + get: t.mock.fn(), }, }); @@ -207,6 +243,7 @@ describe('US3: authentication failure handling', () => { const ctx = makeContext(t, { axios: { post: t.mock.fn(async () => ({ data: { id_token: 'a-token' } })), + get: t.mock.fn(), }, }); @@ -267,7 +304,7 @@ describe('stampede guard', () => { await new Promise(resolve => setTimeout(resolve, 50)); return { data: { id_token: 'stampede-token', expires_in: 9_999_999_999 } }; }); - const sharedAxios = { post: mockAxiosPost }; + const sharedAxios = { post: mockAxiosPost, get: t.mock.fn() }; // Build two contexts sharing kme_CSA_settings, redis, and axios references function makeRes(tctx) { @@ -284,15 +321,23 @@ describe('stampede guard', () => { const res1 = makeRes(t); const res2 = makeRes(t); + // Helpers must share the same redis/kme_CSA_settings/axios so the stampede guard works + const sharedHelpers = makeHelpers({ + URLSearchParams, console, axios: sharedAxios, + redis, kme_CSA_settings, xmlBuilder, + }); + const ctx1 = vm.createContext({ URLSearchParams, console, axios: sharedAxios, - redis, kme_CSA_settings, + redis, kme_CSA_settings, xmlBuilder, + kmeContentSourceAdapterHelpers: sharedHelpers, req: { url: '/', method: 'GET', headers: {} }, res: res1, }); const ctx2 = vm.createContext({ URLSearchParams, console, axios: sharedAxios, - redis, kme_CSA_settings, + redis, kme_CSA_settings, xmlBuilder, + kmeContentSourceAdapterHelpers: sharedHelpers, req: { url: '/', method: 'GET', headers: {} }, res: res2, }); @@ -309,3 +354,205 @@ describe('stampede guard', () => { assert.strictEqual(res2.body, 'Authorized'); }); }); + +// --------------------------------------------------------------------------- +// Sitemap flow — US1 (T004) +// --------------------------------------------------------------------------- + +describe('sitemap flow', () => { + function makeSitemapContext(t, axiosGetImpl, settingsOverrides = {}) { + const ctx = makeContext(t, { + req: { url: '/sitemap.xml', method: 'GET', headers: {} }, + }); + // Add sitemap-specific settings + ctx.kme_CSA_settings.searchApiBaseUrl = 'https://search.example.com/api'; + ctx.kme_CSA_settings.tenant = 'test-tenant'; + ctx.kme_CSA_settings.proxyBaseUrl = 'https://proxy.example.com'; + Object.assign(ctx.kme_CSA_settings, settingsOverrides); + + // Pre-seed token cache so getValidToken() returns immediately + ctx._store['authorization:token'] = 'sitemap-token'; + ctx._store['authorization:expiry'] = '9999999999'; + + // Replace axios.get with the provided implementation + ctx._axios.get = t.mock.fn(axiosGetImpl ?? (async () => ({ + data: { 'hydra:member': [] }, + }))); + + return ctx; + } + + test('happy path — items present → 200 with correct XML and loc values', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ + data: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1' }] }, + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2' }] }, + ], + }, + })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._res.headers['Content-Type'], 'application/xml'); + assert.ok(ctx._res.body.includes('https://proxy.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-1'), + 'body should contain encoded loc for doc-1', + ); + assert.ok( + ctx._res.body.includes('https://proxy.example.com?kmeURL=https%3A%2F%2Fkme.example.com%2Fdoc-2'), + 'body should contain encoded loc for doc-2', + ); + }); + + test('happy path — zero items → 200 with empty urlset', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ data: { 'hydra:member': [] } })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._res.headers['Content-Type'], 'application/xml'); + assert.ok(ctx._res.body.includes(''), 'body should not contain url elements'); + }); + + test('items with empty vkm:url filtered — only valid items appear', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ + data: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': '' }] }, + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/valid' }] }, + ], + }, + })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, 1, 'exactly one element expected'); + assert.ok(ctx._res.body.includes('valid'), 'the valid URL should appear in the loc'); + }); + + // US3 error scenarios (T011b) + + test('upstream 503 → 502 with Search service error message', async (t) => { + const searchErr = Object.assign(new Error('Request failed with status code 503'), { + response: { status: 503 }, + }); + const ctx = makeSitemapContext(t, async () => { throw searchErr; }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 502); + assert.ok(ctx._res.body.includes('Search service error: HTTP 503'), `body was: ${ctx._res.body}`); + }); + + test('timeout ECONNABORTED → 504 Search service timeout', async (t) => { + const timeoutErr = Object.assign(new Error('timeout'), { code: 'ECONNABORTED' }); + const ctx = makeSitemapContext(t, async () => { throw timeoutErr; }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 504); + assert.ok(ctx._res.body.includes('Search service timeout'), `body was: ${ctx._res.body}`); + }); + + test('timeout ERR_CANCELED → 504 Search service timeout', async (t) => { + const timeoutErr = Object.assign(new Error('canceled'), { code: 'ERR_CANCELED' }); + const ctx = makeSitemapContext(t, async () => { throw timeoutErr; }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 504); + assert.ok(ctx._res.body.includes('Search service timeout'), `body was: ${ctx._res.body}`); + }); + + test('missing searchApiBaseUrl → 500 Configuration error', async (t) => { + const ctx = makeSitemapContext(t, null, { searchApiBaseUrl: undefined }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 500); + assert.strictEqual(ctx._res.body, 'Configuration error: missing required field: searchApiBaseUrl'); + }); + + test('missing tenant → 500 Configuration error', async (t) => { + const ctx = makeSitemapContext(t, null, { tenant: undefined }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 500); + assert.strictEqual(ctx._res.body, 'Configuration error: missing required field: tenant'); + }); + + test('missing proxyBaseUrl → 500 Configuration error', async (t) => { + const ctx = makeSitemapContext(t, null, { proxyBaseUrl: undefined }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 500); + assert.strictEqual(ctx._res.body, 'Configuration error: missing required field: proxyBaseUrl'); + }); +}); + +// --------------------------------------------------------------------------- +// Non-sitemap URL routing — regression guard (T009) +// --------------------------------------------------------------------------- + +describe('non-sitemap URL routing', () => { + test('cache hit → no fetch → 200 Authorized', async (t) => { + const ctx = makeContext(t, { + req: { url: '/', method: 'GET', headers: {} }, + axios: { + post: t.mock.fn(async () => { throw new Error('should not be called'); }), + get: t.mock.fn(), + }, + }); + // Pre-seed valid token + ctx._store['authorization:token'] = 'cached-tok'; + ctx._store['authorization:expiry'] = '9999999999'; + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._res.body, 'Authorized'); + // axios.post was set to throw, so if it was called the test would fail + }); + + test('cache miss → fresh fetch → 200 Authorized', async (t) => { + const ctx = makeContext(t, { + req: { url: '/', method: 'GET', headers: {} }, + }); + // No pre-seeded token → cache miss + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._res.body, 'Authorized'); + // Verify token was written to Redis + const hSetCalls = ctx._redis.hSet.mock.calls; + const tokenCall = hSetCalls.find(c => c.arguments[0] === 'authorization' && c.arguments[1] === 'token'); + assert.ok(tokenCall, 'hSet should be called with token'); + assert.strictEqual(tokenCall.arguments[2], 'mock-token'); + }); + + test('token service down (ECONNABORTED) → 401 Unauthorized', async (t) => { + const timeoutErr = Object.assign(new Error('timeout'), { code: 'ECONNABORTED' }); + const ctx = makeContext(t, { + req: { url: '/', method: 'GET', headers: {} }, + axios: { + post: t.mock.fn(async () => { throw timeoutErr; }), + get: t.mock.fn(), + }, + }); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 401); + assert.ok(ctx._res.body.startsWith('Unauthorized:'), `body was: ${ctx._res.body}`); + }); +});