diff --git a/src/globalVariables/kmeContentSourceAdapterHelpers.js b/src/globalVariables/kmeContentSourceAdapterHelpers.js
index a2102c0..3eca3b1 100644
--- a/src/globalVariables/kmeContentSourceAdapterHelpers.js
+++ b/src/globalVariables/kmeContentSourceAdapterHelpers.js
@@ -21,12 +21,25 @@ function validateSettings(settings, requiredFields) {
* structure returned by the KME Knowledge Search Service:
* data["hydra:member"][n] → SearchResultItem
* data["hydra:member"][n]["hydra:member"] → SearchResultItemFragment[] (has vkm:url)
+ *
+ * For each SearchResultItem, only the fragment with the latest vkm:datePublished
+ * is returned. If no vkm:datePublished is present the fragment is treated as
+ * epoch 0, so dated fragments always win over undated ones.
+ *
* @param {object} data – response.data from the search API
* @returns {object[]}
*/
function extractHydraItems(data) {
const topMembers = data['hydra:member'] ?? [];
- return topMembers.flatMap(resultItem => resultItem['hydra:member'] ?? []);
+ return topMembers.map(resultItem => {
+ const fragments = resultItem['hydra:member'] ?? [];
+ if (fragments.length === 0) return null;
+ return fragments.reduce((latest, current) => {
+ const latestDate = new Date(latest['vkm:datePublished'] ?? 0).getTime();
+ const currentDate = new Date(current['vkm:datePublished'] ?? 0).getTime();
+ return currentDate > latestDate ? current : latest;
+ });
+ }).filter(Boolean);
}
/**
diff --git a/src/proxyScripts/kmeContentSourceAdapter.js b/src/proxyScripts/kmeContentSourceAdapter.js
index c150395..616d210 100644
--- a/src/proxyScripts/kmeContentSourceAdapter.js
+++ b/src/proxyScripts/kmeContentSourceAdapter.js
@@ -101,9 +101,17 @@
return;
}
- // Step 9: Return HTML
+ // Step 9: Return HTML wrapped in a full document
+ const title = data['vkm:name'] ?? '';
+ const html = `
+
+
${title}
+
+${articleBody}
+
+`;
res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
- res.end(articleBody);
+ res.end(html);
}
// ---------------------------------------------------------------------------
@@ -143,19 +151,57 @@
const token = await kmeContentSourceAdapterHelpers.getValidToken(req.url, req.method);
const reqParams = new URL(req.url, 'http://localhost').searchParams;
+ const pageSize = reqParams.get('size') ?? '100';
const searchParams = new URLSearchParams({
query: reqParams.get('query') ?? '*',
- size: reqParams.get('size') ?? '100',
+ size: pageSize,
category: reqParams.get('category') ?? 'vkm:ArticleCategory',
});
const searchUrl = `${searchApiBaseUrl}/${tenant}/search?${searchParams}`;
- console.info({ message: 'Sitemap flow: calling search API', url: searchUrl });
- const searchResponse = await axios.get(searchUrl, {
+ console.info({ message: 'Sitemap flow: calling search API (page 1)', url: searchUrl });
+ const firstResponse = await axios.get(searchUrl, {
headers: { Authorization: `OIDC_id_token ${token}`, 'Accept': 'application/ld+json' },
timeout: 10000,
});
- const items = kmeContentSourceAdapterHelpers.extractHydraItems(searchResponse.data);
+ const firstData = firstResponse.data;
+ let allData = [firstData];
+
+ // Paginate: hydra:last is nested inside hydra:view.
+ // hydra:view is absent when all results fit on one page — no pagination needed.
+ // start= is a 0-based item index; subsequent page start values increment by size.
+ // e.g. 22 results, size=5 → hydra:view.hydra:last start=20, pages at start=5,10,15,20
+ const hydraLast = firstData['hydra:view']?.['hydra:last'];
+ if (hydraLast) {
+ const lastUrl = new URL(hydraLast);
+ const lastStart = parseInt(lastUrl.searchParams.get('start') ?? '0', 10);
+ const size = parseInt(lastUrl.searchParams.get('size') ?? pageSize, 10);
+ if (lastStart > 0 && size > 0) {
+ const pageUrls = [];
+ for (let start = size; start <= lastStart; start += size) {
+ const pageUrl = new URL(searchUrl);
+ pageUrl.searchParams.set('start', String(start));
+ pageUrls.push(pageUrl.toString());
+ }
+ console.info({ message: 'Sitemap flow: fetching additional pages', count: pageUrls.length });
+ const pageResponses = await Promise.all(
+ pageUrls.map(url => axios.get(url, {
+ headers: { Authorization: `OIDC_id_token ${token}`, 'Accept': 'application/ld+json' },
+ timeout: 10000,
+ }))
+ );
+ allData = [firstData, ...pageResponses.map(r => r.data)];
+ }
+ }
+
+ const SITEMAP_MAX_URLS = 50_000;
+ const allItems = allData.flatMap(
+ data => kmeContentSourceAdapterHelpers.extractHydraItems(data)
+ );
+ const items = allItems.length > SITEMAP_MAX_URLS ? allItems.slice(0, SITEMAP_MAX_URLS) : allItems;
+ if (allItems.length > SITEMAP_MAX_URLS) {
+ console.warn({ message: 'Sitemap flow: result set truncated to 50,000 (sitemaps.org limit)', total: allItems.length });
+ }
console.debug({ message: 'Sitemap flow: items received', count: items.length });
const xml = kmeContentSourceAdapterHelpers.buildSitemapXml(items, proxyBaseUrl);
diff --git a/tests/contract/proxy-http.test.js b/tests/contract/proxy-http.test.js
index 588f0d0..9e3c1db 100644
--- a/tests/contract/proxy-http.test.js
+++ b/tests/contract/proxy-http.test.js
@@ -200,7 +200,7 @@ describe('sitemap endpoint', () => {
describe('content fetch: happy path', () => {
test('GET /?kmeURL= → 200 text/html with article body (SC-001 < 11s)', async () => {
// Mock content server returning a valid article JSON-LD response
- const contentMock = await startMockServer(200, { 'vkm:articleBody': 'Contract test article
' });
+ const contentMock = await startMockServer(200, { 'vkm:name': 'Contract Article', 'vkm:articleBody': 'Contract test article
' });
try {
const redis = makeRedisFake();
@@ -233,7 +233,9 @@ describe('content fetch: happy path', () => {
res.headers['Content-Type'].startsWith('text/html'),
`Content-Type was: ${res.headers['Content-Type']}`,
);
- assert.strictEqual(res.body, 'Contract test article
');
+ assert.ok(res.body.includes(''), 'body should contain DOCTYPE');
+ assert.ok(res.body.includes('Contract Article'), 'body should contain title');
+ assert.ok(res.body.includes('Contract test article
'), 'body should contain article content verbatim');
assert.ok(elapsed < 11000, `Round-trip should be under 11 s, took ${elapsed}ms`);
} finally {
await contentMock.close();
diff --git a/tests/unit/proxy.test.js b/tests/unit/proxy.test.js
index 01de64f..1c874cf 100644
--- a/tests/unit/proxy.test.js
+++ b/tests/unit/proxy.test.js
@@ -240,8 +240,8 @@ describe('sitemap flow', () => {
const ctx = makeSitemapContext(t, async () => ({
data: {
'hydra:member': [
- { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1' }] },
- { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2' }] },
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2', 'vkm:datePublished': '2024-06-01T00:00:00Z' }] },
],
},
}));
@@ -291,6 +291,139 @@ describe('sitemap flow', () => {
assert.ok(ctx._res.body.includes('valid'), 'the valid URL should appear in the loc');
});
+ test('multiple fragments per SearchResultItem → only latest vkm:datePublished wins', async (t) => {
+ const ctx = makeSitemapContext(t, async () => ({
+ data: {
+ 'hydra:member': [
+ {
+ 'hydra:member': [
+ { 'vkm:url': 'https://kme.example.com/doc/v1', 'vkm:datePublished': '2023-01-01T00:00:00Z' },
+ { 'vkm:url': 'https://kme.example.com/doc/v3', 'vkm:datePublished': '2024-06-01T00:00:00Z' },
+ { 'vkm:url': 'https://kme.example.com/doc/v2', 'vkm:datePublished': '2023-12-01T00:00:00Z' },
+ ],
+ },
+ ],
+ },
+ }));
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ const locMatches = ctx._res.body.match(//g);
+ assert.strictEqual(locMatches?.length ?? 0, 1, 'exactly one element (latest version only)');
+ assert.ok(ctx._res.body.includes('doc%2Fv3'), 'the latest fragment (v3) should be the loc');
+ assert.ok(!ctx._res.body.includes('doc%2Fv1'), 'older fragment v1 should not appear');
+ assert.ok(!ctx._res.body.includes('doc%2Fv2'), 'older fragment v2 should not appear');
+ });
+
+ // Pagination: hydra:last nested inside hydra:view drives multi-page fetching.
+ // hydra:view is absent when all results fit on one page — no pagination needed.
+ // e.g. 22 results, size=5 → hydra:view['hydra:last'] start=20, fetch start=5,10,15,20
+
+ test('hydra:last (22 results, size=5, start=20) → fetches 4 extra pages, all 5 pages combined', async (t) => {
+ // Simulate the example from the spec: 22 results, page size 5
+ // First call has no start param; subsequent pages: start=5,10,15,20
+ const base = 'https://search.example.com/api/test-tenant/search?query=*&size=5&category=vkm%3AArticleCategory';
+ const pageData = {
+ [`${base}`]: {
+ 'hydra:view': { 'hydra:last': `${base}&start=20` },
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
+ ],
+ },
+ [`${base}&start=5`]: {
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p2', 'vkm:datePublished': '2024-02-01T00:00:00Z' }] },
+ ],
+ },
+ [`${base}&start=10`]: {
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p3', 'vkm:datePublished': '2024-03-01T00:00:00Z' }] },
+ ],
+ },
+ [`${base}&start=15`]: {
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p4', 'vkm:datePublished': '2024-04-01T00:00:00Z' }] },
+ ],
+ },
+ [`${base}&start=20`]: {
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p5', 'vkm:datePublished': '2024-05-01T00:00:00Z' }] },
+ ],
+ },
+ };
+
+ // Build context with size=5 in the request URL
+ const ctx = makeContext(t, {
+ req: { url: '/sitemap.xml?size=5', method: 'GET', headers: { host: 'proxy.example.com', 'x-forwarded-proto': 'https' } },
+ });
+ ctx.kme_CSA_settings.searchApiBaseUrl = 'https://search.example.com/api';
+ ctx.kme_CSA_settings.tenant = 'test-tenant';
+ ctx._store['authorization:token'] = 'sitemap-token';
+ ctx._store['authorization:expiry'] = '9999999999';
+ ctx._axios.get = t.mock.fn(async (url) => ({ data: pageData[url] ?? { 'hydra:member': [] } }));
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ assert.strictEqual(ctx._axios.get.mock.calls.length, 5, 'should make 5 GET calls (start 0,5,10,15,20)');
+ const locMatches = ctx._res.body.match(//g);
+ assert.strictEqual(locMatches?.length ?? 0, 5, 'all 5 items from all pages should appear');
+ assert.ok(ctx._res.body.includes('doc-p1'));
+ assert.ok(ctx._res.body.includes('doc-p5'));
+ });
+
+ test('hydra:view absent (all results on one page) → no additional pages fetched', async (t) => {
+ const ctx = makeSitemapContext(t, async () => ({
+ data: {
+ // No hydra:view — all 22 results fit in size=50
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
+ ],
+ },
+ }));
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:view absent');
+ const locMatches = ctx._res.body.match(//g);
+ assert.strictEqual(locMatches?.length ?? 0, 1);
+ });
+
+ test('hydra:view present but hydra:last start=0 → no additional pages fetched', async (t) => {
+ const ctx = makeSitemapContext(t, async () => ({
+ data: {
+ 'hydra:view': { 'hydra:last': 'https://search.example.com/api/test-tenant/search?query=*&size=100&category=vkm%3AArticleCategory&start=0' },
+ 'hydra:member': [
+ { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
+ ],
+ },
+ }));
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:last start=0');
+ const locMatches = ctx._res.body.match(//g);
+ assert.strictEqual(locMatches?.length ?? 0, 1);
+ });
+
+ test('more than 50,000 items → sitemap truncated to exactly 50,000 elements', async (t) => {
+ const LIMIT = 50_000;
+ // Build a response with LIMIT + 5 items
+ const members = Array.from({ length: LIMIT + 5 }, (_, i) => ({
+ 'hydra:member': [{ 'vkm:url': `https://kme.example.com/doc-${i}`, 'vkm:datePublished': '2024-01-01T00:00:00Z' }],
+ }));
+ const ctx = makeSitemapContext(t, async () => ({ data: { 'hydra:member': members } }));
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ const locMatches = ctx._res.body.match(//g);
+ assert.strictEqual(locMatches?.length ?? 0, LIMIT, `should be capped at ${LIMIT}`);
+ });
+
// US3 error scenarios (T011b)
test('upstream 503 → 502 with Search service error message', async (t) => {
@@ -394,10 +527,10 @@ describe('extractArticleBody helper', () => {
// ---------------------------------------------------------------------------
describe('US-content-fetch: happy path', () => {
- test('cached token + valid article response → 200 text/html with body', async (t) => {
+ test('cached token + valid article response → 200 text/html with body and title', async (t) => {
const contentAxios = {
post: t.mock.fn(async () => ({ data: { id_token: 'mock-token', expires_in: 9_999_999_999 } })),
- get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': 'Hello
' } })),
+ get: t.mock.fn(async () => ({ data: { 'vkm:name': 'My Article', 'vkm:articleBody': 'Hello
' } })),
};
const ctx = makeContext(t, {
req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
@@ -410,18 +543,18 @@ describe('US-content-fetch: happy path', () => {
await runScript(ctx);
assert.strictEqual(ctx._res.statusCode, 200);
- assert.ok(
- ctx._res.headers['Content-Type'].startsWith('text/html'),
- `Content-Type was: ${ctx._res.headers['Content-Type']}`,
- );
- assert.strictEqual(ctx._res.body, 'Hello
');
+ assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`);
+ assert.ok(ctx._res.body.includes(''), 'body should contain DOCTYPE');
+ assert.ok(ctx._res.body.includes('My Article'), 'body should contain title');
+ assert.ok(ctx._res.body.includes('Hello
'), 'body should contain article content verbatim');
+ assert.ok(!ctx._res.body.includes(''), 'article content should not be double-wrapped in
');
assert.strictEqual(contentAxios.post.mock.calls.length, 0, 'should not re-fetch token on cache hit');
});
test('cache miss (fresh token acquired) → 200 text/html with body', async (t) => {
const contentAxios = {
post: t.mock.fn(async () => ({ data: { id_token: 'fresh-token', expires_in: 9_999_999_999 } })),
- get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '
Hello
' } })),
+ get: t.mock.fn(async () => ({ data: { 'vkm:name': 'Fresh Article', 'vkm:articleBody': 'Hello
' } })),
};
const ctx = makeContext(t, {
req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
@@ -432,13 +565,31 @@ describe('US-content-fetch: happy path', () => {
await runScript(ctx);
assert.strictEqual(ctx._res.statusCode, 200);
- assert.ok(
- ctx._res.headers['Content-Type'].startsWith('text/html'),
- `Content-Type was: ${ctx._res.headers['Content-Type']}`,
- );
- assert.strictEqual(ctx._res.body, 'Hello
');
+ assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`);
+ assert.ok(ctx._res.body.includes(''), 'body should contain DOCTYPE');
+ assert.ok(ctx._res.body.includes('Fresh Article'), 'body should contain title');
+ assert.ok(ctx._res.body.includes('Hello
'), 'body should contain article content');
assert.strictEqual(contentAxios.post.mock.calls.length, 1, 'should have fetched fresh token');
});
+
+ test('vkm:name absent → title element is empty', async (t) => {
+ const contentAxios = {
+ post: t.mock.fn(),
+ get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': 'No title
' } })),
+ };
+ const ctx = makeContext(t, {
+ req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
+ axios: contentAxios,
+ });
+ ctx._store['authorization:token'] = 'cached-token';
+ ctx._store['authorization:expiry'] = '9999999999';
+
+ await runScript(ctx);
+
+ assert.strictEqual(ctx._res.statusCode, 200);
+ assert.ok(ctx._res.body.includes(''), 'title should be empty when vkm:name absent');
+ assert.ok(ctx._res.body.includes('No title
'));
+ });
});
// ---------------------------------------------------------------------------