diff --git a/src/globalVariables/kmeContentSourceAdapterHelpers.js b/src/globalVariables/kmeContentSourceAdapterHelpers.js index a2102c0..3eca3b1 100644 --- a/src/globalVariables/kmeContentSourceAdapterHelpers.js +++ b/src/globalVariables/kmeContentSourceAdapterHelpers.js @@ -21,12 +21,25 @@ function validateSettings(settings, requiredFields) { * structure returned by the KME Knowledge Search Service: * data["hydra:member"][n] → SearchResultItem * data["hydra:member"][n]["hydra:member"] → SearchResultItemFragment[] (has vkm:url) + * + * For each SearchResultItem, only the fragment with the latest vkm:datePublished + * is returned. If no vkm:datePublished is present the fragment is treated as + * epoch 0, so dated fragments always win over undated ones. + * * @param {object} data – response.data from the search API * @returns {object[]} */ function extractHydraItems(data) { const topMembers = data['hydra:member'] ?? []; - return topMembers.flatMap(resultItem => resultItem['hydra:member'] ?? []); + return topMembers.map(resultItem => { + const fragments = resultItem['hydra:member'] ?? []; + if (fragments.length === 0) return null; + return fragments.reduce((latest, current) => { + const latestDate = new Date(latest['vkm:datePublished'] ?? 0).getTime(); + const currentDate = new Date(current['vkm:datePublished'] ?? 0).getTime(); + return currentDate > latestDate ? current : latest; + }); + }).filter(Boolean); } /** diff --git a/src/proxyScripts/kmeContentSourceAdapter.js b/src/proxyScripts/kmeContentSourceAdapter.js index c150395..616d210 100644 --- a/src/proxyScripts/kmeContentSourceAdapter.js +++ b/src/proxyScripts/kmeContentSourceAdapter.js @@ -101,9 +101,17 @@ return; } - // Step 9: Return HTML + // Step 9: Return HTML wrapped in a full document + const title = data['vkm:name'] ?? ''; + const html = ` + +${title} + +${articleBody} + +`; res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); - res.end(articleBody); + res.end(html); } // --------------------------------------------------------------------------- @@ -143,19 +151,57 @@ const token = await kmeContentSourceAdapterHelpers.getValidToken(req.url, req.method); const reqParams = new URL(req.url, 'http://localhost').searchParams; + const pageSize = reqParams.get('size') ?? '100'; const searchParams = new URLSearchParams({ query: reqParams.get('query') ?? '*', - size: reqParams.get('size') ?? '100', + size: pageSize, category: reqParams.get('category') ?? 'vkm:ArticleCategory', }); const searchUrl = `${searchApiBaseUrl}/${tenant}/search?${searchParams}`; - console.info({ message: 'Sitemap flow: calling search API', url: searchUrl }); - const searchResponse = await axios.get(searchUrl, { + console.info({ message: 'Sitemap flow: calling search API (page 1)', url: searchUrl }); + const firstResponse = await axios.get(searchUrl, { headers: { Authorization: `OIDC_id_token ${token}`, 'Accept': 'application/ld+json' }, timeout: 10000, }); - const items = kmeContentSourceAdapterHelpers.extractHydraItems(searchResponse.data); + const firstData = firstResponse.data; + let allData = [firstData]; + + // Paginate: hydra:last is nested inside hydra:view. + // hydra:view is absent when all results fit on one page — no pagination needed. + // start= is a 0-based item index; subsequent page start values increment by size. + // e.g. 22 results, size=5 → hydra:view.hydra:last start=20, pages at start=5,10,15,20 + const hydraLast = firstData['hydra:view']?.['hydra:last']; + if (hydraLast) { + const lastUrl = new URL(hydraLast); + const lastStart = parseInt(lastUrl.searchParams.get('start') ?? '0', 10); + const size = parseInt(lastUrl.searchParams.get('size') ?? pageSize, 10); + if (lastStart > 0 && size > 0) { + const pageUrls = []; + for (let start = size; start <= lastStart; start += size) { + const pageUrl = new URL(searchUrl); + pageUrl.searchParams.set('start', String(start)); + pageUrls.push(pageUrl.toString()); + } + console.info({ message: 'Sitemap flow: fetching additional pages', count: pageUrls.length }); + const pageResponses = await Promise.all( + pageUrls.map(url => axios.get(url, { + headers: { Authorization: `OIDC_id_token ${token}`, 'Accept': 'application/ld+json' }, + timeout: 10000, + })) + ); + allData = [firstData, ...pageResponses.map(r => r.data)]; + } + } + + const SITEMAP_MAX_URLS = 50_000; + const allItems = allData.flatMap( + data => kmeContentSourceAdapterHelpers.extractHydraItems(data) + ); + const items = allItems.length > SITEMAP_MAX_URLS ? allItems.slice(0, SITEMAP_MAX_URLS) : allItems; + if (allItems.length > SITEMAP_MAX_URLS) { + console.warn({ message: 'Sitemap flow: result set truncated to 50,000 (sitemaps.org limit)', total: allItems.length }); + } console.debug({ message: 'Sitemap flow: items received', count: items.length }); const xml = kmeContentSourceAdapterHelpers.buildSitemapXml(items, proxyBaseUrl); diff --git a/tests/contract/proxy-http.test.js b/tests/contract/proxy-http.test.js index 588f0d0..9e3c1db 100644 --- a/tests/contract/proxy-http.test.js +++ b/tests/contract/proxy-http.test.js @@ -200,7 +200,7 @@ describe('sitemap endpoint', () => { describe('content fetch: happy path', () => { test('GET /?kmeURL= → 200 text/html with article body (SC-001 < 11s)', async () => { // Mock content server returning a valid article JSON-LD response - const contentMock = await startMockServer(200, { 'vkm:articleBody': '

Contract test article

' }); + const contentMock = await startMockServer(200, { 'vkm:name': 'Contract Article', 'vkm:articleBody': '

Contract test article

' }); try { const redis = makeRedisFake(); @@ -233,7 +233,9 @@ describe('content fetch: happy path', () => { res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${res.headers['Content-Type']}`, ); - assert.strictEqual(res.body, '

Contract test article

'); + assert.ok(res.body.includes(''), 'body should contain DOCTYPE'); + assert.ok(res.body.includes('Contract Article'), 'body should contain title'); + assert.ok(res.body.includes('

Contract test article

'), 'body should contain article content verbatim'); assert.ok(elapsed < 11000, `Round-trip should be under 11 s, took ${elapsed}ms`); } finally { await contentMock.close(); diff --git a/tests/unit/proxy.test.js b/tests/unit/proxy.test.js index 01de64f..1c874cf 100644 --- a/tests/unit/proxy.test.js +++ b/tests/unit/proxy.test.js @@ -240,8 +240,8 @@ describe('sitemap flow', () => { const ctx = makeSitemapContext(t, async () => ({ data: { 'hydra:member': [ - { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1' }] }, - { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2' }] }, + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] }, + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2', 'vkm:datePublished': '2024-06-01T00:00:00Z' }] }, ], }, })); @@ -291,6 +291,139 @@ describe('sitemap flow', () => { assert.ok(ctx._res.body.includes('valid'), 'the valid URL should appear in the loc'); }); + test('multiple fragments per SearchResultItem → only latest vkm:datePublished wins', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ + data: { + 'hydra:member': [ + { + 'hydra:member': [ + { 'vkm:url': 'https://kme.example.com/doc/v1', 'vkm:datePublished': '2023-01-01T00:00:00Z' }, + { 'vkm:url': 'https://kme.example.com/doc/v3', 'vkm:datePublished': '2024-06-01T00:00:00Z' }, + { 'vkm:url': 'https://kme.example.com/doc/v2', 'vkm:datePublished': '2023-12-01T00:00:00Z' }, + ], + }, + ], + }, + })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, 1, 'exactly one element (latest version only)'); + assert.ok(ctx._res.body.includes('doc%2Fv3'), 'the latest fragment (v3) should be the loc'); + assert.ok(!ctx._res.body.includes('doc%2Fv1'), 'older fragment v1 should not appear'); + assert.ok(!ctx._res.body.includes('doc%2Fv2'), 'older fragment v2 should not appear'); + }); + + // Pagination: hydra:last nested inside hydra:view drives multi-page fetching. + // hydra:view is absent when all results fit on one page — no pagination needed. + // e.g. 22 results, size=5 → hydra:view['hydra:last'] start=20, fetch start=5,10,15,20 + + test('hydra:last (22 results, size=5, start=20) → fetches 4 extra pages, all 5 pages combined', async (t) => { + // Simulate the example from the spec: 22 results, page size 5 + // First call has no start param; subsequent pages: start=5,10,15,20 + const base = 'https://search.example.com/api/test-tenant/search?query=*&size=5&category=vkm%3AArticleCategory'; + const pageData = { + [`${base}`]: { + 'hydra:view': { 'hydra:last': `${base}&start=20` }, + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] }, + ], + }, + [`${base}&start=5`]: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p2', 'vkm:datePublished': '2024-02-01T00:00:00Z' }] }, + ], + }, + [`${base}&start=10`]: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p3', 'vkm:datePublished': '2024-03-01T00:00:00Z' }] }, + ], + }, + [`${base}&start=15`]: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p4', 'vkm:datePublished': '2024-04-01T00:00:00Z' }] }, + ], + }, + [`${base}&start=20`]: { + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p5', 'vkm:datePublished': '2024-05-01T00:00:00Z' }] }, + ], + }, + }; + + // Build context with size=5 in the request URL + const ctx = makeContext(t, { + req: { url: '/sitemap.xml?size=5', method: 'GET', headers: { host: 'proxy.example.com', 'x-forwarded-proto': 'https' } }, + }); + ctx.kme_CSA_settings.searchApiBaseUrl = 'https://search.example.com/api'; + ctx.kme_CSA_settings.tenant = 'test-tenant'; + ctx._store['authorization:token'] = 'sitemap-token'; + ctx._store['authorization:expiry'] = '9999999999'; + ctx._axios.get = t.mock.fn(async (url) => ({ data: pageData[url] ?? { 'hydra:member': [] } })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._axios.get.mock.calls.length, 5, 'should make 5 GET calls (start 0,5,10,15,20)'); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, 5, 'all 5 items from all pages should appear'); + assert.ok(ctx._res.body.includes('doc-p1')); + assert.ok(ctx._res.body.includes('doc-p5')); + }); + + test('hydra:view absent (all results on one page) → no additional pages fetched', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ + data: { + // No hydra:view — all 22 results fit in size=50 + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] }, + ], + }, + })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:view absent'); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, 1); + }); + + test('hydra:view present but hydra:last start=0 → no additional pages fetched', async (t) => { + const ctx = makeSitemapContext(t, async () => ({ + data: { + 'hydra:view': { 'hydra:last': 'https://search.example.com/api/test-tenant/search?query=*&size=100&category=vkm%3AArticleCategory&start=0' }, + 'hydra:member': [ + { 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] }, + ], + }, + })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:last start=0'); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, 1); + }); + + test('more than 50,000 items → sitemap truncated to exactly 50,000 elements', async (t) => { + const LIMIT = 50_000; + // Build a response with LIMIT + 5 items + const members = Array.from({ length: LIMIT + 5 }, (_, i) => ({ + 'hydra:member': [{ 'vkm:url': `https://kme.example.com/doc-${i}`, 'vkm:datePublished': '2024-01-01T00:00:00Z' }], + })); + const ctx = makeSitemapContext(t, async () => ({ data: { 'hydra:member': members } })); + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + const locMatches = ctx._res.body.match(//g); + assert.strictEqual(locMatches?.length ?? 0, LIMIT, `should be capped at ${LIMIT}`); + }); + // US3 error scenarios (T011b) test('upstream 503 → 502 with Search service error message', async (t) => { @@ -394,10 +527,10 @@ describe('extractArticleBody helper', () => { // --------------------------------------------------------------------------- describe('US-content-fetch: happy path', () => { - test('cached token + valid article response → 200 text/html with body', async (t) => { + test('cached token + valid article response → 200 text/html with body and title', async (t) => { const contentAxios = { post: t.mock.fn(async () => ({ data: { id_token: 'mock-token', expires_in: 9_999_999_999 } })), - get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '

Hello

' } })), + get: t.mock.fn(async () => ({ data: { 'vkm:name': 'My Article', 'vkm:articleBody': '

Hello

' } })), }; const ctx = makeContext(t, { req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} }, @@ -410,18 +543,18 @@ describe('US-content-fetch: happy path', () => { await runScript(ctx); assert.strictEqual(ctx._res.statusCode, 200); - assert.ok( - ctx._res.headers['Content-Type'].startsWith('text/html'), - `Content-Type was: ${ctx._res.headers['Content-Type']}`, - ); - assert.strictEqual(ctx._res.body, '

Hello

'); + assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`); + assert.ok(ctx._res.body.includes(''), 'body should contain DOCTYPE'); + assert.ok(ctx._res.body.includes('My Article'), 'body should contain title'); + assert.ok(ctx._res.body.includes('

Hello

'), 'body should contain article content verbatim'); + assert.ok(!ctx._res.body.includes('

'), 'article content should not be double-wrapped in

'); assert.strictEqual(contentAxios.post.mock.calls.length, 0, 'should not re-fetch token on cache hit'); }); test('cache miss (fresh token acquired) → 200 text/html with body', async (t) => { const contentAxios = { post: t.mock.fn(async () => ({ data: { id_token: 'fresh-token', expires_in: 9_999_999_999 } })), - get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '

Hello

' } })), + get: t.mock.fn(async () => ({ data: { 'vkm:name': 'Fresh Article', 'vkm:articleBody': '

Hello

' } })), }; const ctx = makeContext(t, { req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} }, @@ -432,13 +565,31 @@ describe('US-content-fetch: happy path', () => { await runScript(ctx); assert.strictEqual(ctx._res.statusCode, 200); - assert.ok( - ctx._res.headers['Content-Type'].startsWith('text/html'), - `Content-Type was: ${ctx._res.headers['Content-Type']}`, - ); - assert.strictEqual(ctx._res.body, '

Hello

'); + assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`); + assert.ok(ctx._res.body.includes(''), 'body should contain DOCTYPE'); + assert.ok(ctx._res.body.includes('Fresh Article'), 'body should contain title'); + assert.ok(ctx._res.body.includes('

Hello

'), 'body should contain article content'); assert.strictEqual(contentAxios.post.mock.calls.length, 1, 'should have fetched fresh token'); }); + + test('vkm:name absent → title element is empty', async (t) => { + const contentAxios = { + post: t.mock.fn(), + get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '

No title

' } })), + }; + const ctx = makeContext(t, { + req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} }, + axios: contentAxios, + }); + ctx._store['authorization:token'] = 'cached-token'; + ctx._store['authorization:expiry'] = '9999999999'; + + await runScript(ctx); + + assert.strictEqual(ctx._res.statusCode, 200); + assert.ok(ctx._res.body.includes(''), 'title should be empty when vkm:name absent'); + assert.ok(ctx._res.body.includes('

No title

')); + }); }); // ---------------------------------------------------------------------------