feat: sitemap pagination, HTML wrapper, and title from vkm:name
- Paginate sitemap using hydra:view['hydra:last'] (0-based item index model) - Select latest vkm:datePublished fragment per SearchResultItem - Cap sitemap at 50,000 URLs per sitemaps.org protocol - Wrap content fetch response in full HTML document (DOCTYPE, head, body) - Add <head><title> populated from vkm:name field - Remove oidcAuthFlow route (404 for unmatched paths) - All 51 tests passing Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -240,8 +240,8 @@ describe('sitemap flow', () => {
|
||||
const ctx = makeSitemapContext(t, async () => ({
|
||||
data: {
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1' }] },
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2' }] },
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-2', 'vkm:datePublished': '2024-06-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
}));
|
||||
@@ -291,6 +291,139 @@ describe('sitemap flow', () => {
|
||||
assert.ok(ctx._res.body.includes('valid'), 'the valid URL should appear in the loc');
|
||||
});
|
||||
|
||||
test('multiple fragments per SearchResultItem → only latest vkm:datePublished wins', async (t) => {
|
||||
const ctx = makeSitemapContext(t, async () => ({
|
||||
data: {
|
||||
'hydra:member': [
|
||||
{
|
||||
'hydra:member': [
|
||||
{ 'vkm:url': 'https://kme.example.com/doc/v1', 'vkm:datePublished': '2023-01-01T00:00:00Z' },
|
||||
{ 'vkm:url': 'https://kme.example.com/doc/v3', 'vkm:datePublished': '2024-06-01T00:00:00Z' },
|
||||
{ 'vkm:url': 'https://kme.example.com/doc/v2', 'vkm:datePublished': '2023-12-01T00:00:00Z' },
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}));
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
const locMatches = ctx._res.body.match(/<loc>/g);
|
||||
assert.strictEqual(locMatches?.length ?? 0, 1, 'exactly one <loc> element (latest version only)');
|
||||
assert.ok(ctx._res.body.includes('doc%2Fv3'), 'the latest fragment (v3) should be the loc');
|
||||
assert.ok(!ctx._res.body.includes('doc%2Fv1'), 'older fragment v1 should not appear');
|
||||
assert.ok(!ctx._res.body.includes('doc%2Fv2'), 'older fragment v2 should not appear');
|
||||
});
|
||||
|
||||
// Pagination: hydra:last nested inside hydra:view drives multi-page fetching.
|
||||
// hydra:view is absent when all results fit on one page — no pagination needed.
|
||||
// e.g. 22 results, size=5 → hydra:view['hydra:last'] start=20, fetch start=5,10,15,20
|
||||
|
||||
test('hydra:last (22 results, size=5, start=20) → fetches 4 extra pages, all 5 pages combined', async (t) => {
|
||||
// Simulate the example from the spec: 22 results, page size 5
|
||||
// First call has no start param; subsequent pages: start=5,10,15,20
|
||||
const base = 'https://search.example.com/api/test-tenant/search?query=*&size=5&category=vkm%3AArticleCategory';
|
||||
const pageData = {
|
||||
[`${base}`]: {
|
||||
'hydra:view': { 'hydra:last': `${base}&start=20` },
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p1', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
[`${base}&start=5`]: {
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p2', 'vkm:datePublished': '2024-02-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
[`${base}&start=10`]: {
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p3', 'vkm:datePublished': '2024-03-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
[`${base}&start=15`]: {
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p4', 'vkm:datePublished': '2024-04-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
[`${base}&start=20`]: {
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/doc-p5', 'vkm:datePublished': '2024-05-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
// Build context with size=5 in the request URL
|
||||
const ctx = makeContext(t, {
|
||||
req: { url: '/sitemap.xml?size=5', method: 'GET', headers: { host: 'proxy.example.com', 'x-forwarded-proto': 'https' } },
|
||||
});
|
||||
ctx.kme_CSA_settings.searchApiBaseUrl = 'https://search.example.com/api';
|
||||
ctx.kme_CSA_settings.tenant = 'test-tenant';
|
||||
ctx._store['authorization:token'] = 'sitemap-token';
|
||||
ctx._store['authorization:expiry'] = '9999999999';
|
||||
ctx._axios.get = t.mock.fn(async (url) => ({ data: pageData[url] ?? { 'hydra:member': [] } }));
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.strictEqual(ctx._axios.get.mock.calls.length, 5, 'should make 5 GET calls (start 0,5,10,15,20)');
|
||||
const locMatches = ctx._res.body.match(/<loc>/g);
|
||||
assert.strictEqual(locMatches?.length ?? 0, 5, 'all 5 items from all pages should appear');
|
||||
assert.ok(ctx._res.body.includes('doc-p1'));
|
||||
assert.ok(ctx._res.body.includes('doc-p5'));
|
||||
});
|
||||
|
||||
test('hydra:view absent (all results on one page) → no additional pages fetched', async (t) => {
|
||||
const ctx = makeSitemapContext(t, async () => ({
|
||||
data: {
|
||||
// No hydra:view — all 22 results fit in size=50
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
}));
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:view absent');
|
||||
const locMatches = ctx._res.body.match(/<loc>/g);
|
||||
assert.strictEqual(locMatches?.length ?? 0, 1);
|
||||
});
|
||||
|
||||
test('hydra:view present but hydra:last start=0 → no additional pages fetched', async (t) => {
|
||||
const ctx = makeSitemapContext(t, async () => ({
|
||||
data: {
|
||||
'hydra:view': { 'hydra:last': 'https://search.example.com/api/test-tenant/search?query=*&size=100&category=vkm%3AArticleCategory&start=0' },
|
||||
'hydra:member': [
|
||||
{ 'hydra:member': [{ 'vkm:url': 'https://kme.example.com/only-doc', 'vkm:datePublished': '2024-01-01T00:00:00Z' }] },
|
||||
],
|
||||
},
|
||||
}));
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.strictEqual(ctx._axios.get.mock.calls.length, 1, 'only one GET call when hydra:last start=0');
|
||||
const locMatches = ctx._res.body.match(/<loc>/g);
|
||||
assert.strictEqual(locMatches?.length ?? 0, 1);
|
||||
});
|
||||
|
||||
test('more than 50,000 items → sitemap truncated to exactly 50,000 <loc> elements', async (t) => {
|
||||
const LIMIT = 50_000;
|
||||
// Build a response with LIMIT + 5 items
|
||||
const members = Array.from({ length: LIMIT + 5 }, (_, i) => ({
|
||||
'hydra:member': [{ 'vkm:url': `https://kme.example.com/doc-${i}`, 'vkm:datePublished': '2024-01-01T00:00:00Z' }],
|
||||
}));
|
||||
const ctx = makeSitemapContext(t, async () => ({ data: { 'hydra:member': members } }));
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
const locMatches = ctx._res.body.match(/<loc>/g);
|
||||
assert.strictEqual(locMatches?.length ?? 0, LIMIT, `should be capped at ${LIMIT}`);
|
||||
});
|
||||
|
||||
// US3 error scenarios (T011b)
|
||||
|
||||
test('upstream 503 → 502 with Search service error message', async (t) => {
|
||||
@@ -394,10 +527,10 @@ describe('extractArticleBody helper', () => {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('US-content-fetch: happy path', () => {
|
||||
test('cached token + valid article response → 200 text/html with body', async (t) => {
|
||||
test('cached token + valid article response → 200 text/html with body and title', async (t) => {
|
||||
const contentAxios = {
|
||||
post: t.mock.fn(async () => ({ data: { id_token: 'mock-token', expires_in: 9_999_999_999 } })),
|
||||
get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '<p>Hello</p>' } })),
|
||||
get: t.mock.fn(async () => ({ data: { 'vkm:name': 'My Article', 'vkm:articleBody': '<p>Hello</p>' } })),
|
||||
};
|
||||
const ctx = makeContext(t, {
|
||||
req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
|
||||
@@ -410,18 +543,18 @@ describe('US-content-fetch: happy path', () => {
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.ok(
|
||||
ctx._res.headers['Content-Type'].startsWith('text/html'),
|
||||
`Content-Type was: ${ctx._res.headers['Content-Type']}`,
|
||||
);
|
||||
assert.strictEqual(ctx._res.body, '<p>Hello</p>');
|
||||
assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`);
|
||||
assert.ok(ctx._res.body.includes('<!DOCTYPE html>'), 'body should contain DOCTYPE');
|
||||
assert.ok(ctx._res.body.includes('<title>My Article</title>'), 'body should contain title');
|
||||
assert.ok(ctx._res.body.includes('<p>Hello</p>'), 'body should contain article content verbatim');
|
||||
assert.ok(!ctx._res.body.includes('<p><p>'), 'article content should not be double-wrapped in <p>');
|
||||
assert.strictEqual(contentAxios.post.mock.calls.length, 0, 'should not re-fetch token on cache hit');
|
||||
});
|
||||
|
||||
test('cache miss (fresh token acquired) → 200 text/html with body', async (t) => {
|
||||
const contentAxios = {
|
||||
post: t.mock.fn(async () => ({ data: { id_token: 'fresh-token', expires_in: 9_999_999_999 } })),
|
||||
get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '<p>Hello</p>' } })),
|
||||
get: t.mock.fn(async () => ({ data: { 'vkm:name': 'Fresh Article', 'vkm:articleBody': '<p>Hello</p>' } })),
|
||||
};
|
||||
const ctx = makeContext(t, {
|
||||
req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
|
||||
@@ -432,13 +565,31 @@ describe('US-content-fetch: happy path', () => {
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.ok(
|
||||
ctx._res.headers['Content-Type'].startsWith('text/html'),
|
||||
`Content-Type was: ${ctx._res.headers['Content-Type']}`,
|
||||
);
|
||||
assert.strictEqual(ctx._res.body, '<p>Hello</p>');
|
||||
assert.ok(ctx._res.headers['Content-Type'].startsWith('text/html'), `Content-Type was: ${ctx._res.headers['Content-Type']}`);
|
||||
assert.ok(ctx._res.body.includes('<!DOCTYPE html>'), 'body should contain DOCTYPE');
|
||||
assert.ok(ctx._res.body.includes('<title>Fresh Article</title>'), 'body should contain title');
|
||||
assert.ok(ctx._res.body.includes('<p>Hello</p>'), 'body should contain article content');
|
||||
assert.strictEqual(contentAxios.post.mock.calls.length, 1, 'should have fetched fresh token');
|
||||
});
|
||||
|
||||
test('vkm:name absent → title element is empty', async (t) => {
|
||||
const contentAxios = {
|
||||
post: t.mock.fn(),
|
||||
get: t.mock.fn(async () => ({ data: { 'vkm:articleBody': '<p>No title</p>' } })),
|
||||
};
|
||||
const ctx = makeContext(t, {
|
||||
req: { url: '/?kmeURL=https://kme.example.com/content/article/123', method: 'GET', headers: {} },
|
||||
axios: contentAxios,
|
||||
});
|
||||
ctx._store['authorization:token'] = 'cached-token';
|
||||
ctx._store['authorization:expiry'] = '9999999999';
|
||||
|
||||
await runScript(ctx);
|
||||
|
||||
assert.strictEqual(ctx._res.statusCode, 200);
|
||||
assert.ok(ctx._res.body.includes('<title></title>'), 'title should be empty when vkm:name absent');
|
||||
assert.ok(ctx._res.body.includes('<p>No title</p>'));
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user