Skip to content

Commit 87716fa

Browse files
ikreymertw4l
andauthored
fix hashtag in seed URLs for single page / non-spa scopes: (#1013)
- remove hashtag from normalized seed URL, fixing isIncluded() check which already removes it - consider crawls with all URLs excluded or failed as 'failed', add numExcluded() check to isFailed() - fixes #1012 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
1 parent bc4e002 commit 87716fa

3 files changed

Lines changed: 29 additions & 3 deletions

File tree

src/util/seeds.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ export class ScopedSeed {
6767

6868
this.url = parsedUrl.href;
6969

70-
// Normalize URL with sorted query parameters for consistent matching
71-
this.normUrl = normalizeUrl(parsedUrl.href);
7270
this.include = parseRx(include);
7371
this.exclude = parseRx(exclude);
7472

@@ -95,6 +93,14 @@ export class ScopedSeed {
9593
depth = extraHops;
9694
}
9795

96+
// normalize hash out if not distinguishing between hashes
97+
if (!allowHash) {
98+
parsedUrl.hash = "";
99+
}
100+
101+
// normalize URL with sorted query parameters for consistent matching
102+
this.normUrl = normalizeUrl(parsedUrl.href);
103+
98104
this.sitemap = this.resolveSiteMap(sitemap);
99105
this.allowHash = allowHash;
100106
this.maxExtraHops = extraHops;

src/util/state.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1041,10 +1041,14 @@ return inx;
10411041
(await this.numDone()) === 0 &&
10421042
(await this.queueSize()) === 0 &&
10431043
(await this.numPending()) === 0 &&
1044-
(await this.numFailed()) > 0
1044+
((await this.numExcluded()) > 0 || (await this.numFailed()) > 0)
10451045
);
10461046
}
10471047

1048+
async numExcluded() {
1049+
return await this.redis.scard(this.exKey);
1050+
}
1051+
10481052
async numFound() {
10491053
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
10501054
}

tests/scopes.test.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,3 +383,19 @@ seeds:
383383
expect(result2).not.toBe(false);
384384
expect(result2.isOOS).toBe(false);
385385
});
386+
387+
test("scopeType page includes single pages with hashtag", async () => {
388+
const seeds = await getSeeds(`
389+
seeds:
390+
- url: https://example.com/#hashtag
391+
392+
scopeType: page
393+
`);
394+
395+
expect(seeds[0].scopeType).toEqual("page");
396+
397+
// Test with self (should match)
398+
const result = seeds[0].isIncluded("https://example.com/#hashtag", 0, 0);
399+
expect(result).not.toBe(false);
400+
expect(result.isOOS).toBe(false);
401+
});

0 commit comments

Comments
 (0)