Skip to content

Commit 113b2de

Browse files
committed
allow #-tag urls to be treated as distinct URLs for scope types other than custom:
- fix --allowHashUrls option being ignored - if --allowHashUrls global or --allowHash per seed is set, don't reset to false if scopeType != 'custom' - tests: update scope tests to ensure --allowHash and --allowHashUrls now work as expected fixes #1023
1 parent 7c10fb1 commit 113b2de

3 files changed

Lines changed: 43 additions & 14 deletions

File tree

src/util/argParser.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ class ArgParser {
172172
allowHashUrls: {
173173
describe:
174174
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
175+
type: "boolean",
175176
},
176177

177178
selectLinks: {

src/util/seeds.ts

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { logger } from "./logger.js";
66
import { type CrawlerArgs } from "./argParser.js";
77
import { normalizeUrl } from "./normalize.js";
88

9-
type ScopeType =
9+
export type ScopeType =
1010
| "prefix"
1111
| "host"
1212
| "domain"
@@ -15,6 +15,18 @@ type ScopeType =
1515
| "any"
1616
| "custom";
1717

18+
export type ScopeSeedInitOpts = {
19+
url: string;
20+
scopeType: ScopeType | undefined;
21+
include: string[];
22+
exclude: string[];
23+
allowHash?: boolean;
24+
depth?: number;
25+
sitemap?: string | boolean | null;
26+
extraHops?: number;
27+
auth?: string | null;
28+
};
29+
1830
export class ScopedSeed {
1931
url: string;
2032
normUrl: string;
@@ -43,17 +55,7 @@ export class ScopedSeed {
4355
sitemap = false,
4456
extraHops = 0,
4557
auth = null,
46-
}: {
47-
url: string;
48-
scopeType: ScopeType | undefined;
49-
include: string[];
50-
exclude: string[];
51-
allowHash?: boolean;
52-
depth?: number;
53-
sitemap?: string | boolean | null;
54-
extraHops?: number;
55-
auth?: string | null;
56-
}) {
58+
}: ScopeSeedInitOpts) {
5759
const parsedUrl = this.parseUrl(url);
5860
if (!parsedUrl) {
5961
throw new Error("Invalid URL");
@@ -84,7 +86,7 @@ export class ScopedSeed {
8486
parsedUrl,
8587
);
8688
this.include = [...includeNew, ...this.include];
87-
allowHash = allowHashNew;
89+
allowHash ||= allowHashNew;
8890
}
8991

9092
// for page scope, the depth is set to extraHops, as no other
@@ -348,13 +350,14 @@ export async function parseSeeds(
348350
}
349351
}
350352

351-
const scopeOpts = {
353+
const scopeOpts: Omit<ScopeSeedInitOpts, "url"> = {
352354
scopeType: params.scopeType as ScopeType | undefined,
353355
sitemap: params.sitemap,
354356
include: params.include,
355357
exclude: params.exclude,
356358
depth: params.depth,
357359
extraHops: params.extraHops,
360+
allowHash: params.allowHashUrls,
358361
};
359362

360363
for (const seed of seeds) {

tests/scopes.test.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ seeds:
3434
expect(seeds[0].scopeType).toEqual("prefix");
3535
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
3636
expect(seeds[0].exclude).toEqual([]);
37+
expect(seeds[0].allowHash).toEqual(false);
3738
});
3839

3940
test("default scope + exclude", async () => {
@@ -405,3 +406,27 @@ scopeType: page
405406
expect(result).not.toBe(false);
406407
expect((result as Exclude<typeof result, false>).isOOS).toBe(false);
407408
});
409+
410+
test("allowHashUrls global with scopeType prefix", async () => {
411+
const seeds = await getSeeds(`
412+
allowHashUrls: true
413+
seeds:
414+
- url: https://example.com/
415+
`);
416+
417+
expect(seeds[0].scopeType).toEqual("prefix");
418+
expect(seeds[0].allowHash).toEqual(true);
419+
});
420+
421+
test("allowHash with scopeType prefix", async () => {
422+
const seeds = await getSeeds(`
423+
seeds:
424+
- url: https://example.com/
425+
allowHash: true
426+
427+
scopeType: prefix
428+
`);
429+
430+
expect(seeds[0].scopeType).toEqual("prefix");
431+
expect(seeds[0].allowHash).toEqual(true);
432+
});

0 commit comments

Comments
 (0)