Skip to content

Commit cee501a

Browse files
authored
add reference to external WACZ per revisit record (#1009)
- store in `WARC-Refers-To-Container` with file://<WACZ filename> as per discussions in iipc/warc-specifications#111 - wabac.js 2.26.0 will use this header for prioritizing the specified WACZ for looking up the original. - also clears the per-WACZ dependency key `...:duperef` after current WACZ is finished, so future WACZ files don't use stale dependencies - fixes #1008 - version: bump to 1.12.4
1 parent 87716fa commit cee501a

5 files changed

Lines changed: 46 additions & 5 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "browsertrix-crawler",
3-
"version": "1.12.3",
3+
"version": "1.12.4",
44
"main": "browsertrix-crawler",
55
"type": "module",
66
"repository": "https://github.com/webrecorder/browsertrix-crawler",

src/util/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,5 @@ export type CrawlStatus =
115115
| "interrupted"
116116
| "failed"
117117
| "canceled";
118+
119+
export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container";

src/util/recorder.ts

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import { Crawler } from "../crawler.js";
2727
import { getProxyDispatcher } from "./proxy.js";
2828
import { ScopedSeed } from "./seeds.js";
2929
import EventEmitter from "events";
30-
import { DEFAULT_MAX_RETRIES } from "./constants.js";
30+
import { DEFAULT_MAX_RETRIES, WARC_REFERS_TO_CONTAINER } from "./constants.js";
3131
import { Readable } from "stream";
3232

3333
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
@@ -1762,6 +1762,17 @@ export class Recorder extends EventEmitter {
17621762
const { origUrl, origDate, crawlId, index, size } = res;
17631763
origRecSize = size;
17641764
const date = tsToDate(origDate).toISOString();
1765+
1766+
let externalWACZ = "";
1767+
1768+
// is external crawl
1769+
if (this.crawlState.isExternalCrawl(crawlId)) {
1770+
externalWACZ = await this.crawlState.lookupWACZFilename(
1771+
crawlId,
1772+
Number(index),
1773+
);
1774+
}
1775+
17651776
// always write revisit here
17661777
// duplicate URLs in same crawl filtered out separately
17671778
serializer.externalBuffer?.purge();
@@ -1770,6 +1781,7 @@ export class Recorder extends EventEmitter {
17701781
serializer,
17711782
origUrl,
17721783
date,
1784+
externalWACZ,
17731785
));
17741786
await this.crawlState.addDupeCrawlDependency(crawlId, index);
17751787
} else {
@@ -2200,6 +2212,7 @@ async function createRevisitForResponse(
22002212
serializer: WARCSerializer,
22012213
refersToUrl: string,
22022214
refersToDate: string,
2215+
externalWACZ: string,
22032216
) {
22042217
const payloadDigestForRevisit = responseRecord.warcPayloadDigest || "";
22052218

@@ -2213,6 +2226,10 @@ async function createRevisitForResponse(
22132226
}
22142227
}
22152228

2229+
if (externalWACZ) {
2230+
warcHeaders[WARC_REFERS_TO_CONTAINER] = `file://${externalWACZ}`;
2231+
}
2232+
22162233
const revisitRecord = WARCRecord.create({
22172234
url: responseRecord.warcTargetURI!,
22182235
date: responseRecord.warcDate!,
@@ -2222,6 +2239,7 @@ async function createRevisitForResponse(
22222239
refersToUrl,
22232240
refersToDate,
22242241
});
2242+
22252243
revisitRecord.httpHeaders = responseRecord.httpHeaders;
22262244

22272245
serializer = new WARCSerializer(revisitRecord, {

src/util/state.ts

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,24 @@ export class RedisDedupeIndex {
276276
await pipe.exec();
277277
}
278278

279+
// LOOKUP WACZ FILENAME
280+
281+
async lookupWACZFilename(crawlId: string, index: number): Promise<string> {
282+
try {
283+
const waczdata = await this.dedupeRedis.lindex(
284+
`c:${crawlId}:wacz`,
285+
index,
286+
);
287+
if (!waczdata) {
288+
return "";
289+
}
290+
const { filename } = JSON.parse(waczdata);
291+
return filename;
292+
} catch (_) {
293+
return "";
294+
}
295+
}
296+
279297
// COMMIT DEDUPE TO SHARED INDEX
280298

281299
async commitDedupeDone(crawlId?: string, uncommitted_key = DUPE_UNCOMMITTED) {
@@ -313,12 +331,11 @@ export class RedisDedupeIndex {
313331
const numWacz = await this.dedupeRedis.llen(`c:${crawlId}:wacz`);
314332

315333
for (let i = 0; i < numWacz; i++) {
316-
const waczdata = await this.dedupeRedis.lindex(`c:${crawlId}:wacz`, i);
317-
if (!waczdata) {
334+
const filename = await this.lookupWACZFilename(crawlId, i);
335+
if (!filename) {
318336
continue;
319337
}
320338
try {
321-
const { filename } = JSON.parse(waczdata);
322339
await this.dedupeRedis.sadd(this.sourceDone, filename);
323340
} catch (e) {
324341
// ignore
@@ -1126,6 +1143,8 @@ return inx;
11261143
async clearWACZFilename(): Promise<void> {
11271144
await this.redis.hdel(`${this.crawlId}:nextWacz`, this.uid);
11281145
this.waczFilename = null;
1146+
1147+
await this.redis.del(`${this.uid}:duperef`);
11291148
}
11301149

11311150
async setArchiveSize(size: number) {

tests/dedupe-basic.test.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ test("check revisit records written on duplicate crawl, different collections, w
189189

190190
if (record.warcType === "revisit") {
191191
revisit++;
192+
expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz");
192193
}
193194
}
194195

@@ -226,6 +227,7 @@ test("verify new crawl against imported dupe index has same dupes as dedupe agai
226227

227228
if (record.warcType === "revisit") {
228229
revisit++;
230+
expect(record.warcHeader("WARC-Refers-To-Container")).toBe("file://dedupe-test-orig.wacz");
229231
}
230232
}
231233

0 commit comments

Comments
 (0)