Skip to content

Commit 9d4acf9

Browse files
authored
Refactor: eliminate sanitization pipeline duplication between core and full modules (#7031)
1 parent b819dd1 commit 9d4acf9

File tree

3 files changed

+296
-423
lines changed

3 files changed

+296
-423
lines changed

pkg/workflow/bundler_integration_test.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -372,18 +372,35 @@ func TestGetJavaScriptSources(t *testing.T) {
372372
t.Error("sanitize_content.cjs does not contain sanitizeContent function")
373373
}
374374

375-
// Should contain helper functions
376-
helpers := []string{
375+
// Should contain neutralizeMentions helper (still defined locally)
376+
if !strings.Contains(sanitize, "function neutralizeMentions") {
377+
t.Error("sanitize_content.cjs does not contain function neutralizeMentions")
378+
}
379+
380+
// Should import from sanitize_content_core
381+
if !strings.Contains(sanitize, `require("./sanitize_content_core.cjs")`) {
382+
t.Error("sanitize_content.cjs does not import from sanitize_content_core.cjs")
383+
}
384+
385+
// Check that sanitize_content_core.cjs contains the shared helper functions
386+
core, ok := sources["sanitize_content_core.cjs"]
387+
if !ok {
388+
t.Fatal("GetJavaScriptSources does not contain sanitize_content_core.cjs")
389+
}
390+
391+
// Should contain helper functions in core
392+
coreHelpers := []string{
377393
"function sanitizeUrlDomains",
378394
"function sanitizeUrlProtocols",
379-
"function neutralizeMentions",
380395
"function removeXmlComments",
381396
"function neutralizeBotTriggers",
397+
"function buildAllowedDomains",
398+
"function applyTruncation",
382399
}
383400

384-
for _, helper := range helpers {
385-
if !strings.Contains(sanitize, helper) {
386-
t.Errorf("sanitize_content.cjs does not contain %s", helper)
401+
for _, helper := range coreHelpers {
402+
if !strings.Contains(core, helper) {
403+
t.Errorf("sanitize_content_core.cjs does not contain %s", helper)
387404
}
388405
}
389406
}

pkg/workflow/js/sanitize_content.cjs

Lines changed: 25 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,20 @@
55
* For incoming text that doesn't need mention filtering, use sanitize_incoming_text.cjs instead.
66
*/
77

8-
const { sanitizeContentCore, getRedactedDomains, clearRedactedDomains, writeRedactedDomainsLog, extractDomainsFromUrl, addRedactedDomain } = require("./sanitize_content_core.cjs");
8+
const {
9+
sanitizeContentCore,
10+
getRedactedDomains,
11+
clearRedactedDomains,
12+
writeRedactedDomainsLog,
13+
buildAllowedDomains,
14+
sanitizeUrlProtocols,
15+
sanitizeUrlDomains,
16+
neutralizeCommands,
17+
removeXmlComments,
18+
convertXmlTags,
19+
neutralizeBotTriggers,
20+
applyTruncation,
21+
} = require("./sanitize_content_core.cjs");
922

1023
/**
1124
* @typedef {Object} SanitizeOptions
@@ -40,46 +53,25 @@ function sanitizeContent(content, maxLengthOrOptions) {
4053
}
4154

4255
// If allowed aliases are specified, we need custom mention filtering
43-
// We'll do most of the sanitization with the core, then apply selective mention filtering
56+
// We'll apply the same sanitization pipeline but with selective mention filtering
4457

4558
if (!content || typeof content !== "string") {
4659
return "";
4760
}
4861

49-
// Read allowed domains from environment variable
50-
const allowedDomainsEnv = process.env.GH_AW_ALLOWED_DOMAINS;
51-
const defaultAllowedDomains = ["github.com", "github.io", "githubusercontent.com", "githubassets.com", "github.dev", "codespaces.new"];
52-
53-
let allowedDomains = allowedDomainsEnv
54-
? allowedDomainsEnv
55-
.split(",")
56-
.map(d => d.trim())
57-
.filter(d => d)
58-
: defaultAllowedDomains;
59-
60-
// Extract and add GitHub domains from GitHub context URLs
61-
const githubServerUrl = process.env.GITHUB_SERVER_URL;
62-
const githubApiUrl = process.env.GITHUB_API_URL;
63-
64-
if (githubServerUrl) {
65-
const serverDomains = extractDomainsFromUrl(githubServerUrl);
66-
allowedDomains = allowedDomains.concat(serverDomains);
67-
}
68-
69-
if (githubApiUrl) {
70-
const apiDomains = extractDomainsFromUrl(githubApiUrl);
71-
allowedDomains = allowedDomains.concat(apiDomains);
72-
}
73-
74-
// Remove duplicates
75-
allowedDomains = [...new Set(allowedDomains)];
62+
// Build list of allowed domains (shared with core)
63+
const allowedDomains = buildAllowedDomains();
7664

7765
let sanitized = content;
7866

67+
// Remove ANSI escape sequences and control characters early
68+
sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
69+
sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
70+
7971
// Neutralize commands at the start of text
8072
sanitized = neutralizeCommands(sanitized);
8173

82-
// Neutralize @mentions with selective filtering
74+
// Neutralize @mentions with selective filtering (custom logic for allowed aliases)
8375
sanitized = neutralizeMentions(sanitized, allowedAliasesLowercase);
8476

8577
// Remove XML comments
@@ -88,138 +80,18 @@ function sanitizeContent(content, maxLengthOrOptions) {
8880
// Convert XML tags
8981
sanitized = convertXmlTags(sanitized);
9082

91-
// Remove ANSI escape sequences
92-
sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
93-
94-
// Remove control characters
95-
sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
96-
97-
// URI filtering
83+
// URI filtering (shared with core)
9884
sanitized = sanitizeUrlProtocols(sanitized);
9985
sanitized = sanitizeUrlDomains(sanitized, allowedDomains);
10086

101-
// Truncation
102-
const lines = sanitized.split("\n");
103-
const maxLines = 65000;
104-
maxLength = maxLength || 524288;
105-
106-
if (lines.length > maxLines) {
107-
const truncationMsg = "\n[Content truncated due to line count]";
108-
const truncatedLines = lines.slice(0, maxLines).join("\n") + truncationMsg;
109-
110-
if (truncatedLines.length > maxLength) {
111-
sanitized = truncatedLines.substring(0, maxLength - truncationMsg.length) + truncationMsg;
112-
} else {
113-
sanitized = truncatedLines;
114-
}
115-
} else if (sanitized.length > maxLength) {
116-
sanitized = sanitized.substring(0, maxLength) + "\n[Content truncated due to length]";
117-
}
87+
// Apply truncation limits (shared with core)
88+
sanitized = applyTruncation(sanitized, maxLength);
11889

11990
// Neutralize bot triggers
12091
sanitized = neutralizeBotTriggers(sanitized);
12192

12293
return sanitized.trim();
12394

124-
/**
125-
* Sanitize URL domains
126-
* @param {string} s - The string to process
127-
* @param {string[]} allowed - Allowed domains
128-
* @returns {string} Sanitized string
129-
*/
130-
function sanitizeUrlDomains(s, allowed) {
131-
const httpsUrlRegex = /https:\/\/([\w.-]+(?::\d+)?)(\/[^\s]*)?/gi;
132-
133-
const result = s.replace(httpsUrlRegex, (match, hostnameWithPort, pathPart) => {
134-
const hostname = hostnameWithPort.split(":")[0].toLowerCase();
135-
pathPart = pathPart || "";
136-
137-
const isAllowed = allowed.some(allowedDomain => {
138-
const normalizedAllowed = allowedDomain.toLowerCase();
139-
140-
if (hostname === normalizedAllowed) {
141-
return true;
142-
}
143-
144-
if (normalizedAllowed.startsWith("*.")) {
145-
const baseDomain = normalizedAllowed.substring(2);
146-
return hostname.endsWith("." + baseDomain) || hostname === baseDomain;
147-
}
148-
149-
return hostname.endsWith("." + normalizedAllowed);
150-
});
151-
152-
if (isAllowed) {
153-
return match;
154-
} else {
155-
const truncated = hostname.length > 12 ? hostname.substring(0, 12) + "..." : hostname;
156-
if (typeof core !== "undefined" && core.info) {
157-
core.info(`Redacted URL: ${truncated}`);
158-
}
159-
if (typeof core !== "undefined" && core.debug) {
160-
core.debug(`Redacted URL (full): ${match}`);
161-
}
162-
addRedactedDomain(hostname);
163-
return "(redacted)";
164-
}
165-
});
166-
167-
return result;
168-
}
169-
170-
/**
171-
* Sanitize URL protocols
172-
* @param {string} s - The string to process
173-
* @returns {string} Sanitized string
174-
*/
175-
function sanitizeUrlProtocols(s) {
176-
return s.replace(/\b((?:http|ftp|file|ssh|git):\/\/([\w.-]+)(?:[^\s]*)|(?:data|javascript|vbscript|about|mailto|tel):[^\s]+)/gi, (match, _fullMatch, domain) => {
177-
// Extract domain for http/ftp/file/ssh/git protocols
178-
if (domain) {
179-
const domainLower = domain.toLowerCase();
180-
const truncated = domainLower.length > 12 ? domainLower.substring(0, 12) + "..." : domainLower;
181-
if (typeof core !== "undefined" && core.info) {
182-
core.info(`Redacted URL: ${truncated}`);
183-
}
184-
if (typeof core !== "undefined" && core.debug) {
185-
core.debug(`Redacted URL (full): ${match}`);
186-
}
187-
addRedactedDomain(domainLower);
188-
} else {
189-
// For other protocols (data:, javascript:, etc.), track the protocol itself
190-
const protocolMatch = match.match(/^([^:]+):/);
191-
if (protocolMatch) {
192-
const protocol = protocolMatch[1] + ":";
193-
// Truncate the matched URL for logging (keep first 12 chars + "...")
194-
const truncated = match.length > 12 ? match.substring(0, 12) + "..." : match;
195-
if (typeof core !== "undefined" && core.info) {
196-
core.info(`Redacted URL: ${truncated}`);
197-
}
198-
if (typeof core !== "undefined" && core.debug) {
199-
core.debug(`Redacted URL (full): ${match}`);
200-
}
201-
addRedactedDomain(protocol);
202-
}
203-
}
204-
return "(redacted)";
205-
});
206-
}
207-
208-
/**
209-
* Neutralize commands
210-
* @param {string} s - The string to process
211-
* @returns {string} Processed string
212-
*/
213-
function neutralizeCommands(s) {
214-
const commandName = process.env.GH_AW_COMMAND;
215-
if (!commandName) {
216-
return s;
217-
}
218-
219-
const escapedCommand = commandName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
220-
return s.replace(new RegExp(`^(\\s*)/(${escapedCommand})\\b`, "i"), "$1`/$2`");
221-
}
222-
22395
/**
22496
* Neutralize @mentions with selective filtering
22597
* @param {string} s - The string to process
@@ -240,49 +112,6 @@ function sanitizeContent(content, maxLengthOrOptions) {
240112
return `${p1}\`@${p2}\``; // Neutralize the mention
241113
});
242114
}
243-
244-
/**
245-
* Remove XML comments
246-
* @param {string} s - The string to process
247-
* @returns {string} Processed string
248-
*/
249-
function removeXmlComments(s) {
250-
return s.replace(/<!--[\s\S]*?-->/g, "").replace(/<!--[\s\S]*?--!>/g, "");
251-
}
252-
253-
/**
254-
* Convert XML tags
255-
* @param {string} s - The string to process
256-
* @returns {string} Processed string
257-
*/
258-
function convertXmlTags(s) {
259-
const allowedTags = ["b", "blockquote", "br", "code", "details", "em", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "i", "li", "ol", "p", "pre", "strong", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead", "tr", "ul"];
260-
261-
s = s.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (match, content) => {
262-
const convertedContent = content.replace(/<(\/?[A-Za-z][A-Za-z0-9]*(?:[^>]*?))>/g, "($1)");
263-
return `(![CDATA[${convertedContent}]])`;
264-
});
265-
266-
return s.replace(/<(\/?[A-Za-z!][^>]*?)>/g, (match, tagContent) => {
267-
const tagNameMatch = tagContent.match(/^\/?\s*([A-Za-z][A-Za-z0-9]*)/);
268-
if (tagNameMatch) {
269-
const tagName = tagNameMatch[1].toLowerCase();
270-
if (allowedTags.includes(tagName)) {
271-
return match;
272-
}
273-
}
274-
return `(${tagContent})`;
275-
});
276-
}
277-
278-
/**
279-
* Neutralize bot triggers
280-
* @param {string} s - The string to process
281-
* @returns {string} Processed string
282-
*/
283-
function neutralizeBotTriggers(s) {
284-
return s.replace(/\b(fixes?|closes?|resolves?|fix|close|resolve)\s+#(\w+)/gi, (match, action, ref) => `\`${action} #${ref}\``);
285-
}
286115
}
287116

288117
module.exports = { sanitizeContent, getRedactedDomains, clearRedactedDomains, writeRedactedDomainsLog };

0 commit comments

Comments
 (0)