Refactor: eliminate sanitization pipeline duplication between core and full modules (#7031)

Copilot · web-flow · commit 9d4acf9b26bb · 2025-12-19T23:24:33.000-08:00
diff --git a/pkg/workflow/bundler_integration_test.go b/pkg/workflow/bundler_integration_test.go
@@ -372,18 +372,35 @@ func TestGetJavaScriptSources(t *testing.T) {
 		t.Error("sanitize_content.cjs does not contain sanitizeContent function")
 	}
 
-	// Should contain helper functions
-	helpers := []string{
+	// Should contain neutralizeMentions helper (still defined locally)
+	if !strings.Contains(sanitize, "function neutralizeMentions") {
+		t.Error("sanitize_content.cjs does not contain function neutralizeMentions")
+	}
+
+	// Should import from sanitize_content_core
+	if !strings.Contains(sanitize, `require("./sanitize_content_core.cjs")`) {
+		t.Error("sanitize_content.cjs does not import from sanitize_content_core.cjs")
+	}
+
+	// Check that sanitize_content_core.cjs contains the shared helper functions
+	core, ok := sources["sanitize_content_core.cjs"]
+	if !ok {
+		t.Fatal("GetJavaScriptSources does not contain sanitize_content_core.cjs")
+	}
+
+	// Should contain helper functions in core
+	coreHelpers := []string{
 		"function sanitizeUrlDomains",
 		"function sanitizeUrlProtocols",
-		"function neutralizeMentions",
 		"function removeXmlComments",
 		"function neutralizeBotTriggers",
+		"function buildAllowedDomains",
+		"function applyTruncation",
 	}
 
-	for _, helper := range helpers {
-		if !strings.Contains(sanitize, helper) {
-			t.Errorf("sanitize_content.cjs does not contain %s", helper)
+	for _, helper := range coreHelpers {
+		if !strings.Contains(core, helper) {
+			t.Errorf("sanitize_content_core.cjs does not contain %s", helper)
 		}
 	}
 }
diff --git a/pkg/workflow/js/sanitize_content.cjs b/pkg/workflow/js/sanitize_content.cjs
@@ -5,7 +5,20 @@
  * For incoming text that doesn't need mention filtering, use sanitize_incoming_text.cjs instead.
  */
 
-const { sanitizeContentCore, getRedactedDomains, clearRedactedDomains, writeRedactedDomainsLog, extractDomainsFromUrl, addRedactedDomain } = require("./sanitize_content_core.cjs");
+const {
+  sanitizeContentCore,
+  getRedactedDomains,
+  clearRedactedDomains,
+  writeRedactedDomainsLog,
+  buildAllowedDomains,
+  sanitizeUrlProtocols,
+  sanitizeUrlDomains,
+  neutralizeCommands,
+  removeXmlComments,
+  convertXmlTags,
+  neutralizeBotTriggers,
+  applyTruncation,
+} = require("./sanitize_content_core.cjs");
 
 /**
  * @typedef {Object} SanitizeOptions
@@ -40,46 +53,25 @@ function sanitizeContent(content, maxLengthOrOptions) {
   }
 
   // If allowed aliases are specified, we need custom mention filtering
-  // We'll do most of the sanitization with the core, then apply selective mention filtering
+  // We'll apply the same sanitization pipeline but with selective mention filtering
 
   if (!content || typeof content !== "string") {
     return "";
   }
 
-  // Read allowed domains from environment variable
-  const allowedDomainsEnv = process.env.GH_AW_ALLOWED_DOMAINS;
-  const defaultAllowedDomains = ["github.com", "github.io", "githubusercontent.com", "githubassets.com", "github.dev", "codespaces.new"];
-
-  let allowedDomains = allowedDomainsEnv
-    ? allowedDomainsEnv
-        .split(",")
-        .map(d => d.trim())
-        .filter(d => d)
-    : defaultAllowedDomains;
-
-  // Extract and add GitHub domains from GitHub context URLs
-  const githubServerUrl = process.env.GITHUB_SERVER_URL;
-  const githubApiUrl = process.env.GITHUB_API_URL;
-
-  if (githubServerUrl) {
-    const serverDomains = extractDomainsFromUrl(githubServerUrl);
-    allowedDomains = allowedDomains.concat(serverDomains);
-  }
-
-  if (githubApiUrl) {
-    const apiDomains = extractDomainsFromUrl(githubApiUrl);
-    allowedDomains = allowedDomains.concat(apiDomains);
-  }
-
-  // Remove duplicates
-  allowedDomains = [...new Set(allowedDomains)];
+  // Build list of allowed domains (shared with core)
+  const allowedDomains = buildAllowedDomains();
 
   let sanitized = content;
 
+  // Remove ANSI escape sequences and control characters early
+  sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
+  sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
+
   // Neutralize commands at the start of text
   sanitized = neutralizeCommands(sanitized);
 
-  // Neutralize @mentions with selective filtering
+  // Neutralize @mentions with selective filtering (custom logic for allowed aliases)
   sanitized = neutralizeMentions(sanitized, allowedAliasesLowercase);
 
   // Remove XML comments
@@ -88,138 +80,18 @@ function sanitizeContent(content, maxLengthOrOptions) {
   // Convert XML tags
   sanitized = convertXmlTags(sanitized);
 
-  // Remove ANSI escape sequences
-  sanitized = sanitized.replace(/\x1b\[[0-9;]*[mGKH]/g, "");
-
-  // Remove control characters
-  sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
-
-  // URI filtering
+  // URI filtering (shared with core)
   sanitized = sanitizeUrlProtocols(sanitized);
   sanitized = sanitizeUrlDomains(sanitized, allowedDomains);
 
-  // Truncation
-  const lines = sanitized.split("\n");
-  const maxLines = 65000;
-  maxLength = maxLength || 524288;
-
-  if (lines.length > maxLines) {
-    const truncationMsg = "\n[Content truncated due to line count]";
-    const truncatedLines = lines.slice(0, maxLines).join("\n") + truncationMsg;
-
-    if (truncatedLines.length > maxLength) {
-      sanitized = truncatedLines.substring(0, maxLength - truncationMsg.length) + truncationMsg;
-    } else {
-      sanitized = truncatedLines;
-    }
-  } else if (sanitized.length > maxLength) {
-    sanitized = sanitized.substring(0, maxLength) + "\n[Content truncated due to length]";
-  }
+  // Apply truncation limits (shared with core)
+  sanitized = applyTruncation(sanitized, maxLength);
 
   // Neutralize bot triggers
   sanitized = neutralizeBotTriggers(sanitized);
 
   return sanitized.trim();
 
-  /**
-   * Sanitize URL domains
-   * @param {string} s - The string to process
-   * @param {string[]} allowed - Allowed domains
-   * @returns {string} Sanitized string
-   */
-  function sanitizeUrlDomains(s, allowed) {
-    const httpsUrlRegex = /https:\/\/([\w.-]+(?::\d+)?)(\/[^\s]*)?/gi;
-
-    const result = s.replace(httpsUrlRegex, (match, hostnameWithPort, pathPart) => {
-      const hostname = hostnameWithPort.split(":")[0].toLowerCase();
-      pathPart = pathPart || "";
-
-      const isAllowed = allowed.some(allowedDomain => {
-        const normalizedAllowed = allowedDomain.toLowerCase();
-
-        if (hostname === normalizedAllowed) {
-          return true;
-        }
-
-        if (normalizedAllowed.startsWith("*.")) {
-          const baseDomain = normalizedAllowed.substring(2);
-          return hostname.endsWith("." + baseDomain) || hostname === baseDomain;
-        }
-
-        return hostname.endsWith("." + normalizedAllowed);
-      });
-
-      if (isAllowed) {
-        return match;
-      } else {
-        const truncated = hostname.length > 12 ? hostname.substring(0, 12) + "..." : hostname;
-        if (typeof core !== "undefined" && core.info) {
-          core.info(`Redacted URL: ${truncated}`);
-        }
-        if (typeof core !== "undefined" && core.debug) {
-          core.debug(`Redacted URL (full): ${match}`);
-        }
-        addRedactedDomain(hostname);
-        return "(redacted)";
-      }
-    });
-
-    return result;
-  }
-
-  /**
-   * Sanitize URL protocols
-   * @param {string} s - The string to process
-   * @returns {string} Sanitized string
-   */
-  function sanitizeUrlProtocols(s) {
-    return s.replace(/\b((?:http|ftp|file|ssh|git):\/\/([\w.-]+)(?:[^\s]*)|(?:data|javascript|vbscript|about|mailto|tel):[^\s]+)/gi, (match, _fullMatch, domain) => {
-      // Extract domain for http/ftp/file/ssh/git protocols
-      if (domain) {
-        const domainLower = domain.toLowerCase();
-        const truncated = domainLower.length > 12 ? domainLower.substring(0, 12) + "..." : domainLower;
-        if (typeof core !== "undefined" && core.info) {
-          core.info(`Redacted URL: ${truncated}`);
-        }
-        if (typeof core !== "undefined" && core.debug) {
-          core.debug(`Redacted URL (full): ${match}`);
-        }
-        addRedactedDomain(domainLower);
-      } else {
-        // For other protocols (data:, javascript:, etc.), track the protocol itself
-        const protocolMatch = match.match(/^([^:]+):/);
-        if (protocolMatch) {
-          const protocol = protocolMatch[1] + ":";
-          // Truncate the matched URL for logging (keep first 12 chars + "...")
-          const truncated = match.length > 12 ? match.substring(0, 12) + "..." : match;
-          if (typeof core !== "undefined" && core.info) {
-            core.info(`Redacted URL: ${truncated}`);
-          }
-          if (typeof core !== "undefined" && core.debug) {
-            core.debug(`Redacted URL (full): ${match}`);
-          }
-          addRedactedDomain(protocol);
-        }
-      }
-      return "(redacted)";
-    });
-  }
-
-  /**
-   * Neutralize commands
-   * @param {string} s - The string to process
-   * @returns {string} Processed string
-   */
-  function neutralizeCommands(s) {
-    const commandName = process.env.GH_AW_COMMAND;
-    if (!commandName) {
-      return s;
-    }
-
-    const escapedCommand = commandName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-    return s.replace(new RegExp(`^(\\s*)/(${escapedCommand})\\b`, "i"), "$1`/$2`");
-  }
-
   /**
    * Neutralize @mentions with selective filtering
    * @param {string} s - The string to process
@@ -240,49 +112,6 @@ function sanitizeContent(content, maxLengthOrOptions) {
       return `${p1}\`@${p2}\``; // Neutralize the mention
     });
   }
-
-  /**
-   * Remove XML comments
-   * @param {string} s - The string to process
-   * @returns {string} Processed string
-   */
-  function removeXmlComments(s) {
-    return s.replace(/<!--[\s\S]*?-->/g, "").replace(/<!--[\s\S]*?--!>/g, "");
-  }
-
-  /**
-   * Convert XML tags
-   * @param {string} s - The string to process
-   * @returns {string} Processed string
-   */
-  function convertXmlTags(s) {
-    const allowedTags = ["b", "blockquote", "br", "code", "details", "em", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "i", "li", "ol", "p", "pre", "strong", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead", "tr", "ul"];
-
-    s = s.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (match, content) => {
-      const convertedContent = content.replace(/<(\/?[A-Za-z][A-Za-z0-9]*(?:[^>]*?))>/g, "($1)");
-      return `(![CDATA[${convertedContent}]])`;
-    });
-
-    return s.replace(/<(\/?[A-Za-z!][^>]*?)>/g, (match, tagContent) => {
-      const tagNameMatch = tagContent.match(/^\/?\s*([A-Za-z][A-Za-z0-9]*)/);
-      if (tagNameMatch) {
-        const tagName = tagNameMatch[1].toLowerCase();
-        if (allowedTags.includes(tagName)) {
-          return match;
-        }
-      }
-      return `(${tagContent})`;
-    });
-  }
-
-  /**
-   * Neutralize bot triggers
-   * @param {string} s - The string to process
-   * @returns {string} Processed string
-   */
-  function neutralizeBotTriggers(s) {
-    return s.replace(/\b(fixes?|closes?|resolves?|fix|close|resolve)\s+#(\w+)/gi, (match, action, ref) => `\`${action} #${ref}\``);
-  }
 }
 
 module.exports = { sanitizeContent, getRedactedDomains, clearRedactedDomains, writeRedactedDomainsLog };
diff --git a/pkg/workflow/js/sanitize_content_core.cjs b/pkg/workflow/js/sanitize_content_core.cjs