Skip to content

Commit 4848c4d

Browse files
committed
v0.4.1: schema validation, URL hardening, secrets patterns, tests
1 parent 29a9d21 commit 4848c4d

11 files changed

Lines changed: 210 additions & 2 deletions

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,8 @@ audit.jsonl
7777

7878
# Example artifacts
7979
examples/demo/
80+
81+
82+
STRATEGY.md
83+
CORE_DEV_PLAN.md
84+
DISTRIBUTION.md

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
22

3+
## [0.4.1] - 2025-09-02
4+
### Added
5+
- Schema validation (optional, via jsonschema)
6+
- URL scanner hardening (IP/punycode)
7+
- Secrets patterns extended
8+
- Tests
9+
10+
311
## [0.4.0] - 2025-08-30
412
### Added
513
- GraphRAG support:

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rag-firewall"
7-
version = "0.4.0"
7+
version = "0.4.1"
88
description = "Client-side retrieval firewall for RAG systems — blocks prompt injection and secret leaks, re-ranks stale or untrusted content, and keeps all data inside your environment."
99
readme = { file = "README.md", content-type = "text/markdown" }
1010
requires-python = ">=3.9"
@@ -24,9 +24,13 @@ dev = [
2424
"networkx>=3.2"
2525
]
2626

27+
[tool.setuptools.package-data]
28+
"rag_firewall" = ["schema/*.json"]
29+
2730
[project.urls]
2831
Homepage = "https://github.com/taladari/rag-firewall"
2932
Repository = "https://github.com/taladari/rag-firewall"
3033

3134
[project.scripts]
3235
ragfw = "rag_firewall.cli:main"
36+

rag_firewall/firewall.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,26 @@ def from_yaml(cls, path):
1818
with open(path,"r",encoding="utf-8") as f:
1919
if not yaml: raise RuntimeError("PyYAML is required to load YAML configs.")
2020
cfg=yaml.safe_load(f)
21+
# Optional JSON Schema validation if jsonschema is available
22+
try:
23+
import json
24+
from importlib.resources import files
25+
import jsonschema # type: ignore
26+
schema_path = files("rag_firewall").joinpath("schema/firewall.schema.json")
27+
with open(schema_path, "r", encoding="utf-8") as sf:
28+
schema = json.load(sf)
29+
jsonschema.validate(instance=cfg, schema=schema)
30+
except ModuleNotFoundError:
31+
# jsonschema not installed; skip validation gracefully
32+
pass
33+
except Exception as ve:
34+
# If validation fails, surface a clear error
35+
try:
36+
import jsonschema # noqa: F401
37+
raise ValueError(f"Invalid firewall config: {getattr(ve, 'message', str(ve))}") from ve
38+
except Exception:
39+
# Unknown error during validation; continue without blocking
40+
pass
2141
scanners=[]
2242
from .scanners.regex_scanner import RegexInjectionScanner
2343
from .scanners.pii_scanner import PIIScanner

rag_firewall/scanners/secrets_scanner.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@
77
(r"ghp_[A-Za-z0-9]{36}","github_token"),(r"AIza[0-9A-Za-z\-_]{35}","google_api_key"),
88
(r"xox[abp]-\d{10,}-\d{10,}-[A-Za-z0-9-]{24,}","slack_token"),(r"sk-[A-Za-z0-9]{32,}","generic_sk_token"),
99
(r"(?i)bearer\s+[A-Za-z0-9\-_\.=]{20,}","bearer_token"),
10-
(r"-----BEGIN (?:RSA|OPENSSH|EC) PRIVATE KEY-----","private_key")]
10+
(r"-----BEGIN (?:RSA|OPENSSH|EC) PRIVATE KEY-----","private_key"),
11+
# Additional high-signal patterns
12+
(r"hf_[A-Za-z0-9]{30,}","huggingface_token"),
13+
(r"dapi[a-zA-Z0-9]{24}","databricks_token"),
14+
(r"https://hooks\.slack\.com/services/[A-Za-z0-9/+]{20,}","slack_webhook"),
15+
(r"(?i)azure[_\s-]key[vV]ault|AZURE_[A-Z0-9_]{8,}","azure_secret_suspect"),
16+
(r"secret_[A-Za-z0-9]{32,}","generic_secret_token")]
1117
class SecretsScanner:
1218
def __init__(self, extra_patterns=None):
1319
import regex as re

rag_firewall/scanners/url_scanner.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import regex as re
55
from urllib.parse import urlparse
6+
import ipaddress
67
URL_RE=re.compile(r"https?://[\w\-\.:%#@/\?=~\+,&]+", re.I)
78
class URLScanner:
89
def __init__(self, allowlist=None, denylist=None):
@@ -13,6 +14,17 @@ def scan(self, text, metadata):
1314
for m in URL_RE.findall(t):
1415
host=(urlparse(m).hostname or "").lower()
1516
sev="low"; reason="url_found"
17+
# Flag IP literals (IPv4/IPv6)
18+
try:
19+
if host:
20+
ipaddress.ip_address(host)
21+
out.append({"scanner":"url","match":host,"severity":"high","reason":"ip_literal"})
22+
# Do not continue; also evaluate allow/deny checks below
23+
except ValueError:
24+
pass
25+
# Flag punycode domains
26+
if host.startswith("xn--"):
27+
out.append({"scanner":"url","match":host,"severity":"high","reason":"punycode_host"})
1628
if self.denylist and any(host==d or host.endswith("."+d) for d in self.denylist):
1729
sev="high"; reason="denylist_domain"
1830
elif self.allowlist and not any(host==d or host.endswith("."+d) for d in self.allowlist):

rag_firewall/schema/__init__.py

Whitespace-only changes.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3+
"title": "RAG Firewall Config v1",
4+
"type": "object",
5+
"properties": {
6+
"scanners": {
7+
"type": "array",
8+
"items": {
9+
"type": "object",
10+
"properties": {
11+
"type": { "type": "string", "enum": ["regex_injection","pii","secrets","encoded","url","conflict"] },
12+
"patterns": { "type": "array", "items": { "type": "string" } },
13+
"extra_patterns": { "type": "array", "items": { "type": "string" } },
14+
"min_len": { "type": "integer", "minimum": 1 },
15+
"ratio_threshold": { "type": "number", "minimum": 0, "maximum": 1 },
16+
"allowlist": { "type": "array", "items": { "type": "string" } },
17+
"denylist": { "type": "array", "items": { "type": "string" } },
18+
"stale_days": { "type": "integer", "minimum": 1 }
19+
},
20+
"required": ["type"],
21+
"additionalProperties": true
22+
}
23+
},
24+
"policies": {
25+
"type": "array",
26+
"items": {
27+
"type": "object",
28+
"properties": {
29+
"name": { "type": "string" },
30+
"action": { "type": "string", "enum": ["allow","deny","rerank"] },
31+
"match": { "type": "object" },
32+
"weight": {
33+
"type": "object",
34+
"properties": {
35+
"recency": { "type": "number" },
36+
"relevance": { "type": "number" },
37+
"provenance": { "type": "number" }
38+
},
39+
"additionalProperties": false
40+
}
41+
},
42+
"required": ["name","action"],
43+
"additionalProperties": true
44+
}
45+
}
46+
},
47+
"required": ["scanners","policies"],
48+
"additionalProperties": false
49+
}

tests/test_config_validation.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os, json, tempfile
2+
import pytest
3+
4+
from rag_firewall.firewall import Firewall
5+
6+
VALID_CFG = {
7+
"scanners": [
8+
{"type": "regex_injection"},
9+
{"type": "secrets"},
10+
{"type": "url", "allowlist": ["good.example.com"], "denylist": ["evil.example.com"]},
11+
{"type": "conflict", "stale_days": 120}
12+
],
13+
"policies": [
14+
{"name": "allow_default", "action": "allow"}
15+
]
16+
}
17+
18+
INVALID_ACTION_CFG = {
19+
"scanners": [ {"type": "regex_injection"} ],
20+
"policies": [ {"name": "bad", "action": "block"} ] # not in enum
21+
}
22+
23+
INVALID_SCANNER_TYPE_CFG = {
24+
"scanners": [ {"type": "unknown_scanner"} ],
25+
"policies": [ {"name": "allow_default", "action": "allow"} ]
26+
}
27+
28+
def _write_tmp_yaml(obj):
29+
import yaml
30+
fd, path = tempfile.mkstemp(suffix=".yaml")
31+
with os.fdopen(fd, "w", encoding="utf-8") as f:
32+
yaml.safe_dump(obj, f)
33+
return path
34+
35+
@pytest.mark.parametrize("cfg", [VALID_CFG])
36+
def test_valid_config_passes_validation(cfg):
37+
path = _write_tmp_yaml(cfg)
38+
# Should not raise; might skip validation if jsonschema not installed
39+
fw = Firewall.from_yaml(path)
40+
assert isinstance(fw, Firewall)
41+
42+
@pytest.mark.parametrize("cfg", [INVALID_ACTION_CFG, INVALID_SCANNER_TYPE_CFG])
43+
def test_invalid_config_raises_when_jsonschema_present(cfg):
44+
try:
45+
import jsonschema # noqa: F401
46+
except Exception:
47+
pytest.skip("jsonschema not installed; validation is optional")
48+
path = _write_tmp_yaml(cfg)
49+
with pytest.raises(ValueError):
50+
Firewall.from_yaml(path)

tests/test_secrets_extended.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from rag_firewall.scanners.secrets_scanner import SecretsScanner
2+
3+
4+
def test_huggingface_and_databricks_and_slack_webhook_detected():
5+
s = SecretsScanner()
6+
text = "hf_abcdefghijklmnopqrstuvwxyzABCDE dapiABCDEFGHIJKLMNOPQRSTUVWX https://hooks.slack.com/services/T12345/A12345/ABCDEFghijklmnop"
7+
findings = s.scan(text, {})
8+
names = {f.get("match") for f in findings}
9+
assert "huggingface_token" in names
10+
assert "databricks_token" in names
11+
assert "slack_webhook" in names
12+
13+
14+
def test_azure_like_and_generic_secret_detected():
15+
s = SecretsScanner()
16+
text = "AZURE_SECRET_ABCDEFGH secret_abcdefghijklmnopqrstuvwxyz123456"
17+
findings = s.scan(text, {})
18+
names = {f.get("match") for f in findings}
19+
assert "azure_secret_suspect" in names or any("azure" in n for n in names)
20+
assert "generic_secret_token" in names

0 commit comments

Comments
 (0)