Skip to content

Commit 9ba4f85

Browse files
authored
Handle errors reading file (#152)
1 parent 6ce80b4 commit 9ba4f85

File tree

3 files changed

+50
-23
lines changed

3 files changed

+50
-23
lines changed

src/hooks/presidio/path_filter.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class PathScanStatus(Enum):
1818
EXCLUDED = 2
1919
PASSED = 3
2020
FAILED = 4
21+
ERRORED = 5
2122

2223

2324
class PathFilter:

src/hooks/presidio/scanner.py

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(self, results: List[PathScanResult] = []) -> None:
5151
self.paths_containing_personal_data: List[PathScanResult] = []
5252
self.paths_skipped: List[PathScanResult] = []
5353
self.paths_excluded: List[PathScanResult] = []
54+
self.paths_errored: List[PathScanResult] = []
5455
self.add_path_scan_results(results)
5556

5657
def add_path_scan_results(self, scan_results: List[PathScanResult]):
@@ -70,6 +71,9 @@ def add_path_scan_result(self, scan_result: PathScanResult):
7071
if scan_result.status == PathScanStatus.SKIPPED:
7172
self.paths_skipped.append(scan_result)
7273

74+
if scan_result.status == PathScanStatus.ERRORED:
75+
self.paths_errored.append(scan_result)
76+
7377
def __str__(self) -> str:
7478
with StringIO() as output_buffer:
7579
output_buffer.write("--------PERSONAL DATA SCAN SUMMARY--------")
@@ -94,6 +98,13 @@ def __str__(self) -> str:
9498
paths_without_issues_table.add_row([valid_path.path])
9599
output_buffer.write(str(paths_without_issues_table))
96100

101+
if self.paths_errored:
102+
output_buffer.write("\n\nFILES ERRORED\n")
103+
errored_paths_table = PrettyTable(["Path"])
104+
for errored_path in self.paths_errored:
105+
errored_paths_table.add_row([errored_path.path])
106+
output_buffer.write(str(errored_paths_table))
107+
97108
if self.paths_containing_personal_data:
98109
output_buffer.write("\n\nFILES CONTAINING PERSONAL DATA\n")
99110

@@ -155,29 +166,33 @@ def _scan_content(self, analyzer: AnalyzerEngine, entities: List[str], content:
155166
async def _scan_path(
156167
self, analyzer: AnalyzerEngine, entities: List[str], file_path: str, exclusions: List[re.Pattern[str]]
157168
) -> PathScanResult:
158-
sources = PathFilter()
159-
160-
invalid_check_result = await sources._check_is_path_invalid(file_path, exclusions)
161-
if invalid_check_result is not None:
162-
return PathScanResult(file_path, invalid_check_result)
163-
164-
file_extension = Path(file_path).suffix.lower()
165-
async with await open_file(file_path, "r", encoding="utf-8") as fs:
166-
results: List[PersonalDataDetection] = []
167-
if file_extension in self.LINE_BY_LINE_FILE_EXTENSIONS:
168-
logger.debug("Scanning file %s line by line", file_path)
169-
async for line in fs:
170-
results.extend(self._scan_content(analyzer, entities, line.rstrip()))
171-
else:
172-
contents = await fs.read()
173-
logger.debug("Scanning file %s by reading all contents", file_path)
174-
results.extend(self._scan_content(analyzer, entities, contents))
175-
176-
return PathScanResult(
177-
file_path,
178-
status=PathScanStatus.PASSED if len(results) == 0 else PathScanStatus.FAILED,
179-
results=results,
180-
)
169+
try:
170+
sources = PathFilter()
171+
172+
invalid_check_result = await sources._check_is_path_invalid(file_path, exclusions)
173+
if invalid_check_result is not None:
174+
return PathScanResult(file_path, invalid_check_result)
175+
176+
file_extension = Path(file_path).suffix.lower()
177+
async with await open_file(file_path, "r", encoding="utf-8") as fs:
178+
results: List[PersonalDataDetection] = []
179+
if file_extension in self.LINE_BY_LINE_FILE_EXTENSIONS:
180+
logger.debug("Scanning file %s line by line", file_path)
181+
async for line in fs:
182+
results.extend(self._scan_content(analyzer, entities, line.rstrip()))
183+
else:
184+
contents = await fs.read()
185+
logger.debug("Scanning file %s by reading all contents", file_path)
186+
results.extend(self._scan_content(analyzer, entities, contents))
187+
188+
return PathScanResult(
189+
file_path,
190+
status=PathScanStatus.PASSED if len(results) == 0 else PathScanStatus.FAILED,
191+
results=results,
192+
)
193+
except Exception:
194+
logger.exception("The file scanner failed to read file %s", file_path, stack_info=True)
195+
return PathScanResult(file_path, status=PathScanStatus.ERRORED)
181196

182197
async def scan(
183198
self,

tests/unit/hooks/presidio/test_scanner.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,17 @@ async def test_scan_path_scans_file_contents_for_file_extensions_with_expected_r
6565
mock_scan_content.assert_called_once_with(ANY, ANY, contents)
6666
assert pickle.dumps(result) == pickle.dumps(expected_scan_result)
6767

68+
async def test_scan_path_handles_exception(self):
69+
async with NamedTemporaryFile(suffix="file1.csv", mode="w+t") as tf:
70+
with patch.object(PresidioScanner, "_scan_content") as mock_scan_content:
71+
mock_scan_content.side_effect = Exception()
72+
contents = "Error reading this file"
73+
await tf.write(contents)
74+
await tf.seek(0)
75+
76+
result = await PresidioScanner()._scan_path(MagicMock(), [], tf.name, [])
77+
assert result.status == PathScanStatus.ERRORED
78+
6879
def test_scan_content_returns_detections_list_when_path_has_personal_data(self):
6980
contents = "I have personal data"
7081

0 commit comments

Comments
 (0)