diff --git a/README.md b/README.md index e1b1c9a..ca7ff50 100644 --- a/README.md +++ b/README.md @@ -172,12 +172,13 @@ sharepoint2text --file /path/to/file.docx --json > extraction.json | Option | Description | |---|---| -| `--file FILE` | Required input file | +| `--file FILE`, `-f FILE` | Required input file | | `--output FILE`, `-o FILE` | Write output to file (default: stdout) | -| `--json` | Emit `list[extraction_object]` | -| `--json-unit` | Emit `list[list[unit_object]]` | -| `--include-images` | Include binary image payloads as base64 in JSON output | -| `--version` | Print CLI version | +| `--json`, `-j` | Emit `list[extraction_object]` | +| `--json-unit`, `-u` | Emit `list[unit_object]` | +| `--include-images`, `-i` | Include binary image payloads as base64 in JSON output | +| `--no-attachments`, `-n` | Exclude email attachments from CLI extraction output | +| `--version`, `-v` | Print CLI version | Rules: diff --git a/sharepoint2text/cli.py b/sharepoint2text/cli.py index 11520f8..aa9553a 100644 --- a/sharepoint2text/cli.py +++ b/sharepoint2text/cli.py @@ -4,10 +4,13 @@ import json import sys from pathlib import Path -from typing import Sequence, TextIO +from typing import Iterator, Sequence, TextIO import sharepoint2text -from sharepoint2text.parsing.extractors.data_types import ExtractionInterface +from sharepoint2text.parsing.extractors.data_types import ( + EmailContent, + ExtractionInterface, +) from sharepoint2text.parsing.extractors.serialization import serialize_extraction @@ -18,12 +21,14 @@ def _build_parser() -> argparse.ArgumentParser: description="Extract file content and emit full text to stdout (or JSON with --json/--json-unit).", ) parser.add_argument( + "-v", "--version", action="version", version=f"%(prog)s {sharepoint2text.__version__}", help="Show the version and exit.", ) parser.add_argument( + "-f", "--file", type=Path, required=True, @@ -31,22 +36,32 @@ def _build_parser() -> argparse.ArgumentParser: ) output_group = parser.add_mutually_exclusive_group() output_group.add_argument( + "-j", "--json", action="store_true", help="Emit structured JSON instead of plain full text (omits binary payloads by default).", ) output_group.add_argument( + "-u", "--json-unit", dest="json_unit", action="store_true", help="Emit JSON for extracted text units instead of full extraction objects (omits binary payloads by default).", ) parser.add_argument( + "-i", "--include-images", dest="include_images", action="store_true", help="Extract images from the file and include image data as base64 blobs in JSON output (default: images are ignored for faster processing).", ) + parser.add_argument( + "-n", + "--no-attachments", + dest="no_attachments", + action="store_true", + help="For email files, exclude supported attachments from CLI extraction output.", + ) parser.add_argument( "--output", "-o", @@ -71,19 +86,22 @@ def _serialize_results( def _serialize_unit_results( - results: list[ExtractionInterface], *, include_binary: bool -) -> list[list[dict]]: + results: list[ExtractionInterface], + *, + include_binary: bool, + include_email_attachments: bool = False, +) -> list[dict]: """Serialize per-unit output for ``--json-unit`` mode. - Always returns ``list[list[dict]]`` so each extraction result keeps a stable - unit-list boundary. + Returns a flat ``list[dict]`` with one dictionary per extracted unit. """ return [ - [ - serialize_extraction(unit, include_binary=include_binary) - for unit in result.iterate_units() - ] + serialize_extraction(unit, include_binary=include_binary) for result in results + for extraction in _iter_result_tree( + result, include_email_attachments=include_email_attachments + ) + for unit in extraction.iterate_units() ] @@ -92,6 +110,34 @@ def _serialize_full_text(results: list[ExtractionInterface]) -> str: return "\n\n".join(result.get_full_text().rstrip() for result in results).rstrip() +def _expand_email_results( + results: list[ExtractionInterface], +) -> list[ExtractionInterface]: + """Expand email results with any supported extracted attachments.""" + expanded: list[ExtractionInterface] = [] + for result in results: + expanded.extend(_iter_result_tree(result, include_email_attachments=True)) + return expanded + + +def _iter_result_tree( + result: ExtractionInterface, *, include_email_attachments: bool +) -> Iterator[ExtractionInterface]: + """Yield a root result and optionally nested supported email attachments.""" + yield result + if not include_email_attachments or not isinstance(result, EmailContent): + return + for attachment in result.iterate_supported_attachments(): + yield from _iter_result_tree(attachment, include_email_attachments=True) + + +def _strip_email_attachments(results: list[ExtractionInterface]) -> None: + """Remove parsed attachment metadata/payloads from email results in-place.""" + for result in results: + if isinstance(result, EmailContent): + result.attachments = [] + + def main(argv: Sequence[str] | None = None) -> int: """Run the CLI and return a process-style exit code. @@ -151,18 +197,37 @@ def main(argv: Sequence[str] | None = None) -> int: ) if not results: raise RuntimeError(f"No extraction results for {args.file}") + if args.no_attachments: + _strip_email_attachments(results) if args.json or args.json_unit: include_binary = bool(args.include_images) payload = ( - _serialize_unit_results(results, include_binary=include_binary) + _serialize_unit_results( + results, + include_binary=include_binary, + include_email_attachments=not args.no_attachments, + ) if args.json_unit - else _serialize_results(results, include_binary=include_binary) + else _serialize_results( + ( + _expand_email_results(results) + if not args.no_attachments + else results + ), + include_binary=include_binary, + ) ) - json.dump(payload, output_stream) + json.dump(payload, output_stream, indent=4) output_stream.write("\n") else: - output_stream.write(_serialize_full_text(results)) + output_stream.write( + _serialize_full_text( + _expand_email_results(results) + if not args.no_attachments + else results + ) + ) output_stream.write("\n") finally: if output_file is not None: diff --git a/sharepoint2text/tests/test_cli.py b/sharepoint2text/tests/test_cli.py index 256c86b..3369475 100644 --- a/sharepoint2text/tests/test_cli.py +++ b/sharepoint2text/tests/test_cli.py @@ -5,6 +5,13 @@ from sharepoint2text.cli import _serialize_results, main from sharepoint2text.parsing.extractors.serialization import serialize_extraction +EMAIL_WITH_ATTACHMENT_PATH = Path( + "sharepoint2text/tests/resources/mails/msg_with_attachment.eml" +).resolve() +BASIC_EMAIL_PATH = Path( + "sharepoint2text/tests/resources/mails/basic_email.eml" +).resolve() + def test_cli_outputs_full_text_by_default(capsys) -> None: path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve() @@ -33,6 +40,22 @@ def test_cli_outputs_json_with_flag(capsys) -> None: assert payload == expected +def test_cli_outputs_json_with_short_flag(capsys) -> None: + path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve() + expected = [ + serialize_extraction( + next(sharepoint2text.read_file(path)), include_binary=False + ) + ] + + exit_code = main(["-j", "-f", str(path)]) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert payload == expected + + def test_serialize_results_returns_list_for_multiple_results() -> None: path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve() result = next(sharepoint2text.read_file(path)) @@ -47,10 +70,8 @@ def test_cli_outputs_json_unit_with_flag(capsys) -> None: path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve() result = next(sharepoint2text.read_file(path)) expected = [ - [ - serialize_extraction(unit, include_binary=False) - for unit in result.iterate_units() - ] + serialize_extraction(unit, include_binary=False) + for unit in result.iterate_units() ] exit_code = main(["--json-unit", "--file", str(path)]) @@ -61,6 +82,97 @@ def test_cli_outputs_json_unit_with_flag(capsys) -> None: assert payload == expected +def test_cli_outputs_json_unit_with_short_flag(capsys) -> None: + path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve() + result = next(sharepoint2text.read_file(path)) + expected = [ + serialize_extraction(unit, include_binary=False) + for unit in result.iterate_units() + ] + + exit_code = main(["-u", "-f", str(path)]) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert payload == expected + + +def test_cli_plain_text_extracts_email_content(capsys) -> None: + expected = next(sharepoint2text.read_file(BASIC_EMAIL_PATH)).get_full_text() + + exit_code = main(["--file", str(BASIC_EMAIL_PATH)]) + captured = capsys.readouterr() + + assert exit_code == 0 + assert captured.out == f"{expected}\n" + + +def test_cli_plain_text_extracts_supported_email_attachments(capsys) -> None: + exit_code = main(["--file", str(EMAIL_WITH_ATTACHMENT_PATH)]) + captured = capsys.readouterr() + + assert exit_code == 0 + assert "This is a test sentence" in captured.out + assert "The slide title" in captured.out + + +def test_cli_json_extracts_supported_email_attachments(capsys) -> None: + exit_code = main(["--json", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)]) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert isinstance(payload, list) + assert {item["_type"] for item in payload} == { + "EmailContent", + "PdfContent", + "PptxContent", + } + + +def test_cli_json_no_attachments_excludes_email_attachments(capsys) -> None: + exit_code = main( + ["--json", "--no-attachments", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)] + ) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert isinstance(payload, list) + assert {item["_type"] for item in payload} == {"EmailContent"} + assert payload[0]["attachments"] == [] + + +def test_cli_json_no_attachments_excludes_email_attachments_with_short_flag( + capsys, +) -> None: + exit_code = main(["-j", "-n", "-f", str(EMAIL_WITH_ATTACHMENT_PATH)]) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert isinstance(payload, list) + assert {item["_type"] for item in payload} == {"EmailContent"} + assert payload[0]["attachments"] == [] + + +def test_cli_json_unit_extracts_supported_email_attachments(capsys) -> None: + exit_code = main(["--json-unit", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)]) + captured = capsys.readouterr() + + assert exit_code == 0 + payload = json.loads(captured.out.strip()) + assert isinstance(payload, list) + + unit_types = { + unit["_type"] for unit in payload if isinstance(unit, dict) and "_type" in unit + } + assert "EmailUnit" in unit_types + assert "PdfUnit" in unit_types + assert "PptxUnit" in unit_types + + def _contains_binary_markers(value: object) -> bool: if isinstance(value, dict): if "_bytes" in value or "_bytesio" in value: @@ -100,12 +212,11 @@ def test_cli_outputs_json_unit_without_images(capsys) -> None: payload = json.loads(captured.out.strip()) assert isinstance(payload, list) assert len(payload) > 0 - assert isinstance(payload[0], list) - assert payload[0][0]["_type"] == "PdfUnit" + assert payload[0]["_type"] == "PdfUnit" assert _contains_binary_markers(payload) is False # Images are not extracted by default - images = payload[0][0]["images"] + images = payload[0]["images"] assert len(images) == 0 @@ -137,11 +248,10 @@ def test_cli_outputs_json_unit_with_binary_payloads_when_requested(capsys) -> No payload = json.loads(captured.out.strip()) assert isinstance(payload, list) assert len(payload) > 0 - assert isinstance(payload[0], list) - assert payload[0][0]["_type"] == "PdfUnit" + assert payload[0]["_type"] == "PdfUnit" assert _contains_binary_markers(payload) is True - images = payload[0][0]["images"] + images = payload[0]["images"] assert len(images) > 0 assert isinstance(images[0]["data"], dict) assert "_bytesio" in images[0]["data"] or "_bytes" in images[0]["data"]