Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,13 @@ sharepoint2text --file /path/to/file.docx --json > extraction.json

| Option | Description |
|---|---|
| `--file FILE` | Required input file |
| `--file FILE`, `-f FILE` | Required input file |
| `--output FILE`, `-o FILE` | Write output to file (default: stdout) |
| `--json` | Emit `list[extraction_object]` |
| `--json-unit` | Emit `list[list[unit_object]]` |
| `--include-images` | Include binary image payloads as base64 in JSON output |
| `--version` | Print CLI version |
| `--json`, `-j` | Emit `list[extraction_object]` |
| `--json-unit`, `-u` | Emit `list[unit_object]` |
| `--include-images`, `-i` | Include binary image payloads as base64 in JSON output |
| `--no-attachments`, `-n` | Exclude email attachments from CLI extraction output |
| `--version`, `-v` | Print CLI version |

Rules:

Expand Down
93 changes: 79 additions & 14 deletions sharepoint2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
import json
import sys
from pathlib import Path
from typing import Sequence, TextIO
from typing import Iterator, Sequence, TextIO

import sharepoint2text
from sharepoint2text.parsing.extractors.data_types import ExtractionInterface
from sharepoint2text.parsing.extractors.data_types import (
EmailContent,
ExtractionInterface,
)
from sharepoint2text.parsing.extractors.serialization import serialize_extraction


Expand All @@ -18,35 +21,47 @@ def _build_parser() -> argparse.ArgumentParser:
description="Extract file content and emit full text to stdout (or JSON with --json/--json-unit).",
)
parser.add_argument(
"-v",
"--version",
action="version",
version=f"%(prog)s {sharepoint2text.__version__}",
help="Show the version and exit.",
)
parser.add_argument(
"-f",
"--file",
type=Path,
required=True,
help="Path to the file to extract.",
)
output_group = parser.add_mutually_exclusive_group()
output_group.add_argument(
"-j",
"--json",
action="store_true",
help="Emit structured JSON instead of plain full text (omits binary payloads by default).",
)
output_group.add_argument(
"-u",
"--json-unit",
dest="json_unit",
action="store_true",
help="Emit JSON for extracted text units instead of full extraction objects (omits binary payloads by default).",
)
parser.add_argument(
"-i",
"--include-images",
dest="include_images",
action="store_true",
help="Extract images from the file and include image data as base64 blobs in JSON output (default: images are ignored for faster processing).",
)
parser.add_argument(
"-n",
"--no-attachments",
dest="no_attachments",
action="store_true",
help="For email files, exclude supported attachments from CLI extraction output.",
)
parser.add_argument(
"--output",
"-o",
Expand All @@ -71,19 +86,22 @@ def _serialize_results(


def _serialize_unit_results(
results: list[ExtractionInterface], *, include_binary: bool
) -> list[list[dict]]:
results: list[ExtractionInterface],
*,
include_binary: bool,
include_email_attachments: bool = False,
) -> list[dict]:
"""Serialize per-unit output for ``--json-unit`` mode.

Always returns ``list[list[dict]]`` so each extraction result keeps a stable
unit-list boundary.
Returns a flat ``list[dict]`` with one dictionary per extracted unit.
"""
return [
[
serialize_extraction(unit, include_binary=include_binary)
for unit in result.iterate_units()
]
serialize_extraction(unit, include_binary=include_binary)
for result in results
for extraction in _iter_result_tree(
result, include_email_attachments=include_email_attachments
)
for unit in extraction.iterate_units()
]


Expand All @@ -92,6 +110,34 @@ def _serialize_full_text(results: list[ExtractionInterface]) -> str:
return "\n\n".join(result.get_full_text().rstrip() for result in results).rstrip()


def _expand_email_results(
results: list[ExtractionInterface],
) -> list[ExtractionInterface]:
"""Expand email results with any supported extracted attachments."""
expanded: list[ExtractionInterface] = []
for result in results:
expanded.extend(_iter_result_tree(result, include_email_attachments=True))
return expanded


def _iter_result_tree(
result: ExtractionInterface, *, include_email_attachments: bool
) -> Iterator[ExtractionInterface]:
"""Yield a root result and optionally nested supported email attachments."""
yield result
if not include_email_attachments or not isinstance(result, EmailContent):
return
for attachment in result.iterate_supported_attachments():
yield from _iter_result_tree(attachment, include_email_attachments=True)


def _strip_email_attachments(results: list[ExtractionInterface]) -> None:
"""Remove parsed attachment metadata/payloads from email results in-place."""
for result in results:
if isinstance(result, EmailContent):
result.attachments = []


def main(argv: Sequence[str] | None = None) -> int:
"""Run the CLI and return a process-style exit code.

Expand Down Expand Up @@ -151,18 +197,37 @@ def main(argv: Sequence[str] | None = None) -> int:
)
if not results:
raise RuntimeError(f"No extraction results for {args.file}")
if args.no_attachments:
_strip_email_attachments(results)

if args.json or args.json_unit:
include_binary = bool(args.include_images)
payload = (
_serialize_unit_results(results, include_binary=include_binary)
_serialize_unit_results(
results,
include_binary=include_binary,
include_email_attachments=not args.no_attachments,
)
if args.json_unit
else _serialize_results(results, include_binary=include_binary)
else _serialize_results(
(
_expand_email_results(results)
if not args.no_attachments
else results
),
include_binary=include_binary,
)
)
json.dump(payload, output_stream)
json.dump(payload, output_stream, indent=4)
output_stream.write("\n")
else:
output_stream.write(_serialize_full_text(results))
output_stream.write(
_serialize_full_text(
_expand_email_results(results)
if not args.no_attachments
else results
)
)
output_stream.write("\n")
finally:
if output_file is not None:
Expand Down
130 changes: 120 additions & 10 deletions sharepoint2text/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
from sharepoint2text.cli import _serialize_results, main
from sharepoint2text.parsing.extractors.serialization import serialize_extraction

EMAIL_WITH_ATTACHMENT_PATH = Path(
"sharepoint2text/tests/resources/mails/msg_with_attachment.eml"
).resolve()
BASIC_EMAIL_PATH = Path(
"sharepoint2text/tests/resources/mails/basic_email.eml"
).resolve()


def test_cli_outputs_full_text_by_default(capsys) -> None:
path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve()
Expand Down Expand Up @@ -33,6 +40,22 @@ def test_cli_outputs_json_with_flag(capsys) -> None:
assert payload == expected


def test_cli_outputs_json_with_short_flag(capsys) -> None:
path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve()
expected = [
serialize_extraction(
next(sharepoint2text.read_file(path)), include_binary=False
)
]

exit_code = main(["-j", "-f", str(path)])
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert payload == expected


def test_serialize_results_returns_list_for_multiple_results() -> None:
path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve()
result = next(sharepoint2text.read_file(path))
Expand All @@ -47,10 +70,8 @@ def test_cli_outputs_json_unit_with_flag(capsys) -> None:
path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve()
result = next(sharepoint2text.read_file(path))
expected = [
[
serialize_extraction(unit, include_binary=False)
for unit in result.iterate_units()
]
serialize_extraction(unit, include_binary=False)
for unit in result.iterate_units()
]

exit_code = main(["--json-unit", "--file", str(path)])
Expand All @@ -61,6 +82,97 @@ def test_cli_outputs_json_unit_with_flag(capsys) -> None:
assert payload == expected


def test_cli_outputs_json_unit_with_short_flag(capsys) -> None:
path = Path("sharepoint2text/tests/resources/plain_text/plain.txt").resolve()
result = next(sharepoint2text.read_file(path))
expected = [
serialize_extraction(unit, include_binary=False)
for unit in result.iterate_units()
]

exit_code = main(["-u", "-f", str(path)])
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert payload == expected


def test_cli_plain_text_extracts_email_content(capsys) -> None:
expected = next(sharepoint2text.read_file(BASIC_EMAIL_PATH)).get_full_text()

exit_code = main(["--file", str(BASIC_EMAIL_PATH)])
captured = capsys.readouterr()

assert exit_code == 0
assert captured.out == f"{expected}\n"


def test_cli_plain_text_extracts_supported_email_attachments(capsys) -> None:
exit_code = main(["--file", str(EMAIL_WITH_ATTACHMENT_PATH)])
captured = capsys.readouterr()

assert exit_code == 0
assert "This is a test sentence" in captured.out
assert "The slide title" in captured.out


def test_cli_json_extracts_supported_email_attachments(capsys) -> None:
exit_code = main(["--json", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)])
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)
assert {item["_type"] for item in payload} == {
"EmailContent",
"PdfContent",
"PptxContent",
}


def test_cli_json_no_attachments_excludes_email_attachments(capsys) -> None:
exit_code = main(
["--json", "--no-attachments", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)]
)
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)
assert {item["_type"] for item in payload} == {"EmailContent"}
assert payload[0]["attachments"] == []


def test_cli_json_no_attachments_excludes_email_attachments_with_short_flag(
capsys,
) -> None:
exit_code = main(["-j", "-n", "-f", str(EMAIL_WITH_ATTACHMENT_PATH)])
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)
assert {item["_type"] for item in payload} == {"EmailContent"}
assert payload[0]["attachments"] == []


def test_cli_json_unit_extracts_supported_email_attachments(capsys) -> None:
exit_code = main(["--json-unit", "--file", str(EMAIL_WITH_ATTACHMENT_PATH)])
captured = capsys.readouterr()

assert exit_code == 0
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)

unit_types = {
unit["_type"] for unit in payload if isinstance(unit, dict) and "_type" in unit
}
assert "EmailUnit" in unit_types
assert "PdfUnit" in unit_types
assert "PptxUnit" in unit_types


def _contains_binary_markers(value: object) -> bool:
if isinstance(value, dict):
if "_bytes" in value or "_bytesio" in value:
Expand Down Expand Up @@ -100,12 +212,11 @@ def test_cli_outputs_json_unit_without_images(capsys) -> None:
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)
assert len(payload) > 0
assert isinstance(payload[0], list)
assert payload[0][0]["_type"] == "PdfUnit"
assert payload[0]["_type"] == "PdfUnit"
assert _contains_binary_markers(payload) is False

# Images are not extracted by default
images = payload[0][0]["images"]
images = payload[0]["images"]
assert len(images) == 0


Expand Down Expand Up @@ -137,11 +248,10 @@ def test_cli_outputs_json_unit_with_binary_payloads_when_requested(capsys) -> No
payload = json.loads(captured.out.strip())
assert isinstance(payload, list)
assert len(payload) > 0
assert isinstance(payload[0], list)
assert payload[0][0]["_type"] == "PdfUnit"
assert payload[0]["_type"] == "PdfUnit"
assert _contains_binary_markers(payload) is True

images = payload[0][0]["images"]
images = payload[0]["images"]
assert len(images) > 0
assert isinstance(images[0]["data"], dict)
assert "_bytesio" in images[0]["data"] or "_bytes" in images[0]["data"]
Expand Down