Skip to content

Commit 3db7b4f

Browse files
badGarnetlawrence-u10dclauderyannikolaidis
authored
Fix: replace nltk with spacy CVE 2025 14009 (#4255)
Co-authored-by: Lawrence Elitzer <[email protected]> Co-authored-by: Claude Opus 4.6 <[email protected]> Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: badGarnet <[email protected]>
1 parent a8f14ba commit 3db7b4f

File tree

38 files changed

+619
-337
lines changed

38 files changed

+619
-337
lines changed

.github/actions/base-cache/action.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
name: 'Base Cache Build'
32
description: 'Set up uv and install project dependencies'
43
inputs:
@@ -20,4 +19,3 @@ runs:
2019
shell: bash
2120
run: |
2221
uv sync --locked --all-extras --all-groups
23-
make install-nltk-models

.github/workflows/ci.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ permissions:
1212
id-token: write
1313
contents: read
1414

15-
env:
16-
NLTK_DATA: ${{ github.workspace }}/nltk_data
17-
1815
jobs:
1916
setup:
2017
strategy:
@@ -123,7 +120,6 @@ jobs:
123120
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
124121
run: |
125122
uv sync --locked --group test
126-
make install-nltk-models
127123
make test-no-extras CI=true
128124
129125
test_unit_dependency_extras:
@@ -163,7 +159,6 @@ jobs:
163159
- name: Install extra dependencies
164160
run: |
165161
uv sync --locked ${{ matrix.uv-extras }} --group test
166-
make install-nltk-models
167162
- name: Install system dependencies
168163
run: |
169164
sudo apt-get update

.github/workflows/codeflash.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ jobs:
1616
name: Optimize new Python code
1717
if: ${{ github.actor != 'codeflash-ai[bot]' }}
1818
runs-on: ubuntu-latest
19-
env:
20-
NLTK_DATA: ${{ github.workspace }}/nltk_data
2119
steps:
2220
- uses: actions/checkout@v4
2321
with:

.github/workflows/ingest-test-fixtures-update-pr.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ on:
55

66
env:
77
PYTHON_VERSION: "3.12"
8-
NLTK_DATA: ${{ github.workspace }}/nltk_data
98

109
permissions:
1110
id-token: write

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ wheels/
2424
pip-wheel-metadata/
2525
share/python-wheels/
2626
*.egg-info/
27-
nltk_data/
2827
.installed.cfg
2928
*.egg
3029
MANIFEST

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.21.0
2+
3+
### Fixes
4+
- **Replace NLTK with spaCy to remediate CVE-2025-14009**: NLTK's downloader uses `zipfile.extractall()` without path validation, enabling RCE via malicious packages (CVSS 10.0, no patch available). spaCy models install as pip packages, eliminating the vulnerable downloader entirely.
5+
16
## 0.20.8
27

38
### Fixes

Dockerfile

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,11 @@ RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh
6868
WORKDIR /app
6969

7070
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
71-
ENV NLTK_DATA=/home/notebook-user/nltk_data
7271
ENV UV_COMPILE_BYTECODE=1
7372
ENV UV_PYTHON_DOWNLOADS=never
7473

75-
# Install Python dependencies via uv and download required NLTK packages
74+
# Install Python dependencies via uv (en-core-web-sm is declared in pyproject.toml)
7675
RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \
77-
mkdir -p ${NLTK_DATA} && \
78-
uv run --no-sync $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
7976
uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
8077
uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
8178

Makefile

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,12 @@ help: Makefile
1414
.PHONY: install
1515
install:
1616
@uv sync --locked --all-extras --all-groups
17-
@$(MAKE) install-nltk-models
1817

1918
## lock: update and lock all dependencies
2019
.PHONY: lock
2120
lock:
2221
@uv lock --upgrade
2322

24-
.PHONY: install-nltk-models
25-
install-nltk-models:
26-
uv run --no-sync python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
27-
2823

2924
#################
3025
# Test and Lint #

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@ dependencies = [
2828
"html5lib>=1.1, <2.0.0",
2929
"langdetect>=1.0.9, <2.0.0",
3030
"lxml>=5.0.0, <7.0.0",
31-
"nltk>=3.9.2, <4.0.0",
31+
"spacy>=3.7.0, <4.0.0",
32+
"en-core-web-sm>=3.8.0, <4.0.0",
3233
"numba>=0.60.0, <1.0.0",
3334
"numpy>=1.26.0, <3.0.0",
3435
"psutil>=7.2.2, <8.0.0",
3536
"python-iso639>=2026.1.31, <2027.0.0",
3637
"python-magic>=0.4.27, <1.0.0",
3738
"python-oxmsg>=0.0.2, <1.0.0",
3839
"rapidfuzz>=3.14.3, <4.0.0",
40+
"regex>=2024.0.0, <2027.0.0",
3941
"requests>=2.32.5, <3.0.0",
4042
"tqdm>=4.67.3, <5.0.0",
4143
"typing-extensions>=4.15.0, <5.0.0",
@@ -179,6 +181,9 @@ release = [
179181
"twine>=6.0.0, <7.0.0",
180182
]
181183

184+
[tool.uv.sources]
185+
en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
186+
182187
[tool.uv]
183188
required-environments = [
184189
"sys_platform == 'linux' and platform_machine == 'x86_64'",

test_unstructured/metrics/test_element_type.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
"sample-presentation.pptx",
2828
{
2929
("Title", 0): 4,
30-
("Title", 1): 1,
31-
("NarrativeText", 0): 3,
30+
("Title", 1): 2,
31+
("NarrativeText", 0): 2,
3232
("PageBreak", None): 3,
3333
("ListItem", 0): 6,
3434
("ListItem", 1): 6,
@@ -68,7 +68,7 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
6868
("ListItem", 2): 3,
6969
("Table", None): 1,
7070
},
71-
(0.96, 0.96, 0.96),
71+
(0.92, 0.92, 0.92),
7272
),
7373
(
7474
"handbook-1p.docx",

0 commit comments

Comments
 (0)