-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy pathpyproject.toml
More file actions
83 lines (77 loc) · 2.11 KB
/
pyproject.toml
File metadata and controls
83 lines (77 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
[build-system]
requires = ["setuptools>=69", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "glossapi"
version = "0.1.3"
description = "Academic document processing pipeline with Rust-powered markdown cleaning"
authors = [
{name = "GlossAPI Team", email = "glossapi.team@eellak.gr"}
]
readme = "README.md"
requires-python = ">=3.8"
dependencies = [
# Core pipeline deps
"pandas>=1.3.0",
"numpy<2", # ORT+RapidOCR best compatibility
"scikit-learn==1.6.1",
"joblib>=1.0.0",
"dask>=2022.1.0",
"pyarrow>=7.0.0",
"aiohttp>=3.8.0",
"aiofiles>=23.0.0",
"ftfy>=6.0.0",
"tenacity>=8.0.0",
"tqdm>=4.67.0",
"pyyaml>=6.0",
# Math/JSON enrichment helpers
"pypdfium2>=4.0.0",
"zstandard>=0.22.0",
]
license = "EUPL-1.2"
classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Rust",
]
[project.optional-dependencies]
# Docling + RapidOCR ONNX stack (kept optional to preserve import-light installs)
rapidocr = [
"docling==2.48.0",
# Use RapidOCR core package; avoid rapidocr_onnxruntime to prevent pip
# from auto-installing the CPU-only 'onnxruntime' wheel.
"rapidocr>=3.3.0",
"onnxruntime-gpu==1.18.1",
]
# Optional CUDA layout acceleration (Docling)
cuda = [
"torch==2.5.1",
"torchvision==0.20.1",
]
# DeepSeek OCR backend extras (CUDA 12.1 build of vLLM). Torch is not pinned here
# because users should install the CUDA wheel from the PyTorch index
# (see docs: installing torch==2.5.1+cu121 via extra index URL).
deepseek = [
"vllm>=0.11.0",
"transformers>=4.45,<5",
"accelerate>=1.2.1,<2",
"pymupdf==1.24.10",
"Pillow==10.4.0",
]
docs = [
"mkdocs>=1.5",
"mkdocs-material>=9.5",
]
[tool.setuptools]
package-dir = {"" = "src"}
include-package-data = true
[tool.setuptools.packages.find]
where = ["src"]
include = ["glossapi", "glossapi.*"]
[tool.setuptools.package-data]
glossapi = ["models/**/*"]
[tool.pytest.ini_options]
markers = [
"rapidocr: requires the RapidOCR/Docling execution stack",
"deepseek: exercises the DeepSeek OCR pipeline",
]