Skip to content

Commit 87bb56a

Browse files
CLN: migrate pandas-specific docstring validation into numpydoc via Sphinx build
Made-with: Cursor
1 parent 1f2248c commit 87bb56a

7 files changed

Lines changed: 294 additions & 975 deletions

File tree

.github/workflows/code-checks.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,6 @@ jobs:
6262
run: ci/code_checks.sh code
6363
if: ${{ steps.build.outcome == 'success' && always() }}
6464

65-
- name: Run docstring validation
66-
run: ci/code_checks.sh docstrings
67-
if: ${{ steps.build.outcome == 'success' && always() }}
68-
6965
- name: Run check of documentation notebooks
7066
run: ci/code_checks.sh notebooks
7167
if: ${{ steps.build.outcome == 'success' && always() }}

ci/code_checks.sh

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33
# Run checks related to code quality.
44
#
55
# This script is intended for both the CI and to check locally that code standards are
6-
# respected. We run doctests here (currently some files only), and we
7-
# validate formatting error in docstrings.
6+
# respected. We run doctests here (currently some files only).
7+
#
8+
# Docstring validation (both standard numpydoc and pandas-specific checks GL04,
9+
# PD01, SA05, EX04) is enforced via the numpydoc extension during the Sphinx build
10+
# (see doc/source/conf.py).
811
#
912
# Usage:
1013
# $ ./ci/code_checks.sh # run all checks
1114
# $ ./ci/code_checks.sh code # checks on imported code
1215
# $ ./ci/code_checks.sh doctests # run doctests
13-
# $ ./ci/code_checks.sh docstrings # validate docstring errors
1416
# $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free
1517
# $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks
1618

@@ -23,10 +25,9 @@ else
2325
CHECK=""
2426
fi
2527

26-
[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
27-
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; }
28+
[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
29+
{ echo "Unknown command $1. Usage: $0 [code|doctests|single-docs|notebooks]"; exit 1; }
2830

29-
BASE_DIR="$(dirname "$0")/.."
3031
RET=0
3132

3233
### CODE ###
@@ -63,17 +64,6 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
6364

6465
fi
6566

66-
### DOCSTRINGS ###
67-
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
68-
69-
MSG='Validate Docstrings' ; echo "$MSG"
70-
python "$BASE_DIR"/scripts/validate_docstrings.py \
71-
--format=actions
72-
73-
RET=$(($RET + $?)) ; echo $MSG "DONE"
74-
75-
fi
76-
7767
### DOCUMENTATION NOTEBOOKS ###
7868
if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then
7969

doc/source/conf.py

Lines changed: 272 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# All configuration values have a default; values that are commented out
1111
# serve to show the default.
1212
from datetime import datetime
13+
import doctest as _doctest
1314
import importlib
1415
import inspect
1516
import logging
@@ -20,10 +21,87 @@
2021

2122
import jinja2
2223
from numpydoc.docscrape import NumpyDocString
24+
import numpydoc.validate as _numpydoc_validate
2325
from sphinx.ext.autosummary import _import_by_name
2426

2527
logger = logging.getLogger(__name__)
2628

29+
# ---------------------------------------------------------------------------
30+
# Monkey-patch numpydoc validation to include pandas-specific checks.
31+
#
32+
# numpydoc has no plugin system, so we inject four custom error codes into its
33+
# ERROR_MSGS dict and wrap the ``validate`` function. Because conf.py executes
34+
# before Sphinx loads extensions, the later
35+
# ``from .validate import validate`` inside numpydoc.numpydoc picks up the
36+
# patched version automatically.
37+
# ---------------------------------------------------------------------------
38+
_PANDAS_PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"]
39+
40+
_numpydoc_validate.ERROR_MSGS.update(
41+
{
42+
"GL04": "Private classes ({mentioned_private_classes}) should not be "
43+
"mentioned in public docstrings",
44+
"PD01": "Use 'array-like' rather than 'array_like' in docstrings.",
45+
"SA05": "{reference_name} in `See Also` section does not need `pandas` "
46+
"prefix, use {right_reference} instead.",
47+
"EX04": "Do not import {imported_library}, as it is imported "
48+
"automatically for the examples (numpy as np, pandas as pd)",
49+
}
50+
)
51+
52+
_original_numpydoc_validate = _numpydoc_validate.validate
53+
54+
55+
def _pandas_validate(obj_name, validator_cls=None, **validator_kwargs):
56+
result = _original_numpydoc_validate(
57+
obj_name, validator_cls=validator_cls, **validator_kwargs
58+
)
59+
60+
docstring = result.get("docstring", "")
61+
if not docstring:
62+
return result
63+
64+
errors = result["errors"]
65+
66+
# GL04: private classes mentioned in public docstrings
67+
mentioned = [klass for klass in _PANDAS_PRIVATE_CLASSES if klass in docstring]
68+
if mentioned:
69+
errors.append(
70+
_numpydoc_validate.error(
71+
"GL04", mentioned_private_classes=", ".join(mentioned)
72+
)
73+
)
74+
75+
# PD01: array_like instead of array-like
76+
if "array_like" in docstring:
77+
errors.append(_numpydoc_validate.error("PD01"))
78+
79+
# SA05: See Also references with unnecessary pandas. prefix
80+
doc = NumpyDocString(docstring)
81+
for ref_group in doc["See Also"]:
82+
for ref_name, _ in ref_group[0]:
83+
if ref_name.startswith("pandas."):
84+
errors.append(
85+
_numpydoc_validate.error(
86+
"SA05",
87+
reference_name=ref_name,
88+
right_reference=ref_name[len("pandas.") :],
89+
)
90+
)
91+
92+
# EX04: redundant numpy/pandas imports in examples
93+
examples_source = "".join(
94+
line.source for line in _doctest.DocTestParser().get_examples(docstring)
95+
)
96+
for lib in ("numpy", "pandas"):
97+
if f"import {lib}" in examples_source:
98+
errors.append(_numpydoc_validate.error("EX04", imported_library=lib))
99+
100+
return result
101+
102+
103+
_numpydoc_validate.validate = _pandas_validate
104+
27105
# https://github.com/sphinx-doc/sphinx/pull/2325/files
28106
# Workaround for sphinx-build recursion limit overflow:
29107
# pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL)
@@ -116,7 +194,10 @@
116194
elif single_doc and rel_fname != pattern:
117195
if "\\" in rel_fname:
118196
rel_fname = rel_fname.replace("\\", "/")
119-
exclude_patterns.append(rel_fname)
197+
# Keep the autosummary stub needed by the single-doc build.
198+
api_stub = os.path.join("reference", "api", f"pandas.{pattern}.rst")
199+
if rel_fname != api_stub:
200+
exclude_patterns.append(rel_fname)
120201

121202
with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f:
122203
t = jinja2.Template(f.read())
@@ -134,6 +215,196 @@
134215
numpydoc_show_class_members = False
135216
numpydoc_show_inherited_class_members = False
136217
numpydoc_attributes_as_param_list = False
218+
numpydoc_validation_checks = {"all"}
219+
numpydoc_validation_exclude = {
220+
# Jinja2 Styler template attributes (docstrings not owned by pandas)
221+
r"pandas\.io\.formats\.style\.Styler\.env$",
222+
r"pandas\.io\.formats\.style\.Styler\.template_html$",
223+
r"pandas\.io\.formats\.style\.Styler\.template_html_style$",
224+
r"pandas\.io\.formats\.style\.Styler\.template_html_table$",
225+
r"pandas\.io\.formats\.style\.Styler\.template_latex$",
226+
r"pandas\.io\.formats\.style\.Styler\.template_typst$",
227+
r"pandas\.io\.formats\.style\.Styler\.template_string$",
228+
r"pandas\.io\.formats\.style\.Styler\.loader$",
229+
# Error/warning classes with no numpydoc-style docstrings
230+
r"pandas\.errors\.InvalidComparison$",
231+
r"pandas\.errors\.LossySetitemError$",
232+
r"pandas\.errors\.NoBufferPresent$",
233+
r"pandas\.errors\.IncompatibilityWarning$",
234+
r"pandas\.errors\.PyperclipException$",
235+
r"pandas\.errors\.PyperclipWindowsException$",
236+
# Offset .base properties
237+
r"pandas\.tseries\.offsets\.DateOffset\.base$",
238+
r"pandas\.tseries\.offsets\.BusinessDay\.base$",
239+
r"pandas\.tseries\.offsets\.BusinessHour\.base$",
240+
r"pandas\.tseries\.offsets\.CustomBusinessDay\.base$",
241+
r"pandas\.tseries\.offsets\.CustomBusinessHour\.base$",
242+
r"pandas\.tseries\.offsets\.MonthEnd\.base$",
243+
r"pandas\.tseries\.offsets\.MonthBegin\.base$",
244+
r"pandas\.tseries\.offsets\.BusinessMonthEnd\.base$",
245+
r"pandas\.tseries\.offsets\.BusinessMonthBegin\.base$",
246+
r"pandas\.tseries\.offsets\.CustomBusinessMonthEnd\.base$",
247+
r"pandas\.tseries\.offsets\.CustomBusinessMonthBegin\.base$",
248+
r"pandas\.tseries\.offsets\.SemiMonthEnd\.base$",
249+
r"pandas\.tseries\.offsets\.SemiMonthBegin\.base$",
250+
r"pandas\.tseries\.offsets\.Week\.base$",
251+
r"pandas\.tseries\.offsets\.WeekOfMonth\.base$",
252+
r"pandas\.tseries\.offsets\.LastWeekOfMonth\.base$",
253+
r"pandas\.tseries\.offsets\.BQuarterEnd\.base$",
254+
r"pandas\.tseries\.offsets\.BQuarterBegin\.base$",
255+
r"pandas\.tseries\.offsets\.QuarterEnd\.base$",
256+
r"pandas\.tseries\.offsets\.QuarterBegin\.base$",
257+
r"pandas\.tseries\.offsets\.BHalfYearEnd\.base$",
258+
r"pandas\.tseries\.offsets\.BHalfYearBegin\.base$",
259+
r"pandas\.tseries\.offsets\.HalfYearEnd\.base$",
260+
r"pandas\.tseries\.offsets\.HalfYearBegin\.base$",
261+
r"pandas\.tseries\.offsets\.BYearEnd\.base$",
262+
r"pandas\.tseries\.offsets\.BYearBegin\.base$",
263+
r"pandas\.tseries\.offsets\.YearEnd\.base$",
264+
r"pandas\.tseries\.offsets\.YearBegin\.base$",
265+
r"pandas\.tseries\.offsets\.FY5253\.base$",
266+
r"pandas\.tseries\.offsets\.FY5253Quarter\.base$",
267+
r"pandas\.tseries\.offsets\.Easter\.base$",
268+
r"pandas\.tseries\.offsets\.Tick\.base$",
269+
r"pandas\.tseries\.offsets\.Day\.base$",
270+
r"pandas\.tseries\.offsets\.Hour\.base$",
271+
r"pandas\.tseries\.offsets\.Minute\.base$",
272+
r"pandas\.tseries\.offsets\.Second\.base$",
273+
r"pandas\.tseries\.offsets\.Milli\.base$",
274+
r"pandas\.tseries\.offsets\.Micro\.base$",
275+
r"pandas\.tseries\.offsets\.Nano\.base$",
276+
# Offset rollback methods
277+
r"pandas\.tseries\.offsets\.DateOffset\.rollback$",
278+
r"pandas\.tseries\.offsets\.BusinessDay\.rollback$",
279+
r"pandas\.tseries\.offsets\.BusinessHour\.rollback$",
280+
r"pandas\.tseries\.offsets\.CustomBusinessDay\.rollback$",
281+
r"pandas\.tseries\.offsets\.CustomBusinessHour\.rollback$",
282+
r"pandas\.tseries\.offsets\.MonthEnd\.rollback$",
283+
r"pandas\.tseries\.offsets\.MonthBegin\.rollback$",
284+
r"pandas\.tseries\.offsets\.BusinessMonthEnd\.rollback$",
285+
r"pandas\.tseries\.offsets\.BusinessMonthBegin\.rollback$",
286+
r"pandas\.tseries\.offsets\.CustomBusinessMonthEnd\.rollback$",
287+
r"pandas\.tseries\.offsets\.CustomBusinessMonthBegin\.rollback$",
288+
r"pandas\.tseries\.offsets\.SemiMonthEnd\.rollback$",
289+
r"pandas\.tseries\.offsets\.SemiMonthBegin\.rollback$",
290+
r"pandas\.tseries\.offsets\.Week\.rollback$",
291+
r"pandas\.tseries\.offsets\.WeekOfMonth\.rollback$",
292+
r"pandas\.tseries\.offsets\.LastWeekOfMonth\.rollback$",
293+
r"pandas\.tseries\.offsets\.BQuarterEnd\.rollback$",
294+
r"pandas\.tseries\.offsets\.BQuarterBegin\.rollback$",
295+
r"pandas\.tseries\.offsets\.QuarterEnd\.rollback$",
296+
r"pandas\.tseries\.offsets\.QuarterBegin\.rollback$",
297+
r"pandas\.tseries\.offsets\.BHalfYearEnd\.rollback$",
298+
r"pandas\.tseries\.offsets\.BHalfYearBegin\.rollback$",
299+
r"pandas\.tseries\.offsets\.HalfYearEnd\.rollback$",
300+
r"pandas\.tseries\.offsets\.HalfYearBegin\.rollback$",
301+
r"pandas\.tseries\.offsets\.BYearEnd\.rollback$",
302+
r"pandas\.tseries\.offsets\.BYearBegin\.rollback$",
303+
r"pandas\.tseries\.offsets\.YearEnd\.rollback$",
304+
r"pandas\.tseries\.offsets\.YearBegin\.rollback$",
305+
r"pandas\.tseries\.offsets\.FY5253\.rollback$",
306+
r"pandas\.tseries\.offsets\.FY5253Quarter\.rollback$",
307+
r"pandas\.tseries\.offsets\.Easter\.rollback$",
308+
r"pandas\.tseries\.offsets\.Tick\.rollback$",
309+
r"pandas\.tseries\.offsets\.Day\.rollback$",
310+
r"pandas\.tseries\.offsets\.Hour\.rollback$",
311+
r"pandas\.tseries\.offsets\.Minute\.rollback$",
312+
r"pandas\.tseries\.offsets\.Second\.rollback$",
313+
r"pandas\.tseries\.offsets\.Milli\.rollback$",
314+
r"pandas\.tseries\.offsets\.Micro\.rollback$",
315+
r"pandas\.tseries\.offsets\.Nano\.rollback$",
316+
# Offset rollforward methods
317+
r"pandas\.tseries\.offsets\.DateOffset\.rollforward$",
318+
r"pandas\.tseries\.offsets\.BusinessDay\.rollforward$",
319+
r"pandas\.tseries\.offsets\.BusinessHour\.rollforward$",
320+
r"pandas\.tseries\.offsets\.CustomBusinessDay\.rollforward$",
321+
r"pandas\.tseries\.offsets\.CustomBusinessHour\.rollforward$",
322+
r"pandas\.tseries\.offsets\.MonthEnd\.rollforward$",
323+
r"pandas\.tseries\.offsets\.MonthBegin\.rollforward$",
324+
r"pandas\.tseries\.offsets\.BusinessMonthEnd\.rollforward$",
325+
r"pandas\.tseries\.offsets\.BusinessMonthBegin\.rollforward$",
326+
r"pandas\.tseries\.offsets\.CustomBusinessMonthEnd\.rollforward$",
327+
r"pandas\.tseries\.offsets\.CustomBusinessMonthBegin\.rollforward$",
328+
r"pandas\.tseries\.offsets\.SemiMonthEnd\.rollforward$",
329+
r"pandas\.tseries\.offsets\.SemiMonthBegin\.rollforward$",
330+
r"pandas\.tseries\.offsets\.Week\.rollforward$",
331+
r"pandas\.tseries\.offsets\.WeekOfMonth\.rollforward$",
332+
r"pandas\.tseries\.offsets\.LastWeekOfMonth\.rollforward$",
333+
r"pandas\.tseries\.offsets\.BQuarterEnd\.rollforward$",
334+
r"pandas\.tseries\.offsets\.BQuarterBegin\.rollforward$",
335+
r"pandas\.tseries\.offsets\.QuarterEnd\.rollforward$",
336+
r"pandas\.tseries\.offsets\.QuarterBegin\.rollforward$",
337+
r"pandas\.tseries\.offsets\.BHalfYearEnd\.rollforward$",
338+
r"pandas\.tseries\.offsets\.BHalfYearBegin\.rollforward$",
339+
r"pandas\.tseries\.offsets\.HalfYearEnd\.rollforward$",
340+
r"pandas\.tseries\.offsets\.HalfYearBegin\.rollforward$",
341+
r"pandas\.tseries\.offsets\.BYearEnd\.rollforward$",
342+
r"pandas\.tseries\.offsets\.BYearBegin\.rollforward$",
343+
r"pandas\.tseries\.offsets\.YearEnd\.rollforward$",
344+
r"pandas\.tseries\.offsets\.YearBegin\.rollforward$",
345+
r"pandas\.tseries\.offsets\.FY5253\.rollforward$",
346+
r"pandas\.tseries\.offsets\.FY5253Quarter\.rollforward$",
347+
r"pandas\.tseries\.offsets\.Easter\.rollforward$",
348+
r"pandas\.tseries\.offsets\.Tick\.rollforward$",
349+
r"pandas\.tseries\.offsets\.Day\.rollforward$",
350+
r"pandas\.tseries\.offsets\.Hour\.rollforward$",
351+
r"pandas\.tseries\.offsets\.Minute\.rollforward$",
352+
r"pandas\.tseries\.offsets\.Second\.rollforward$",
353+
r"pandas\.tseries\.offsets\.Milli\.rollforward$",
354+
r"pandas\.tseries\.offsets\.Micro\.rollforward$",
355+
r"pandas\.tseries\.offsets\.Nano\.rollforward$",
356+
# Offset next_bday (BusinessHour and CustomBusinessHour only)
357+
r"pandas\.tseries\.offsets\.BusinessHour\.next_bday$",
358+
r"pandas\.tseries\.offsets\.CustomBusinessHour\.next_bday$",
359+
# Easter.method
360+
r"pandas\.tseries\.offsets\.Easter\.method$",
361+
# CustomBusinessMonth helper methods
362+
r"pandas\.tseries\.offsets\.CustomBusinessMonthEnd\.cbday_roll$",
363+
r"pandas\.tseries\.offsets\.CustomBusinessMonthEnd\.month_roll$",
364+
r"pandas\.tseries\.offsets\.CustomBusinessMonthBegin\.cbday_roll$",
365+
r"pandas\.tseries\.offsets\.CustomBusinessMonthBegin\.month_roll$",
366+
# ExtensionDtype base class stubs
367+
r"pandas\.api\.extensions\.ExtensionDtype\.construct_array_type$",
368+
r"pandas\.api\.extensions\.ExtensionDtype\.construct_from_string$",
369+
r"pandas\.api\.extensions\.ExtensionDtype\.empty$",
370+
r"pandas\.api\.extensions\.ExtensionDtype\.index_class$",
371+
r"pandas\.api\.extensions\.ExtensionDtype\.is_dtype$",
372+
r"pandas\.api\.extensions\.ExtensionDtype\.kind$",
373+
r"pandas\.api\.extensions\.ExtensionDtype\.na_value$",
374+
r"pandas\.api\.extensions\.ExtensionDtype\.name$",
375+
r"pandas\.api\.extensions\.ExtensionDtype\.names$",
376+
r"pandas\.api\.extensions\.ExtensionDtype\.type$",
377+
# Window indexer get_window_bounds
378+
r"pandas\.api\.indexers\.BaseIndexer\.get_window_bounds$",
379+
r"pandas\.api\.indexers\.FixedForwardWindowIndexer\.get_window_bounds$",
380+
r"pandas\.api\.indexers\.VariableOffsetWindowIndexer\.get_window_bounds$",
381+
# ExcelWriter properties and methods
382+
r"pandas\.ExcelWriter\.book$",
383+
r"pandas\.ExcelWriter\.check_extension$",
384+
r"pandas\.ExcelWriter\.close$",
385+
r"pandas\.ExcelWriter\.date_format$",
386+
r"pandas\.ExcelWriter\.datetime_format$",
387+
r"pandas\.ExcelWriter\.engine$",
388+
r"pandas\.ExcelWriter\.if_sheet_exists$",
389+
r"pandas\.ExcelWriter\.sheets$",
390+
r"pandas\.ExcelWriter\.supported_extensions$",
391+
# ExcelFile
392+
r"pandas\.ExcelFile\.close$",
393+
# plot.__call__ (PlotAccessor)
394+
r"pandas\.DataFrame\.plot\.__call__$",
395+
r"pandas\.Series\.plot\.__call__$",
396+
# Index attributes and methods processed by autodoc but not in api.rst
397+
r"pandas\.Index\.nlevels$",
398+
r"pandas\.Index\.diff$",
399+
r"pandas\.Index\.groupby$",
400+
r"pandas\.Index\.round$",
401+
r"pandas\.Index\.sortlevel$",
402+
r"pandas\.Index\.to_flat_index$",
403+
r"pandas\.Index\.transpose$",
404+
# Series attributes processed by autodoc but not in api.rst
405+
r"pandas\.Series\.axes$",
406+
r"pandas\.Series\.transpose$",
407+
}
137408

138409
# matplotlib plot directive
139410
plot_include_source = True

doc/source/development/contributing_codebase.rst

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ Thus, good style is a requirement for submitting code to pandas.
2121
There are a couple of tools in pandas to help contributors verify their changes
2222
before contributing to the project
2323

24-
- ``./ci/code_checks.sh``: a script validates the doctests, formatting in docstrings,
25-
and imported modules. It is possible to run the checks independently by using the
26-
parameters ``docstrings``, ``code``, and ``doctests``
27-
(e.g. ``./ci/code_checks.sh doctests``);
24+
- ``./ci/code_checks.sh``: a script validates the doctests, pandas-specific docstring
25+
conventions, and imported modules. It is possible to run the checks independently by
26+
using the parameters ``docstrings``, ``code``, and ``doctests``
27+
(e.g. ``./ci/code_checks.sh doctests``). Standard numpydoc docstring validation is
28+
enforced during the Sphinx build via ``numpydoc_validation_checks`` in
29+
``doc/source/conf.py``;
2830
- ``pre-commit``, which we go into detail on in the next section.
2931

3032
In addition, because a lot of people use our library, it is important that we

0 commit comments

Comments
 (0)