McRogueFace/tools/audit_pymethoddef.py
John McCardle 626d5ae708 Add tools/audit_pymethoddef.py - MCRF_METHOD/MCRF_PROPERTY compliance auditor
Static-analysis tool for the pre-1.0 API freeze: walks src/**/*.cpp with
tree-sitter-cpp, locates every PyMethodDef and PyGetSetDef array initializer,
and classifies each entry's docstring slot as MACRO / RAW_STRING / NULL /
MISSING. The MACRO classification (MCRF_METHOD or MCRF_PROPERTY from
McRFPy_Doc.h) is the project's compliance target; raw string literals and
NULL docs predate the macro system and need migration before the 1.0 freeze.

Features:
- Per-file table (file:line, array, entry, classification) plus summary
  footer with totals, per-kind breakdown, % MACRO compliance, and a
  ranked list of the worst-offender files.
- --strict exits nonzero when any non-MACRO entries are present (CI use).
- --quiet suppresses per-file tables and prints only the summary.
- --paths PATH [PATH ...] limits the scan to specific files / directories.
- Sentinel terminator entries ({NULL}, {0}) are filtered out.
- Concatenated string literals and parenthesized expressions are handled.

Dependencies live in a project-local .venv-audit (added to .gitignore) so
no system Python pollution. tools/generate_all_docs.sh now invokes the
auditor at the end (informational, no --strict) so doc regeneration
surfaces compliance drift alongside the generated docs themselves.

Initial baseline on the current tree: 755 entries scanned, 340 MACRO
compliant (45.0%). Top offenders: UIEntity.cpp (51), 3d/PyVoxelGrid.cpp
(36), 3d/Viewport3D.cpp (31), UIGridPyProperties.cpp (30),
3d/Entity3D.cpp (28).

Refs pre-1.0 documentation freeze.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:30:25 -04:00

463 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
audit_pymethoddef.py - Static-analysis tool for McRogueFace Python bindings.
Walks src/**/*.cpp, parses each file with tree-sitter-cpp, and locates every
`PyMethodDef <name>[] = {...}` and `PyGetSetDef <name>[] = {...}` declaration.
For each entry inside those array initializers, classifies the docstring slot:
MACRO - uses MCRF_METHOD(...) or MCRF_PROPERTY(...)
RAW_STRING - inline C string literal (or concatenated string literals)
NULL - explicit NULL literal
MISSING - entry too short to have a doc field (probably malformed)
The `MACRO` classification is the project's compliance target. RAW_STRING and
NULL entries should be migrated to the macro system before the 1.0 API freeze.
Sentinel terminator entries (e.g. `{NULL}`, `{0}`) are skipped.
Usage:
python3 tools/audit_pymethoddef.py [--strict] [--quiet]
[--paths PATH [PATH ...]]
Flags:
--strict Exit nonzero if any non-MACRO entries are found (CI mode).
--quiet Suppress per-file output, print only the summary.
--paths Restrict scan to the given files/directories. Defaults to src/.
"""
from __future__ import annotations
import argparse
import os
import sys
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
try:
import tree_sitter_cpp
from tree_sitter import Language, Parser
except ImportError as e:
sys.stderr.write(
"ERROR: tree-sitter / tree-sitter-cpp not installed.\n"
"Activate the audit venv first:\n"
" source .venv-audit/bin/activate\n"
f"(import error: {e})\n"
)
sys.exit(2)
# ---------------------------------------------------------------------------
# Tree-sitter setup
# ---------------------------------------------------------------------------
_LANG = Language(tree_sitter_cpp.language())
_PARSER = Parser(_LANG)
# Indices of the docstring field in the struct initializer.
# PyMethodDef: {ml_name, ml_meth, ml_flags, ml_doc} -> idx 3
# PyGetSetDef: {name, get, set, doc, closure} -> idx 3
DOC_FIELD_INDEX = 3
EXPECTED_MIN_FIELDS = {
"PyMethodDef": 4, # need at least 4 to have ml_doc
"PyGetSetDef": 4, # need at least 4 to have doc (closure can be omitted)
}
# Punctuation/structural child node types we ignore when walking entry fields.
_PUNCT_TYPES = {"{", "}", ",", "(", ")", "[", "]", ";"}
# Macros that mark a docstring slot as compliant.
_COMPLIANT_MACROS = {"MCRF_METHOD", "MCRF_PROPERTY"}
# ---------------------------------------------------------------------------
# Data records
# ---------------------------------------------------------------------------
@dataclass
class EntryRecord:
file_path: Path
line: int # 1-based
array_kind: str # "PyMethodDef" or "PyGetSetDef"
array_name: str # e.g. "PyAnimation::methods"
entry_name: str # ml_name / name string, or "<unknown>"
classification: str # MACRO / RAW_STRING / NULL / MISSING
# ---------------------------------------------------------------------------
# Tree-sitter helpers
# ---------------------------------------------------------------------------
def _node_text(src: bytes, node) -> str:
return src[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
def _meaningful_children(node) -> List:
"""Return children of an initializer_list, skipping punctuation tokens."""
return [c for c in node.children if c.type not in _PUNCT_TYPES]
def _classify_doc_field(src: bytes, doc_node) -> str:
"""Map a docstring AST node to a classification string."""
t = doc_node.type
if t == "null":
return "NULL"
if t in ("string_literal", "concatenated_string", "raw_string_literal"):
return "RAW_STRING"
if t == "call_expression":
# The first child of call_expression is the callee identifier.
callee = doc_node.child_by_field_name("function")
if callee is None and doc_node.children:
callee = doc_node.children[0]
if callee is not None:
name = _node_text(src, callee).strip()
if name in _COMPLIANT_MACROS:
return "MACRO"
return "RAW_STRING" # call expression to something non-MACRO
if t == "identifier":
# Bare identifier in the doc slot - could be a #define alias. Treat as
# raw (non-compliant) so the user investigates it.
text = _node_text(src, doc_node).strip()
if text == "NULL":
return "NULL"
return "RAW_STRING"
# Anything else (parenthesized_expression, etc.) - inspect text fallback.
text = _node_text(src, doc_node).strip()
stripped = text.lstrip("(").lstrip()
for macro in _COMPLIANT_MACROS:
if stripped.startswith(macro + "("):
return "MACRO"
if stripped == "NULL":
return "NULL"
return "RAW_STRING"
def _entry_name(src: bytes, entry_node) -> str:
"""Return the first string literal in an entry initializer (the ml_name)."""
for c in _meaningful_children(entry_node):
if c.type == "string_literal":
# string_literal contains string_content children
for sub in c.children:
if sub.type == "string_content":
return _node_text(src, sub)
return _node_text(src, c).strip('"')
if c.type == "concatenated_string":
for lit in c.children:
if lit.type == "string_literal":
for sub in lit.children:
if sub.type == "string_content":
return _node_text(src, sub)
return _node_text(src, c)
# Stop at the first non-string field - the name should come first.
break
return "<unknown>"
def _is_sentinel(src: bytes, entry_node) -> bool:
"""True if the entry looks like a sentinel terminator (e.g. {NULL} / {0})."""
fields = _meaningful_children(entry_node)
if not fields:
return True
if len(fields) == 1:
only = fields[0]
if only.type == "null":
return True
if only.type == "number_literal" and _node_text(src, only).strip() == "0":
return True
# Some codebases write {NULL, NULL, NULL, NULL}. Treat all-NULL/0 as sentinel.
for f in fields:
if f.type == "null":
continue
if f.type == "number_literal" and _node_text(src, f).strip() == "0":
continue
return False
return True
def _array_name_from_init_declarator(src: bytes, init_decl) -> Optional[str]:
"""Extract the array name from an init_declarator containing array_declarator."""
arr = None
for c in init_decl.children:
if c.type == "array_declarator":
arr = c
break
if arr is None:
return None
# The first child is the declarator (identifier or qualified_identifier).
for c in arr.children:
if c.type in ("identifier", "qualified_identifier", "field_identifier"):
return _node_text(src, c)
return None
def _outer_initializer_list(init_decl):
"""Get the top-level initializer_list child of an init_declarator, if any."""
for c in init_decl.children:
if c.type == "initializer_list":
return c
return None
# ---------------------------------------------------------------------------
# Per-file scan
# ---------------------------------------------------------------------------
def _walk_declarations(node, out: list) -> None:
"""Collect all `declaration` nodes under `node` (recursive)."""
if node.type == "declaration":
out.append(node)
for c in node.children:
_walk_declarations(c, out)
def scan_file(path: Path) -> List[EntryRecord]:
try:
src = path.read_bytes()
except OSError as e:
sys.stderr.write(f"WARNING: cannot read {path}: {e}\n")
return []
tree = _PARSER.parse(src)
decls: list = []
_walk_declarations(tree.root_node, decls)
records: List[EntryRecord] = []
for decl in decls:
# Find the type_identifier child to determine if this is one of ours.
type_kind = None
for c in decl.children:
if c.type == "type_identifier":
txt = _node_text(src, c).strip()
if txt in EXPECTED_MIN_FIELDS:
type_kind = txt
break
if type_kind is None:
continue
# Each declaration may have multiple init_declarators (rare for arrays
# but cheap to handle).
for c in decl.children:
if c.type != "init_declarator":
continue
outer_init = _outer_initializer_list(c)
if outer_init is None:
continue # forward decl or extern - no initializer
array_name = _array_name_from_init_declarator(src, c) or "<anon>"
# Each direct child initializer_list is an entry.
for entry in outer_init.children:
if entry.type != "initializer_list":
continue
if _is_sentinel(src, entry):
continue
fields = _meaningful_children(entry)
line = entry.start_point[0] + 1 # tree-sitter is 0-based
name = _entry_name(src, entry)
if len(fields) <= DOC_FIELD_INDEX:
records.append(EntryRecord(
file_path=path,
line=line,
array_kind=type_kind,
array_name=array_name,
entry_name=name,
classification="MISSING",
))
continue
doc_node = fields[DOC_FIELD_INDEX]
classification = _classify_doc_field(src, doc_node)
records.append(EntryRecord(
file_path=path,
line=line,
array_kind=type_kind,
array_name=array_name,
entry_name=name,
classification=classification,
))
return records
# ---------------------------------------------------------------------------
# Path resolution
# ---------------------------------------------------------------------------
def _iter_cpp_files(roots: Iterable[Path]) -> Iterable[Path]:
for root in roots:
if root.is_file():
if root.suffix == ".cpp":
yield root
continue
if not root.exists():
sys.stderr.write(f"WARNING: path does not exist: {root}\n")
continue
for dirpath, _dirnames, filenames in os.walk(root):
for fn in filenames:
if fn.endswith(".cpp"):
yield Path(dirpath) / fn
# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------
def _print_file_table(records: List[EntryRecord], project_root: Path) -> None:
by_file: dict[Path, List[EntryRecord]] = defaultdict(list)
for r in records:
by_file[r.file_path].append(r)
for path in sorted(by_file):
try:
rel = path.relative_to(project_root)
except ValueError:
rel = path
entries = sorted(by_file[path], key=lambda r: r.line)
# Compute column widths for this file.
loc_w = max(len(f"{rel}:{e.line}") for e in entries)
arr_w = max(len(e.array_name) for e in entries)
ent_w = max(len(e.entry_name) for e in entries)
loc_w = max(loc_w, len("file:line"))
arr_w = max(arr_w, len("array"))
ent_w = max(ent_w, len("entry"))
header = (
f"{'file:line':<{loc_w}} "
f"{'array':<{arr_w}} "
f"{'entry':<{ent_w}} "
f"classification"
)
print(header)
print("-" * len(header))
for e in entries:
loc = f"{rel}:{e.line}"
print(
f"{loc:<{loc_w}} "
f"{e.array_name:<{arr_w}} "
f"{e.entry_name:<{ent_w}} "
f"{e.classification}"
)
print()
def _print_summary(records: List[EntryRecord]) -> None:
total = len(records)
counts = Counter(r.classification for r in records)
macro = counts.get("MACRO", 0)
raw = counts.get("RAW_STRING", 0)
null = counts.get("NULL", 0)
missing = counts.get("MISSING", 0)
pct = (macro / total * 100.0) if total else 0.0
print("=" * 60)
print("PyMethodDef / PyGetSetDef Documentation Audit Summary")
print("=" * 60)
print(f"Total entries scanned : {total}")
print(f" MACRO compliant : {macro}")
print(f" RAW_STRING : {raw}")
print(f" NULL : {null}")
print(f" MISSING : {missing}")
print(f"MACRO compliance : {pct:.1f}%")
# Per-kind breakdown.
by_kind = defaultdict(Counter)
for r in records:
by_kind[r.array_kind][r.classification] += 1
if by_kind:
print()
print("Breakdown by kind:")
for kind in sorted(by_kind):
kc = by_kind[kind]
kt = sum(kc.values())
kp = (kc.get("MACRO", 0) / kt * 100.0) if kt else 0.0
print(
f" {kind:<13} total={kt:<4} "
f"MACRO={kc.get('MACRO', 0):<4} "
f"RAW={kc.get('RAW_STRING', 0):<4} "
f"NULL={kc.get('NULL', 0):<4} "
f"MISSING={kc.get('MISSING', 0):<4} "
f"({kp:.1f}% compliant)"
)
# Top offenders.
offenders: dict[Path, int] = defaultdict(int)
for r in records:
if r.classification != "MACRO":
offenders[r.file_path] += 1
if offenders:
print()
print("Top non-compliant files:")
ranked = sorted(offenders.items(), key=lambda kv: kv[1], reverse=True)
for path, count in ranked[:10]:
try:
rel = path.relative_to(Path.cwd())
except ValueError:
rel = path
print(f" {count:>4} {rel}")
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def _default_roots() -> List[Path]:
cwd = Path.cwd()
src = cwd / "src"
return [src] if src.exists() else [cwd]
def main(argv: Optional[List[str]] = None) -> int:
parser = argparse.ArgumentParser(
description=(
"Audit PyMethodDef / PyGetSetDef entries in McRogueFace C++ "
"sources for MCRF_METHOD / MCRF_PROPERTY documentation macro use."
),
)
parser.add_argument(
"--strict", action="store_true",
help="Exit nonzero if any non-MACRO entries are found (CI mode)."
)
parser.add_argument(
"--quiet", action="store_true",
help="Print only the summary, omit per-file tables."
)
parser.add_argument(
"--paths", nargs="+", type=Path, default=None,
help="Files or directories to scan (default: ./src)."
)
args = parser.parse_args(argv)
roots = args.paths if args.paths else _default_roots()
project_root = Path.cwd()
files = sorted(set(_iter_cpp_files(roots)))
if not files:
sys.stderr.write("WARNING: no .cpp files found.\n")
return 0
all_records: List[EntryRecord] = []
for f in files:
all_records.extend(scan_file(f))
if not args.quiet:
if all_records:
_print_file_table(all_records, project_root)
else:
print("(no PyMethodDef / PyGetSetDef arrays found)")
print()
_print_summary(all_records)
if args.strict:
non_macro = sum(
1 for r in all_records if r.classification != "MACRO"
)
if non_macro:
return 1
return 0
if __name__ == "__main__":
sys.exit(main())