Add tools/audit_pymethoddef.py - MCRF_METHOD/MCRF_PROPERTY compliance auditor

Static-analysis tool for the pre-1.0 API freeze: walks src/**/*.cpp with tree-sitter-cpp, locates every PyMethodDef and PyGetSetDef array initializer, and classifies each entry's docstring slot as MACRO / RAW_STRING / NULL / MISSING. The MACRO classification (MCRF_METHOD or MCRF_PROPERTY from McRFPy_Doc.h) is the project's compliance target; raw string literals and NULL docs predate the macro system and need migration before the 1.0 freeze. Features: - Per-file table (file:line, array, entry, classification) plus summary footer with totals, per-kind breakdown, % MACRO compliance, and a ranked list of the worst-offender files. - --strict exits nonzero when any non-MACRO entries are present (CI use). - --quiet suppresses per-file tables and prints only the summary. - --paths PATH [PATH ...] limits the scan to specific files / directories. - Sentinel terminator entries ({NULL}, {0}) are filtered out. - Concatenated string literals and parenthesized expressions are handled. Dependencies live in a project-local .venv-audit (added to .gitignore) so no system Python pollution. tools/generate_all_docs.sh now invokes the auditor at the end (informational, no --strict) so doc regeneration surfaces compliance drift alongside the generated docs themselves. Initial baseline on the current tree: 755 entries scanned, 340 MACRO compliant (45.0%). Top offenders: UIEntity.cpp (51), 3d/PyVoxelGrid.cpp (36), 3d/Viewport3D.cpp (31), UIGridPyProperties.cpp (30), 3d/Entity3D.cpp (28). Refs pre-1.0 documentation freeze. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:30:25 -04:00 · 2026-04-18 12:30:25 -04:00 · 626d5ae708
commit 626d5ae708
parent 3030ac488b
3 changed files with 487 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,6 +33,9 @@ __lib_windows/
 build-windows/
 build_windows/
 _oldscripts/
+
+# Audit tooling virtualenv (tools/audit_pymethoddef.py)
+.venv-audit/
 assets/
 cellular_automata_fire/
 deps/
--- a/tools/audit_pymethoddef.py
+++ b/tools/audit_pymethoddef.py
@ -0,0 +1,463 @@
+#!/usr/bin/env python3
+"""
+audit_pymethoddef.py - Static-analysis tool for McRogueFace Python bindings.
+
+Walks src/**/*.cpp, parses each file with tree-sitter-cpp, and locates every
+`PyMethodDef <name>[] = {...}` and `PyGetSetDef <name>[] = {...}` declaration.
+For each entry inside those array initializers, classifies the docstring slot:
+
+    MACRO       - uses MCRF_METHOD(...) or MCRF_PROPERTY(...)
+    RAW_STRING  - inline C string literal (or concatenated string literals)
+    NULL        - explicit NULL literal
+    MISSING     - entry too short to have a doc field (probably malformed)
+
+The `MACRO` classification is the project's compliance target. RAW_STRING and
+NULL entries should be migrated to the macro system before the 1.0 API freeze.
+
+Sentinel terminator entries (e.g. `{NULL}`, `{0}`) are skipped.
+
+Usage:
+    python3 tools/audit_pymethoddef.py [--strict] [--quiet]
+                                       [--paths PATH [PATH ...]]
+
+Flags:
+    --strict   Exit nonzero if any non-MACRO entries are found (CI mode).
+    --quiet    Suppress per-file output, print only the summary.
+    --paths    Restrict scan to the given files/directories. Defaults to src/.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple
+
+try:
+    import tree_sitter_cpp
+    from tree_sitter import Language, Parser
+except ImportError as e:
+    sys.stderr.write(
+        "ERROR: tree-sitter / tree-sitter-cpp not installed.\n"
+        "Activate the audit venv first:\n"
+        "    source .venv-audit/bin/activate\n"
+        f"(import error: {e})\n"
+    )
+    sys.exit(2)
+
+
+# ---------------------------------------------------------------------------
+# Tree-sitter setup
+# ---------------------------------------------------------------------------
+
+_LANG = Language(tree_sitter_cpp.language())
+_PARSER = Parser(_LANG)
+
+# Indices of the docstring field in the struct initializer.
+# PyMethodDef:  {ml_name, ml_meth, ml_flags, ml_doc}                  -> idx 3
+# PyGetSetDef:  {name, get, set, doc, closure}                        -> idx 3
+DOC_FIELD_INDEX = 3
+EXPECTED_MIN_FIELDS = {
+    "PyMethodDef": 4,   # need at least 4 to have ml_doc
+    "PyGetSetDef": 4,   # need at least 4 to have doc (closure can be omitted)
+}
+
+# Punctuation/structural child node types we ignore when walking entry fields.
+_PUNCT_TYPES = {"{", "}", ",", "(", ")", "[", "]", ";"}
+
+# Macros that mark a docstring slot as compliant.
+_COMPLIANT_MACROS = {"MCRF_METHOD", "MCRF_PROPERTY"}
+
+
+# ---------------------------------------------------------------------------
+# Data records
+# ---------------------------------------------------------------------------
+
+@dataclass
+class EntryRecord:
+    file_path: Path
+    line: int                # 1-based
+    array_kind: str          # "PyMethodDef" or "PyGetSetDef"
+    array_name: str          # e.g. "PyAnimation::methods"
+    entry_name: str          # ml_name / name string, or "<unknown>"
+    classification: str      # MACRO / RAW_STRING / NULL / MISSING
+
+
+# ---------------------------------------------------------------------------
+# Tree-sitter helpers
+# ---------------------------------------------------------------------------
+
+def _node_text(src: bytes, node) -> str:
+    return src[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
+
+
+def _meaningful_children(node) -> List:
+    """Return children of an initializer_list, skipping punctuation tokens."""
+    return [c for c in node.children if c.type not in _PUNCT_TYPES]
+
+
+def _classify_doc_field(src: bytes, doc_node) -> str:
+    """Map a docstring AST node to a classification string."""
+    t = doc_node.type
+    if t == "null":
+        return "NULL"
+    if t in ("string_literal", "concatenated_string", "raw_string_literal"):
+        return "RAW_STRING"
+    if t == "call_expression":
+        # The first child of call_expression is the callee identifier.
+        callee = doc_node.child_by_field_name("function")
+        if callee is None and doc_node.children:
+            callee = doc_node.children[0]
+        if callee is not None:
+            name = _node_text(src, callee).strip()
+            if name in _COMPLIANT_MACROS:
+                return "MACRO"
+        return "RAW_STRING"  # call expression to something non-MACRO
+    if t == "identifier":
+        # Bare identifier in the doc slot - could be a #define alias. Treat as
+        # raw (non-compliant) so the user investigates it.
+        text = _node_text(src, doc_node).strip()
+        if text == "NULL":
+            return "NULL"
+        return "RAW_STRING"
+    # Anything else (parenthesized_expression, etc.) - inspect text fallback.
+    text = _node_text(src, doc_node).strip()
+    stripped = text.lstrip("(").lstrip()
+    for macro in _COMPLIANT_MACROS:
+        if stripped.startswith(macro + "("):
+            return "MACRO"
+    if stripped == "NULL":
+        return "NULL"
+    return "RAW_STRING"
+
+
+def _entry_name(src: bytes, entry_node) -> str:
+    """Return the first string literal in an entry initializer (the ml_name)."""
+    for c in _meaningful_children(entry_node):
+        if c.type == "string_literal":
+            # string_literal contains string_content children
+            for sub in c.children:
+                if sub.type == "string_content":
+                    return _node_text(src, sub)
+            return _node_text(src, c).strip('"')
+        if c.type == "concatenated_string":
+            for lit in c.children:
+                if lit.type == "string_literal":
+                    for sub in lit.children:
+                        if sub.type == "string_content":
+                            return _node_text(src, sub)
+            return _node_text(src, c)
+        # Stop at the first non-string field - the name should come first.
+        break
+    return "<unknown>"
+
+
+def _is_sentinel(src: bytes, entry_node) -> bool:
+    """True if the entry looks like a sentinel terminator (e.g. {NULL} / {0})."""
+    fields = _meaningful_children(entry_node)
+    if not fields:
+        return True
+    if len(fields) == 1:
+        only = fields[0]
+        if only.type == "null":
+            return True
+        if only.type == "number_literal" and _node_text(src, only).strip() == "0":
+            return True
+    # Some codebases write {NULL, NULL, NULL, NULL}. Treat all-NULL/0 as sentinel.
+    for f in fields:
+        if f.type == "null":
+            continue
+        if f.type == "number_literal" and _node_text(src, f).strip() == "0":
+            continue
+        return False
+    return True
+
+
+def _array_name_from_init_declarator(src: bytes, init_decl) -> Optional[str]:
+    """Extract the array name from an init_declarator containing array_declarator."""
+    arr = None
+    for c in init_decl.children:
+        if c.type == "array_declarator":
+            arr = c
+            break
+    if arr is None:
+        return None
+    # The first child is the declarator (identifier or qualified_identifier).
+    for c in arr.children:
+        if c.type in ("identifier", "qualified_identifier", "field_identifier"):
+            return _node_text(src, c)
+    return None
+
+
+def _outer_initializer_list(init_decl):
+    """Get the top-level initializer_list child of an init_declarator, if any."""
+    for c in init_decl.children:
+        if c.type == "initializer_list":
+            return c
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Per-file scan
+# ---------------------------------------------------------------------------
+
+def _walk_declarations(node, out: list) -> None:
+    """Collect all `declaration` nodes under `node` (recursive)."""
+    if node.type == "declaration":
+        out.append(node)
+    for c in node.children:
+        _walk_declarations(c, out)
+
+
+def scan_file(path: Path) -> List[EntryRecord]:
+    try:
+        src = path.read_bytes()
+    except OSError as e:
+        sys.stderr.write(f"WARNING: cannot read {path}: {e}\n")
+        return []
+
+    tree = _PARSER.parse(src)
+    decls: list = []
+    _walk_declarations(tree.root_node, decls)
+
+    records: List[EntryRecord] = []
+    for decl in decls:
+        # Find the type_identifier child to determine if this is one of ours.
+        type_kind = None
+        for c in decl.children:
+            if c.type == "type_identifier":
+                txt = _node_text(src, c).strip()
+                if txt in EXPECTED_MIN_FIELDS:
+                    type_kind = txt
+                break
+        if type_kind is None:
+            continue
+
+        # Each declaration may have multiple init_declarators (rare for arrays
+        # but cheap to handle).
+        for c in decl.children:
+            if c.type != "init_declarator":
+                continue
+            outer_init = _outer_initializer_list(c)
+            if outer_init is None:
+                continue  # forward decl or extern - no initializer
+            array_name = _array_name_from_init_declarator(src, c) or "<anon>"
+
+            # Each direct child initializer_list is an entry.
+            for entry in outer_init.children:
+                if entry.type != "initializer_list":
+                    continue
+                if _is_sentinel(src, entry):
+                    continue
+
+                fields = _meaningful_children(entry)
+                line = entry.start_point[0] + 1  # tree-sitter is 0-based
+                name = _entry_name(src, entry)
+
+                if len(fields) <= DOC_FIELD_INDEX:
+                    records.append(EntryRecord(
+                        file_path=path,
+                        line=line,
+                        array_kind=type_kind,
+                        array_name=array_name,
+                        entry_name=name,
+                        classification="MISSING",
+                    ))
+                    continue
+
+                doc_node = fields[DOC_FIELD_INDEX]
+                classification = _classify_doc_field(src, doc_node)
+                records.append(EntryRecord(
+                    file_path=path,
+                    line=line,
+                    array_kind=type_kind,
+                    array_name=array_name,
+                    entry_name=name,
+                    classification=classification,
+                ))
+    return records
+
+
+# ---------------------------------------------------------------------------
+# Path resolution
+# ---------------------------------------------------------------------------
+
+def _iter_cpp_files(roots: Iterable[Path]) -> Iterable[Path]:
+    for root in roots:
+        if root.is_file():
+            if root.suffix == ".cpp":
+                yield root
+            continue
+        if not root.exists():
+            sys.stderr.write(f"WARNING: path does not exist: {root}\n")
+            continue
+        for dirpath, _dirnames, filenames in os.walk(root):
+            for fn in filenames:
+                if fn.endswith(".cpp"):
+                    yield Path(dirpath) / fn
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+def _print_file_table(records: List[EntryRecord], project_root: Path) -> None:
+    by_file: dict[Path, List[EntryRecord]] = defaultdict(list)
+    for r in records:
+        by_file[r.file_path].append(r)
+
+    for path in sorted(by_file):
+        try:
+            rel = path.relative_to(project_root)
+        except ValueError:
+            rel = path
+        entries = sorted(by_file[path], key=lambda r: r.line)
+
+        # Compute column widths for this file.
+        loc_w = max(len(f"{rel}:{e.line}") for e in entries)
+        arr_w = max(len(e.array_name) for e in entries)
+        ent_w = max(len(e.entry_name) for e in entries)
+        loc_w = max(loc_w, len("file:line"))
+        arr_w = max(arr_w, len("array"))
+        ent_w = max(ent_w, len("entry"))
+
+        header = (
+            f"{'file:line':<{loc_w}}  "
+            f"{'array':<{arr_w}}  "
+            f"{'entry':<{ent_w}}  "
+            f"classification"
+        )
+        print(header)
+        print("-" * len(header))
+        for e in entries:
+            loc = f"{rel}:{e.line}"
+            print(
+                f"{loc:<{loc_w}}  "
+                f"{e.array_name:<{arr_w}}  "
+                f"{e.entry_name:<{ent_w}}  "
+                f"{e.classification}"
+            )
+        print()
+
+
+def _print_summary(records: List[EntryRecord]) -> None:
+    total = len(records)
+    counts = Counter(r.classification for r in records)
+    macro = counts.get("MACRO", 0)
+    raw = counts.get("RAW_STRING", 0)
+    null = counts.get("NULL", 0)
+    missing = counts.get("MISSING", 0)
+    pct = (macro / total * 100.0) if total else 0.0
+
+    print("=" * 60)
+    print("PyMethodDef / PyGetSetDef Documentation Audit Summary")
+    print("=" * 60)
+    print(f"Total entries scanned : {total}")
+    print(f"  MACRO compliant     : {macro}")
+    print(f"  RAW_STRING          : {raw}")
+    print(f"  NULL                : {null}")
+    print(f"  MISSING             : {missing}")
+    print(f"MACRO compliance      : {pct:.1f}%")
+
+    # Per-kind breakdown.
+    by_kind = defaultdict(Counter)
+    for r in records:
+        by_kind[r.array_kind][r.classification] += 1
+    if by_kind:
+        print()
+        print("Breakdown by kind:")
+        for kind in sorted(by_kind):
+            kc = by_kind[kind]
+            kt = sum(kc.values())
+            kp = (kc.get("MACRO", 0) / kt * 100.0) if kt else 0.0
+            print(
+                f"  {kind:<13} total={kt:<4} "
+                f"MACRO={kc.get('MACRO', 0):<4} "
+                f"RAW={kc.get('RAW_STRING', 0):<4} "
+                f"NULL={kc.get('NULL', 0):<4} "
+                f"MISSING={kc.get('MISSING', 0):<4} "
+                f"({kp:.1f}% compliant)"
+            )
+
+    # Top offenders.
+    offenders: dict[Path, int] = defaultdict(int)
+    for r in records:
+        if r.classification != "MACRO":
+            offenders[r.file_path] += 1
+    if offenders:
+        print()
+        print("Top non-compliant files:")
+        ranked = sorted(offenders.items(), key=lambda kv: kv[1], reverse=True)
+        for path, count in ranked[:10]:
+            try:
+                rel = path.relative_to(Path.cwd())
+            except ValueError:
+                rel = path
+            print(f"  {count:>4}  {rel}")
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def _default_roots() -> List[Path]:
+    cwd = Path.cwd()
+    src = cwd / "src"
+    return [src] if src.exists() else [cwd]
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Audit PyMethodDef / PyGetSetDef entries in McRogueFace C++ "
+            "sources for MCRF_METHOD / MCRF_PROPERTY documentation macro use."
+        ),
+    )
+    parser.add_argument(
+        "--strict", action="store_true",
+        help="Exit nonzero if any non-MACRO entries are found (CI mode)."
+    )
+    parser.add_argument(
+        "--quiet", action="store_true",
+        help="Print only the summary, omit per-file tables."
+    )
+    parser.add_argument(
+        "--paths", nargs="+", type=Path, default=None,
+        help="Files or directories to scan (default: ./src)."
+    )
+    args = parser.parse_args(argv)
+
+    roots = args.paths if args.paths else _default_roots()
+    project_root = Path.cwd()
+
+    files = sorted(set(_iter_cpp_files(roots)))
+    if not files:
+        sys.stderr.write("WARNING: no .cpp files found.\n")
+        return 0
+
+    all_records: List[EntryRecord] = []
+    for f in files:
+        all_records.extend(scan_file(f))
+
+    if not args.quiet:
+        if all_records:
+            _print_file_table(all_records, project_root)
+        else:
+            print("(no PyMethodDef / PyGetSetDef arrays found)")
+            print()
+
+    _print_summary(all_records)
+
+    if args.strict:
+        non_macro = sum(
+            1 for r in all_records if r.classification != "MACRO"
+        )
+        if non_macro:
+            return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/generate_all_docs.sh
+++ b/tools/generate_all_docs.sh
@ -26,3 +26,24 @@ echo "  HTML:     docs/api_reference_dynamic.html"
 echo "  Markdown: docs/API_REFERENCE_DYNAMIC.md"
 echo "  Man page: docs/mcrfpy.3"
 echo "  Stubs:    stubs/mcrfpy.pyi"
+
+# ---------------------------------------------------------------------------
+# Static-analysis audit: report MCRF_METHOD / MCRF_PROPERTY macro compliance
+# across every PyMethodDef / PyGetSetDef array in src/. This is informational
+# only (no --strict) so it cannot break the doc build, but the summary makes
+# pre-1.0 documentation drift visible alongside doc generation.
+#
+# Requires the .venv-audit virtual environment with tree-sitter +
+# tree-sitter-cpp installed. The audit is skipped silently if absent so
+# contributors without the venv aren't blocked.
+# ---------------------------------------------------------------------------
+if [ -x "./.venv-audit/bin/python3" ] && [ -f "./tools/audit_pymethoddef.py" ]; then
+    echo ""
+    echo "=== PyMethodDef / PyGetSetDef Macro Compliance Audit ==="
+    ./.venv-audit/bin/python3 ./tools/audit_pymethoddef.py --quiet || true
+elif [ -f "./tools/audit_pymethoddef.py" ]; then
+    echo ""
+    echo "(skipping audit_pymethoddef.py: .venv-audit not found - run"
+    echo " 'python3 -m venv .venv-audit && .venv-audit/bin/pip install"
+    echo " tree-sitter tree-sitter-cpp' to enable)"
+fi