fix: escape HTML in descriptions before link transformation

Fixes HTML injection vulnerability in generate_dynamic_docs.py where description text was not HTML-escaped before being inserted into HTML output. Special characters like <, >, & could be interpreted as HTML. Changes: - Modified transform_doc_links() to escape all non-link text when format='html' or format='web' - Link text and hrefs are also properly escaped - Non-HTML formats (markdown, python) remain unchanged - Added proper handling for descriptions with mixed plain text and links The fix splits docstrings into link and non-link segments, escapes non-link segments, and properly escapes content within link patterns. Tested with comprehensive test suite covering: - Basic HTML special characters - Special chars with links - Special chars in link text - Multiple links with special chars 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 11:48:09 -04:00 · 2025-10-30 11:48:09 -04:00 · 7f253da581
commit 7f253da581
parent fac6a9a457
2 changed files with 75 additions and 25 deletions
--- a/tools/generate_dynamic_docs.py
+++ b/tools/generate_dynamic_docs.py
@ -17,6 +17,8 @@ def transform_doc_links(docstring, format='html', base_url=''):

    Detects pattern: "See also: TEXT (docs/path.md)"
    Transforms to appropriate format for output type.
+
+    For HTML/web formats, properly escapes content before inserting HTML tags.
    """
    if not docstring:
        return docstring
@ -27,14 +29,17 @@ def transform_doc_links(docstring, format='html', base_url=''):
        text, ref = match.group(1).strip(), match.group(2).strip()

        if format == 'html':
-            # Convert docs/foo.md → foo.html
-            href = ref.replace('docs/', '').replace('.md', '.html')
-            return f'<p class="see-also">See also: <a href="{href}">{text}</a></p>'
+            # Convert docs/foo.md → foo.html and escape for safe HTML
+            href = html.escape(ref.replace('docs/', '').replace('.md', '.html'), quote=True)
+            text_escaped = html.escape(text)
+            return f'<p class="see-also">See also: <a href="{href}">{text_escaped}</a></p>'

        elif format == 'web':
-            # Link to hosted docs
+            # Link to hosted docs and escape for safe HTML
            web_path = ref.replace('docs/', '').replace('.md', '')
-            return f'<p class="see-also">See also: <a href="{base_url}/{web_path}">{text}</a></p>'
+            href = html.escape(f"{base_url}/{web_path}", quote=True)
+            text_escaped = html.escape(text)
+            return f'<p class="see-also">See also: <a href="{href}">{text_escaped}</a></p>'

        elif format == 'markdown':
            # Markdown link
@ -44,7 +49,29 @@ def transform_doc_links(docstring, format='html', base_url=''):
            # Keep as plain text for Python docstrings
            return match.group(0)

-    return re.sub(link_pattern, replace_link, docstring)
+    # For HTML formats, escape the entire docstring first, then process links
+    if format in ('html', 'web'):
+        # Split by the link pattern, escape non-link parts, then reassemble
+        parts = []
+        last_end = 0
+
+        for match in re.finditer(link_pattern, docstring):
+            # Escape the text before this match
+            if match.start() > last_end:
+                parts.append(html.escape(docstring[last_end:match.start()]))
+
+            # Process the link (replace_link handles escaping internally)
+            parts.append(replace_link(match))
+            last_end = match.end()
+
+        # Escape any remaining text after the last match
+        if last_end < len(docstring):
+            parts.append(html.escape(docstring[last_end:]))
+
+        return ''.join(parts)
+    else:
+        # For non-HTML formats, just do simple replacement
+        return re.sub(link_pattern, replace_link, docstring)

 # Must be run with McRogueFace as interpreter
 try:
@ -339,8 +366,9 @@ def generate_html_docs():
        <div class="method-section">
            <h3><code class="function-signature">{func_name}{parsed['signature'] if parsed['signature'] else '(...)'}</code></h3>
 """
-        description = transform_doc_links(parsed['description'], format='html')
-        html_content += f"            <p>{description}</p>\n"
+        if parsed['description']:
+            description = transform_doc_links(parsed['description'], format='html')
+            html_content += f"            <p>{description}</p>\n"
        
        if parsed['args']:
            html_content += "            <h4>Arguments:</h4>\n            <ul>\n"