diff --git a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java index 30e2e003b..bdab88b4f 100644 --- a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java +++ b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java @@ -168,9 +168,9 @@ private static String postProcessPandocOutput(String output) { // Remove empty lines at the start and end output = output.trim(); - // Remove unnecessary backslash escapes that pandoc adds for markdown - // These characters don't need escaping in Python docstrings - output = output.replaceAll("\\\\([\\[\\]'{}()<>`@_*|!~$#^])", "$1"); + // Fix up the backslash escapes pandoc adds for Markdown so the result is + // a valid Python string literal (see normalizeBackslashEscapes). + output = normalizeBackslashEscapes(output); // Replace and tags with admonitions for mkdocstrings output = replaceAdmonitionTags(output, "note", "Note"); @@ -180,6 +180,45 @@ private static String postProcessPandocOutput(String output) { return output.replace("$", "$$"); } + // A lone backslash before one of these is dropped: pandoc adds it for Markdown + // but it needs no escaping in a Python docstring. + private static final String MARKDOWN_ONLY_CHARS = "[](){}<>`@_*|!~$#^'."; + // A lone backslash before one of these is kept: together they form a valid + // Python escape. (' is intentionally left out; it lives in MARKDOWN_ONLY_CHARS.) + private static final String VALID_ESCAPE_CHARS = "\\\"abfnrtv01234567xNuU\r\n"; + private static final Pattern BACKSLASH_RUN = Pattern.compile("(\\\\+)([^\\\\]|$)", Pattern.DOTALL); + + /** + * Fixes pandoc's backslash escaping so the docstring is a valid Python literal. + * + *

Backslashes must come in pairs, but pandoc can leave an odd-length run + * (it escapes literal backslashes and Markdown characters separately, and + * adjacent escapes pile up). For each run we keep the pairs; a leftover + * backslash is then kept if it forms a valid escape, dropped if it is only + * there for Markdown, or doubled otherwise. + */ + private static String normalizeBackslashEscapes(String output) { + Matcher m = BACKSLASH_RUN.matcher(output); + StringBuilder sb = new StringBuilder(); + while (m.find()) { + int runLength = m.group(1).length(); + String next = m.group(2); + int backslashes = (runLength / 2) * 2; + if (runLength % 2 != 0) { + char c = next.isEmpty() ? '\0' : next.charAt(0); + if (!next.isEmpty() && VALID_ESCAPE_CHARS.indexOf(c) >= 0) { + backslashes += 1; + } else if (next.isEmpty() || MARKDOWN_ONLY_CHARS.indexOf(c) < 0) { + backslashes += 2; + } + // else: Markdown-only char, drop the spurious backslash + } + m.appendReplacement(sb, Matcher.quoteReplacement("\\".repeat(backslashes) + next)); + } + m.appendTail(sb); + return sb.toString(); + } + /** * Replaces admonition tags (e.g. note, important) with Google-style format. * diff --git a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java index 028ec1aaf..d425f727a 100644 --- a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java +++ b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java @@ -128,6 +128,54 @@ public void testConvertRemovesUnnecessaryBackslashEscapes() { assertEquals("Text with [brackets] and {braces} and (parens)", result.trim()); } + @Test + public void testConvertPreservesLiteralDoubleBackslash() { + // A literal "\\" is a valid Python escape (e.g. a password charset like + // "[\\]") and must be preserved. + String html = "@[\\\\]^"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`@[\\\\]^`", result); + } + + @Test + public void testConvertEscapesLoneBackslash() { + // A lone backslash that is not part of a recognized Python escape (here a + // backslash followed by a space) must be doubled so the docstring is a + // valid Python string. + String html = "[ \\ ]"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`[ \\\\ ]`", result); + } + + @Test + public void testConvertPreservesValidPythonEscapes() { + // A backslash that already forms a valid Python escape (e.g. \") must be + // left untouched rather than doubled. + String html = "!\\\"#"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`!\\\"#`", result); + } + + @Test + public void testConvertHandlesBackslashBeforeMarkdownChar() { + // A backslash followed by a Markdown-significant character (here "*") makes + // pandoc emit an odd-length backslash run. The spurious escape must be + // dropped so the result is valid Python. + String html = "arn:aws:iam::\\*:user/\\*"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`arn:aws:iam::*:user/*`", result); + } + + @Test + public void testConvertHandlesBackslashStarNextToQuote() { + // The API Gateway model documents the comment marker "\*/". Next to a quote, + // pandoc emits an odd-length backslash run; the result must stay a valid + // Python literal. + String html = "

Do not include \"\\*/\" characters

"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("Do not include \\\"\\\\*/\\\" characters", result); + } + @Test public void testConvertMixedElements() { String html = "

Title

Paragraph

";