diff --git a/Doc/library/pyexpat.rst b/Doc/library/pyexpat.rst
index 2e6938b5cf6860..c88411ce0b7b91 100644
--- a/Doc/library/pyexpat.rst
+++ b/Doc/library/pyexpat.rst
@@ -63,12 +63,26 @@ The :mod:`!xml.parsers.expat` module contains two functions:
.. function:: ParserCreate(encoding=None, namespace_separator=None)
- Creates and returns a new :class:`xmlparser` object. *encoding*, if specified,
- must be a string naming the encoding used by the XML data. Expat doesn't
- support as many encodings as Python does, and its repertoire of encodings can't
- be extended; it supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If
- *encoding* [1]_ is given it will override the implicit or explicit encoding of the
- document.
+ Creates and returns a new :class:`xmlparser` object.
+ *encoding* [1]_, if specified, must be a string naming the encoding
+ used by the XML data.
+ If it is given it will override the implicit or explicit encoding
+ of the document.
+
+ .. impl-detail::
+
+ Expat natively understands and processes UTF-8, UTF-16, UTF-16BE,
+ UTF-16LE, ISO-8859-1, and US-ASCII.
+ For other encodings (including aliases like Latin1 and ASCII) it
+ falls back to Python.
+ It supports most of 8-bit encodings and many multi-byte encodings
+ like Shift_JIS, although only BMP characters (``U+0000-U+FFFF``)
+ are supported with non-native encodings (this restriction is also
+ applied to aliases like UTF8).
+ These restrictions only apply if *encoding* is not given.
+
+ .. versionchanged:: next
+ Added support for multi-byte encodings.
.. _xmlparser-non-root:
@@ -113,7 +127,6 @@ The :mod:`!xml.parsers.expat` module contains two functions:
XML document. Call ``ParserCreate`` for each document to provide unique
parser instances.
-
.. seealso::
`The Expat XML Parser `_
@@ -1083,9 +1096,11 @@ The ``errors`` module has the following attributes:
.. rubric:: Footnotes
-.. [1] The encoding string included in XML output should conform to the
- appropriate standards. For example, "UTF-8" is valid, but "UTF8" is
- not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
+.. [1] The encoding string included in XML output should conform to
+ the appropriate standards. For example, "UTF-8" is valid, but
+ "UTF8" is not valid in an XML document's declaration, even though
+ Python accepts it as an encoding name.
+ See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst
index 8c06628a082e76..0fa6038ca79f0d 100644
--- a/Doc/whatsnew/3.16.rst
+++ b/Doc/whatsnew/3.16.rst
@@ -86,10 +86,20 @@ New modules
Improved modules
================
-module_name
------------
+xml
+---
-* TODO
+* Add support for multiple multi-byte encodings in the :mod:`XML parser
+ `: "cp932", "cp949", "cp950", "Big5","EUC-JP",
+ "GB2312", "GBK", "johab", and "Shift_JIS".
+ Add partial support (only BMP characters) for multi-byte encodings
+ "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
+ "Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
+ (without hyphen).
+ The parser now raises :exc:`ValueError` for known unsupported
+ multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
+ instead of failing later, when encounter non-ASCII data.
+ (Contributed by Serhiy Storchaka in :gh:`62259`.)
.. Add improved modules above alphabetically, not here at the end.
diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h
index 52dca1362592d6..bfa10eadf73573 100644
--- a/Include/internal/pycore_codecs.h
+++ b/Include/internal/pycore_codecs.h
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
in Python 3.5+?
*/
-extern PyObject* _PyCodec_LookupTextEncoding(
+PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
const char *encoding,
const char *alternate_command);
diff --git a/Lib/codecs.py b/Lib/codecs.py
index e4a8010aba90a5..af6ab031157e79 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -93,7 +93,7 @@ class CodecInfo(tuple):
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None,
- *, _is_text_encoding=None):
+ *, _is_text_encoding=None, _expat_decoding_table=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
@@ -104,6 +104,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
self.streamreader = streamreader
if _is_text_encoding is not None:
self._is_text_encoding = _is_text_encoding
+ if _expat_decoding_table is not None:
+ self._expat_decoding_table = _expat_decoding_table
return self
def __repr__(self):
diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py
index 7adeb0e1605274..4f749507d78b8b 100644
--- a/Lib/encodings/big5.py
+++ b/Lib/encodings/big5.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py
index 350df37baaedaf..a88caa5e1404c0 100644
--- a/Lib/encodings/big5hkscs.py
+++ b/Lib/encodings/big5hkscs.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
)
diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py
index e01f59b7190576..86e6ffe3b16c4f 100644
--- a/Lib/encodings/cp932.py
+++ b/Lib/encodings/cp932.py
@@ -36,4 +36,18 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ 0x80, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ 0xf8f0, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+ 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+ 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+ 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+ 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+ 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+ 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+ 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, 0xf8f1, 0xf8f2, 0xf8f3),
)
diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py
index 627c87125e2aff..7283dba8dbb8d3 100644
--- a/Lib/encodings/cp949.py
+++ b/Lib/encodings/cp949.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1),
)
diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py
index 39eec5ed0ddef9..d530f914880a32 100644
--- a/Lib/encodings/cp950.py
+++ b/Lib/encodings/cp950.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py
index 72b87aea68862f..557f926b8cdb97 100644
--- a/Lib/encodings/euc_jis_2004.py
+++ b/Lib/encodings/euc_jis_2004.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
)
diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py
index cc47d04112a187..bace554431ba3a 100644
--- a/Lib/encodings/euc_jisx0213.py
+++ b/Lib/encodings/euc_jisx0213.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
)
diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py
index 7bcbe4147f2ad4..b8df1bc0e2d5fb 100644
--- a/Lib/encodings/euc_jp.py
+++ b/Lib/encodings/euc_jp.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py
index c1fb1260e879f0..ee54e17180b5e1 100644
--- a/Lib/encodings/euc_kr.py
+++ b/Lib/encodings/euc_kr.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py
index 34fb6c366a7614..c2269a7a98105c 100644
--- a/Lib/encodings/gb18030.py
+++ b/Lib/encodings/gb18030.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py
index 3c3b837d618ecd..0a9313b05bd75f 100644
--- a/Lib/encodings/gb2312.py
+++ b/Lib/encodings/gb2312.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py
index 1b45db89859cdf..45e38bba391533 100644
--- a/Lib/encodings/gbk.py
+++ b/Lib/encodings/gbk.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
)
diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py
index 383442a3c9ac9a..f17f32e0f6f64c 100644
--- a/Lib/encodings/hz.py
+++ b/Lib/encodings/hz.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index d31ee07ab45b76..c896ffdeadfef7 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -385,4 +385,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py
index ab0406069356e4..40892f5c151d4d 100644
--- a/Lib/encodings/iso2022_jp.py
+++ b/Lib/encodings/iso2022_jp.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py
index 997044dc378749..98210d617879e2 100644
--- a/Lib/encodings/iso2022_jp_1.py
+++ b/Lib/encodings/iso2022_jp_1.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py
index 9106bf762512fd..047cd7c9677c54 100644
--- a/Lib/encodings/iso2022_jp_2.py
+++ b/Lib/encodings/iso2022_jp_2.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py
index 40198bf098570b..9b29edacce3fea 100644
--- a/Lib/encodings/iso2022_jp_2004.py
+++ b/Lib/encodings/iso2022_jp_2004.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py
index 346e08beccbbaf..a39de6301ccdec 100644
--- a/Lib/encodings/iso2022_jp_3.py
+++ b/Lib/encodings/iso2022_jp_3.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py
index 752bab9813a094..b7470ec9893655 100644
--- a/Lib/encodings/iso2022_jp_ext.py
+++ b/Lib/encodings/iso2022_jp_ext.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py
index bf7018763eae38..48dff8dc68e85c 100644
--- a/Lib/encodings/iso2022_kr.py
+++ b/Lib/encodings/iso2022_kr.py
@@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py
index 512aeeb732b522..99c9cf6335aaf0 100644
--- a/Lib/encodings/johab.py
+++ b/Lib/encodings/johab.py
@@ -36,4 +36,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
index 268fccbd53974e..279245f435e179 100644
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -250,4 +250,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py
index 46c8e070dd192e..911f59ccf9a1dd 100644
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py
index 83381172764dea..4b33b6fd93cbbe 100644
--- a/Lib/encodings/shift_jis.py
+++ b/Lib/encodings/shift_jis.py
@@ -36,4 +36,17 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+ 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+ 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+ 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+ 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+ 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+ 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+ 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py
index 161b1e86f9918a..195519eddf50f3 100644
--- a/Lib/encodings/shift_jis_2004.py
+++ b/Lib/encodings/shift_jis_2004.py
@@ -36,4 +36,18 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(
+ *range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+ 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+ 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+ 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+ 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+ 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+ 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+ 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
)
diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py
index cb653f53055e67..b533eed6c18cbf 100644
--- a/Lib/encodings/shift_jisx0213.py
+++ b/Lib/encodings/shift_jisx0213.py
@@ -36,4 +36,18 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(
+ *range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
+ -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+ 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+ 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+ 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+ 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+ 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+ 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+ 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
)
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py
index 9b1ce99b339ae0..52e4dc256ce7ff 100644
--- a/Lib/encodings/unicode_escape.py
+++ b/Lib/encodings/unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index d3b9980026666f..01853d46c89bf0 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -152,4 +152,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py
index 86b458eb9bcd96..c4f8753e999b90 100644
--- a/Lib/encodings/utf_16_be.py
+++ b/Lib/encodings/utf_16_be.py
@@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py
index ec454142eedf25..aa68f019f9ea2c 100644
--- a/Lib/encodings/utf_16_le.py
+++ b/Lib/encodings/utf_16_le.py
@@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py
index 1924bedbb74c68..446503ccb32ee0 100644
--- a/Lib/encodings/utf_32.py
+++ b/Lib/encodings/utf_32.py
@@ -147,4 +147,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py
index fe272b5fafec69..c430c7ee0ac897 100644
--- a/Lib/encodings/utf_32_be.py
+++ b/Lib/encodings/utf_32_be.py
@@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py
index 9e48210928ee65..7fb33289054770 100644
--- a/Lib/encodings/utf_32_le.py
+++ b/Lib/encodings/utf_32_le.py
@@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py
index 8e0567f2087d65..9f70aaff4f7a7f 100644
--- a/Lib/encodings/utf_7.py
+++ b/Lib/encodings/utf_7.py
@@ -35,4 +35,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=False,
)
diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py
index 1bf6336571547b..854cb88375c37f 100644
--- a/Lib/encodings/utf_8.py
+++ b/Lib/encodings/utf_8.py
@@ -39,4 +39,13 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+ -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index 1bb479203f365d..cc895cddcc561b 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -127,4 +127,14 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ # The same as for UTF-8.
+ _expat_decoding_table=(*range(128),
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+ -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 79c8a7ef886482..8fdd08df9e4f46 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1892,9 +1892,11 @@ def test_copy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
+ self.assertIsInstance(orig._expat_decoding_table, tuple)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+ self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
# Test a CodecInfo with _is_text_encoding equal to false.
orig = codecs.lookup("base64")
@@ -1902,9 +1904,11 @@ def test_copy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertFalse(orig._is_text_encoding)
+ self.assertNotHasAttr(orig, '_expat_decoding_table')
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+ self.assertNotHasAttr(dup, '_expat_decoding_table')
def test_deepcopy(self):
orig = codecs.lookup('utf-8')
@@ -1912,9 +1916,11 @@ def test_deepcopy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
+ self.assertIsInstance(orig._expat_decoding_table, tuple)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+ self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
# Test a CodecInfo with _is_text_encoding equal to false.
orig = codecs.lookup("base64")
@@ -1922,9 +1928,11 @@ def test_deepcopy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertFalse(orig._is_text_encoding)
+ self.assertNotHasAttr(orig, '_expat_decoding_table')
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+ self.assertNotHasAttr(dup, '_expat_decoding_table')
def test_pickle(self):
codec_info = codecs.lookup('utf-8')
@@ -1940,6 +1948,8 @@ def test_pickle(self):
unpickled_codec_info.incrementalencoder
)
self.assertTrue(unpickled_codec_info._is_text_encoding)
+ self.assertEqual(unpickled_codec_info._expat_decoding_table,
+ codec_info._expat_decoding_table)
# Test a CodecInfo with _is_text_encoding equal to false.
codec_info = codecs.lookup('base64')
@@ -1955,6 +1965,7 @@ def test_pickle(self):
unpickled_codec_info.incrementalencoder
)
self.assertFalse(unpickled_codec_info._is_text_encoding)
+ self.assertNotHasAttr(unpickled_codec_info, '_expat_decoding_table')
class StreamReaderTest(unittest.TestCase):
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py
index 10dca684accee3..3c33c32a6a77f7 100644
--- a/Lib/test/test_pyexpat.py
+++ b/Lib/test/test_pyexpat.py
@@ -276,7 +276,9 @@ def test_parse_again(self):
expat.errors.XML_ERROR_FINISHED)
@support.subTests('encoding', [
- 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+ # built-in Expat encodings
+ 'iso-8859-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+ # 8-bit Python encodings
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -288,6 +290,12 @@ def test_parse_again(self):
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
'mac-roman', 'mac-turkish',
'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
+ # multi-byte Python encodings
+ "cp932", "cp949", "cp950",
+ "Big5","EUC-JP", "GB2312", "GBK", "johab", "Shift_JIS",
+ 'UTF8', 'utf-8-sig',
+ "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+ "Shift_JIS-2004", "Shift_JISX0213",
])
def test_supported_encodings(self, encoding):
out = self.Outputter()
@@ -305,7 +313,7 @@ def test_supported_encodings(self, encoding):
])
@support.subTests('encoding', [
- 'UTF-8', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be',
+ 'UTF-8', 'utf-8', 'utf8', 'utf-16', 'utf-16le', 'utf-16be',
'koi8-u', 'cp1125', 'cp1251', 'iso8859-5', 'mac-cyrillic',
])
def test_supported_encodings2(self, encoding):
@@ -324,15 +332,46 @@ def test_supported_encodings2(self, encoding):
"End element: 'корінь'",
])
+ @support.subTests('encoding', [
+ 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+ ])
+ def test_supported_non_bmp(self, encoding):
+ out = self.Outputter()
+ parser = expat.ParserCreate()
+ self._hookup_callbacks(parser, out)
+ c = '\U00020e6d\U00028e36'
+ data = (f'\n'
+ f'{c}').encode(encoding)
+ parser.Parse(data, True)
+ self.assertEqual(out.out, [
+ ('XML declaration', ('1.0', encoding, -1)),
+ "Start element: 'root' {}",
+ f'Character data: {c!r}',
+ "End element: 'root'",
+ ])
+
+ @support.subTests('encoding', [
+ 'UTF8', 'utf-8-sig',
+ "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+ "Shift_JIS-2004", "Shift_JISX0213",
+ ])
+ def test_unsupported_non_bmp(self, encoding):
+ parser = expat.ParserCreate()
+ c = '\U00020e6d\U00028e36'
+ data = (f'\n'
+ f'{c}').encode(encoding)
+ with self.assertRaises(expat.ExpatError):
+ parser.Parse(data, True)
+
@support.subTests('encoding', [
'UTF-7',
- "Big5-HKSCS", "Big5",
- "cp932", "cp949", "cp950",
- "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
- "GB18030", "GB2312", "GBK",
+ "unicode-escape", "raw-unicode-escape",
+ "EUC-KR",
+ "GB18030",
+ "HZ-GB-2312",
+ "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
+ "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
"ISO-2022-KR",
- "johab",
- "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
])
def test_unsupported_encodings(self, encoding):
parser = expat.ParserCreate()
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 3a41ea97a2e0a2..8fef5bf663a7c4 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1008,6 +1008,8 @@ def check(encoding, body=''):
check("iso-8859-15", '\u20ac')
check("cp437", '\u221a')
check("mac-roman", '\u02da')
+ check('shift-jis-2004', '\u203e\u3406\uff66')
+ check('euc-jis-2004', '\u3406\uff66')
def xml(encoding, body=''):
return "%s" % (encoding, body)
@@ -1026,6 +1028,12 @@ def bxml(encoding, body=''):
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
'mac-roman', 'mac-turkish',
'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213',
+ 'gb2312', 'gbk', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-8-sig', 'utf8',
]
for encoding in supported_encodings:
with self.subTest(encoding=encoding):
@@ -1035,12 +1043,10 @@ def bxml(encoding, body=''):
('%d;' % ord(c)).encode())
unsupported_ascii_compatible_encodings = [
- 'big5', 'big5hkscs',
- 'cp932', 'cp949', 'cp950',
- 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
- 'gb2312', 'gbk', 'gb18030',
- 'iso2022-kr', 'johab',
- 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'euc-kr', 'gb18030',
+ 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+ 'iso2022-jp-3', 'iso2022-jp-ext',
+ 'iso2022-kr', 'hz',
'utf-7',
]
for encoding in unsupported_ascii_compatible_encodings:
diff --git a/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst b/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst
new file mode 100644
index 00000000000000..d0af77366378b8
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst
@@ -0,0 +1,9 @@
+Add support for multiple multi-byte encodings in the :mod:`XML parser
+`: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312",
+"GBK", "johab", and "Shift_JIS". Add partial support (only BMP characters)
+for multi-byte encodings "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+"Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases
+like "UTF8" (without hyphen). The parser now raises :exc:`ValueError` for
+known unsupported multi-byte encodings such us "ISO-2022-JP" or
+"raw-unicode-escape" instead of failing later, when encounter non-ASCII
+data.
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 64314e5dff93a1..c0548a5d1bd724 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -4,6 +4,7 @@
#include "Python.h"
#include "pycore_ceval.h" // _Py_EnterRecursiveCall()
+#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding()
#include "pycore_import.h" // _PyImport_SetModule()
#include "pycore_pyhash.h" // _Py_HashSecret
#include "pycore_traceback.h" // _PyTraceback_Add()
@@ -1433,6 +1434,57 @@ static struct PyMethodDef xmlparse_methods[] = {
Make it as simple as possible.
*/
+typedef struct {
+ int map[256];
+ char name[0];
+} pyexpat_encoding_info;
+
+static pyexpat_encoding_info *
+pyexpat_encoding_create(const char *name, PyObject *mapping)
+{
+ if (!PyTuple_Check(mapping) || PyTuple_GET_SIZE(mapping) != 256) {
+ PyErr_SetString(PyExc_ValueError,
+ "_expat_decoding_table must be a 256-tuple of integers");
+ return NULL;
+ }
+ pyexpat_encoding_info *info = (pyexpat_encoding_info *)PyMem_Malloc(
+ sizeof(pyexpat_encoding_info) + strlen(name) + 1);
+ if (info == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ for (int i = 0; i < 256; i++) {
+ int j = PyLong_AsInt(PyTuple_GET_ITEM(mapping, i));
+ if (j == -1 && PyErr_Occurred()) {
+ PyMem_Free(info);
+ return NULL;
+ }
+ info->map[i] = j;
+ }
+ strcpy(info->name, name);
+ return info;
+}
+
+static int
+pyexpat_encoding_convert(void *data, const char *s)
+{
+ pyexpat_encoding_info *info = (pyexpat_encoding_info *)data;
+ int i = (unsigned char)s[0];
+ assert(info->map[i] < -1);
+ PyObject *u = PyUnicode_Decode(s, -info->map[i], info->name, NULL);
+ if (u == NULL) {
+ return -1;
+ }
+ if (PyUnicode_GET_LENGTH(u) != 1) {
+ Py_DECREF(u);
+ return -1;
+ }
+ Py_UCS4 ch = PyUnicode_ReadChar(u, 0);
+ Py_DECREF(u);
+ return (int)ch;
+}
+
+
static const unsigned char template_buffer[256] =
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@@ -1465,6 +1517,43 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
if (PyErr_Occurred())
return XML_STATUS_ERROR;
+ PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
+ if (codec == NULL) {
+ return XML_STATUS_ERROR;
+ }
+ if (!PyTuple_CheckExact(codec)) {
+ PyObject *attr;
+ if (PyObject_GetOptionalAttrString(codec, "_expat_decoding_table", &attr) < 0) {
+ Py_DECREF(codec);
+ return XML_STATUS_ERROR;
+ }
+ if (attr != NULL) {
+ if (attr == Py_False) {
+ Py_DECREF(attr);
+ Py_DECREF(codec);
+ PyErr_Format(PyExc_ValueError,
+ "encoding '%s' is not supported",
+ name);
+ return XML_STATUS_ERROR;
+ }
+ pyexpat_encoding_info *data = pyexpat_encoding_create(name, attr);
+ Py_DECREF(attr);
+ if (data == NULL) {
+ Py_DECREF(codec);
+ return XML_STATUS_ERROR;
+ }
+ for (i = 0; i < 256; i++) {
+ info->map[i] = data->map[i];
+ }
+ info->data = data;
+ info->convert = pyexpat_encoding_convert;
+ info->release = PyMem_Free;
+ Py_DECREF(codec);
+ return XML_STATUS_OK;
+ }
+ }
+ Py_DECREF(codec);
+
u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
if (u == NULL) {
Py_XDECREF(u);
@@ -1473,8 +1562,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
if (PyUnicode_GET_LENGTH(u) != 256) {
Py_DECREF(u);
- PyErr_SetString(PyExc_ValueError,
- "multi-byte encodings are not supported");
+ PyErr_Format(PyExc_ValueError,
+ "multi-byte encoding '%s' is not supported",
+ name);
return XML_STATUS_ERROR;
}
diff --git a/Python/codecs.c b/Python/codecs.c
index 0bde56c0ac662e..a522e6b88068b3 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -10,6 +10,7 @@ Copyright (c) Corporation for National Research Initiatives.
#include "Python.h"
#include "pycore_call.h" // _PyObject_CallNoArgs()
+#include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding()
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
#include "pycore_pystate.h" // _PyInterpreterState_GET()