Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,34 @@ def test_issue10254(self):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)

def test_long_combining_mark_run(self):
# gh-149079: avoid quadratic canonical ordering.
payload = "a" + ("\u0300\u0327" * 32)
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)

self.assertEqual(self.db.normalize("NFD", payload), nfd)
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
self.assertEqual(self.db.normalize("NFC", payload), nfc)
self.assertEqual(self.db.normalize("NFKC", payload), nfc)

def test_combining_mark_run_fast_paths(self):
# gh-149079: cover short runs and already-sorted long runs.
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)

self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)

def test_issue29456(self):
# Fix #29456
u1176_str_a = '\u1100\u1176\u11a8'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix a potential denial of service in :func:`unicodedata.normalize`. The
canonical ordering step of Unicode normalization used a quadratic-time insertion
sort for reordering combining characters, which could be exploited with
crafted input containing many combining characters in non-canonical order.
Replaced with a linear-time counting sort for long runs.
143 changes: 117 additions & 26 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,19 +552,80 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
(*index)++;
}

/* Small combining runs are usually cheaper with insertion sort. */
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20

static void
canonical_ordering_sort_insertion(int kind, void *data,
Py_ssize_t start, Py_ssize_t end)
{
for (Py_ssize_t i = start + 1; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
Py_ssize_t j = i;

while (j > start) {
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
if (_getrecord_ex(previous)->combining <= combining) {
break;
}
PyUnicode_WRITE(kind, data, j, previous);
j--;
}
if (j != i) {
PyUnicode_WRITE(kind, data, j, code);
}
}
}

static void
canonical_ordering_sort_counting(int kind, void *data,
Py_ssize_t start, Py_ssize_t end,
Py_UCS4 *sortbuf)
{
Py_ssize_t counts[256] = {0};
Py_ssize_t run_length = end - start;
Py_ssize_t total = 0;

for (Py_ssize_t i = start; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
counts[combining]++;
}

for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
Py_ssize_t count = counts[i];
counts[i] = total;
total += count;
}

/* Reuse counts[] as the next output slot for each CCC. */
for (Py_ssize_t i = start; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
sortbuf[counts[combining]++] = code;
}
for (Py_ssize_t i = 0; i < run_length; i++) {
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
}
}

static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
Py_ssize_t i, o, osize;
int kind;
const void *data;
int input_kind, result_kind;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not reuse the same variable?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, I asked to have two different variables for readability purposes. We could reuse it but when reading the code, it was cleaner when I saw the separation. But it can be reverted if you insist.

const void *input_data;
void *result_data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
Py_UCS4 *sortbuf = NULL;
Py_ssize_t sortbuflen = 0;

stackptr = 0;
isize = PyUnicode_GET_LENGTH(input);
Expand All @@ -584,11 +645,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
return NULL;
}
i = o = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
input_kind = PyUnicode_KIND(input);
input_data = PyUnicode_DATA(input);

while (i < isize) {
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
while(stackptr) {
Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
Expand Down Expand Up @@ -656,34 +717,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
if (!result)
return NULL;

kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
result_kind = PyUnicode_KIND(result);
result_data = PyUnicode_DATA(result);

/* Sort canonically. */
/* Sort each consecutive combining-character run canonically. */
i = 0;
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
while (i < o) {
Py_ssize_t run_length, run_start;
int needs_sort = 0;

Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
prev = _getrecord_ex(ch)->combining;
if (prev == 0) {
i++;
continue;
}
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
PyUnicode_WRITE(kind, data, o+1,
PyUnicode_READ(kind, data, o));
PyUnicode_WRITE(kind, data, o, tmp);
o--;
if (o < 0)
break;
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)

run_start = i++;
while (i < o) {
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
cur = _getrecord_ex(ch)->combining;
if (cur == 0) {
break;
}
if (prev > cur) {
needs_sort = 1;
}
prev = cur;
i++;
}
if (!needs_sort) {
continue;
}

run_length = i - run_start;
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
canonical_ordering_sort_insertion(result_kind, result_data,
run_start, i);
continue;
}
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;

if (run_length > sortbuflen) {
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
Py_UCS4,
run_length);
if (new_sortbuf == NULL) {
PyErr_NoMemory();
PyMem_Free(sortbuf);
Py_DECREF(result);
return NULL;
}
sortbuf = new_sortbuf;
sortbuflen = run_length;
}

canonical_ordering_sort_counting(result_kind, result_data,
run_start, i, sortbuf);
}
PyMem_Free(sortbuf);
return result;
}

Expand Down
Loading