302 lines
9.6 KiB
Diff
302 lines
9.6 KiB
Diff
|
From fa9b6845ed583f5486372c6ffbc59e02a140d303 Mon Sep 17 00:00:00 2001
|
||
|
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
|
||
|
Date: Thu, 29 Apr 2021 19:12:20 +0200
|
||
|
Subject: [PATCH] allow utf-8 in xml names (#137)
|
||
|
|
||
|
https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
|
||
|
has a list of all allowed characters.
|
||
|
---
|
||
|
include/orcus/sax_parser_base.hpp | 3 +
|
||
|
src/orcus_test_xml.cpp | 1 +
|
||
|
src/parser/sax_parser_base.cpp | 201 ++++++++++++++++++++++++++++--
|
||
|
test/xml/non-ascii/check.txt | 4 +
|
||
|
test/xml/non-ascii/input.xml | 4 +
|
||
|
5 files changed, 201 insertions(+), 12 deletions(-)
|
||
|
create mode 100644 test/xml/non-ascii/check.txt
|
||
|
create mode 100644 test/xml/non-ascii/input.xml
|
||
|
|
||
|
diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp
|
||
|
index 9939e133..8394c07b 100644
|
||
|
--- a/include/orcus/sax_parser_base.hpp
|
||
|
+++ b/include/orcus/sax_parser_base.hpp
|
||
|
@@ -218,6 +218,9 @@ protected:
|
||
|
void element_name(parser_element& elem, std::ptrdiff_t begin_pos);
|
||
|
void attribute_name(pstring& attr_ns, pstring& attr_name);
|
||
|
void characters_with_encoded_char(cell_buffer& buf);
|
||
|
+
|
||
|
+ int is_name_char();
|
||
|
+ int is_name_start_char();
|
||
|
};
|
||
|
|
||
|
}}
|
||
|
diff --git a/src/orcus_test_xml.cpp b/src/orcus_test_xml.cpp
|
||
|
index 8a864d68..35f3dea7 100644
|
||
|
--- a/src/orcus_test_xml.cpp
|
||
|
+++ b/src/orcus_test_xml.cpp
|
||
|
@@ -77,6 +77,7 @@ const char* sax_parser_test_dirs[] = {
|
||
|
SRCDIR"/test/xml/no-decl-1/",
|
||
|
SRCDIR"/test/xml/underscore-identifier/",
|
||
|
SRCDIR"/test/xml/self-closing-root/",
|
||
|
+ SRCDIR"/test/xml/non-ascii/",
|
||
|
};
|
||
|
|
||
|
const char* sax_parser_parse_only_test_dirs[] = {
|
||
|
diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
|
||
|
index 97aa34ec..db51ff94 100644
|
||
|
--- a/src/parser/sax_parser_base.cpp
|
||
|
+++ b/src/parser/sax_parser_base.cpp
|
||
|
@@ -328,20 +328,182 @@ bool parser_base::value(pstring& str, bool decode)
|
||
|
return transient_stream();
|
||
|
}
|
||
|
|
||
|
+// https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
|
||
|
+// Return length of the character in bytes, otherwise 0.
|
||
|
+template< bool only_start_name >
|
||
|
+static
|
||
|
+int is_name_char_helper(const char* mp_char, const char* mp_end)
|
||
|
+{
|
||
|
+ const unsigned char first = mp_char[0];
|
||
|
+ // Note that ':' technically is an allowed name character, but it is handled separately
|
||
|
+ // e.g. in element_name(), so here pretend it isn't.
|
||
|
+ if (/*first == ':' ||*/ first == '_' || (first >= 'A' && first <= 'Z') || (first >= 'a' && first <= 'z'))
|
||
|
+ return 1;
|
||
|
+ if (!only_start_name && (first == '-' || first == '.' || (first >= '0' && first <= '9')))
|
||
|
+ return 1;
|
||
|
+
|
||
|
+ if (first < 0x7f) // other ascii characters are not allowed
|
||
|
+ return 0;
|
||
|
+ if (mp_end < mp_char + 1)
|
||
|
+ return 0;
|
||
|
+ const unsigned char second = mp_char[1];
|
||
|
+
|
||
|
+ // 0xb7 = 0xc2 0xb7 utf-8
|
||
|
+ if (!only_start_name && first == 0xc2 && second == 0xb7)
|
||
|
+ return 2;
|
||
|
+
|
||
|
+ // [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
|
||
|
+ // 0xc0 = 0xc3 0x80 utf-8
|
||
|
+ if (first < 0xc3)
|
||
|
+ return 0;
|
||
|
+ // xd7 = 0xc3 0x97 utf-8, 0xf7 = 0xc3 0xb7 utf-8
|
||
|
+ if (first == 0xc3)
|
||
|
+ return second >= 0x80 && second <= 0xff && second != 0x97 && second != 0xb7 ? 2 : 0;
|
||
|
+ // 0x2ff = 0xcb 0xbf utf-8, 0x300 = 0xcc 0x80 utf-8
|
||
|
+ if (first >= 0xc4 && first <= 0xcb)
|
||
|
+ return 2;
|
||
|
+
|
||
|
+ // [#x0300-#x036F]
|
||
|
+ // 0x0300 = 0xcc 0x80 utf-8, 0x36f = 0xcd 0xaf utf-8
|
||
|
+ if (!only_start_name && first == 0xcc)
|
||
|
+ return 2;
|
||
|
+ if (!only_start_name && first == 0xcd && second <= 0xaf)
|
||
|
+ return 2;
|
||
|
+
|
||
|
+ // [#x370-#x37D] | [#x37F-#x1FFF]
|
||
|
+ // 0x370 = 0xcd 0xb0 utf-8, 0x37e = 0xcd 0xbe
|
||
|
+ if (first < 0xcd)
|
||
|
+ return 0;
|
||
|
+ if (first == 0xcd)
|
||
|
+ return second >= 0xb0 && second != 0xbe ? 2 : 0;
|
||
|
+ // 0x07ff = 0xdf 0xbf utf-8 (the last 2-byte utf-8)
|
||
|
+ if (first <= 0xdf)
|
||
|
+ return 2;
|
||
|
+
|
||
|
+ if (first < 0xe0)
|
||
|
+ return 0;
|
||
|
+ if (mp_end < mp_char + 2)
|
||
|
+ return 0;
|
||
|
+ const unsigned char third = mp_char[2];
|
||
|
+
|
||
|
+ // 0x0800 = 0xe0 0xa0 0x80 utf-8, 0x1fff = 0xe1 0xbf 0xbf utf-8, 0x2000 = 0xe2 0x80 0x80
|
||
|
+ if (first == 0xe0 || first == 0xe1)
|
||
|
+ return 3;
|
||
|
+
|
||
|
+ // [#x200C-#x200D]
|
||
|
+ // 0x200c = 0xe2 0x80 0x8c utf-8, 0x200d = 0xe2 0x80 0x8d utf-8
|
||
|
+ if (first < 0xe2)
|
||
|
+ return 0;
|
||
|
+ if (first == 0xe2 && second == 0x80 && (third == 0x8c || third == 0x8d))
|
||
|
+ return 3;
|
||
|
+
|
||
|
+ // [#x203F-#x2040]
|
||
|
+ // 0x203f = 0xe2 0x80 0xbf utf-8, 0x2040 = 0xe2 0x81 0x80 utf-8
|
||
|
+ if (!only_start_name && first == 0xe2 && second == 0x80 && third == 0xbf)
|
||
|
+ return 3;
|
||
|
+ if (!only_start_name && first == 0xe2 && second == 0x81 && third == 0x80)
|
||
|
+ return 3;
|
||
|
+
|
||
|
+ // [#x2070-#x218F]
|
||
|
+ // 0x2070 = 0xe2 0x81 0xb0 utf-8, 0x218f = 0xe2 0x86 0x8f utf-8
|
||
|
+ if (first == 0xe2)
|
||
|
+ {
|
||
|
+ if (second < 0x81)
|
||
|
+ return 0;
|
||
|
+ if (second >= 0x81 && second < 0x86)
|
||
|
+ return 3;
|
||
|
+ if (second == 0x86 && third <= 0x8f)
|
||
|
+ return 3;
|
||
|
+ }
|
||
|
+
|
||
|
+ // [#x2C00-#x2FEF]
|
||
|
+ // 0x2c00 = 0xe2 0xb0 0x80 utf-8, 0x2fef = 0xe2 0xbf 0xaf utf-8
|
||
|
+ if (first == 0xe2)
|
||
|
+ {
|
||
|
+ if (second < 0xb0)
|
||
|
+ return 0;
|
||
|
+ if (second < 0xbf)
|
||
|
+ return 3;
|
||
|
+ if (second == 0xbf && third <= 0xaf)
|
||
|
+ return 3;
|
||
|
+ }
|
||
|
+
|
||
|
+ // [#x3001-#xD7FF]
|
||
|
+ // 0x3001 = 0xe3 0x80 0x81 utf-8, 0xd7ff = 0xed 0x9f 0xbf utf-8, 0xd800 = 0xed 0xa0 0x80 utf-8
|
||
|
+ if (first < 0xe3)
|
||
|
+ return 0;
|
||
|
+ if (first < 0xed)
|
||
|
+ return 3;
|
||
|
+ if (first == 0xed && second <= 0x9f)
|
||
|
+ return 3;
|
||
|
+
|
||
|
+ // [#xF900-#xFDCF]
|
||
|
+ // 0xf900 = 0xef 0xa4 0x80 utf-8, 0xfdcf = 0xef 0xb7 0x8f utf-8
|
||
|
+ if (first == 0xef)
|
||
|
+ {
|
||
|
+ if (second < 0xa4)
|
||
|
+ return 0;
|
||
|
+ if (second < 0xb7)
|
||
|
+ return 3;
|
||
|
+ if (second == 0xb7 && third <= 0x8f)
|
||
|
+ return 3;
|
||
|
+ }
|
||
|
+
|
||
|
+ // [#xFDF0-#xFFFD]
|
||
|
+ // 0xfdf0 = 0xef 0xb7 0xb0 utf-8, 0xfffd = 0xef 0xbf 0xbd utf-8
|
||
|
+ if (first == 0xef)
|
||
|
+ {
|
||
|
+ assert(second >= 0xb7);
|
||
|
+ if (second == 0xb7 && third < 0xb0)
|
||
|
+ return 0;
|
||
|
+ if (second < 0xbe)
|
||
|
+ return 3;
|
||
|
+ if (second == 0xbf && third <= 0xbd)
|
||
|
+ return 3;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (first < 0xf0)
|
||
|
+ return 0;
|
||
|
+ if (mp_end < mp_char + 3)
|
||
|
+ return 0;
|
||
|
+ // const unsigned char fourth = mp_char[3];
|
||
|
+
|
||
|
+ // [#x10000-#xEFFFF]
|
||
|
+ // 0x10000 = 0xf0 0x90 0x80 0x80 utf-8, 0xeffff = 0xf3 0xaf 0xbf 0xbf utf-8,
|
||
|
+ // 0xf0000 = 0xf3 0xb0 0x80 0x80 utf-8
|
||
|
+ if (first >= 0xf0 && first < 0xf2)
|
||
|
+ return 4;
|
||
|
+ if (first == 0xf3 && second < 0xb0)
|
||
|
+ return 4;
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+int parser_base::is_name_char()
|
||
|
+{
|
||
|
+ return is_name_char_helper<false>(mp_char, mp_end);
|
||
|
+}
|
||
|
+
|
||
|
+int parser_base::is_name_start_char()
|
||
|
+{
|
||
|
+ return is_name_char_helper<true>(mp_char, mp_end);
|
||
|
+}
|
||
|
+
|
||
|
void parser_base::name(pstring& str)
|
||
|
{
|
||
|
const char* p0 = mp_char;
|
||
|
- char c = cur_char();
|
||
|
- if (!is_alpha(c) && c != '_')
|
||
|
+ int skip = is_name_start_char();
|
||
|
+ if (skip == 0)
|
||
|
{
|
||
|
::std::ostringstream os;
|
||
|
- os << "name must begin with an alphabet, but got this instead '" << c << "'";
|
||
|
+ os << "name must begin with an alphabet, but got this instead '" << cur_char() << "'";
|
||
|
throw malformed_xml_error(os.str(), offset());
|
||
|
}
|
||
|
+ next(skip);
|
||
|
|
||
|
#if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__)
|
||
|
|
||
|
- const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__");
|
||
|
+ const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__..");
|
||
|
const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
|
||
|
|
||
|
size_t n_total = available_size();
|
||
|
@@ -351,20 +513,35 @@ void parser_base::name(pstring& str)
|
||
|
__m128i char_block = _mm_loadu_si128((const __m128i*)mp_char);
|
||
|
|
||
|
int n = std::min<size_t>(16u, n_total);
|
||
|
- int r = _mm_cmpestri(match, 10, char_block, n, mode);
|
||
|
+ int r = _mm_cmpestri(match, 12, char_block, n, mode);
|
||
|
mp_char += r; // Move the current char position.
|
||
|
+ n_total -= r;
|
||
|
|
||
|
- if (r < 16)
|
||
|
- // No need to move to the next segment. Stop here.
|
||
|
- break;
|
||
|
+ if (r < 16 && n_total)
|
||
|
+ {
|
||
|
+ // There is a character that does not match the SSE-based ASCII-only check.
|
||
|
+ // It may either by an ascii character that is not allowed, in which case stop,
|
||
|
+ // or it may possibly be an allowed utf-8 character, in which case move over it
|
||
|
+ // using the slow function.
|
||
|
+ skip = is_name_char();
|
||
|
+ if(skip == 0)
|
||
|
+ break;
|
||
|
+ next(skip);
|
||
|
+ n_total -= skip;
|
||
|
+ }
|
||
|
|
||
|
- // Skip 16 chars to the next segment.
|
||
|
- n_total -= 16;
|
||
|
}
|
||
|
+ cur_char_checked(); // check end of xml stream
|
||
|
|
||
|
#else
|
||
|
- while (is_alpha(c) || is_numeric(c) || is_name_char(c))
|
||
|
- c = next_char_checked();
|
||
|
+ for(;;)
|
||
|
+ {
|
||
|
+ cur_char_checked(); // check end of xml stream
|
||
|
+ skip = is_name_char();
|
||
|
+ if(skip == 0)
|
||
|
+ break;
|
||
|
+ next(skip);
|
||
|
+ }
|
||
|
#endif
|
||
|
|
||
|
str = pstring(p0, mp_char-p0);
|
||
|
diff --git a/test/xml/non-ascii/check.txt b/test/xml/non-ascii/check.txt
|
||
|
new file mode 100644
|
||
|
index 00000000..77b7c003
|
||
|
--- /dev/null
|
||
|
+++ b/test/xml/non-ascii/check.txt
|
||
|
@@ -0,0 +1,4 @@
|
||
|
+/Myšička
|
||
|
+/Myšička@jméno="Žužla"
|
||
|
+/Myšička/Nožičky
|
||
|
+/Myšička/Nožičky"4"
|
||
|
diff --git a/test/xml/non-ascii/input.xml b/test/xml/non-ascii/input.xml
|
||
|
new file mode 100644
|
||
|
index 00000000..c516744b
|
||
|
--- /dev/null
|
||
|
+++ b/test/xml/non-ascii/input.xml
|
||
|
@@ -0,0 +1,4 @@
|
||
|
+<?xml version="1.0" encoding="UTF-8"?>
|
||
|
+<Myšička jméno="Žužla">
|
||
|
+ <Nožičky>4</Nožičky>
|
||
|
+</Myšička>
|
||
|
--
|
||
|
2.26.2
|
||
|
|