af65e892dc
Change-Id: I448d3119129a864e79dbef336e46545f9aca8b25 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/137584 Tested-by: Jenkins Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
942 lines
29 KiB
C++
942 lines
29 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
#include <sal/config.h>
|
|
|
|
#include <cassert>
|
|
#include <climits>
|
|
|
|
#include <com/sun/star/container/NoSuchElementException.hpp>
|
|
#include <com/sun/star/uno/RuntimeException.hpp>
|
|
#include <osl/file.h>
|
|
#include <rtl/character.hxx>
|
|
#include <rtl/string.h>
|
|
#include <rtl/ustring.hxx>
|
|
#include <sal/log.hxx>
|
|
#include <sal/types.h>
|
|
#include <utility>
|
|
#include <xmlreader/pad.hxx>
|
|
#include <xmlreader/span.hxx>
|
|
#include <xmlreader/xmlreader.hxx>
|
|
|
|
namespace xmlreader {
|
|
|
|
namespace {
|
|
|
|
bool isSpace(char c) {
|
|
switch (c) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
case ' ':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
XmlReader::XmlReader(OUString fileUrl)
|
|
: fileUrl_(std::move(fileUrl))
|
|
, fileHandle_(nullptr)
|
|
{
|
|
oslFileError e = osl_openFile(
|
|
fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
|
|
switch (e)
|
|
{
|
|
case osl_File_E_None:
|
|
break;
|
|
case osl_File_E_NOENT:
|
|
throw css::container::NoSuchElementException( fileUrl_ );
|
|
default:
|
|
throw css::uno::RuntimeException(
|
|
"cannot open " + fileUrl_ + ": " + OUString::number(e));
|
|
}
|
|
e = osl_getFileSize(fileHandle_, &fileSize_);
|
|
if (e == osl_File_E_None) {
|
|
e = osl_mapFile(
|
|
fileHandle_, &fileAddress_, fileSize_, 0,
|
|
osl_File_MapFlag_WillNeed);
|
|
}
|
|
if (e != osl_File_E_None) {
|
|
oslFileError e2 = osl_closeFile(fileHandle_);
|
|
if (e2 != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
|
|
}
|
|
throw css::uno::RuntimeException(
|
|
"cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
|
|
}
|
|
namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
|
|
namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
|
|
pos_ = static_cast< char * >(fileAddress_);
|
|
end_ = pos_ + fileSize_;
|
|
state_ = State::Content;
|
|
firstAttribute_ = true;
|
|
}
|
|
|
|
XmlReader::~XmlReader() {
|
|
if (!fileHandle_)
|
|
return;
|
|
oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
|
|
if (e != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
|
|
}
|
|
e = osl_closeFile(fileHandle_);
|
|
if (e != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
|
|
}
|
|
}
|
|
|
|
int XmlReader::registerNamespaceIri(Span const & iri) {
|
|
int id = toNamespaceId(namespaceIris_.size());
|
|
namespaceIris_.push_back(iri);
|
|
if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
|
|
// Old user layer .xcu files used the xsi namespace prefix without
|
|
// declaring a corresponding namespace binding, see issue 77174; reading
|
|
// those files during migration would fail without this hack that can be
|
|
// removed once migration is no longer relevant (see
|
|
// configmgr::Components::parseModificationLayer):
|
|
namespaces_.emplace_back(Span("xsi"), id);
|
|
}
|
|
return id;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
|
|
{
|
|
switch (state_) {
|
|
case State::Content:
|
|
switch (reportText) {
|
|
case Text::NONE:
|
|
return handleSkippedText(data, nsId);
|
|
case Text::Raw:
|
|
return handleRawText(data);
|
|
default: // Text::Normalized
|
|
return handleNormalizedText(data);
|
|
}
|
|
case State::StartTag:
|
|
return handleStartTag(nsId, data);
|
|
case State::EndTag:
|
|
return handleEndTag();
|
|
case State::EmptyElementTag:
|
|
handleElementEnd();
|
|
return Result::End;
|
|
default: // State::Done
|
|
return Result::Done;
|
|
}
|
|
}
|
|
|
|
bool XmlReader::nextAttribute(int * nsId, Span * localName) {
|
|
assert(nsId != nullptr && localName != nullptr);
|
|
if (firstAttribute_) {
|
|
currentAttribute_ = attributes_.begin();
|
|
firstAttribute_ = false;
|
|
} else {
|
|
++currentAttribute_;
|
|
}
|
|
if (currentAttribute_ == attributes_.end()) {
|
|
return false;
|
|
}
|
|
if (currentAttribute_->nameColon == nullptr) {
|
|
*nsId = NAMESPACE_NONE;
|
|
*localName = Span(
|
|
currentAttribute_->nameBegin,
|
|
currentAttribute_->nameEnd - currentAttribute_->nameBegin);
|
|
} else {
|
|
*nsId = getNamespaceId(
|
|
Span(
|
|
currentAttribute_->nameBegin,
|
|
currentAttribute_->nameColon - currentAttribute_->nameBegin));
|
|
*localName = Span(
|
|
currentAttribute_->nameColon + 1,
|
|
currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Span XmlReader::getAttributeValue(bool fullyNormalize) {
|
|
return handleAttributeValue(
|
|
currentAttribute_->valueBegin, currentAttribute_->valueEnd,
|
|
fullyNormalize);
|
|
}
|
|
|
|
int XmlReader::getNamespaceId(Span const & prefix) const {
|
|
auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
|
|
[&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
|
|
|
|
if (i != namespaces_.rend())
|
|
return i->nsId;
|
|
|
|
return NAMESPACE_UNKNOWN;
|
|
}
|
|
|
|
|
|
void XmlReader::normalizeLineEnds(Span const & text) {
|
|
char const * p = text.begin;
|
|
sal_Int32 n = text.length;
|
|
for (;;) {
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
|
|
if (i < 0) {
|
|
break;
|
|
}
|
|
pad_.add(p, i);
|
|
p += i + 1;
|
|
n -= i + 1;
|
|
if (n == 0 || *p != '\x0A') {
|
|
pad_.add("\x0A");
|
|
}
|
|
}
|
|
pad_.add(p, n);
|
|
}
|
|
|
|
void XmlReader::skipSpace() {
|
|
while (isSpace(peek())) {
|
|
++pos_;
|
|
}
|
|
}
|
|
|
|
bool XmlReader::skipComment() {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
|
|
RTL_CONSTASCII_LENGTH("--")) !=
|
|
0)
|
|
{
|
|
return false;
|
|
}
|
|
pos_ += RTL_CONSTASCII_LENGTH("--");
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within comment) of " + fileUrl_ );
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("--");
|
|
if (read() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
"illegal \"--\" within comment in " + fileUrl_ );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void XmlReader::skipProcessingInstruction() {
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"bad '<?' in " + fileUrl_ );
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("?>");
|
|
}
|
|
|
|
void XmlReader::skipDocumentTypeDeclaration() {
|
|
// Neither is it checked that the doctypedecl is at the correct position in
|
|
// the document, nor that it is well-formed:
|
|
for (;;) {
|
|
char c = read();
|
|
switch (c) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within DTD) of " + fileUrl_ );
|
|
case '"':
|
|
case '\'':
|
|
{
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(
|
|
pos_, end_ - pos_, c);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within DTD) of " + fileUrl_ );
|
|
}
|
|
pos_ += i + 1;
|
|
}
|
|
break;
|
|
case '>':
|
|
return;
|
|
case '[':
|
|
for (;;) {
|
|
c = read();
|
|
switch (c) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within DTD) of " + fileUrl_ );
|
|
case '"':
|
|
case '\'':
|
|
{
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(
|
|
pos_, end_ - pos_, c);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within DTD) of " + fileUrl_ );
|
|
}
|
|
pos_ += i + 1;
|
|
}
|
|
break;
|
|
case '<':
|
|
switch (read()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within DTD) of " + fileUrl_ );
|
|
case '!':
|
|
skipComment();
|
|
break;
|
|
case '?':
|
|
skipProcessingInstruction();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
case ']':
|
|
skipSpace();
|
|
if (read() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
"missing \">\" of DTD in " + fileUrl_ );
|
|
}
|
|
return;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Span XmlReader::scanCdataSection() {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
|
|
RTL_CONSTASCII_LENGTH("[CDATA[")) !=
|
|
0)
|
|
{
|
|
return Span();
|
|
}
|
|
pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
|
|
char const * begin = pos_;
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"premature end (within CDATA section) of " + fileUrl_ );
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
|
|
return Span(begin, i);
|
|
}
|
|
|
|
bool XmlReader::scanName(char const ** nameColon) {
|
|
assert(nameColon != nullptr && *nameColon == nullptr);
|
|
for (char const * begin = pos_;; ++pos_) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
case ' ':
|
|
case '/':
|
|
case '=':
|
|
case '>':
|
|
return pos_ != begin;
|
|
case ':':
|
|
*nameColon = pos_;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
|
|
assert(begin != nullptr && begin <= end);
|
|
Span iri(handleAttributeValue(begin, end, false));
|
|
for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
|
|
if (namespaceIris_[i] == iri) {
|
|
return toNamespaceId(i);
|
|
}
|
|
}
|
|
return XmlReader::NAMESPACE_UNKNOWN;
|
|
}
|
|
|
|
char const * XmlReader::handleReference(char const * position, char const * end)
|
|
{
|
|
assert(position != nullptr && *position == '&' && position < end);
|
|
++position;
|
|
if (*position == '#') {
|
|
++position;
|
|
sal_uInt32 val = 0;
|
|
char const * p;
|
|
if (*position == 'x') {
|
|
++position;
|
|
p = position;
|
|
for (;; ++position) {
|
|
char c = *position;
|
|
if (c >= '0' && c <= '9') {
|
|
val = 16 * val + (c - '0');
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
val = 16 * val + (c - 'A') + 10;
|
|
} else if (c >= 'a' && c <= 'f') {
|
|
val = 16 * val + (c - 'a') + 10;
|
|
} else {
|
|
break;
|
|
}
|
|
if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
|
|
throw css::uno::RuntimeException(
|
|
"'&#x...' too large in " + fileUrl_ );
|
|
}
|
|
}
|
|
} else {
|
|
p = position;
|
|
for (;; ++position) {
|
|
char c = *position;
|
|
if (c >= '0' && c <= '9') {
|
|
val = 10 * val + (c - '0');
|
|
} else {
|
|
break;
|
|
}
|
|
if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
|
|
throw css::uno::RuntimeException(
|
|
"'&#...' too large in " + fileUrl_ );
|
|
}
|
|
}
|
|
}
|
|
if (position == p || *position++ != ';') {
|
|
throw css::uno::RuntimeException(
|
|
"'&#...' missing ';' in " + fileUrl_ );
|
|
}
|
|
assert(rtl::isUnicodeCodePoint(val));
|
|
if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
|
|
(val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
|
|
{
|
|
throw css::uno::RuntimeException(
|
|
"character reference denoting invalid character in " + fileUrl_ );
|
|
}
|
|
char buf[4];
|
|
sal_Int32 len;
|
|
if (val < 0x80) {
|
|
buf[0] = static_cast< char >(val);
|
|
len = 1;
|
|
} else if (val < 0x800) {
|
|
buf[0] = static_cast< char >((val >> 6) | 0xC0);
|
|
buf[1] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 2;
|
|
} else if (val < 0x10000) {
|
|
buf[0] = static_cast< char >((val >> 12) | 0xE0);
|
|
buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
|
|
buf[2] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 3;
|
|
} else {
|
|
buf[0] = static_cast< char >((val >> 18) | 0xF0);
|
|
buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
|
|
buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
|
|
buf[3] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 4;
|
|
}
|
|
pad_.addEphemeral(buf, len);
|
|
return position;
|
|
} else {
|
|
struct EntityRef {
|
|
char const * inBegin;
|
|
sal_Int32 const inLength;
|
|
char const * outBegin;
|
|
sal_Int32 const outLength;
|
|
};
|
|
static EntityRef const refs[] = {
|
|
{ RTL_CONSTASCII_STRINGPARAM("amp;"),
|
|
RTL_CONSTASCII_STRINGPARAM("&") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("lt;"),
|
|
RTL_CONSTASCII_STRINGPARAM("<") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("gt;"),
|
|
RTL_CONSTASCII_STRINGPARAM(">") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("apos;"),
|
|
RTL_CONSTASCII_STRINGPARAM("'") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("quot;"),
|
|
RTL_CONSTASCII_STRINGPARAM("\"") } };
|
|
for (const auto & ref : refs) {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
position, end - position, ref.inBegin, ref.inLength,
|
|
ref.inLength) ==
|
|
0)
|
|
{
|
|
position += ref.inLength;
|
|
pad_.add(ref.outBegin, ref.outLength);
|
|
return position;
|
|
}
|
|
}
|
|
throw css::uno::RuntimeException(
|
|
"unknown entity reference in " + fileUrl_ );
|
|
}
|
|
}
|
|
|
|
Span XmlReader::handleAttributeValue(
|
|
char const * begin, char const * end, bool fullyNormalize)
|
|
{
|
|
pad_.clear();
|
|
if (fullyNormalize) {
|
|
while (begin != end && isSpace(*begin)) {
|
|
++begin;
|
|
}
|
|
while (end != begin && isSpace(end[-1])) {
|
|
--end;
|
|
}
|
|
char const * p = begin;
|
|
enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
|
|
// a single true space character can go into the current span,
|
|
// everything else breaks the span
|
|
Space space = SPACE_NONE;
|
|
while (p != end) {
|
|
switch (*p) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
switch (space) {
|
|
case SPACE_NONE:
|
|
pad_.add(begin, p - begin);
|
|
pad_.add(" ");
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_SPAN:
|
|
pad_.add(begin, p - begin);
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_BREAK:
|
|
break;
|
|
}
|
|
begin = ++p;
|
|
break;
|
|
case ' ':
|
|
switch (space) {
|
|
case SPACE_NONE:
|
|
++p;
|
|
space = SPACE_SPAN;
|
|
break;
|
|
case SPACE_SPAN:
|
|
pad_.add(begin, p - begin);
|
|
begin = ++p;
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_BREAK:
|
|
begin = ++p;
|
|
break;
|
|
}
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, p - begin);
|
|
p = handleReference(p, end);
|
|
begin = p;
|
|
space = SPACE_NONE;
|
|
break;
|
|
default:
|
|
++p;
|
|
space = SPACE_NONE;
|
|
break;
|
|
}
|
|
}
|
|
pad_.add(begin, p - begin);
|
|
} else {
|
|
char const * p = begin;
|
|
while (p != end) {
|
|
switch (*p) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
pad_.add(begin, p - begin);
|
|
begin = ++p;
|
|
pad_.add(" ");
|
|
break;
|
|
case '\x0D':
|
|
pad_.add(begin, p - begin);
|
|
++p;
|
|
if (peek() == '\x0A') {
|
|
++p;
|
|
}
|
|
begin = p;
|
|
pad_.add(" ");
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, p - begin);
|
|
p = handleReference(p, end);
|
|
begin = p;
|
|
break;
|
|
default:
|
|
++p;
|
|
break;
|
|
}
|
|
}
|
|
pad_.add(begin, p - begin);
|
|
}
|
|
return pad_.get();
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
|
|
assert(nsId != nullptr && localName);
|
|
char const * nameBegin = pos_;
|
|
char const * nameColon = nullptr;
|
|
if (!scanName(&nameColon)) {
|
|
throw css::uno::RuntimeException(
|
|
"bad tag name in " + fileUrl_ );
|
|
}
|
|
char const * nameEnd = pos_;
|
|
NamespaceList::size_type inheritedNamespaces = namespaces_.size();
|
|
bool hasDefaultNs = false;
|
|
int defaultNsId = NAMESPACE_NONE;
|
|
attributes_.clear();
|
|
for (;;) {
|
|
char const * p = pos_;
|
|
skipSpace();
|
|
if (peek() == '/' || peek() == '>') {
|
|
break;
|
|
}
|
|
if (pos_ == p) {
|
|
throw css::uno::RuntimeException(
|
|
"missing whitespace before attribute in " + fileUrl_ );
|
|
}
|
|
char const * attrNameBegin = pos_;
|
|
char const * attrNameColon = nullptr;
|
|
if (!scanName(&attrNameColon)) {
|
|
throw css::uno::RuntimeException(
|
|
"bad attribute name in " + fileUrl_ );
|
|
}
|
|
char const * attrNameEnd = pos_;
|
|
skipSpace();
|
|
if (read() != '=') {
|
|
throw css::uno::RuntimeException(
|
|
"missing '=' in " + fileUrl_ );
|
|
}
|
|
skipSpace();
|
|
char del = read();
|
|
if (del != '\'' && del != '"') {
|
|
throw css::uno::RuntimeException(
|
|
"bad attribute value in " + fileUrl_ );
|
|
}
|
|
char const * valueBegin = pos_;
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
"unterminated attribute value in " + fileUrl_ );
|
|
}
|
|
char const * valueEnd = pos_ + i;
|
|
pos_ += i + 1;
|
|
if (attrNameColon == nullptr &&
|
|
Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
|
|
{
|
|
hasDefaultNs = true;
|
|
defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
|
|
} else if (attrNameColon != nullptr &&
|
|
Span(attrNameBegin, attrNameColon - attrNameBegin) ==
|
|
"xmlns")
|
|
{
|
|
namespaces_.emplace_back(
|
|
Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
|
|
scanNamespaceIri(valueBegin, valueEnd));
|
|
} else {
|
|
attributes_.emplace_back(
|
|
attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
|
|
valueEnd);
|
|
}
|
|
}
|
|
if (!hasDefaultNs && !elements_.empty()) {
|
|
defaultNsId = elements_.top().defaultNamespaceId;
|
|
}
|
|
firstAttribute_ = true;
|
|
if (peek() == '/') {
|
|
state_ = State::EmptyElementTag;
|
|
++pos_;
|
|
} else {
|
|
state_ = State::Content;
|
|
}
|
|
if (peek() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
"missing '>' in " + fileUrl_ );
|
|
}
|
|
++pos_;
|
|
elements_.push(
|
|
ElementData(
|
|
Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
|
|
defaultNsId));
|
|
if (nameColon == nullptr) {
|
|
*nsId = defaultNsId;
|
|
*localName = Span(nameBegin, nameEnd - nameBegin);
|
|
} else {
|
|
*nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
|
|
*localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
|
|
}
|
|
return Result::Begin;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleEndTag() {
|
|
if (elements_.empty()) {
|
|
throw css::uno::RuntimeException(
|
|
"spurious end tag in " + fileUrl_ );
|
|
}
|
|
char const * nameBegin = pos_;
|
|
char const * nameColon = nullptr;
|
|
if (!scanName(&nameColon) ||
|
|
!elements_.top().name.equals(nameBegin, pos_ - nameBegin))
|
|
{
|
|
throw css::uno::RuntimeException(
|
|
"tag mismatch in " + fileUrl_ );
|
|
}
|
|
handleElementEnd();
|
|
skipSpace();
|
|
if (peek() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
"missing '>' in " + fileUrl_ );
|
|
}
|
|
++pos_;
|
|
return Result::End;
|
|
}
|
|
|
|
void XmlReader::handleElementEnd() {
|
|
assert(!elements_.empty());
|
|
auto end = elements_.top().inheritedNamespaces;
|
|
namespaces_.resize(end);
|
|
elements_.pop();
|
|
state_ = elements_.empty() ? State::Done : State::Content;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
|
|
for (;;) {
|
|
auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
|
|
if (!i) {
|
|
throw css::uno::RuntimeException(
|
|
"premature end of " + fileUrl_ );
|
|
}
|
|
pos_ = i + 1;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (!skipComment() && !scanCdataSection().is()) {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
break;
|
|
case '/':
|
|
++pos_;
|
|
return handleEndTag();
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
break;
|
|
default:
|
|
return handleStartTag(nsId, data);
|
|
}
|
|
}
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleRawText(Span * text) {
|
|
pad_.clear();
|
|
for (char const * begin = pos_;;) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
"premature end of " + fileUrl_ );
|
|
case '\x0D':
|
|
pad_.add(begin, pos_ - begin);
|
|
++pos_;
|
|
if (peek() != '\x0A') {
|
|
pad_.add("\x0A");
|
|
}
|
|
begin = pos_;
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, pos_ - begin);
|
|
pos_ = handleReference(pos_, end_);
|
|
begin = pos_;
|
|
break;
|
|
case '<':
|
|
pad_.add(begin, pos_ - begin);
|
|
++pos_;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (!skipComment()) {
|
|
Span cdata(scanCdataSection());
|
|
if (cdata.is()) {
|
|
normalizeLineEnds(cdata);
|
|
} else {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
}
|
|
begin = pos_;
|
|
break;
|
|
case '/':
|
|
*text = pad_.get();
|
|
++pos_;
|
|
state_ = State::EndTag;
|
|
return Result::Text;
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
begin = pos_;
|
|
break;
|
|
default:
|
|
*text = pad_.get();
|
|
state_ = State::StartTag;
|
|
return Result::Text;
|
|
}
|
|
break;
|
|
default:
|
|
++pos_;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
|
|
pad_.clear();
|
|
char const * flowBegin = pos_;
|
|
char const * flowEnd = pos_;
|
|
enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
|
|
// a single true space character can go into the current flow,
|
|
// everything else breaks the flow
|
|
Space space = SPACE_START;
|
|
for (;;) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
"premature end of " + fileUrl_ );
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
case SPACE_BREAK:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
space = SPACE_BREAK;
|
|
break;
|
|
}
|
|
++pos_;
|
|
break;
|
|
case ' ':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
case SPACE_BREAK:
|
|
break;
|
|
case SPACE_NONE:
|
|
space = SPACE_SPAN;
|
|
break;
|
|
case SPACE_SPAN:
|
|
space = SPACE_BREAK;
|
|
break;
|
|
}
|
|
++pos_;
|
|
break;
|
|
case '&':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
pad_.add(flowBegin, pos_ - flowBegin);
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(" ");
|
|
break;
|
|
}
|
|
pos_ = handleReference(pos_, end_);
|
|
flowBegin = pos_;
|
|
flowEnd = pos_;
|
|
space = SPACE_NONE;
|
|
break;
|
|
case '<':
|
|
++pos_;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (skipComment()) {
|
|
space = SPACE_BREAK;
|
|
} else {
|
|
Span cdata(scanCdataSection());
|
|
if (cdata.is()) {
|
|
// CDATA is not normalized (similar to character
|
|
// references; it keeps the code simple), but it might
|
|
// arguably be better to normalize it:
|
|
switch (space) {
|
|
case SPACE_START:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
pad_.add(flowBegin, pos_ - flowBegin);
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(" ");
|
|
break;
|
|
}
|
|
normalizeLineEnds(cdata);
|
|
flowBegin = pos_;
|
|
flowEnd = pos_;
|
|
space = SPACE_NONE;
|
|
} else {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
}
|
|
break;
|
|
case '/':
|
|
++pos_;
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
*text = pad_.get();
|
|
state_ = State::EndTag;
|
|
return Result::Text;
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
space = SPACE_BREAK;
|
|
break;
|
|
default:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
*text = pad_.get();
|
|
state_ = State::StartTag;
|
|
return Result::Text;
|
|
}
|
|
break;
|
|
default:
|
|
switch (space) {
|
|
case SPACE_START:
|
|
flowBegin = pos_;
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(" ");
|
|
flowBegin = pos_;
|
|
break;
|
|
}
|
|
flowEnd = ++pos_;
|
|
space = SPACE_NONE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
|
|
assert(pos <= INT_MAX);
|
|
return static_cast< int >(pos);
|
|
}
|
|
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|