b854de954f
the intent of this header has canged over time. now it is already systematically included with ustring.hxx and the operator overload it provide fit nicely there... Just to be safe, since that include as been added to the api during the 3.5 timeframe and therefore is already in 'production' the header remain and simply attempt to include ustring.hxx but a warning is issued indicating that this header should not be used anymore... in a couple of major release we will thenr emove it completely All internal users of that header are converted. Change-Id: I8934c55f089e29d78c0f5649b7c87b2ecf024bad Reviewed-on: https://gerrit.libreoffice.org/634 Tested-by: Norbert Thiebaud <nthiebaud@gmail.com> Reviewed-by: Norbert Thiebaud <nthiebaud@gmail.com>
1058 lines
35 KiB
C++
1058 lines
35 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
#include "sal/config.h"
|
|
|
|
#include <cassert>
|
|
#include <climits>
|
|
#include <cstddef>
|
|
|
|
#include "com/sun/star/container/NoSuchElementException.hpp"
|
|
#include "com/sun/star/uno/Reference.hxx"
|
|
#include "com/sun/star/uno/RuntimeException.hpp"
|
|
#include "com/sun/star/uno/XInterface.hpp"
|
|
#include "osl/file.h"
|
|
#include "rtl/string.h"
|
|
#include "rtl/ustring.h"
|
|
#include "rtl/ustring.hxx"
|
|
#include "sal/log.hxx"
|
|
#include "sal/types.h"
|
|
#include "xmlreader/pad.hxx"
|
|
#include "xmlreader/span.hxx"
|
|
#include "xmlreader/xmlreader.hxx"
|
|
|
|
namespace xmlreader {
|
|
|
|
namespace {
|
|
|
|
namespace css = com::sun::star;
|
|
|
|
bool isSpace(char c) {
|
|
switch (c) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
case ' ':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
XmlReader::XmlReader(rtl::OUString const & fileUrl)
|
|
SAL_THROW((
|
|
css::container::NoSuchElementException, css::uno::RuntimeException)):
|
|
fileUrl_(fileUrl)
|
|
{
|
|
oslFileError e = osl_openFile(
|
|
fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
|
|
switch (e)
|
|
{
|
|
case osl_File_E_None:
|
|
break;
|
|
case osl_File_E_NOENT:
|
|
throw css::container::NoSuchElementException(
|
|
fileUrl_, css::uno::Reference< css::uno::XInterface >());
|
|
default:
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
|
|
fileUrl_ + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(": ")) +
|
|
rtl::OUString::valueOf(static_cast< sal_Int32 >(e))),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
e = osl_getFileSize(fileHandle_, &fileSize_);
|
|
if (e == osl_File_E_None) {
|
|
e = osl_mapFile(
|
|
fileHandle_, &fileAddress_, fileSize_, 0,
|
|
osl_File_MapFlag_WillNeed);
|
|
}
|
|
if (e != osl_File_E_None) {
|
|
oslFileError e2 = osl_closeFile(fileHandle_);
|
|
if (e2 != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
|
|
}
|
|
throw css::uno::RuntimeException(
|
|
("cannot mmap " + fileUrl_ + " (" +
|
|
rtl::OUString::valueOf(static_cast< sal_Int32 >(e)) + ")"),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
namespaceIris_.push_back(
|
|
Span(
|
|
RTL_CONSTASCII_STRINGPARAM(
|
|
"http://www.w3.org/XML/1998/namespace")));
|
|
namespaces_.push_back(
|
|
NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
|
|
pos_ = static_cast< char * >(fileAddress_);
|
|
end_ = pos_ + fileSize_;
|
|
state_ = STATE_CONTENT;
|
|
firstAttribute_ = true;
|
|
}
|
|
|
|
XmlReader::~XmlReader() {
|
|
oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
|
|
if (e != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
|
|
}
|
|
e = osl_closeFile(fileHandle_);
|
|
if (e != osl_File_E_None) {
|
|
SAL_WARN(
|
|
"xmlreader",
|
|
"osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
|
|
}
|
|
}
|
|
|
|
int XmlReader::registerNamespaceIri(Span const & iri) {
|
|
int id = toNamespaceId(namespaceIris_.size());
|
|
namespaceIris_.push_back(iri);
|
|
if (iri.equals(
|
|
Span(
|
|
RTL_CONSTASCII_STRINGPARAM(
|
|
"http://www.w3.org/2001/XMLSchema-instance"))))
|
|
{
|
|
// Old user layer .xcu files used the xsi namespace prefix without
|
|
// declaring a corresponding namespace binding, see issue 77174; reading
|
|
// those files during migration would fail without this hack that can be
|
|
// removed once migration is no longer relevant (see
|
|
// configmgr::Components::parseModificationLayer):
|
|
namespaces_.push_back(
|
|
NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
|
|
}
|
|
return id;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
|
|
{
|
|
switch (state_) {
|
|
case STATE_CONTENT:
|
|
switch (reportText) {
|
|
case TEXT_NONE:
|
|
return handleSkippedText(data, nsId);
|
|
case TEXT_RAW:
|
|
return handleRawText(data);
|
|
case TEXT_NORMALIZED:
|
|
return handleNormalizedText(data);
|
|
}
|
|
case STATE_START_TAG:
|
|
return handleStartTag(nsId, data);
|
|
case STATE_END_TAG:
|
|
return handleEndTag();
|
|
case STATE_EMPTY_ELEMENT_TAG:
|
|
handleElementEnd();
|
|
return RESULT_END;
|
|
default: // STATE_DONE
|
|
return RESULT_DONE;
|
|
}
|
|
}
|
|
|
|
bool XmlReader::nextAttribute(int * nsId, Span * localName) {
|
|
assert(nsId != 0 && localName != 0);
|
|
if (firstAttribute_) {
|
|
currentAttribute_ = attributes_.begin();
|
|
firstAttribute_ = false;
|
|
} else {
|
|
++currentAttribute_;
|
|
}
|
|
if (currentAttribute_ == attributes_.end()) {
|
|
return false;
|
|
}
|
|
if (currentAttribute_->nameColon == 0) {
|
|
*nsId = NAMESPACE_NONE;
|
|
*localName = Span(
|
|
currentAttribute_->nameBegin,
|
|
currentAttribute_->nameEnd - currentAttribute_->nameBegin);
|
|
} else {
|
|
*nsId = getNamespaceId(
|
|
Span(
|
|
currentAttribute_->nameBegin,
|
|
currentAttribute_->nameColon - currentAttribute_->nameBegin));
|
|
*localName = Span(
|
|
currentAttribute_->nameColon + 1,
|
|
currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Span XmlReader::getAttributeValue(bool fullyNormalize) {
|
|
return handleAttributeValue(
|
|
currentAttribute_->valueBegin, currentAttribute_->valueEnd,
|
|
fullyNormalize);
|
|
}
|
|
|
|
int XmlReader::getNamespaceId(Span const & prefix) const {
|
|
for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
|
|
i != namespaces_.rend(); ++i)
|
|
{
|
|
if (prefix.equals(i->prefix)) {
|
|
return i->nsId;
|
|
}
|
|
}
|
|
return NAMESPACE_UNKNOWN;
|
|
}
|
|
|
|
rtl::OUString XmlReader::getUrl() const {
|
|
return fileUrl_;
|
|
}
|
|
|
|
void XmlReader::normalizeLineEnds(Span const & text) {
|
|
char const * p = text.begin;
|
|
sal_Int32 n = text.length;
|
|
for (;;) {
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
|
|
if (i < 0) {
|
|
break;
|
|
}
|
|
pad_.add(p, i);
|
|
p += i + 1;
|
|
n -= i + 1;
|
|
if (n == 0 || *p != '\x0A') {
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
|
|
}
|
|
}
|
|
pad_.add(p, n);
|
|
}
|
|
|
|
void XmlReader::skipSpace() {
|
|
while (isSpace(peek())) {
|
|
++pos_;
|
|
}
|
|
}
|
|
|
|
bool XmlReader::skipComment() {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
|
|
RTL_CONSTASCII_LENGTH("--")) !=
|
|
0)
|
|
{
|
|
return false;
|
|
}
|
|
pos_ += RTL_CONSTASCII_LENGTH("--");
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within comment) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("--");
|
|
if (read() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"illegal \"--\" within comment in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void XmlReader::skipProcessingInstruction() {
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("?>");
|
|
}
|
|
|
|
void XmlReader::skipDocumentTypeDeclaration() {
|
|
// Neither is it checked that the doctypedecl is at the correct position in
|
|
// the document, nor that it is well-formed:
|
|
for (;;) {
|
|
char c = read();
|
|
switch (c) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within DTD) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
case '"':
|
|
case '\'':
|
|
{
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(
|
|
pos_, end_ - pos_, c);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within DTD) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + 1;
|
|
}
|
|
break;
|
|
case '>':
|
|
return;
|
|
case '[':
|
|
for (;;) {
|
|
c = read();
|
|
switch (c) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within DTD) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
case '"':
|
|
case '\'':
|
|
{
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(
|
|
pos_, end_ - pos_, c);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within DTD) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + 1;
|
|
}
|
|
break;
|
|
case '<':
|
|
switch (read()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within DTD) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
case '!':
|
|
skipComment();
|
|
break;
|
|
case '?':
|
|
skipProcessingInstruction();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
case ']':
|
|
skipSpace();
|
|
if (read() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"missing \">\" of DTD in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
return;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Span XmlReader::scanCdataSection() {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
|
|
RTL_CONSTASCII_LENGTH("[CDATA[")) !=
|
|
0)
|
|
{
|
|
return Span();
|
|
}
|
|
pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
|
|
char const * begin = pos_;
|
|
sal_Int32 i = rtl_str_indexOfStr_WithLength(
|
|
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"premature end (within CDATA section) of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
|
|
return Span(begin, i);
|
|
}
|
|
|
|
bool XmlReader::scanName(char const ** nameColon) {
|
|
assert(nameColon != 0 && *nameColon == 0);
|
|
for (char const * begin = pos_;; ++pos_) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
case ' ':
|
|
case '/':
|
|
case '=':
|
|
case '>':
|
|
return pos_ != begin;
|
|
case ':':
|
|
*nameColon = pos_;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
|
|
assert(begin != 0 && begin <= end);
|
|
Span iri(handleAttributeValue(begin, end, false));
|
|
for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
|
|
if (namespaceIris_[i].equals(iri)) {
|
|
return toNamespaceId(i);
|
|
}
|
|
}
|
|
return XmlReader::NAMESPACE_UNKNOWN;
|
|
}
|
|
|
|
char const * XmlReader::handleReference(char const * position, char const * end)
|
|
{
|
|
assert(position != 0 && *position == '&' && position < end);
|
|
++position;
|
|
if (*position == '#') {
|
|
++position;
|
|
sal_Int32 val = 0;
|
|
char const * p;
|
|
if (*position == 'x') {
|
|
++position;
|
|
p = position;
|
|
for (;; ++position) {
|
|
char c = *position;
|
|
if (c >= '0' && c <= '9') {
|
|
val = 16 * val + (c - '0');
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
val = 16 * val + (c - 'A') + 10;
|
|
} else if (c >= 'a' && c <= 'f') {
|
|
val = 16 * val + (c - 'a') + 10;
|
|
} else {
|
|
break;
|
|
}
|
|
if (val > 0x10FFFF) { // avoid overflow
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"'&#x...' too large in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
}
|
|
} else {
|
|
p = position;
|
|
for (;; ++position) {
|
|
char c = *position;
|
|
if (c >= '0' && c <= '9') {
|
|
val = 10 * val + (c - '0');
|
|
} else {
|
|
break;
|
|
}
|
|
if (val > 0x10FFFF) { // avoid overflow
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"'&#...' too large in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
}
|
|
}
|
|
if (position == p || *position++ != ';') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
assert(val >= 0 && val <= 0x10FFFF);
|
|
if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
|
|
(val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
|
|
{
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"character reference denoting invalid character in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char buf[4];
|
|
sal_Int32 len;
|
|
if (val < 0x80) {
|
|
buf[0] = static_cast< char >(val);
|
|
len = 1;
|
|
} else if (val < 0x800) {
|
|
buf[0] = static_cast< char >((val >> 6) | 0xC0);
|
|
buf[1] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 2;
|
|
} else if (val < 0x10000) {
|
|
buf[0] = static_cast< char >((val >> 12) | 0xE0);
|
|
buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
|
|
buf[2] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 3;
|
|
} else {
|
|
buf[0] = static_cast< char >((val >> 18) | 0xF0);
|
|
buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
|
|
buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
|
|
buf[3] = static_cast< char >((val & 0x3F) | 0x80);
|
|
len = 4;
|
|
}
|
|
pad_.addEphemeral(buf, len);
|
|
return position;
|
|
} else {
|
|
struct EntityRef {
|
|
char const * inBegin;
|
|
sal_Int32 inLength;
|
|
char const * outBegin;
|
|
sal_Int32 outLength;
|
|
};
|
|
static EntityRef const refs[] = {
|
|
{ RTL_CONSTASCII_STRINGPARAM("amp;"),
|
|
RTL_CONSTASCII_STRINGPARAM("&") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("lt;"),
|
|
RTL_CONSTASCII_STRINGPARAM("<") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("gt;"),
|
|
RTL_CONSTASCII_STRINGPARAM(">") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("apos;"),
|
|
RTL_CONSTASCII_STRINGPARAM("'") },
|
|
{ RTL_CONSTASCII_STRINGPARAM("quot;"),
|
|
RTL_CONSTASCII_STRINGPARAM("\"") } };
|
|
for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
|
|
if (rtl_str_shortenedCompare_WithLength(
|
|
position, end - position, refs[i].inBegin, refs[i].inLength,
|
|
refs[i].inLength) ==
|
|
0)
|
|
{
|
|
position += refs[i].inLength;
|
|
pad_.add(refs[i].outBegin, refs[i].outLength);
|
|
return position;
|
|
}
|
|
}
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
}
|
|
|
|
Span XmlReader::handleAttributeValue(
|
|
char const * begin, char const * end, bool fullyNormalize)
|
|
{
|
|
pad_.clear();
|
|
if (fullyNormalize) {
|
|
while (begin != end && isSpace(*begin)) {
|
|
++begin;
|
|
}
|
|
while (end != begin && isSpace(end[-1])) {
|
|
--end;
|
|
}
|
|
char const * p = begin;
|
|
enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
|
|
// a single true space character can go into the current span,
|
|
// everything else breaks the span
|
|
Space space = SPACE_NONE;
|
|
while (p != end) {
|
|
switch (*p) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
switch (space) {
|
|
case SPACE_NONE:
|
|
pad_.add(begin, p - begin);
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_SPAN:
|
|
pad_.add(begin, p - begin);
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_BREAK:
|
|
break;
|
|
}
|
|
begin = ++p;
|
|
break;
|
|
case ' ':
|
|
switch (space) {
|
|
case SPACE_NONE:
|
|
++p;
|
|
space = SPACE_SPAN;
|
|
break;
|
|
case SPACE_SPAN:
|
|
pad_.add(begin, p - begin);
|
|
begin = ++p;
|
|
space = SPACE_BREAK;
|
|
break;
|
|
case SPACE_BREAK:
|
|
begin = ++p;
|
|
break;
|
|
}
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, p - begin);
|
|
p = handleReference(p, end);
|
|
begin = p;
|
|
space = SPACE_NONE;
|
|
break;
|
|
default:
|
|
++p;
|
|
space = SPACE_NONE;
|
|
break;
|
|
}
|
|
}
|
|
pad_.add(begin, p - begin);
|
|
} else {
|
|
char const * p = begin;
|
|
while (p != end) {
|
|
switch (*p) {
|
|
case '\x09':
|
|
case '\x0A':
|
|
pad_.add(begin, p - begin);
|
|
begin = ++p;
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
break;
|
|
case '\x0D':
|
|
pad_.add(begin, p - begin);
|
|
++p;
|
|
if (peek() == '\x0A') {
|
|
++p;
|
|
}
|
|
begin = p;
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, p - begin);
|
|
p = handleReference(p, end);
|
|
begin = p;
|
|
break;
|
|
default:
|
|
++p;
|
|
break;
|
|
}
|
|
}
|
|
pad_.add(begin, p - begin);
|
|
}
|
|
return pad_.get();
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
|
|
assert(nsId != 0 && localName);
|
|
char const * nameBegin = pos_;
|
|
char const * nameColon = 0;
|
|
if (!scanName(&nameColon)) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * nameEnd = pos_;
|
|
NamespaceList::size_type inheritedNamespaces = namespaces_.size();
|
|
bool hasDefaultNs = false;
|
|
int defaultNsId = NAMESPACE_NONE;
|
|
attributes_.clear();
|
|
for (;;) {
|
|
char const * p = pos_;
|
|
skipSpace();
|
|
if (peek() == '/' || peek() == '>') {
|
|
break;
|
|
}
|
|
if (pos_ == p) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"missing whitespace before attribute in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * attrNameBegin = pos_;
|
|
char const * attrNameColon = 0;
|
|
if (!scanName(&attrNameColon)) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * attrNameEnd = pos_;
|
|
skipSpace();
|
|
if (read() != '=') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
skipSpace();
|
|
char del = read();
|
|
if (del != '\'' && del != '"') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * valueBegin = pos_;
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM(
|
|
"unterminated attribute value in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * valueEnd = pos_ + i;
|
|
pos_ += i + 1;
|
|
if (attrNameColon == 0 &&
|
|
Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
|
|
RTL_CONSTASCII_STRINGPARAM("xmlns")))
|
|
{
|
|
hasDefaultNs = true;
|
|
defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
|
|
} else if (attrNameColon != 0 &&
|
|
Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
|
|
RTL_CONSTASCII_STRINGPARAM("xmlns")))
|
|
{
|
|
namespaces_.push_back(
|
|
NamespaceData(
|
|
Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
|
|
scanNamespaceIri(valueBegin, valueEnd)));
|
|
} else {
|
|
attributes_.push_back(
|
|
AttributeData(
|
|
attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
|
|
valueEnd));
|
|
}
|
|
}
|
|
if (!hasDefaultNs && !elements_.empty()) {
|
|
defaultNsId = elements_.top().defaultNamespaceId;
|
|
}
|
|
firstAttribute_ = true;
|
|
if (peek() == '/') {
|
|
state_ = STATE_EMPTY_ELEMENT_TAG;
|
|
++pos_;
|
|
} else {
|
|
state_ = STATE_CONTENT;
|
|
}
|
|
if (peek() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
++pos_;
|
|
elements_.push(
|
|
ElementData(
|
|
Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
|
|
defaultNsId));
|
|
if (nameColon == 0) {
|
|
*nsId = defaultNsId;
|
|
*localName = Span(nameBegin, nameEnd - nameBegin);
|
|
} else {
|
|
*nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
|
|
*localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
|
|
}
|
|
return RESULT_BEGIN;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleEndTag() {
|
|
if (elements_.empty()) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
char const * nameBegin = pos_;
|
|
char const * nameColon = 0;
|
|
if (!scanName(&nameColon) ||
|
|
!elements_.top().name.equals(nameBegin, pos_ - nameBegin))
|
|
{
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
handleElementEnd();
|
|
skipSpace();
|
|
if (peek() != '>') {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
++pos_;
|
|
return RESULT_END;
|
|
}
|
|
|
|
void XmlReader::handleElementEnd() {
|
|
assert(!elements_.empty());
|
|
namespaces_.resize(elements_.top().inheritedNamespaces);
|
|
elements_.pop();
|
|
state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
|
|
for (;;) {
|
|
sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
|
|
if (i < 0) {
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
}
|
|
pos_ += i + 1;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (!skipComment() && !scanCdataSection().is()) {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
break;
|
|
case '/':
|
|
++pos_;
|
|
return handleEndTag();
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
break;
|
|
default:
|
|
return handleStartTag(nsId, data);
|
|
}
|
|
}
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleRawText(Span * text) {
|
|
pad_.clear();
|
|
for (char const * begin = pos_;;) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
case '\x0D':
|
|
pad_.add(begin, pos_ - begin);
|
|
++pos_;
|
|
if (peek() != '\x0A') {
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
|
|
}
|
|
begin = pos_;
|
|
break;
|
|
case '&':
|
|
pad_.add(begin, pos_ - begin);
|
|
pos_ = handleReference(pos_, end_);
|
|
begin = pos_;
|
|
break;
|
|
case '<':
|
|
pad_.add(begin, pos_ - begin);
|
|
++pos_;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (!skipComment()) {
|
|
Span cdata(scanCdataSection());
|
|
if (cdata.is()) {
|
|
normalizeLineEnds(cdata);
|
|
} else {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
}
|
|
begin = pos_;
|
|
break;
|
|
case '/':
|
|
*text = pad_.get();
|
|
++pos_;
|
|
state_ = STATE_END_TAG;
|
|
return RESULT_TEXT;
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
begin = pos_;
|
|
break;
|
|
default:
|
|
*text = pad_.get();
|
|
state_ = STATE_START_TAG;
|
|
return RESULT_TEXT;
|
|
}
|
|
break;
|
|
default:
|
|
++pos_;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
|
|
pad_.clear();
|
|
char const * flowBegin = pos_;
|
|
char const * flowEnd = pos_;
|
|
enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
|
|
// a single true space character can go into the current flow,
|
|
// everything else breaks the flow
|
|
Space space = SPACE_START;
|
|
for (;;) {
|
|
switch (peek()) {
|
|
case '\0': // i.e., EOF
|
|
throw css::uno::RuntimeException(
|
|
(rtl::OUString(
|
|
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
|
|
fileUrl_),
|
|
css::uno::Reference< css::uno::XInterface >());
|
|
case '\x09':
|
|
case '\x0A':
|
|
case '\x0D':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
case SPACE_BREAK:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
space = SPACE_BREAK;
|
|
break;
|
|
}
|
|
++pos_;
|
|
break;
|
|
case ' ':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
case SPACE_BREAK:
|
|
break;
|
|
case SPACE_NONE:
|
|
space = SPACE_SPAN;
|
|
break;
|
|
case SPACE_SPAN:
|
|
space = SPACE_BREAK;
|
|
break;
|
|
}
|
|
++pos_;
|
|
break;
|
|
case '&':
|
|
switch (space) {
|
|
case SPACE_START:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
pad_.add(flowBegin, pos_ - flowBegin);
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
break;
|
|
}
|
|
pos_ = handleReference(pos_, end_);
|
|
flowBegin = pos_;
|
|
flowEnd = pos_;
|
|
space = SPACE_NONE;
|
|
break;
|
|
case '<':
|
|
++pos_;
|
|
switch (peek()) {
|
|
case '!':
|
|
++pos_;
|
|
if (skipComment()) {
|
|
space = SPACE_BREAK;
|
|
} else {
|
|
Span cdata(scanCdataSection());
|
|
if (cdata.is()) {
|
|
// CDATA is not normalized (similar to character
|
|
// references; it keeps the code simple), but it might
|
|
// arguably be better to normalize it:
|
|
switch (space) {
|
|
case SPACE_START:
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
pad_.add(flowBegin, pos_ - flowBegin);
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
break;
|
|
}
|
|
normalizeLineEnds(cdata);
|
|
flowBegin = pos_;
|
|
flowEnd = pos_;
|
|
space = SPACE_NONE;
|
|
} else {
|
|
skipDocumentTypeDeclaration();
|
|
}
|
|
}
|
|
break;
|
|
case '/':
|
|
++pos_;
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
*text = pad_.get();
|
|
state_ = STATE_END_TAG;
|
|
return RESULT_TEXT;
|
|
case '?':
|
|
++pos_;
|
|
skipProcessingInstruction();
|
|
space = SPACE_BREAK;
|
|
break;
|
|
default:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
*text = pad_.get();
|
|
state_ = STATE_START_TAG;
|
|
return RESULT_TEXT;
|
|
}
|
|
break;
|
|
default:
|
|
switch (space) {
|
|
case SPACE_START:
|
|
flowBegin = pos_;
|
|
break;
|
|
case SPACE_NONE:
|
|
case SPACE_SPAN:
|
|
break;
|
|
case SPACE_BREAK:
|
|
pad_.add(flowBegin, flowEnd - flowBegin);
|
|
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
|
|
flowBegin = pos_;
|
|
break;
|
|
}
|
|
flowEnd = ++pos_;
|
|
space = SPACE_NONE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
|
|
assert(pos <= INT_MAX);
|
|
return static_cast< int >(pos);
|
|
}
|
|
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|