office-gobmx/xmlreader/source/xmlreader.cxx

1056 lines
35 KiB
C++

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*************************************************************************
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* Copyright 2000, 2010 Oracle and/or its affiliates.
*
* OpenOffice.org - a multi-platform office productivity suite
*
* This file is part of OpenOffice.org.
*
* OpenOffice.org is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3
* only, as published by the Free Software Foundation.
*
* OpenOffice.org is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License version 3 for more details
* (a copy is included in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU Lesser General Public License
* version 3 along with OpenOffice.org. If not, see
* <http://www.openoffice.org/license.html>
* for a copy of the LGPLv3 License.
*
************************************************************************/
#include "sal/config.h"
#include <climits>
#include <cstddef>
#include "com/sun/star/container/NoSuchElementException.hpp"
#include "com/sun/star/uno/Reference.hxx"
#include "com/sun/star/uno/RuntimeException.hpp"
#include "com/sun/star/uno/XInterface.hpp"
#include "osl/diagnose.h"
#include "osl/file.h"
#include "rtl/string.h"
#include "rtl/ustring.h"
#include "rtl/ustring.hxx"
#include "sal/types.h"
#include "xmlreader/pad.hxx"
#include "xmlreader/span.hxx"
#include "xmlreader/xmlreader.hxx"
namespace xmlreader {
namespace {
namespace css = com::sun::star;
bool isSpace(char c) {
switch (c) {
case '\x09':
case '\x0A':
case '\x0D':
case ' ':
return true;
default:
return false;
}
}
}
XmlReader::XmlReader(rtl::OUString const & fileUrl)
SAL_THROW((
css::container::NoSuchElementException, css::uno::RuntimeException)):
fileUrl_(fileUrl)
{
switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
{
case osl_File_E_None:
break;
case osl_File_E_NOENT:
throw css::container::NoSuchElementException(
fileUrl_, css::uno::Reference< css::uno::XInterface >());
default:
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
if (e == osl_File_E_None) {
e = osl_mapFile(
fileHandle_, &fileAddress_, fileSize_, 0,
osl_File_MapFlag_WillNeed);
}
if (e != osl_File_E_None) {
e = osl_closeFile(fileHandle_);
if (e != osl_File_E_None) {
OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
}
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
namespaceIris_.push_back(
Span(
RTL_CONSTASCII_STRINGPARAM(
"http://www.w3.org/XML/1998/namespace")));
namespaces_.push_back(
NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
pos_ = static_cast< char * >(fileAddress_);
end_ = pos_ + fileSize_;
state_ = STATE_CONTENT;
}
XmlReader::~XmlReader() {
oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
if (e != osl_File_E_None) {
OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
}
e = osl_closeFile(fileHandle_);
if (e != osl_File_E_None) {
OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
}
}
int XmlReader::registerNamespaceIri(Span const & iri) {
int id = toNamespaceId(namespaceIris_.size());
namespaceIris_.push_back(iri);
if (iri.equals(
Span(
RTL_CONSTASCII_STRINGPARAM(
"http://www.w3.org/2001/XMLSchema-instance"))))
{
// Old user layer .xcu files used the xsi namespace prefix without
// declaring a corresponding namespace binding, see issue 77174; reading
// those files during migration would fail without this hack that can be
// removed once migration is no longer relevant (see
// configmgr::Components::parseModificationLayer):
namespaces_.push_back(
NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
}
return id;
}
XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
{
switch (state_) {
case STATE_CONTENT:
switch (reportText) {
case TEXT_NONE:
return handleSkippedText(data, nsId);
case TEXT_RAW:
return handleRawText(data);
case TEXT_NORMALIZED:
return handleNormalizedText(data);
}
case STATE_START_TAG:
return handleStartTag(nsId, data);
case STATE_END_TAG:
return handleEndTag();
case STATE_EMPTY_ELEMENT_TAG:
handleElementEnd();
return RESULT_END;
default: // STATE_DONE
return RESULT_DONE;
}
}
bool XmlReader::nextAttribute(int * nsId, Span * localName) {
OSL_ASSERT(nsId != 0 && localName != 0);
if (firstAttribute_) {
currentAttribute_ = attributes_.begin();
firstAttribute_ = false;
} else {
++currentAttribute_;
}
if (currentAttribute_ == attributes_.end()) {
return false;
}
if (currentAttribute_->nameColon == 0) {
*nsId = NAMESPACE_NONE;
*localName = Span(
currentAttribute_->nameBegin,
currentAttribute_->nameEnd - currentAttribute_->nameBegin);
} else {
*nsId = getNamespaceId(
Span(
currentAttribute_->nameBegin,
currentAttribute_->nameColon - currentAttribute_->nameBegin));
*localName = Span(
currentAttribute_->nameColon + 1,
currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
}
return true;
}
Span XmlReader::getAttributeValue(bool fullyNormalize) {
return handleAttributeValue(
currentAttribute_->valueBegin, currentAttribute_->valueEnd,
fullyNormalize);
}
int XmlReader::getNamespaceId(Span const & prefix) const {
for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
i != namespaces_.rend(); ++i)
{
if (prefix.equals(i->prefix)) {
return i->nsId;
}
}
return NAMESPACE_UNKNOWN;
}
rtl::OUString XmlReader::getUrl() const {
return fileUrl_;
}
void XmlReader::normalizeLineEnds(Span const & text) {
char const * p = text.begin;
sal_Int32 n = text.length;
for (;;) {
sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
if (i < 0) {
break;
}
pad_.add(p, i);
p += i + 1;
n -= i + 1;
if (n == 0 || *p != '\x0A') {
pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
}
}
pad_.add(p, n);
}
void XmlReader::skipSpace() {
while (isSpace(peek())) {
++pos_;
}
}
bool XmlReader::skipComment() {
if (rtl_str_shortenedCompare_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
RTL_CONSTASCII_LENGTH("--")) !=
0)
{
return false;
}
pos_ += RTL_CONSTASCII_LENGTH("--");
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within comment) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + RTL_CONSTASCII_LENGTH("--");
if (read() != '>') {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"illegal \"--\" within comment in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
return true;
}
void XmlReader::skipProcessingInstruction() {
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + RTL_CONSTASCII_LENGTH("?>");
}
void XmlReader::skipDocumentTypeDeclaration() {
// Neither is it checked that the doctypedecl is at the correct position in
// the document, nor that it is well-formed:
for (;;) {
char c = read();
switch (c) {
case '\0': // i.e., EOF
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within DTD) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
case '"':
case '\'':
{
sal_Int32 i = rtl_str_indexOfChar_WithLength(
pos_, end_ - pos_, c);
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within DTD) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + 1;
}
break;
case '>':
return;
case '[':
for (;;) {
c = read();
switch (c) {
case '\0': // i.e., EOF
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within DTD) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
case '"':
case '\'':
{
sal_Int32 i = rtl_str_indexOfChar_WithLength(
pos_, end_ - pos_, c);
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within DTD) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + 1;
}
break;
case '<':
switch (read()) {
case '\0': // i.e., EOF
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within DTD) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
case '!':
skipComment();
break;
case '?':
skipProcessingInstruction();
break;
default:
break;
}
break;
case ']':
skipSpace();
if (read() != '>') {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"missing \">\" of DTD in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
return;
default:
break;
}
}
default:
break;
}
}
}
Span XmlReader::scanCdataSection() {
if (rtl_str_shortenedCompare_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
RTL_CONSTASCII_LENGTH("[CDATA[")) !=
0)
{
return Span();
}
pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
char const * begin = pos_;
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"premature end (within CDATA section) of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
return Span(begin, i);
}
bool XmlReader::scanName(char const ** nameColon) {
OSL_ASSERT(nameColon != 0 && *nameColon == 0);
for (char const * begin = pos_;; ++pos_) {
switch (peek()) {
case '\0': // i.e., EOF
case '\x09':
case '\x0A':
case '\x0D':
case ' ':
case '/':
case '=':
case '>':
return pos_ != begin;
case ':':
*nameColon = pos_;
break;
default:
break;
}
}
}
int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
OSL_ASSERT(begin != 0 && begin <= end);
Span iri(handleAttributeValue(begin, end, false));
for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
if (namespaceIris_[i].equals(iri)) {
return toNamespaceId(i);
}
}
return XmlReader::NAMESPACE_UNKNOWN;
}
char const * XmlReader::handleReference(char const * position, char const * end)
{
OSL_ASSERT(position != 0 && *position == '&' && position < end);
++position;
if (*position == '#') {
++position;
sal_Int32 val = 0;
char const * p;
if (*position == 'x') {
++position;
p = position;
for (;; ++position) {
char c = *position;
if (c >= '0' && c <= '9') {
val = 16 * val + (c - '0');
} else if (c >= 'A' && c <= 'F') {
val = 16 * val + (c - 'A') + 10;
} else if (c >= 'a' && c <= 'f') {
val = 16 * val + (c - 'a') + 10;
} else {
break;
}
if (val > 0x10FFFF) { // avoid overflow
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"'&#x...' too large in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
}
} else {
p = position;
for (;; ++position) {
char c = *position;
if (c >= '0' && c <= '9') {
val = 10 * val + (c - '0');
} else {
break;
}
if (val > 0x10FFFF) { // avoid overflow
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"'&#...' too large in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
}
}
if (position == p || *position++ != ';') {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
(val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
{
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"character reference denoting invalid character in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char buf[4];
sal_Int32 len;
if (val < 0x80) {
buf[0] = static_cast< char >(val);
len = 1;
} else if (val < 0x800) {
buf[0] = static_cast< char >((val >> 6) | 0xC0);
buf[1] = static_cast< char >((val & 0x3F) | 0x80);
len = 2;
} else if (val < 0x10000) {
buf[0] = static_cast< char >((val >> 12) | 0xE0);
buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
buf[2] = static_cast< char >((val & 0x3F) | 0x80);
len = 3;
} else {
buf[0] = static_cast< char >((val >> 18) | 0xF0);
buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
buf[3] = static_cast< char >((val & 0x3F) | 0x80);
len = 4;
}
pad_.addEphemeral(buf, len);
return position;
} else {
struct EntityRef {
char const * inBegin;
sal_Int32 inLength;
char const * outBegin;
sal_Int32 outLength;
};
static EntityRef const refs[] = {
{ RTL_CONSTASCII_STRINGPARAM("amp;"),
RTL_CONSTASCII_STRINGPARAM("&") },
{ RTL_CONSTASCII_STRINGPARAM("lt;"),
RTL_CONSTASCII_STRINGPARAM("<") },
{ RTL_CONSTASCII_STRINGPARAM("gt;"),
RTL_CONSTASCII_STRINGPARAM(">") },
{ RTL_CONSTASCII_STRINGPARAM("apos;"),
RTL_CONSTASCII_STRINGPARAM("'") },
{ RTL_CONSTASCII_STRINGPARAM("quot;"),
RTL_CONSTASCII_STRINGPARAM("\"") } };
for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
if (rtl_str_shortenedCompare_WithLength(
position, end - position, refs[i].inBegin, refs[i].inLength,
refs[i].inLength) ==
0)
{
position += refs[i].inLength;
pad_.add(refs[i].outBegin, refs[i].outLength);
return position;
}
}
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
}
Span XmlReader::handleAttributeValue(
char const * begin, char const * end, bool fullyNormalize)
{
pad_.clear();
if (fullyNormalize) {
while (begin != end && isSpace(*begin)) {
++begin;
}
while (end != begin && isSpace(end[-1])) {
--end;
}
char const * p = begin;
enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
// a single true space character can go into the current span,
// everything else breaks the span
Space space = SPACE_NONE;
while (p != end) {
switch (*p) {
case '\x09':
case '\x0A':
case '\x0D':
switch (space) {
case SPACE_NONE:
pad_.add(begin, p - begin);
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
space = SPACE_BREAK;
break;
case SPACE_SPAN:
pad_.add(begin, p - begin);
space = SPACE_BREAK;
break;
case SPACE_BREAK:
break;
}
begin = ++p;
break;
case ' ':
switch (space) {
case SPACE_NONE:
++p;
space = SPACE_SPAN;
break;
case SPACE_SPAN:
pad_.add(begin, p - begin);
begin = ++p;
space = SPACE_BREAK;
break;
case SPACE_BREAK:
begin = ++p;
break;
}
break;
case '&':
pad_.add(begin, p - begin);
p = handleReference(p, end);
begin = p;
space = SPACE_NONE;
break;
default:
++p;
space = SPACE_NONE;
break;
}
}
pad_.add(begin, p - begin);
} else {
char const * p = begin;
while (p != end) {
switch (*p) {
case '\x09':
case '\x0A':
pad_.add(begin, p - begin);
begin = ++p;
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
break;
case '\x0D':
pad_.add(begin, p - begin);
++p;
if (peek() == '\x0A') {
++p;
}
begin = p;
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
break;
case '&':
pad_.add(begin, p - begin);
p = handleReference(p, end);
begin = p;
break;
default:
++p;
break;
}
}
pad_.add(begin, p - begin);
}
return pad_.get();
}
XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
OSL_ASSERT(nsId != 0 && localName);
char const * nameBegin = pos_;
char const * nameColon = 0;
if (!scanName(&nameColon)) {
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * nameEnd = pos_;
NamespaceList::size_type inheritedNamespaces = namespaces_.size();
bool hasDefaultNs = false;
int defaultNsId = NAMESPACE_NONE;
attributes_.clear();
for (;;) {
char const * p = pos_;
skipSpace();
if (peek() == '/' || peek() == '>') {
break;
}
if (pos_ == p) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"missing whitespace before attribute in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * attrNameBegin = pos_;
char const * attrNameColon = 0;
if (!scanName(&attrNameColon)) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * attrNameEnd = pos_;
skipSpace();
if (read() != '=') {
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
skipSpace();
char del = read();
if (del != '\'' && del != '"') {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * valueBegin = pos_;
sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM(
"unterminated attribute value in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * valueEnd = pos_ + i;
pos_ += i + 1;
if (attrNameColon == 0 &&
Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
RTL_CONSTASCII_STRINGPARAM("xmlns")))
{
hasDefaultNs = true;
defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
} else if (attrNameColon != 0 &&
Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
RTL_CONSTASCII_STRINGPARAM("xmlns")))
{
namespaces_.push_back(
NamespaceData(
Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
scanNamespaceIri(valueBegin, valueEnd)));
} else {
attributes_.push_back(
AttributeData(
attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
valueEnd));
}
}
if (!hasDefaultNs && !elements_.empty()) {
defaultNsId = elements_.top().defaultNamespaceId;
}
firstAttribute_ = true;
if (peek() == '/') {
state_ = STATE_EMPTY_ELEMENT_TAG;
++pos_;
} else {
state_ = STATE_CONTENT;
}
if (peek() != '>') {
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
++pos_;
elements_.push(
ElementData(
Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
defaultNsId));
if (nameColon == 0) {
*nsId = defaultNsId;
*localName = Span(nameBegin, nameEnd - nameBegin);
} else {
*nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
*localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
}
return RESULT_BEGIN;
}
XmlReader::Result XmlReader::handleEndTag() {
if (elements_.empty()) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
char const * nameBegin = pos_;
char const * nameColon = 0;
if (!scanName(&nameColon) ||
!elements_.top().name.equals(nameBegin, pos_ - nameBegin))
{
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
handleElementEnd();
skipSpace();
if (peek() != '>') {
throw css::uno::RuntimeException(
(rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
++pos_;
return RESULT_END;
}
void XmlReader::handleElementEnd() {
OSL_ASSERT(!elements_.empty());
namespaces_.resize(elements_.top().inheritedNamespaces);
elements_.pop();
state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
}
XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
for (;;) {
sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
if (i < 0) {
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
}
pos_ += i + 1;
switch (peek()) {
case '!':
++pos_;
if (!skipComment() && !scanCdataSection().is()) {
skipDocumentTypeDeclaration();
}
break;
case '/':
++pos_;
return handleEndTag();
case '?':
++pos_;
skipProcessingInstruction();
break;
default:
return handleStartTag(nsId, data);
}
}
}
XmlReader::Result XmlReader::handleRawText(Span * text) {
pad_.clear();
for (char const * begin = pos_;;) {
switch (peek()) {
case '\0': // i.e., EOF
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
case '\x0D':
pad_.add(begin, pos_ - begin);
++pos_;
if (peek() != '\x0A') {
pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
}
begin = pos_;
break;
case '&':
pad_.add(begin, pos_ - begin);
pos_ = handleReference(pos_, end_);
begin = pos_;
break;
case '<':
pad_.add(begin, pos_ - begin);
++pos_;
switch (peek()) {
case '!':
++pos_;
if (!skipComment()) {
Span cdata(scanCdataSection());
if (cdata.is()) {
normalizeLineEnds(cdata);
} else {
skipDocumentTypeDeclaration();
}
}
begin = pos_;
break;
case '/':
*text = pad_.get();
++pos_;
state_ = STATE_END_TAG;
return RESULT_TEXT;
case '?':
++pos_;
skipProcessingInstruction();
begin = pos_;
break;
default:
*text = pad_.get();
state_ = STATE_START_TAG;
return RESULT_TEXT;
}
break;
default:
++pos_;
break;
}
}
}
XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
pad_.clear();
char const * flowBegin = pos_;
char const * flowEnd = pos_;
enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
// a single true space character can go into the current flow,
// everything else breaks the flow
Space space = SPACE_START;
for (;;) {
switch (peek()) {
case '\0': // i.e., EOF
throw css::uno::RuntimeException(
(rtl::OUString(
RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
fileUrl_),
css::uno::Reference< css::uno::XInterface >());
case '\x09':
case '\x0A':
case '\x0D':
switch (space) {
case SPACE_START:
case SPACE_BREAK:
break;
case SPACE_NONE:
case SPACE_SPAN:
space = SPACE_BREAK;
break;
}
++pos_;
break;
case ' ':
switch (space) {
case SPACE_START:
case SPACE_BREAK:
break;
case SPACE_NONE:
space = SPACE_SPAN;
break;
case SPACE_SPAN:
space = SPACE_BREAK;
break;
}
++pos_;
break;
case '&':
switch (space) {
case SPACE_START:
break;
case SPACE_NONE:
case SPACE_SPAN:
pad_.add(flowBegin, pos_ - flowBegin);
break;
case SPACE_BREAK:
pad_.add(flowBegin, flowEnd - flowBegin);
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
break;
}
pos_ = handleReference(pos_, end_);
flowBegin = pos_;
flowEnd = pos_;
space = SPACE_NONE;
break;
case '<':
++pos_;
switch (peek()) {
case '!':
++pos_;
if (skipComment()) {
space = SPACE_BREAK;
} else {
Span cdata(scanCdataSection());
if (cdata.is()) {
// CDATA is not normalized (similar to character
// references; it keeps the code simple), but it might
// arguably be better to normalize it:
switch (space) {
case SPACE_START:
break;
case SPACE_NONE:
case SPACE_SPAN:
pad_.add(flowBegin, pos_ - flowBegin);
break;
case SPACE_BREAK:
pad_.add(flowBegin, flowEnd - flowBegin);
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
break;
}
normalizeLineEnds(cdata);
flowBegin = pos_;
flowEnd = pos_;
space = SPACE_NONE;
} else {
skipDocumentTypeDeclaration();
}
}
break;
case '/':
++pos_;
pad_.add(flowBegin, flowEnd - flowBegin);
*text = pad_.get();
state_ = STATE_END_TAG;
return RESULT_TEXT;
case '?':
++pos_;
skipProcessingInstruction();
space = SPACE_BREAK;
break;
default:
pad_.add(flowBegin, flowEnd - flowBegin);
*text = pad_.get();
state_ = STATE_START_TAG;
return RESULT_TEXT;
}
break;
default:
switch (space) {
case SPACE_START:
flowBegin = pos_;
break;
case SPACE_NONE:
case SPACE_SPAN:
break;
case SPACE_BREAK:
pad_.add(flowBegin, flowEnd - flowBegin);
pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
flowBegin = pos_;
break;
}
flowEnd = ++pos_;
space = SPACE_NONE;
break;
}
}
}
int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
OSL_ASSERT(pos <= INT_MAX);
return static_cast< int >(pos);
}
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */