Add rng schema for model.xml
Change-Id: I1b75c5c42a131c7994868ea3261120c6a5b7650e
This commit is contained in:
parent
a7d2149709
commit
59a68fe4ad
2 changed files with 473 additions and 42 deletions
473
writerfilter/documentation/ooxml/model.rng
Normal file
473
writerfilter/documentation/ooxml/model.rng
Normal file
|
@ -0,0 +1,473 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
* This file is part of the LibreOffice project.
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
-->
|
||||
<!--
|
||||
This file is both a relax-ng schema for writerfilter/source/ooxml/model.xml and
|
||||
documentation for that file. The schema has two parts:
|
||||
|
||||
- first part: a subset of the relax-ng grammar to define *what* we expect as
|
||||
the input in a DOCX file
|
||||
- second part: additional annotation on top of that to define *how* to handle
|
||||
that expected input
|
||||
-->
|
||||
<grammar xmlns="http://relaxng.org/ns/structure/1.0">
|
||||
<!--
|
||||
First part: a subset of the relax-ng XML markup.
|
||||
|
||||
The order of elements in this part follow a bottom-up approach.
|
||||
-->
|
||||
|
||||
<!-- Basic building blocks: element, attribute and their contents. -->
|
||||
|
||||
<!--
|
||||
Describes an XML element.
|
||||
|
||||
Example:
|
||||
|
||||
<element name="charset">
|
||||
<ref name="CT_Charset"/>
|
||||
</element>
|
||||
-->
|
||||
<define name="element-element">
|
||||
<element name="element" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<optional>
|
||||
<attribute name="name"/>
|
||||
</optional>
|
||||
<oneOrMore>
|
||||
<choice>
|
||||
<ref name="attribute-element"/>
|
||||
<ref name="data-element"/>
|
||||
<ref name="ref-element"/>
|
||||
<ref name="text-element"/>
|
||||
</choice>
|
||||
</oneOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Describes an attribute.
|
||||
|
||||
Example:
|
||||
|
||||
<attribute name="name">
|
||||
<text/>
|
||||
</attribute>
|
||||
-->
|
||||
<define name="attribute-element">
|
||||
<element name="attribute" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<optional>
|
||||
<attribute name="name"/>
|
||||
</optional>
|
||||
<zeroOrMore>
|
||||
<choice>
|
||||
<ref name="data-element"/>
|
||||
<ref name="ref-element"/>
|
||||
<ref name="text-element"/>
|
||||
</choice>
|
||||
</zeroOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Describes the type of the data contained in an attribute. Possible values:
|
||||
boolean, integer or string. See also <text>.
|
||||
-->
|
||||
<define name="data-element">
|
||||
<element name="data" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<attribute name="type"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Describes that the data used inside the parent (element or attribute) is a
|
||||
string. It is just a short-hand for <data type="string"/>.
|
||||
-->
|
||||
<define name="text-element">
|
||||
<element name="text" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<empty/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Describes an enumeration element: a possible value for an attribute.
|
||||
-->
|
||||
<define name="value-element">
|
||||
<element name="value" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<text/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
This element is ignored during parsing, it just helps readability.
|
||||
|
||||
Example:
|
||||
|
||||
<choice>
|
||||
<value>true</value>
|
||||
<value>false</value>
|
||||
</choice>
|
||||
-->
|
||||
<define name="choice-element">
|
||||
<element name="choice" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<oneOrMore>
|
||||
<choice>
|
||||
<ref name="data-element"/>
|
||||
<ref name="element-element"/>
|
||||
<ref name="ref-element"/>
|
||||
<ref name="text-element"/>
|
||||
<ref name="value-element"/>
|
||||
</choice>
|
||||
</oneOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!-- Grouping elements: define and grammar. -->
|
||||
|
||||
<!--
|
||||
A define is named definition of its contents, so that multiple <ref> elements
|
||||
can refer to it, to avoid copy&paste. OOXML named (complex and simple) types
|
||||
are described using defines.
|
||||
-->
|
||||
<define name="define-element">
|
||||
<element name="define" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<attribute name="name"/>
|
||||
<oneOrMore>
|
||||
<choice>
|
||||
<ref name="choice-element"/>
|
||||
<ref name="attribute-element"/>
|
||||
<ref name="element-element"/>
|
||||
<ref name="data-element"/>
|
||||
<ref name="ref-element"/>
|
||||
<empty/>
|
||||
</choice>
|
||||
</oneOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A reference to a define.
|
||||
-->
|
||||
<define name="ref-element">
|
||||
<element name="ref" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<attribute name="name"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A grammar is a set of defines, one grammar is equivalent to one .xsd file
|
||||
from the OOXML spec.
|
||||
-->
|
||||
<define name="grammar-element">
|
||||
<element name="grammar" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<attribute name="ns"/>
|
||||
<optional>
|
||||
<attribute name="datatypeLibrary"/>
|
||||
</optional>
|
||||
<optional>
|
||||
<attribute name="attributeFormDefault"/>
|
||||
</optional>
|
||||
<zeroOrMore>
|
||||
<ref name="include-element"/>
|
||||
</zeroOrMore>
|
||||
<oneOrMore>
|
||||
<ref name="define-element"/>
|
||||
</oneOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Controls the resolution of <ref> elements. The order is:
|
||||
|
||||
- the current grammar
|
||||
- included grammars, if there are any
|
||||
- the first define in the whole model
|
||||
-->
|
||||
<define name="include-element">
|
||||
<element name="include" ns="http://relaxng.org/ns/structure/1.0">
|
||||
<attribute name="href"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Second part: custom markup, building on top of the first one.
|
||||
|
||||
The order of elements in this part follow a top-down approach.
|
||||
|
||||
The output of the code generated from these elements is a token stream. There
|
||||
are two types of tokens: SPRM tokens and attribute ones. SPRM refers to
|
||||
Single PRoperty Modifier, in this context it means a token that contains other
|
||||
tokens. It's used to represent an XML element. That means that SPRM tokens
|
||||
can contain other SPRM tokens, and also attribute tokens, while attribute
|
||||
tokens only contain simple types (boolean, integer, string).
|
||||
|
||||
More terminology: the types in the OOXML schema have two typical prefixes:
|
||||
|
||||
- CT_something: complex type, used to describe an XML element
|
||||
- ST_something: simple type, used to describe the contents of an attribute
|
||||
|
||||
For tokens the following abbreviations are used:
|
||||
|
||||
- NS_something: namespace
|
||||
- LN_something: local name
|
||||
-->
|
||||
|
||||
<!--
|
||||
The model element is the toplevel container for the XML element /
|
||||
attribute mapping definition. It contains namespace aliases, direct token
|
||||
definitions and mapping definitions for each namespace.
|
||||
-->
|
||||
<define name="model-element">
|
||||
<element name="model">
|
||||
<oneOrMore>
|
||||
<ref name="namespace-alias-element"/>
|
||||
</oneOrMore>
|
||||
<oneOrMore>
|
||||
<ref name="token-element"/>
|
||||
</oneOrMore>
|
||||
<oneOrMore>
|
||||
<ref name="namespace-element"/>
|
||||
</oneOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A namespace-alias element defines an alias for an URI. Multiple URI's
|
||||
can have the same alias, that's how both strict and transitional OOXML is
|
||||
supported by the same tokenizer.
|
||||
-->
|
||||
<define name="namespace-alias-element">
|
||||
<element name="namespace-alias">
|
||||
<!-- The URI of the namespace, e.g. http://schemas.openxmlformats.org/wordprocessingml/2006/main -->
|
||||
<attribute name="name"/>
|
||||
<!-- The alias of the namespace, e.g. w14 -->
|
||||
<attribute name="alias"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A token element can explicitly define a token. This allows generating
|
||||
such a token in the tokenizers and handling it in the domain mapper. Ideally
|
||||
tokens are *not* defined this way, they are mapped to an XML element or
|
||||
attribute from the OOXML specification.
|
||||
-->
|
||||
<define name="token-element">
|
||||
<element name="token">
|
||||
<!--
|
||||
The token name must be ooxml:something, then in C++ it'll be the
|
||||
NS_ooxml::LN_something ("OOXML namespace, something local name")
|
||||
constant.
|
||||
-->
|
||||
<attribute name="tokenid"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A namespace element is a container for a subset of the relax-ng grammar
|
||||
of a part of the OOXML specification. It also contains the resource
|
||||
definitions, which specify how XML elements and attributes are mapped to
|
||||
tokens.
|
||||
-->
|
||||
<define name="namespace-element">
|
||||
<element name="namespace">
|
||||
<attribute name="name"/>
|
||||
<optional>
|
||||
<attribute name="file"/>
|
||||
</optional>
|
||||
<optional>
|
||||
<attribute name="url"/>
|
||||
</optional>
|
||||
<zeroOrMore>
|
||||
<ref name="start-element"/>
|
||||
</zeroOrMore>
|
||||
<ref name="grammar-element"/>
|
||||
<zeroOrMore>
|
||||
<ref name="resource-element"/>
|
||||
</zeroOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A start element is similar to the relax-ng start element, but this one has a
|
||||
name attribute to refer to a define, while the relax-ng one has a ref child
|
||||
element to do the same.
|
||||
-->
|
||||
<define name="start-element">
|
||||
<element name="start">
|
||||
<attribute name="name"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A resource element always matches (by its name attribute) a define from the
|
||||
grammar of the namespace. It describes how that (simple or complex) type is
|
||||
parsed during import.
|
||||
|
||||
Example:
|
||||
|
||||
<resource name="CT_Font" resource="Properties">
|
||||
...
|
||||
</resource>
|
||||
|
||||
or
|
||||
|
||||
<resource name="CT_OMathPara" resource="Stream"/>
|
||||
-->
|
||||
<define name="resource-element">
|
||||
<element name="resource">
|
||||
<!-- There should be a define element with the same name attribute. -->
|
||||
<attribute name="name"/>
|
||||
<!--
|
||||
This means the resource element will be handled by the
|
||||
OOXMLFastContextHandler<resource> class.
|
||||
|
||||
The two most important resources:
|
||||
|
||||
- Properties: this maps elements/attributes to SPRM/attribute tokens
|
||||
- Stream: If the element itself does not require any special handling,
|
||||
but the subelemenents are interesting, use this resource. If no
|
||||
explicit resource element is available, then a null context will be
|
||||
created and the element and all its subelements will be ignored.
|
||||
-->
|
||||
<attribute name="resource"/>
|
||||
<optional>
|
||||
<attribute name="tokenid"/>
|
||||
</optional>
|
||||
<zeroOrMore>
|
||||
<choice>
|
||||
<ref name="resource-element-element"/>
|
||||
<ref name="resource-attribute-element"/>
|
||||
<ref name="resource-value-element"/>
|
||||
<ref name="resource-action-element"/>
|
||||
</choice>
|
||||
</zeroOrMore>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
The <element> child of a <resource> defines what element name will be handled
|
||||
via what token.
|
||||
|
||||
Example:
|
||||
|
||||
<element name="charset" tokenid="ooxml:CT_Font_charset"/>
|
||||
|
||||
Means the <charset> element will be handled in the sprm() function of the handler
|
||||
class as a NS_ooxml::LN_CT_Font_charset case. (sprm() is a logging wrapper
|
||||
around lcl_sprm(), which is the real implementation.)
|
||||
-->
|
||||
<define name="resource-element-element">
|
||||
<element name="element">
|
||||
<attribute name="name"/>
|
||||
<attribute name="tokenid"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
The <attribute> child of a <resource> defines what attribute name will be
|
||||
handled via what token.
|
||||
|
||||
Example:
|
||||
|
||||
<attribute name="name" tokenid="ooxml:CT_Font_name"/>
|
||||
|
||||
Means the <name> attribute will be handled in the attribute() (real
|
||||
implementation in lcl_attribute()) function of the handler class as a
|
||||
NS_ooxml::LN_CT_Font_name case.
|
||||
-->
|
||||
<define name="resource-attribute-element">
|
||||
<element name="attribute">
|
||||
<attribute name="name"/>
|
||||
<optional>
|
||||
<attribute name="tokenid"/>
|
||||
</optional>
|
||||
<optional>
|
||||
<attribute name="action"/>
|
||||
</optional>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
A <value> inside a <resource> defines how to map the string data of a value
|
||||
to a token. The tokenid attribute defines the token name, the text of the
|
||||
element defines the string. This is useful in case the value of an attribute
|
||||
is a choice from a predefined list.
|
||||
-->
|
||||
<define name="resource-value-element">
|
||||
<element name="value">
|
||||
<attribute name="tokenid"/>
|
||||
<text/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
An <action> inside a <resource> can perform additional actions in the
|
||||
following situations:
|
||||
|
||||
- start of the element
|
||||
- end of the element
|
||||
- character data of the element
|
||||
|
||||
Example:
|
||||
|
||||
<resource name="CT_TxbxContent" resource="Stream">
|
||||
<action name="start" action="startTxbxContent"/>
|
||||
<action name="end" action="endTxbxContent"/>
|
||||
</resource>
|
||||
|
||||
That means that when:
|
||||
|
||||
- <txbxContent> starts, OOXMLFastContextHandler::startTxbxContent() will be called
|
||||
- <txbxContent> ends, OOXMLFastContextHandler::endTxbxContent() will be called
|
||||
-->
|
||||
<define name="resource-action-element">
|
||||
<element name="action">
|
||||
<attribute name="name"/>
|
||||
<attribute name="action"/>
|
||||
<optional>
|
||||
<attribute name="tokenid"/>
|
||||
</optional>
|
||||
<optional>
|
||||
<attribute name="sendtokenid"/>
|
||||
</optional>
|
||||
<optional>
|
||||
<ref name="resource-action-cond-element"/>
|
||||
</optional>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!--
|
||||
Some actions take parameters, which can be defined by the <cond> element.
|
||||
|
||||
Example:
|
||||
|
||||
<resource name="CT_FldChar" resource="Stream">
|
||||
<action name="start" action="fieldstart">
|
||||
<cond tokenid="ooxml:CT_FldChar_fldCharType" value="ooxml:Value_ST_FldCharType_begin"/>
|
||||
</action>
|
||||
</resource>
|
||||
|
||||
That means:
|
||||
|
||||
- if the <fldChar> starts with an fldCharType attribute being "begin"
|
||||
- then perform the "fieldstart" action.
|
||||
-->
|
||||
<define name="resource-action-cond-element">
|
||||
<element name="cond">
|
||||
<attribute name="tokenid"/>
|
||||
<attribute name="value"/>
|
||||
</element>
|
||||
</define>
|
||||
|
||||
<!-- The entry point of the schema. -->
|
||||
<start>
|
||||
<ref name="model-element"/>
|
||||
</start>
|
||||
</grammar>
|
||||
<!-- vim: ft=xml shiftwidth=2 softtabstop=2 expandtab:
|
||||
-->
|
|
@ -1,42 +0,0 @@
|
|||
These are various notes about ooxml/model.xml and related stuff. They have been
|
||||
mostly found out by trial and error, because existing documentation is poor
|
||||
or nonexistent, so I don't actually understand writerfilter that much (and
|
||||
think nothing nice about it) and don't think it (both writerfilter and my
|
||||
understanding/liking of it) could be noticeably improved. In an ideal world
|
||||
it should be nuked from orbit and started again from scratch with a saner design.
|
||||
|
||||
-
|
||||
CT_xxx (Complex Type) - it seems to be used for XML elements
|
||||
ST_xxx (Simple Type) - it seems to be used for XML attributes
|
||||
|
||||
- SPRM (the Sprm structure specified a modification to a property of a
|
||||
character, paragraph, table, or section in the binary .doc format) - in
|
||||
the context of OOXML it seems to pretty much mean "XML element"
|
||||
|
||||
-
|
||||
|
||||
Format of the <resource> tag (shortened CT_Font example):
|
||||
|
||||
<resource name="CT_Font" resource="Properties" tag="font">
|
||||
<element name="charset" tokenid="ooxml:CT_Font_charset"/>
|
||||
<attribute name="name" tokenid="ooxml:CT_Font_name"/>
|
||||
</resource>
|
||||
|
||||
CT_Font is the type that is defined how it will be handled.
|
||||
resource="XXX" means it will be handled by OOXMLFastContextHandlerXXX class
|
||||
no idea what tag="font" means or if it matters
|
||||
<element> defines the <w:charset> subelement will be handled in sprm() function
|
||||
as NS_ooxml::LN_CT_Font_charset case
|
||||
<attribute> defines the <w:name> attribute of the element will be handled
|
||||
in attribute() function as NS_ooxml::LN_CT_Font_name case
|
||||
in both cases sprm()/attribute() may mean actually any of the various strange
|
||||
naming ideas like lcl_sprm()
|
||||
|
||||
-
|
||||
If an element (and its subelements) are not processed but the element itself
|
||||
does not require any special handling, make sure something like the below is present.
|
||||
Otherwise null context will be created and the element and all its subelements
|
||||
will be ignored.
|
||||
|
||||
<resource name="CT_OMathPara" resource="Stream" tag="math"/>
|
||||
|
Loading…
Reference in a new issue