From e55afcbec416654f75936d0d9f1c61f26a38b6be Mon Sep 17 00:00:00 2001 From: mguessan Date: Thu, 12 Jan 2012 21:51:01 +0000 Subject: [PATCH] Add StreamScanner.java from Woodstox 4.1.2 git-svn-id: http://svn.code.sf.net/p/davmail/code/trunk@1876 3d1905a2-6b24-0410-a738-b14d5a86fcbd --- src/java/com/ctc/wstx/sr/StreamScanner.java | 2444 +++++++++++++++++++ 1 file changed, 2444 insertions(+) create mode 100644 src/java/com/ctc/wstx/sr/StreamScanner.java diff --git a/src/java/com/ctc/wstx/sr/StreamScanner.java b/src/java/com/ctc/wstx/sr/StreamScanner.java new file mode 100644 index 00000000..a1e476b9 --- /dev/null +++ b/src/java/com/ctc/wstx/sr/StreamScanner.java @@ -0,0 +1,2444 @@ +/* Woodstox XML processor + * + * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi + * + * Licensed under the License specified in file LICENSE, included with + * the source code. + * You may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.ctc.wstx.sr; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URL; +import java.text.MessageFormat; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.stream.Location; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLReporter; +import javax.xml.stream.XMLResolver; +import javax.xml.stream.XMLStreamException; + +import org.codehaus.stax2.XMLReporter2; +import org.codehaus.stax2.XMLStreamLocation2; +import org.codehaus.stax2.validation.XMLValidationProblem; + +import com.ctc.wstx.api.ReaderConfig; +import com.ctc.wstx.cfg.ErrorConsts; +import com.ctc.wstx.cfg.InputConfigFlags; +import com.ctc.wstx.cfg.ParsingErrorMsgs; +import com.ctc.wstx.cfg.XmlConsts; +import com.ctc.wstx.dtd.MinimalDTDReader; +import com.ctc.wstx.ent.EntityDecl; +import com.ctc.wstx.ent.IntEntity; +import com.ctc.wstx.exc.*; +import com.ctc.wstx.io.DefaultInputResolver; +import com.ctc.wstx.io.WstxInputData; +import com.ctc.wstx.io.WstxInputLocation; +import com.ctc.wstx.io.WstxInputSource; +import com.ctc.wstx.util.ExceptionUtil; +import com.ctc.wstx.util.SymbolTable; +import com.ctc.wstx.util.TextBuffer; + +/** + * Abstract base class that defines some basic functionality that all + * Woodstox reader classes (main XML reader, DTD reader) extend from. + */ + +public abstract class StreamScanner + extends WstxInputData + implements InputProblemReporter, + InputConfigFlags, ParsingErrorMsgs +{ + + // // // Some well-known chars: + + /** + * Last (highest) char code of the three, LF, CR and NULL + */ + public final static char CHAR_CR_LF_OR_NULL = (char) 13; + + public final static int INT_CR_LF_OR_NULL = 13; + + /** + * Character that allows quick check of whether a char can potentially + * be some kind of markup, WRT input stream processing; + * has to contain linefeeds, &, < and > (">" only matters when + * quoting text, as part of "]]>") + */ + protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1); + + + /** + * First character in Unicode (ie one with lowest id) that is legal + * as part of a local name (all valid name chars minus ':'). Used + * for doing quick check for local name end; usually name ends in + * a whitespace or equals sign. + */ + protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-'; + + /* + /////////////////////////////////////////////////////////// + // Character validity constants, structs + /////////////////////////////////////////////////////////// + */ + + /** + * We will only use validity array for first 256 characters, mostly + * because after those characters it's easier to do fairly simple + * block checks. + */ + private final static int VALID_CHAR_COUNT = 0x100; + + private final static byte NAME_CHAR_INVALID_B = (byte) 0; + private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1; + private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1; + + private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT]; + + static { + /* First, since all valid-as-first chars are also valid-as-other chars, + * we'll initialize common chars: + */ + sCharValidity['_'] = NAME_CHAR_ALL_VALID_B; + for (int i = 0, last = ('z' - 'a'); i <= last; ++i) { + sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B; + sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B; + } + for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but + sCharValidity[i] = NAME_CHAR_ALL_VALID_B; + } + // ... now we can 'revert' ones not fully valid: + sCharValidity[0xD7] = NAME_CHAR_INVALID_B; + sCharValidity[0xF7] = NAME_CHAR_INVALID_B; + + /* And then we can proceed with ones only valid-as-other. + */ + sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B; + sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B; + sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B; + for (int i = '0'; i <= '9'; ++i) { + sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B; + } + } + + /** + * Public identifiers only use 7-bit ascii range. + */ + private final static int VALID_PUBID_CHAR_COUNT = 0x80; + private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT]; +// private final static byte PUBID_CHAR_INVALID_B = (byte) 0; + private final static byte PUBID_CHAR_VALID_B = (byte) 1; + static { + for (int i = 0, last = ('z' - 'a'); i <= last; ++i) { + sPubidValidity['A' + i] = PUBID_CHAR_VALID_B; + sPubidValidity['a' + i] = PUBID_CHAR_VALID_B; + } + for (int i = '0'; i <= '9'; ++i) { + sPubidValidity[i] = PUBID_CHAR_VALID_B; + } + + // 3 main white space types are valid + sPubidValidity[0x0A] = PUBID_CHAR_VALID_B; + sPubidValidity[0x0D] = PUBID_CHAR_VALID_B; + sPubidValidity[0x20] = PUBID_CHAR_VALID_B; + + // And many of punctuation/separator ascii chars too: + sPubidValidity['-'] = PUBID_CHAR_VALID_B; + sPubidValidity['\''] = PUBID_CHAR_VALID_B; + sPubidValidity['('] = PUBID_CHAR_VALID_B; + sPubidValidity[')'] = PUBID_CHAR_VALID_B; + sPubidValidity['+'] = PUBID_CHAR_VALID_B; + sPubidValidity[','] = PUBID_CHAR_VALID_B; + sPubidValidity['.'] = PUBID_CHAR_VALID_B; + sPubidValidity['/'] = PUBID_CHAR_VALID_B; + sPubidValidity[':'] = PUBID_CHAR_VALID_B; + sPubidValidity['='] = PUBID_CHAR_VALID_B; + sPubidValidity['?'] = PUBID_CHAR_VALID_B; + sPubidValidity[';'] = PUBID_CHAR_VALID_B; + sPubidValidity['!'] = PUBID_CHAR_VALID_B; + sPubidValidity['*'] = PUBID_CHAR_VALID_B; + sPubidValidity['#'] = PUBID_CHAR_VALID_B; + sPubidValidity['@'] = PUBID_CHAR_VALID_B; + sPubidValidity['$'] = PUBID_CHAR_VALID_B; + sPubidValidity['_'] = PUBID_CHAR_VALID_B; + sPubidValidity['%'] = PUBID_CHAR_VALID_B; + } + + /* + /////////////////////////////////////////////////////////// + // Basic configuration + /////////////////////////////////////////////////////////// + */ + + /** + * Copy of the configuration object passed by the factory. + * Contains immutable settings for this reader (or in case + * of DTD parsers, reader that uses it) + */ + protected final ReaderConfig mConfig; + + // // // Various extracted settings: + + /** + * If true, Reader is namespace aware, and should do basic checks + * (usually enforcing limitations on having colons in names) + */ + protected final boolean mCfgNsEnabled; + + // Extracted standard on/off settings: + + /** + * note: left non-final on purpose: sub-class may need to modify + * the default value after construction. + */ + protected boolean mCfgReplaceEntities; + + /* + /////////////////////////////////////////////////////////// + // Symbol handling, if applicable + /////////////////////////////////////////////////////////// + */ + + final SymbolTable mSymbols; + + /** + * Local full name for the event, if it has one (note: element events + * do NOT use this variable; those names are stored in element stack): + * target for processing instructions. + *

+ * Currently used for proc. instr. target, and entity name (at least + * when current entity reference is null). + *

+ * Note: this variable is generally not cleared, since it comes from + * a symbol table, ie. this won't be the only reference. + */ + protected String mCurrName; + + /* + /////////////////////////////////////////////////////////// + // Input handling + /////////////////////////////////////////////////////////// + */ + + /** + * Currently active input source; contains link to parent (nesting) input + * sources, if any. + */ + protected WstxInputSource mInput; + + /** + * Top-most input source this reader can use; due to input source + * chaining, this is not necessarily the root of all input; for example, + * external DTD subset reader's root input still has original document + * input as its parent. + */ + protected final WstxInputSource mRootInput; + + /** + * Custom resolver used to handle external entities that are to be expanded + * by this reader (external param/general entity expander) + */ + XMLResolver mEntityResolver = null; + + /** + * This is the current depth of the input stack (same as what input + * element stack would return as its depth). + * It is used to enforce input scope constraints for nesting of + * elements (for xml reader) and dtd declaration (for dtd reader) + * with regards to input block (entity expansion) boundaries. + *

+ * Basically this value is compared to {@link #mInputTopDepth}, which + * indicates what was the depth at the point where the currently active + * input scope/block was started. + */ + protected int mCurrDepth = 0; + + protected int mInputTopDepth = 0; + + /** + * Flag that indicates whether linefeeds in the input data are to + * be normalized or not. + * Xml specs mandate that the line feeds are only normalized + * when they are from the external entities (main doc, external + * general/parsed entities), so normalization has to be + * suppressed when expanding internal general/parsed entities. + */ + protected boolean mNormalizeLFs; + + /* + /////////////////////////////////////////////////////////// + // Buffer(s) for local name(s) and text content + /////////////////////////////////////////////////////////// + */ + + /** + * Temporary buffer used if local name can not be just directly + * constructed from input buffer (name is on a boundary or such). + */ + protected char[] mNameBuffer = null; + + /* + /////////////////////////////////////////////////////////// + // Information about starting location of event + // Reader is pointing to; updated on-demand + /////////////////////////////////////////////////////////// + */ + + // // // Location info at point when current token was started + + /** + * Total number of characters read before start of current token. + * For big (gigabyte-sized) sizes are possible, needs to be long, + * unlike pointers and sizes related to in-memory buffers. + */ + protected long mTokenInputTotal = 0; + + /** + * Input row on which current token starts, 1-based + */ + protected int mTokenInputRow = 1; + + /** + * Column on input row that current token starts; 0-based (although + * in the end it'll be converted to 1-based) + */ + protected int mTokenInputCol = 0; + + /* + /////////////////////////////////////////////////////////// + // XML document information (from doc decl if one + // was found) common to all entities (main xml + // document, external DTD subset) + /////////////////////////////////////////////////////////// + */ + + /** + * Input stream encoding, if known (passed in, or determined by + * auto-detection); null if not. + */ + String mDocInputEncoding = null; + + /** + * Character encoding from xml declaration, if any; null if no + * declaration, or it didn't specify encoding. + */ + String mDocXmlEncoding = null; + + /** + * XML version as declared by the document; one of constants + * from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}). + */ + protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN; + + /** + * Cache of internal character entities; + */ + protected Map mCachedEntities; + + /** + * Flag for whether or not character references should be treated as entities + */ + protected boolean mCfgTreatCharRefsAsEntities; + + /** + * Entity reference stream currently points to. + */ + protected EntityDecl mCurrEntity; + + /* + /////////////////////////////////////////////////////////// + // Life-cycle + /////////////////////////////////////////////////////////// + */ + + /** + * Constructor used when creating a complete new (main-level) reader that + * does not share its input buffers or state with another reader. + */ + protected StreamScanner(WstxInputSource input, ReaderConfig cfg, + XMLResolver res) + { + super(); + mInput = input; + // 17-Jun-2004, TSa: Need to know root-level input source + mRootInput = input; + + mConfig = cfg; + mSymbols = cfg.getSymbols(); + int cf = cfg.getConfigFlags(); + mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0; + mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0; + + mNormalizeLFs = mConfig.willNormalizeLFs(); + mInputBuffer = null; + mInputPtr = mInputEnd = 0; + mEntityResolver = res; + + mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts(); + mCachedEntities = mCfgTreatCharRefsAsEntities ? new HashMap() : Collections.EMPTY_MAP; + } + + /* + /////////////////////////////////////////////////////////// + // Package API + /////////////////////////////////////////////////////////// + */ + + /** + * Method that returns location of the last character returned by this + * reader; that is, location "one less" than the currently pointed to + * location. + */ + protected WstxInputLocation getLastCharLocation() + { + return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1, + mCurrInputRow, + mInputPtr - mCurrInputRowStart); + } + + protected URL getSource() { + return mInput.getSource(); + } + + protected String getSystemId() { + return mInput.getSystemId(); + } + + /* + /////////////////////////////////////////////////////////// + // Partial LocationInfo implementation (not implemented + // by this base class, but is by some sub-classes) + /////////////////////////////////////////////////////////// + */ + + /** + * Returns location of last properly parsed token; as per StAX specs, + * apparently needs to be the end of current event, which is the same + * as the start of the following event (or EOF if that's next). + */ + public abstract Location getLocation(); + + public XMLStreamLocation2 getStartLocation() + { + // note: +1 is used as columns are 1-based... + return mInput.getLocation(mTokenInputTotal, mTokenInputRow, + mTokenInputCol + 1); + } + + public XMLStreamLocation2 getCurrentLocation() + { + return mInput.getLocation(mCurrInputProcessed + mInputPtr, + mCurrInputRow, + mInputPtr - mCurrInputRowStart + 1); + } + + /* + /////////////////////////////////////////////////////////// + // InputProblemReporter implementation + /////////////////////////////////////////////////////////// + */ + + public WstxException throwWfcException(String msg, boolean deferErrors) + throws WstxException + { + WstxException ex = constructWfcException(msg); + if (!deferErrors) { + throw ex; + } + return ex; + } + + public void throwParseError(String msg) throws XMLStreamException + { + throwParseError(msg, null, null); + } + + /** + * Throws generic parse error with specified message and current parsing + * location. + *

+ * Note: public access only because core code in other packages needs + * to access it. + */ + public void throwParseError(String format, Object arg, Object arg2) + throws XMLStreamException + { + String msg = (arg != null || arg2 != null) ? + MessageFormat.format(format, new Object[] { arg, arg2 }) : format; + throw constructWfcException(msg); + } + + public void reportProblem(String probType, String format, Object arg, Object arg2) + throws XMLStreamException + { + XMLReporter rep = mConfig.getXMLReporter(); + if (rep != null) { + _reportProblem(rep, probType, + MessageFormat.format(format, new Object[] { arg, arg2 }), null); + } + } + + public void reportProblem(Location loc, String probType, + String format, Object arg, Object arg2) + throws XMLStreamException + { + XMLReporter rep = mConfig.getXMLReporter(); + if (rep != null) { + String msg = (arg != null || arg2 != null) ? + MessageFormat.format(format, new Object[] { arg, arg2 }) : format; + _reportProblem(rep, probType, msg, loc); + } + } + + protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc) + throws XMLStreamException + { + if (loc == null) { + loc = getLastCharLocation(); + } + _reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType)); + } + + protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob) + throws XMLStreamException + { + if (rep != null) { + Location loc = prob.getLocation(); + if (loc == null) { + loc = getLastCharLocation(); + prob.setLocation(loc); + } + // Backwards-compatibility fix: add non-null type, if missing: + if (prob.getType() == null) { + prob.setType(ErrorConsts.WT_VALIDATION); + } + // [WSTX-154]: was catching and dropping thrown exception: shouldn't. + // [WTSX-157]: need to support XMLReporter2 + if (rep instanceof XMLReporter2) { + ((XMLReporter2) rep).report(prob); + } else { + rep.report(prob.getMessage(), prob.getType(), prob, loc); + } + } + } + + /** + *

+ * Note: this is the base implementation used for implementing + * ValidationContext + */ + public void reportValidationProblem(XMLValidationProblem prob) + throws XMLStreamException + { + // !!! TBI: Fail-fast vs. deferred modes? + /* For now let's implement basic functionality: warnings get + * reported via XMLReporter, errors and fatal errors result in + * immediate exceptions. + */ + /* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax + * javadocs for XMLReporter, both warnings and non-fatal errors + * (which includes all validation errors) should be reported via + * XMLReporter interface, and only fatals should cause an + * immediate stream exception (by-passing reporter) + */ + if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) { + throw WstxValidationException.create(prob); + } + XMLReporter rep = mConfig.getXMLReporter(); + if (rep != null) { + _reportProblem(rep, prob); + } else { + /* If no reporter, regular non-fatal errors are to be reported + * as exceptions as well, for backwards compatibility + */ + if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) { + throw WstxValidationException.create(prob); + } + } + } + + public void reportValidationProblem(String msg, int severity) + throws XMLStreamException + { + reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), + msg, severity)); + } + + public void reportValidationProblem(String msg) + throws XMLStreamException + { + reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), + msg, + XMLValidationProblem.SEVERITY_ERROR)); + } + + public void reportValidationProblem(Location loc, String msg) + throws XMLStreamException + { + reportValidationProblem(new XMLValidationProblem(loc, msg)); + } + + public void reportValidationProblem(String format, Object arg, Object arg2) + throws XMLStreamException + { + reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 })); + } + + /* + /////////////////////////////////////////////////////////// + // Other error reporting methods + /////////////////////////////////////////////////////////// + */ + + protected WstxException constructWfcException(String msg) + { + return new WstxParsingException(msg, getLastCharLocation()); + } + + /** + * Construct and return a {@link XMLStreamException} to throw + * as a result of a failed Typed Access operation (but one not + * caused by a Well-Formedness Constraint or Validation Constraint + * problem) + */ + /* + protected WstxException _constructTypeException(String msg) + { + // Hmmh. Should there be a distinct sub-type? + return new WstxParsingException(msg, getLastCharLocation()); + } + */ + + protected WstxException constructFromIOE(IOException ioe) + { + return new WstxIOException(ioe); + } + + protected WstxException constructNullCharException() + { + return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content", + getLastCharLocation(), CHAR_NULL); + } + + protected void throwUnexpectedChar(int i, String msg) + throws WstxException + { + char c = (char) i; + String excMsg = "Unexpected character "+getCharDesc(c)+msg; + throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c); + } + + protected void throwNullChar() + throws WstxException + { + throw constructNullCharException(); + } + + protected void throwInvalidSpace(int i) + throws WstxException + { + throwInvalidSpace(i, false); + } + + protected WstxException throwInvalidSpace(int i, boolean deferErrors) + throws WstxException + { + char c = (char) i; + WstxException ex; + if (c == CHAR_NULL) { + ex = constructNullCharException(); + } else { + String msg = "Illegal character ("+getCharDesc(c)+")"; + if (mXml11) { + msg += " [note: in XML 1.1, it could be included via entity expansion]"; + } + ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c); + } + if (!deferErrors) { + throw ex; + } + return ex; + } + + protected void throwUnexpectedEOF(String msg) + throws WstxException + { + throw new WstxEOFException("Unexpected EOF" + +(msg == null ? "" : msg), + getLastCharLocation()); + } + + /** + * Similar to {@link #throwUnexpectedEOF}, but only indicates ending + * of an input block. Used when reading a token that can not span + * input block boundaries (ie. can not continue past end of an + * entity expansion). + */ + protected void throwUnexpectedEOB(String msg) + throws WstxException + { + throw new WstxEOFException("Unexpected end of input block" + +(msg == null ? "" : msg), + getLastCharLocation()); + } + + protected void throwFromIOE(IOException ioe) + throws WstxException + { + throw new WstxIOException(ioe); + } + + protected void throwFromStrE(XMLStreamException strex) + throws WstxException + { + if (strex instanceof WstxException) { + throw (WstxException) strex; + } + WstxException newEx = new WstxException(strex); + ExceptionUtil.setInitCause(newEx, strex); + throw newEx; + } + + /** + * Method called to report an error, when caller's signature only + * allows runtime exceptions to be thrown. + */ + protected void throwLazyError(Exception e) + { + if (e instanceof XMLStreamException) { + WstxLazyException.throwLazily((XMLStreamException) e); + } + ExceptionUtil.throwRuntimeException(e); + } + + protected String tokenTypeDesc(int type) + { + return ErrorConsts.tokenTypeDesc(type); + } + + /* + /////////////////////////////////////////////////////////// + // Input buffer handling + /////////////////////////////////////////////////////////// + */ + + /** + * Returns current input source this source uses. + *

+ * Note: public only because some implementations are on different + * package. + */ + public final WstxInputSource getCurrentInput() { + return mInput; + } + + protected final int inputInBuffer() { + return mInputEnd - mInputPtr; + } + + protected final int getNext() + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + if (!loadMore()) { + return -1; + } + } + return (int) mInputBuffer[mInputPtr++]; + } + + /** + * Similar to {@link #getNext}, but does not advance pointer + * in input buffer. + *

+ * Note: this method only peeks within current input source; + * it does not close it and check nested input source (if any). + * This is necessary when checking keywords, since they can never + * cross input block boundary. + */ + protected final int peekNext() + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + if (!loadMoreFromCurrent()) { + return -1; + } + } + return (int) mInputBuffer[mInputPtr]; + } + + protected final char getNextChar(String errorMsg) + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + loadMore(errorMsg); + } + return mInputBuffer[mInputPtr++]; + } + + /** + * Similar to {@link #getNextChar}, but will not read more characters + * from parent input source(s) if the current input source doesn't + * have more content. This is often needed to prevent "runaway" content, + * such as comments that start in an entity but do not have matching + * close marker inside entity; XML specification specifically states + * such markup is not legal. + */ + protected final char getNextCharFromCurrent(String errorMsg) + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + loadMoreFromCurrent(errorMsg); + } + return mInputBuffer[mInputPtr++]; + } + + /** + * Method that will skip through zero or more white space characters, + * and return either the character following white space, or -1 to + * indicate EOF (end of the outermost input source)/ + */ + protected final int getNextAfterWS() + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + if (!loadMore()) { + return -1; + } + } + char c = mInputBuffer[mInputPtr++]; + while (c <= CHAR_SPACE) { + // Linefeed? + if (c == '\n' || c == '\r') { + skipCRLF(c); + } else if (c != CHAR_SPACE && c != '\t') { + throwInvalidSpace(c); + } + // Still a white space? + if (mInputPtr >= mInputEnd) { + if (!loadMore()) { + return -1; + } + } + c = mInputBuffer[mInputPtr++]; + } + return (int) c; + } + + protected final char getNextCharAfterWS(String errorMsg) + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + loadMore(errorMsg); + } + + char c = mInputBuffer[mInputPtr++]; + while (c <= CHAR_SPACE) { + // Linefeed? + if (c == '\n' || c == '\r') { + skipCRLF(c); + } else if (c != CHAR_SPACE && c != '\t') { + throwInvalidSpace(c); + } + + // Still a white space? + if (mInputPtr >= mInputEnd) { + loadMore(errorMsg); + } + c = mInputBuffer[mInputPtr++]; + } + return c; + } + + protected final char getNextInCurrAfterWS(String errorMsg) + throws XMLStreamException + { + return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg)); + } + + protected final char getNextInCurrAfterWS(String errorMsg, char c) + throws XMLStreamException + { + while (c <= CHAR_SPACE) { + // Linefeed? + if (c == '\n' || c == '\r') { + skipCRLF(c); + } else if (c != CHAR_SPACE && c != '\t') { + throwInvalidSpace(c); + } + + // Still a white space? + if (mInputPtr >= mInputEnd) { + loadMoreFromCurrent(errorMsg); + } + c = mInputBuffer[mInputPtr++]; + } + return c; + } + + /** + * Method called when a CR has been spotted in input; checks if next + * char is LF, and if so, skips it. Note that next character has to + * come from the current input source, to qualify; it can never come + * from another (nested) input source. + * + * @return True, if passed in char is '\r' and next one is '\n'. + */ + protected final boolean skipCRLF(char c) + throws XMLStreamException + { + boolean result; + + if (c == '\r' && peekNext() == '\n') { + ++mInputPtr; + result = true; + } else { + result = false; + } + ++mCurrInputRow; + mCurrInputRowStart = mInputPtr; + return result; + } + + protected final void markLF() { + ++mCurrInputRow; + mCurrInputRowStart = mInputPtr; + } + + protected final void markLF(int inputPtr) { + ++mCurrInputRow; + mCurrInputRowStart = inputPtr; + } + + /** + * Method to push back last character read; can only be called once, + * that is, no more than one char can be guaranteed to be succesfully + * returned. + */ + protected final void pushback() { --mInputPtr; } + + /* + /////////////////////////////////////////////////////////// + // Sub-class overridable input handling methods + /////////////////////////////////////////////////////////// + */ + + /** + * Method called when an entity has been expanded (new input source + * has been created). Needs to initialize location information and change + * active input source. + * + * @param entityId Name of the entity being expanded + */ + protected void initInputSource(WstxInputSource newInput, boolean isExt, + String entityId) + throws XMLStreamException + { + mInput = newInput; + // Let's make sure new input will be read next time input is needed: + mInputPtr = 0; + mInputEnd = 0; + /* Plus, reset the input location so that'll be accurate for + * error reporting etc. + */ + mInputTopDepth = mCurrDepth; + mInput.initInputLocation(this, mCurrDepth); + + /* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding + * internal entities (XML, 2.11) + */ + if (isExt) { + mNormalizeLFs = true; + } else { + mNormalizeLFs = false; + } + } + + /** + * Method that will try to read one or more characters from currently + * open input sources; closing input sources if necessary. + * + * @return true if reading succeeded (or may succeed), false if + * we reached EOF. + */ + protected boolean loadMore() + throws XMLStreamException + { + WstxInputSource input = mInput; + do { + /* Need to make sure offsets are properly updated for error + * reporting purposes, and do this now while previous amounts + * are still known. + */ + mCurrInputProcessed += mInputEnd; + mCurrInputRowStart -= mInputEnd; + int count; + try { + count = input.readInto(this); + if (count > 0) { + return true; + } + input.close(); + } catch (IOException ioe) { + throw constructFromIOE(ioe); + } + if (input == mRootInput) { + /* Note: no need to check entity/input nesting in this + * particular case, since it will be handled by higher level + * parsing code (results in an unexpected EOF) + */ + return false; + } + WstxInputSource parent = input.getParent(); + if (parent == null) { // sanity check! + throwNullParent(input); + } + /* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints + * with this input block closure? + */ + if (mCurrDepth != input.getScopeId()) { + handleIncompleteEntityProblem(input); + } + + mInput = input = parent; + input.restoreContext(this); + mInputTopDepth = input.getScopeId(); + /* 21-Feb-2006, TSa: Since linefeed normalization needs to be + * suppressed for internal entity expansion, we may need to + * change the state... + */ + if (!mNormalizeLFs) { + mNormalizeLFs = !input.fromInternalEntity(); + } + // Maybe there are leftovers from that input in buffer now? + } while (mInputPtr >= mInputEnd); + + return true; + } + + protected final boolean loadMore(String errorMsg) + throws XMLStreamException + { + if (!loadMore()) { + throwUnexpectedEOF(errorMsg); + } + return true; + } + + protected boolean loadMoreFromCurrent() + throws XMLStreamException + { + // Need to update offsets properly + mCurrInputProcessed += mInputEnd; + mCurrInputRowStart -= mInputEnd; + try { + int count = mInput.readInto(this); + return (count > 0); + } catch (IOException ie) { + throw constructFromIOE(ie); + } + } + + protected final boolean loadMoreFromCurrent(String errorMsg) + throws XMLStreamException + { + if (!loadMoreFromCurrent()) { + throwUnexpectedEOB(errorMsg); + } + return true; + } + + /** + * Method called to make sure current main-level input buffer has at + * least specified number of characters available consequtively, + * without having to call {@link #loadMore}. It can only be called + * when input comes from main-level buffer; further, call can shift + * content in input buffer, so caller has to flush any data still + * pending. In short, caller has to know exactly what it's doing. :-) + *

+ * Note: method does not check for any other input sources than the + * current one -- if current source can not fulfill the request, a + * failure is indicated. + * + * @return true if there's now enough data; false if not (EOF) + */ + protected boolean ensureInput(int minAmount) + throws XMLStreamException + { + int currAmount = mInputEnd - mInputPtr; + if (currAmount >= minAmount) { + return true; + } + try { + return mInput.readMore(this, minAmount); + } catch (IOException ie) { + throw constructFromIOE(ie); + } + } + + protected void closeAllInput(boolean force) + throws XMLStreamException + { + WstxInputSource input = mInput; + while (true) { + try { + if (force) { + input.closeCompletely(); + } else { + input.close(); + } + } catch (IOException ie) { + throw constructFromIOE(ie); + } + if (input == mRootInput) { + break; + } + WstxInputSource parent = input.getParent(); + if (parent == null) { // sanity check! + throwNullParent(input); + } + mInput = input = parent; + } + } + + protected void throwNullParent(WstxInputSource curr) + { + throw new IllegalStateException(ErrorConsts.ERR_INTERNAL); + //throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"')."); + } + + /* + /////////////////////////////////////////////////////////// + // Entity resolution + /////////////////////////////////////////////////////////// + */ + + /** + * Method that tries to resolve a character entity, or (if caller so + * specifies), a pre-defined internal entity (lt, gt, amp, apos, quot). + * It will succeed iff: + *

    + *
  1. Entity in question is a simple character entity (either one of + * 5 pre-defined ones, or using decimal/hex notation), AND + *
  2. + *
  3. Entity fits completely inside current input buffer. + *
  4. + *
+ * If so, character value of entity is returned. Character 0 is returned + * otherwise; if so, caller needs to do full resolution. + *

+ * Note: On entry we are guaranteed there are at least 3 more characters + * in this buffer; otherwise we shouldn't be called. + * + * @param checkStd If true, will check pre-defined internal entities + * (gt, lt, amp, apos, quot); if false, will only check actual + * character entities. + * + * @return (Valid) character value, if entity is a character reference, + * and could be resolved from current input buffer (does not span + * buffer boundary); null char (code 0) if not (either non-char + * entity, or spans input buffer boundary). + */ + protected int resolveSimpleEntity(boolean checkStd) + throws XMLStreamException + { + char[] buf = mInputBuffer; + int ptr = mInputPtr; + char c = buf[ptr++]; + + // Numeric reference? + if (c == '#') { + c = buf[ptr++]; + int value = 0; + int inputLen = mInputEnd; + if (c == 'x') { // hex + while (ptr < inputLen) { + c = buf[ptr++]; + if (c == ';') { + break; + } + value = value << 4; + if (c <= '9' && c >= '0') { + value += (c - '0'); + } else if (c >= 'a' && c <= 'f') { + value += (10 + (c - 'a')); + } else if (c >= 'A' && c <= 'F') { + value += (10 + (c - 'A')); + } else { + mInputPtr = ptr; // so error points to correct char + throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); + } + /* Need to check for overflow; easiest to do right as + * it happens... + */ + if (value > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + } + } else { // numeric (decimal) + while (c != ';') { + if (c <= '9' && c >= '0') { + value = (value * 10) + (c - '0'); + // Overflow? + if (value > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + } else { + mInputPtr = ptr; // so error points to correct char + throwUnexpectedChar(c, "; expected a decimal number."); + } + if (ptr >= inputLen) { + break; + } + c = buf[ptr++]; + } + } + /* We get here either if we got it all, OR if we ran out of + * input in current buffer. + */ + if (c == ';') { // got the full thing + mInputPtr = ptr; + validateChar(value); + return value; + } + + /* If we ran out of input, need to just fall back, gets + * resolved via 'full' resolution mechanism. + */ + } else if (checkStd) { + /* Caller may not want to resolve these quite yet... + * (when it wants separate events for non-char entities) + */ + if (c == 'a') { // amp or apos? + c = buf[ptr++]; + + if (c == 'm') { // amp? + if (buf[ptr++] == 'p') { + if (ptr < mInputEnd && buf[ptr++] == ';') { + mInputPtr = ptr; + return '&'; + } + } + } else if (c == 'p') { // apos? + if (buf[ptr++] == 'o') { + int len = mInputEnd; + if (ptr < len && buf[ptr++] == 's') { + if (ptr < len && buf[ptr++] == ';') { + mInputPtr = ptr; + return '\''; + } + } + } + } + } else if (c == 'g') { // gt? + if (buf[ptr++] == 't' && buf[ptr++] == ';') { + mInputPtr = ptr; + return '>'; + } + } else if (c == 'l') { // lt? + if (buf[ptr++] == 't' && buf[ptr++] == ';') { + mInputPtr = ptr; + return '<'; + } + } else if (c == 'q') { // quot? + if (buf[ptr++] == 'u' && buf[ptr++] == 'o') { + int len = mInputEnd; + if (ptr < len && buf[ptr++] == 't') { + if (ptr < len && buf[ptr++] == ';') { + mInputPtr = ptr; + return '"'; + } + } + } + } + } + return 0; + } + + /** + * Method called to resolve character entities, and only character + * entities (except that pre-defined char entities -- amp, apos, lt, + * gt, quote -- MAY be "char entities" in this sense, depending on + * arguments). + * Otherwise it is to return the null char; if so, + * the input pointer will point to the same point as when method + * entered (char after ampersand), plus the ampersand itself is + * guaranteed to be in the input buffer (so caller can just push it + * back if necessary). + *

+ * Most often this method is called when reader is not to expand + * non-char entities automatically, but to return them as separate + * events. + *

+ * Main complication here is that we need to do 5-char lookahead. This + * is problematic if chars are on input buffer boundary. This is ok + * for the root level input buffer, but not for some nested buffers. + * However, according to XML specs, such split entities are actually + * illegal... so we can throw an exception in those cases. + * + * @param checkStd If true, will check pre-defined internal entities + * (gt, lt, amp, apos, quot) as character entities; if false, will only + * check actual 'real' character entities. + * + * @return (Valid) character value, if entity is a character reference, + * and could be resolved from current input buffer (does not span + * buffer boundary); null char (code 0) if not (either non-char + * entity, or spans input buffer boundary). + */ + protected int resolveCharOnlyEntity(boolean checkStd) + throws XMLStreamException + { + //int avail = inputInBuffer(); + int avail = mInputEnd - mInputPtr; + if (avail < 6) { + // split entity, or buffer boundary + /* Don't want to lose leading '&' (in case we can not expand + * the entity), so let's push it back first + */ + --mInputPtr; + /* Shortest valid reference would be 3 chars ('&a;'); which + * would only be legal from an expanded entity... + */ + if (!ensureInput(6)) { + avail = inputInBuffer(); + if (avail < 3) { + throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF); + } + } else { + avail = 6; + } + // ... and now we can move pointer back as well: + ++mInputPtr; + } + + /* Ok, now we have one more character to check, and that's enough + * to determine type decisively. + */ + char c = mInputBuffer[mInputPtr]; + + // A char reference? + if (c == '#') { // yup + ++mInputPtr; + return resolveCharEnt(null); + } + + // nope... except may be a pre-def? + if (checkStd) { + if (c == 'a') { + char d = mInputBuffer[mInputPtr+1]; + if (d == 'm') { + if (avail >= 4 + && mInputBuffer[mInputPtr+2] == 'p' + && mInputBuffer[mInputPtr+3] == ';') { + mInputPtr += 4; + return '&'; + } + } else if (d == 'p') { + if (avail >= 5 + && mInputBuffer[mInputPtr+2] == 'o' + && mInputBuffer[mInputPtr+3] == 's' + && mInputBuffer[mInputPtr+4] == ';') { + mInputPtr += 5; + return '\''; + } + } + } else if (c == 'l') { + if (avail >= 3 + && mInputBuffer[mInputPtr+1] == 't' + && mInputBuffer[mInputPtr+2] == ';') { + mInputPtr += 3; + return '<'; + } + } else if (c == 'g') { + if (avail >= 3 + && mInputBuffer[mInputPtr+1] == 't' + && mInputBuffer[mInputPtr+2] == ';') { + mInputPtr += 3; + return '>'; + } + } else if (c == 'q') { + if (avail >= 5 + && mInputBuffer[mInputPtr+1] == 'u' + && mInputBuffer[mInputPtr+2] == 'o' + && mInputBuffer[mInputPtr+3] == 't' + && mInputBuffer[mInputPtr+4] == ';') { + mInputPtr += 5; + return '"'; + } + } + } + return 0; + } + + /** + * Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity + * if it is NOT a character entity (or pre-defined 'generic' entity; + * amp, apos, lt, gt or quot). Only used in cases where entities + * are to be separately returned unexpanded (in non-entity-replacing + * mode); which means it's never called from dtd handler. + */ + protected EntityDecl resolveNonCharEntity() + throws XMLStreamException + { + //int avail = inputInBuffer(); + int avail = mInputEnd - mInputPtr; + if (avail < 6) { + // split entity, or buffer boundary + /* Don't want to lose leading '&' (in case we can not expand + * the entity), so let's push it back first + */ + --mInputPtr; + + /* Shortest valid reference would be 3 chars ('&a;'); which + * would only be legal from an expanded entity... + */ + if (!ensureInput(6)) { + avail = inputInBuffer(); + if (avail < 3) { + throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF); + } + } else { + avail = 6; + } + // ... and now we can move pointer back as well: + ++mInputPtr; + } + + // We don't care about char entities: + char c = mInputBuffer[mInputPtr]; + if (c == '#') { + return null; + } + + /* 19-Aug-2004, TSa: Need special handling for pre-defined + * entities; they are not counted as 'real' general parsed + * entities, but more as character entities... + */ + + // have chars at least up to mInputPtr+4 by now + if (c == 'a') { + char d = mInputBuffer[mInputPtr+1]; + if (d == 'm') { + if (avail >= 4 + && mInputBuffer[mInputPtr+2] == 'p' + && mInputBuffer[mInputPtr+3] == ';') { + // If not automatically expanding: + //return sEntityAmp; + // mInputPtr += 4; + return null; + } + } else if (d == 'p') { + if (avail >= 5 + && mInputBuffer[mInputPtr+2] == 'o' + && mInputBuffer[mInputPtr+3] == 's' + && mInputBuffer[mInputPtr+4] == ';') { + return null; + } + } + } else if (c == 'l') { + if (avail >= 3 + && mInputBuffer[mInputPtr+1] == 't' + && mInputBuffer[mInputPtr+2] == ';') { + return null; + } + } else if (c == 'g') { + if (avail >= 3 + && mInputBuffer[mInputPtr+1] == 't' + && mInputBuffer[mInputPtr+2] == ';') { + return null; + } + } else if (c == 'q') { + if (avail >= 5 + && mInputBuffer[mInputPtr+1] == 'u' + && mInputBuffer[mInputPtr+2] == 'o' + && mInputBuffer[mInputPtr+3] == 't' + && mInputBuffer[mInputPtr+4] == ';') { + return null; + } + } + + // Otherwise, let's just parse in generic way: + ++mInputPtr; // since we already read the first letter + String id = parseEntityName(c); + mCurrName = id; + + return findEntity(id, null); + } + + /** + * Method that does full resolution of an entity reference, be it + * character entity, internal entity or external entity, including + * updating of input buffers, and depending on whether result is + * a character entity (or one of 5 pre-defined entities), returns + * char in question, or null character (code 0) to indicate it had + * to change input source. + * + * @param allowExt If true, is allowed to expand external entities + * (expanding text); if false, is not (expanding attribute value). + * + * @return Either single-character replacement (which is NOT to be + * reparsed), or null char (0) to indicate expansion is done via + * input source. + */ + protected int fullyResolveEntity(boolean allowExt) + throws XMLStreamException + { + char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); + // Do we have a (numeric) character entity reference? + if (c == '#') { // numeric + final StringBuffer originalSurface = new StringBuffer("#"); + int ch = resolveCharEnt(originalSurface); + if (mCfgTreatCharRefsAsEntities) { + final char[] originalChars = new char[originalSurface.length()]; + originalSurface.getChars(0, originalSurface.length(), originalChars, 0); + mCurrEntity = getIntEntity(ch, originalChars); + return 0; + } + return ch; + } + + String id = parseEntityName(c); + + // Perhaps we have a pre-defined char reference? + c = id.charAt(0); + /* + * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override + * pre-defined settings for these? + */ + char d = CHAR_NULL; + if (c == 'a') { // amp or apos? + if (id.equals("amp")) { + d = '&'; + } else if (id.equals("apos")) { + d = '\''; + } + } else if (c == 'g') { // gt? + if (id.length() == 2 && id.charAt(1) == 't') { + d = '>'; + } + } else if (c == 'l') { // lt? + if (id.length() == 2 && id.charAt(1) == 't') { + d = '<'; + } + } else if (c == 'q') { // quot? + if (id.equals("quot")) { + d = '"'; + } + } + + if (d != CHAR_NULL) { + if (mCfgTreatCharRefsAsEntities) { + final char[] originalChars = new char[id.length()]; + id.getChars(0, id.length(), originalChars, 0); + mCurrEntity = getIntEntity(d, originalChars); + return 0; + } + return d; + } + + final EntityDecl e = expandEntity(id, allowExt, null); + if (mCfgTreatCharRefsAsEntities) { + mCurrEntity = e; + } + return 0; + } + + /** + * Returns an entity (possibly from cache) for the argument character using the encoded + * representation in mInputBuffer[entityStartPos ... mInputPtr-1]. + */ + protected EntityDecl getIntEntity(int ch, final char[] originalChars) + { + String cacheKey = new String(originalChars); + + IntEntity entity = (IntEntity) mCachedEntities.get(cacheKey); + if (entity == null) { + String repl; + if (ch <= 0xFFFF) { + repl = Character.toString((char) ch); + } else { + StringBuffer sb = new StringBuffer(2); + ch -= 0x10000; + sb.append((char) ((ch >> 10) + 0xD800)); + sb.append((char) ((ch & 0x3FF) + 0xDC00)); + repl = sb.toString(); + } + entity = IntEntity.create(new String(originalChars), repl); + mCachedEntities.put(cacheKey, entity); + } + return entity; + } + + + /** + * Helper method that will try to expand a parsed entity (parameter or + * generic entity). + *

+ * note: called by sub-classes (dtd parser), needs to be protected. + * + * @param id Name of the entity being expanded + * @param allowExt Whether external entities can be expanded or not; if + * not, and the entity to expand would be external one, an exception + * will be thrown + */ + protected EntityDecl expandEntity(String id, boolean allowExt, + Object extraArg) + throws XMLStreamException + { + mCurrName = id; + + EntityDecl ed = findEntity(id, extraArg); + + if (ed == null) { + /* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception + * if we have to resolve it (otherwise it's just best-effort, + * and null is ok) + */ + /* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity + * resolver" + */ + if (mCfgReplaceEntities) { + mCurrEntity = expandUnresolvedEntity(id); + } + return null; + } + + if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) { + expandEntity(ed, allowExt); + } + + return ed; + } + + /** + * + *

+ * note: defined as private for documentation, ie. it's just called + * from within this class (not sub-classes), from one specific method + * (see above) + * + * @param ed Entity to be expanded + * @param allowExt Whether external entities are allowed or not. + */ + private void expandEntity(EntityDecl ed, boolean allowExt) + throws XMLStreamException + { + String id = ed.getName(); + + /* Very first thing; we can immediately check if expanding + * this entity would result in infinite recursion: + */ + if (mInput.isOrIsExpandedFrom(id)) { + throwRecursionError(id); + } + + /* Should not refer unparsed entities from attribute values + * or text content (except via notation mechanism, but that's + * not parsed here) + */ + if (!ed.isParsed()) { + throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null); + } + + // 28-Jun-2004, TSa: Do we support external entity expansion? + boolean isExt = ed.isExternal(); + if (isExt) { + if (!allowExt) { // never ok in attribute value... + throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null); + } + if (!mConfig.willSupportExternalEntities()) { + throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled", + id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES); + } + } + + // First, let's give current context chance to save its stuff + WstxInputSource oldInput = mInput; + oldInput.saveContext(this); + WstxInputSource newInput = null; + try { + newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion); + } catch (FileNotFoundException fex) { + /* Let's catch and rethrow this just so we get more meaningful + * description (with input source position etc) + */ + throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage()); + } catch (IOException ioe) { + throw constructFromIOE(ioe); + } + /* And then we'll need to make sure new input comes from the new + * input source + */ + initInputSource(newInput, isExt, id); + } + + /** + *

+ * note: only called from the local expandEntity() method + */ + private EntityDecl expandUnresolvedEntity(String id) + throws XMLStreamException + { + XMLResolver resolver = mConfig.getUndeclaredEntityResolver(); + if (resolver != null) { + /* Ok, we can check for recursion here; but let's only do that + * if there is any chance that it might get resolved by + * the special resolver (it must have been resolved this way + * earlier, too...) + */ + if (mInput.isOrIsExpandedFrom(id)) { + throwRecursionError(id); + } + + WstxInputSource oldInput = mInput; + oldInput.saveContext(this); + // null, null -> no public or system ids + int xmlVersion = mDocXmlVersion; + // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0 + if (xmlVersion == XmlConsts.XML_V_UNKNOWN) { + xmlVersion = XmlConsts.XML_V_10; + } + WstxInputSource newInput; + try { + newInput = DefaultInputResolver.resolveEntityUsing + (oldInput, id, null, null, resolver, mConfig, xmlVersion); + if (mCfgTreatCharRefsAsEntities) { + return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(), + newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation()); + } + } catch (IOException ioe) { + throw constructFromIOE(ioe); + } + if (newInput != null) { + // true -> is external + initInputSource(newInput, true, id); + return null; + } + } + handleUndeclaredEntity(id); + return null; + } + + /* + /////////////////////////////////////////////////////////// + // Abstract methods for sub-classes to implement + /////////////////////////////////////////////////////////// + */ + + /** + * Abstract method for sub-classes to implement, for finding + * a declared general or parsed entity. + * + * @param id Identifier of the entity to find + * @param arg Optional argument passed from caller; needed by DTD + * reader. + */ + protected abstract EntityDecl findEntity(String id, Object arg) + throws XMLStreamException; + + /** + * This method gets called if a declaration for an entity was not + * found in entity expanding mode (enabled by default for xml reader, + * always enabled for dtd reader). + */ + protected abstract void handleUndeclaredEntity(String id) + throws XMLStreamException; + + protected abstract void handleIncompleteEntityProblem(WstxInputSource closing) + throws XMLStreamException; + + /* + /////////////////////////////////////////////////////////// + // Basic tokenization + /////////////////////////////////////////////////////////// + */ + + /** + * Method that will parse name token (roughly equivalent to XML specs; + * although bit lenier for more efficient handling); either uri prefix, + * or local name. + *

+ * Much of complexity in this method has to do with the intention to + * try to avoid any character copies. In this optimal case algorithm + * would be fairly simple. However, this only works if all data is + * already in input buffer... if not, copy has to be made halfway + * through parsing, and that complicates things. + *

+ * One thing to note is that String returned has been canonicalized + * and (if necessary) added to symbol table. It can thus be compared + * against other such (usually id) Strings, with simple equality operator. + * + * @param c First character of the name; not yet checked for validity + * + * @return Canonicalized name String (which may have length 0, if + * EOF or non-name-start char encountered) + */ + protected String parseLocalName(char c) + throws XMLStreamException + { + /* Has to start with letter, or '_' (etc); we won't allow ':' as that + * is taken as namespace separator; no use trying to optimize + * heavily as it's 98% likely it is a valid char... + */ + if (!isNameStartChar(c)) { + if (c == ':') { + throwUnexpectedChar(c, " (missing namespace prefix?)"); + } + throwUnexpectedChar(c, " (expected a name start character)"); + } + + int ptr = mInputPtr; + int hash = (int) c; + final int inputLen = mInputEnd; + int startPtr = ptr-1; // already read previous char + final char[] inputBuf = mInputBuffer; + + /* After which there may be zero or more name chars + * we have to consider + */ + while (true) { + if (ptr >= inputLen) { + /* Ok, identifier may continue past buffer end, need + * to continue with part 2 (separate method, as this is + * not as common as having it all in buffer) + */ + mInputPtr = ptr; + return parseLocalName2(startPtr, hash); + } + // Ok, we have the char... is it a name char? + c = inputBuf[ptr]; + if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { + break; + } + if (!isNameChar(c)) { + break; + } + hash = (hash * 31) + (int) c; + ++ptr; + } + mInputPtr = ptr; + return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash); + } + + /** + * Second part of name token parsing; called when name can continue + * past input buffer end (so only part was read before calling this + * method to read the rest). + *

+ * Note that this isn't heavily optimized, on assumption it's not + * called very often. + */ + protected String parseLocalName2(int start, int hash) + throws XMLStreamException + { + int ptr = mInputEnd - start; + // Let's assume fairly short names + char[] outBuf = getNameBuffer(ptr+8); + + if (ptr > 0) { + System.arraycopy(mInputBuffer, start, outBuf, 0, ptr); + } + + int outLen = outBuf.length; + while (true) { + // note: names can not cross input block (entity) boundaries... + if (mInputPtr >= mInputEnd) { + if (!loadMoreFromCurrent()) { + break; + } + } + char c = mInputBuffer[mInputPtr]; + if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { + break; + } + if (!isNameChar(c)) { + break; + } + ++mInputPtr; + if (ptr >= outLen) { + mNameBuffer = outBuf = expandBy50Pct(outBuf); + outLen = outBuf.length; + } + outBuf[ptr++] = c; + hash = (hash * 31) + (int) c; + } + // Still need to canonicalize the name: + return mSymbols.findSymbol(outBuf, 0, ptr, hash); + } + + /** + * Method that will parse 'full' name token; what full means depends on + * whether reader is namespace aware or not. If it is, full name means + * local name with no namespace prefix (PI target, entity/notation name); + * if not, name can contain arbitrary number of colons. Note that + * element and attribute names are NOT parsed here, so actual namespace + * prefix separation can be handled properly there. + *

+ * Similar to {@link #parseLocalName}, much of complexity stems from + * trying to avoid copying name characters from input buffer. + *

+ * Note that returned String will be canonicalized, similar to + * {@link #parseLocalName}, but without separating prefix/local name. + * + * @return Canonicalized name String (which may have length 0, if + * EOF or non-name-start char encountered) + */ + protected String parseFullName() + throws XMLStreamException + { + if (mInputPtr >= mInputEnd) { + loadMoreFromCurrent(); + } + return parseFullName(mInputBuffer[mInputPtr++]); + } + + protected String parseFullName(char c) + throws XMLStreamException + { + // First char has special handling: + if (!isNameStartChar(c)) { + if (c == ':') { // no name.... generally an error: + if (mCfgNsEnabled) { + throwNsColonException(parseFNameForError()); + } + // Ok, that's fine actually + } else { + if (c <= CHAR_SPACE) { + throwUnexpectedChar(c, " (missing name?)"); + } + throwUnexpectedChar(c, " (expected a name start character)"); + } + } + + int ptr = mInputPtr; + int hash = (int) c; + int inputLen = mInputEnd; + int startPtr = ptr-1; // to account for the first char + + /* After which there may be zero or more name chars + * we have to consider + */ + while (true) { + if (ptr >= inputLen) { + /* Ok, identifier may continue past buffer end, need + * to continue with part 2 (separate method, as this is + * not as common as having it all in buffer) + */ + mInputPtr = ptr; + return parseFullName2(startPtr, hash); + } + c = mInputBuffer[ptr]; + if (c == ':') { // colon only allowed in non-NS mode + if (mCfgNsEnabled) { + mInputPtr = ptr; + throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError()); + } + } else { + if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { + break; + } + if (!isNameChar(c)) { + break; + } + } + hash = (hash * 31) + (int) c; + ++ptr; + } + mInputPtr = ptr; + return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash); + } + + protected String parseFullName2(int start, int hash) + throws XMLStreamException + { + int ptr = mInputEnd - start; + // Let's assume fairly short names + char[] outBuf = getNameBuffer(ptr+8); + + if (ptr > 0) { + System.arraycopy(mInputBuffer, start, outBuf, 0, ptr); + } + + int outLen = outBuf.length; + while (true) { + /* 06-Sep-2004, TSa: Name tokens are not allowed to continue + * past entity expansion ranges... that is, all characters + * have to come from the same input source. Thus, let's only + * load things from same input level + */ + if (mInputPtr >= mInputEnd) { + if (!loadMoreFromCurrent()) { + break; + } + } + char c = mInputBuffer[mInputPtr]; + if (c == ':') { // colon only allowed in non-NS mode + if (mCfgNsEnabled) { + throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError()); + } + } else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { + break; + } else if (!isNameChar(c)) { + break; + } + ++mInputPtr; + + if (ptr >= outLen) { + mNameBuffer = outBuf = expandBy50Pct(outBuf); + outLen = outBuf.length; + } + outBuf[ptr++] = c; + hash = (hash * 31) + (int) c; + } + + // Still need to canonicalize the name: + return mSymbols.findSymbol(outBuf, 0, ptr, hash); + } + + /** + * Method called to read in full name, including unlimited number of + * namespace separators (':'), for the purpose of displaying name in + * an error message. Won't do any further validations, and parsing + * is not optimized: main need is just to get more meaningful error + * messages. + */ + protected String parseFNameForError() + throws XMLStreamException + { + StringBuffer sb = new StringBuffer(100); + while (true) { + char c; + + if (mInputPtr < mInputEnd) { + c = mInputBuffer[mInputPtr++]; + } else { // can't error here, so let's accept EOF for now: + int i = getNext(); + if (i < 0) { + break; + } + c = (char) i; + } + if (c != ':' && !isNameChar(c)) { + --mInputPtr; + break; + } + sb.append(c); + } + return sb.toString(); + } + + protected final String parseEntityName(char c) + throws XMLStreamException + { + String id = parseFullName(c); + // Needs to be followed by a semi-colon, too.. from same input source: + if (mInputPtr >= mInputEnd) { + if (!loadMoreFromCurrent()) { + throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null); + } + } + c = mInputBuffer[mInputPtr++]; + if (c != ';') { + throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'"); + } + return id; + } + + /** + * Note: does not check for number of colons, amongst other things. + * Main idea is to skip through what superficially seems like a valid + * id, nothing more. This is only done when really skipping through + * something we do not care about at all: not even whether names/ids + * would be valid (for example, when ignoring internal DTD subset). + * + * @return Length of skipped name. + */ + protected int skipFullName(char c) + throws XMLStreamException + { + if (!isNameStartChar(c)) { + --mInputPtr; + return 0; + } + + /* After which there may be zero or more name chars + * we have to consider + */ + int count = 1; + while (true) { + c = (mInputPtr < mInputEnd) ? + mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME); + if (c != ':' && !isNameChar(c)) { + break; + } + ++count; + } + return count; + } + + /** + * Simple parsing method that parses system ids, which are generally + * used in entities (from DOCTYPE declaration to internal/external + * subsets). + *

+ * NOTE: returned String is not canonicalized, on assumption that + * external ids may be longish, and are not shared all that often, as + * they are generally just used for resolving paths, if anything. + *
+ * Also note that this method is not heavily optimized, as it's not + * likely to be a bottleneck for parsing. + */ + protected final String parseSystemId(char quoteChar, boolean convertLFs, + String errorMsg) + throws XMLStreamException + { + char[] buf = getNameBuffer(-1); + int ptr = 0; + + while (true) { + char c = (mInputPtr < mInputEnd) ? + mInputBuffer[mInputPtr++] : getNextChar(errorMsg); + if (c == quoteChar) { + break; + } + /* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not? + * It seems like we should, for all input... so that's the way it + * works. + */ + if (c == '\n') { + markLF(); + } else if (c == '\r') { + if (peekNext() == '\n') { + ++mInputPtr; + if (!convertLFs) { + /* The only tricky thing; need to preserve 2-char LF; need to + * output one char from here, then can fall back to default: + */ + if (ptr >= buf.length) { + buf = expandBy50Pct(buf); + } + buf[ptr++] = '\r'; + } + c = '\n'; + } else if (convertLFs) { + c = '\n'; + } + } + + // Other than that, let's just append it: + if (ptr >= buf.length) { + buf = expandBy50Pct(buf); + } + buf[ptr++] = c; + } + + return (ptr == 0) ? "" : new String(buf, 0, ptr); + } + + /** + * Simple parsing method that parses system ids, which are generally + * used in entities (from DOCTYPE declaration to internal/external + * subsets). + *

+ * As per xml specs, the contents are actually normalized. + *

+ * NOTE: returned String is not canonicalized, on assumption that + * external ids may be longish, and are not shared all that often, as + * they are generally just used for resolving paths, if anything. + *
+ * Also note that this method is not heavily optimized, as it's not + * likely to be a bottleneck for parsing. + */ + protected final String parsePublicId(char quoteChar, String errorMsg) + throws XMLStreamException + { + char[] buf = getNameBuffer(-1); + int ptr = 0; + boolean spaceToAdd = false; + + while (true) { + char c = (mInputPtr < mInputEnd) ? + mInputBuffer[mInputPtr++] : getNextChar(errorMsg); + if (c == quoteChar) { + break; + } + if (c == '\n') { + markLF(); + spaceToAdd = true; + continue; + } else if (c == '\r') { + if (peekNext() == '\n') { + ++mInputPtr; + } + spaceToAdd = true; + continue; + } else if (c == CHAR_SPACE) { + spaceToAdd = true; + continue; + } else { + // Verify it's a legal pubid char (see XML spec, #13, from 2.3) + if ((c >= VALID_PUBID_CHAR_COUNT) + || sPubidValidity[c] != PUBID_CHAR_VALID_B) { + throwUnexpectedChar(c, " in public identifier"); + } + } + + // Other than that, let's just append it: + if (ptr >= buf.length) { + buf = expandBy50Pct(buf); + } + /* Space-normalization means scrapping leading and trailing + * white space, and coalescing remaining ws into single spaces. + */ + if (spaceToAdd) { // pending white space to add? + if (c == CHAR_SPACE) { // still a space; let's skip + continue; + } + /* ok: if we have non-space, we'll either forget about + * space(s) (if nothing has been output, ie. leading space), + * or output a single space (in-between non-white space) + */ + spaceToAdd = false; + if (ptr > 0) { + buf[ptr++] = CHAR_SPACE; + if (ptr >= buf.length) { + buf = expandBy50Pct(buf); + } + } + } + buf[ptr++] = c; + } + + return (ptr == 0) ? "" : new String(buf, 0, ptr); + } + + protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs, + String errorMsg) + throws XMLStreamException + { + // Let's first ensure we have some data in there... + if (mInputPtr >= mInputEnd) { + loadMore(errorMsg); + } + while (true) { + // Let's loop consequtive 'easy' spans: + char[] inputBuf = mInputBuffer; + int inputLen = mInputEnd; + int ptr = mInputPtr; + int startPtr = ptr; + while (ptr < inputLen) { + char c = inputBuf[ptr++]; + if (c == endChar) { + int thisLen = ptr - startPtr - 1; + if (thisLen > 0) { + tb.append(inputBuf, startPtr, thisLen); + } + mInputPtr = ptr; + return; + } + if (c == '\n') { + mInputPtr = ptr; // markLF() requires this + markLF(); + } else if (c == '\r') { + if (!convertLFs && ptr < inputLen) { + if (inputBuf[ptr] == '\n') { + ++ptr; + } + mInputPtr = ptr; + markLF(); + } else { + int thisLen = ptr - startPtr - 1; + if (thisLen > 0) { + tb.append(inputBuf, startPtr, thisLen); + } + mInputPtr = ptr; + c = getNextChar(errorMsg); + if (c != '\n') { + --mInputPtr; // pusback + tb.append(convertLFs ? '\n' : '\r'); + } else { + if (convertLFs) { + tb.append('\n'); + } else { + tb.append('\r'); + tb.append('\n'); + } + } + startPtr = ptr = mInputPtr; + markLF(); + } + } + } + int thisLen = ptr - startPtr; + if (thisLen > 0) { + tb.append(inputBuf, startPtr, thisLen); + } + loadMore(errorMsg); + startPtr = ptr = mInputPtr; + inputBuf = mInputBuffer; + inputLen = mInputEnd; + } + } + + /* + /////////////////////////////////////////////////////////// + // Internal methods + /////////////////////////////////////////////////////////// + */ + + private int resolveCharEnt(StringBuffer originalCharacters) + throws XMLStreamException + { + int value = 0; + char c = getNextChar(SUFFIX_IN_ENTITY_REF); + + if (originalCharacters != null) { + originalCharacters.append(c); + } + + if (c == 'x') { // hex + while (true) { + c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] + : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); + if (c == ';') { + break; + } + + if (originalCharacters != null) { + originalCharacters.append(c); + } + value = value << 4; + if (c <= '9' && c >= '0') { + value += (c - '0'); + } else if (c >= 'a' && c <= 'f') { + value += 10 + (c - 'a'); + } else if (c >= 'A' && c <= 'F') { + value += 10 + (c - 'A'); + } else { + throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); + } + // Overflow? + if (value > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + } + } else { // numeric (decimal) + while (c != ';') { + if (c <= '9' && c >= '0') { + value = (value * 10) + (c - '0'); + // Overflow? + if (value > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + } else { + throwUnexpectedChar(c, "; expected a decimal number."); + } + c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] + : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); + + if (originalCharacters != null && c != ';') { + originalCharacters.append(c); + } + } + } + validateChar(value); + return value; + } + + /** + * Method that will verify that expanded Unicode codepoint is a valid + * XML content character. + */ + private final void validateChar(int value) + throws XMLStreamException + { + /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic, + * need to be reported by a surrogate pair.. + */ + if (value >= 0xD800) { + if (value < 0xE000) { // no surrogates via entity expansion + reportIllegalChar(value); + } + if (value > 0xFFFF) { + // Within valid range at all? + if (value > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + } else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too + reportIllegalChar(value); + } + // Ok, fine as is + } else if (value < 32) { + if (value == 0) { + throwParseError("Invalid character reference: null character not allowed in XML content."); + } + // XML 1.1 allows most other chars; 1.0 does not: + if (!mXml11 && + (value != 0x9 && value != 0xA && value != 0xD)) { + reportIllegalChar(value); + } + } + } + + protected final char[] getNameBuffer(int minSize) + { + char[] buf = mNameBuffer; + + if (buf == null) { + mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64]; + } else if (minSize >= buf.length) { // let's allow one char extra... + int len = buf.length; + len += (len >> 1); // grow by 50% + mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len]; + } + return buf; + } + + protected final char[] expandBy50Pct(char[] buf) + { + int len = buf.length; + char[] newBuf = new char[len + (len >> 1)]; + System.arraycopy(buf, 0, newBuf, 0, len); + return newBuf; + } + + /** + * Method called to throw an exception indicating that a name that + * should not be namespace-qualified (PI target, entity/notation name) + * is one, and reader is namespace aware. + */ + private void throwNsColonException(String name) + throws XMLStreamException + { + throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null); + } + + private void throwRecursionError(String entityName) + throws XMLStreamException + { + throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null); + } + + private void reportUnicodeOverflow() + throws XMLStreamException + { + throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null); + } + + private void reportIllegalChar(int value) + throws XMLStreamException + { + throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null); + } +}