;(function (sax) { // wrapper for non-node envs sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } sax.SAXParser = SAXParser sax.SAXStream = SAXStream sax.createStream = createStream // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), // since that's the earliest that a buffer overrun could occur. This way, checks are // as rare as required, but as often as necessary to ensure never crossing this bound. // Furthermore, buffers are only tested at most once per write(), so passing a very // large string into write() might have undesirable effects, but this is manageable by // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme // edge case, result in creating at most one complete copy of the string passed in. // Set to Infinity to have unlimited buffers. sax.MAX_BUFFER_LENGTH = 64 * 1024 var buffers = [ 'comment', 'sgmlDecl', 'textNode', 'tagName', 'doctype', 'procInstName', 'procInstBody', 'entity', 'attribName', 'attribValue', 'cdata', 'script' ] sax.EVENTS = [ 'text', 'processinginstruction', 'sgmldeclaration', 'doctype', 'comment', 'opentagstart', 'attribute', 'opentag', 'closetag', 'opencdata', 'cdata', 'closecdata', 'error', 'end', 'ready', 'script', 'opennamespace', 'closenamespace' ] function SAXParser (strict, opt) { if (!(this instanceof SAXParser)) { return new SAXParser(strict, opt) } var parser = this clearBuffers(parser) parser.q = parser.c = '' parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH parser.opt = opt || {} parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags parser.looseCase = parser.opt.lowercase ? 'toLowerCase' : 'toUpperCase' parser.tags = [] parser.closed = parser.closedRoot = parser.sawRoot = false parser.tag = parser.error = null parser.strict = !!strict parser.noscript = !!(strict || parser.opt.noscript) parser.state = S.BEGIN parser.strictEntities = parser.opt.strictEntities parser.ENTITIES = parser.strictEntities ? Object.create(sax.XML_ENTITIES) : Object.create(sax.ENTITIES) parser.attribList = [] // namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (parser.opt.xmlns) { parser.ns = Object.create(rootNS) } // mostly just for error reporting parser.trackPosition = parser.opt.position !== false if (parser.trackPosition) { parser.position = parser.line = parser.column = 0 } emit(parser, 'onready') } if (!Object.create) { Object.create = function (o) { function F () {} F.prototype = o var newf = new F() return newf } } if (!Object.keys) { Object.keys = function (o) { var a = [] for (var i in o) if (o.hasOwnProperty(i)) a.push(i) return a } } function checkBufferLength (parser) { var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) var maxActual = 0 for (var i = 0, l = buffers.length; i < l; i++) { var len = parser[buffers[i]].length if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (buffers[i]) { case 'textNode': closeText(parser) break case 'cdata': emitNode(parser, 'oncdata', parser.cdata) parser.cdata = '' break case 'script': emitNode(parser, 'onscript', parser.script) parser.script = '' break default: error(parser, 'Max buffer length exceeded: ' + buffers[i]) } } maxActual = Math.max(maxActual, len) } // schedule the next check for the earliest possible buffer overrun. var m = sax.MAX_BUFFER_LENGTH - maxActual parser.bufferCheckPosition = m + parser.position } function clearBuffers (parser) { for (var i = 0, l = buffers.length; i < l; i++) { parser[buffers[i]] = '' } } function flushBuffers (parser) { closeText(parser) if (parser.cdata !== '') { emitNode(parser, 'oncdata', parser.cdata) parser.cdata = '' } if (parser.script !== '') { emitNode(parser, 'onscript', parser.script) parser.script = '' } } SAXParser.prototype = { end: function () { end(this) }, write: write, resume: function () { this.error = null; return this }, close: function () { return this.write(null) }, flush: function () { flushBuffers(this) } } var Stream try { Stream = require('stream').Stream } catch (ex) { Stream = function () {} } var streamWraps = sax.EVENTS.filter(function (ev) { return ev !== 'error' && ev !== 'end' }) function createStream (strict, opt) { return new SAXStream(strict, opt) } function SAXStream (strict, opt) { if (!(this instanceof SAXStream)) { return new SAXStream(strict, opt) } Stream.apply(this) this._parser = new SAXParser(strict, opt) this.writable = true this.readable = true var me = this this._parser.onend = function () { me.emit('end') } this._parser.onerror = function (er) { me.emit('error', er) // if didn't throw, then means error was handled. // go ahead and clear error, so we can write again. me._parser.error = null } this._decoder = null streamWraps.forEach(function (ev) { Object.defineProperty(me, 'on' + ev, { get: function () { return me._parser['on' + ev] }, set: function (h) { if (!h) { me.removeAllListeners(ev) me._parser['on' + ev] = h return h } me.on(ev, h) }, enumerable: true, configurable: false }) }) } SAXStream.prototype = Object.create(Stream.prototype, { constructor: { value: SAXStream } }) SAXStream.prototype.write = function (data) { if (typeof Buffer === 'function' && typeof Buffer.isBuffer === 'function' && Buffer.isBuffer(data)) { if (!this._decoder) { var SD = require('string_decoder').StringDecoder this._decoder = new SD('utf8') } data = this._decoder.write(data) } this._parser.write(data.toString()) this.emit('data', data) return true } SAXStream.prototype.end = function (chunk) { if (chunk && chunk.length) { this.write(chunk) } this._parser.end() return true } SAXStream.prototype.on = function (ev, handler) { var me = this if (!me._parser['on' + ev] && streamWraps.indexOf(ev) !== -1) { me._parser['on' + ev] = function () { var args = arguments.length === 1 ? [arguments[0]] : Array.apply(null, arguments) args.splice(0, 0, ev) me.emit.apply(me, args) } } return Stream.prototype.on.call(me, ev, handler) } // this really needs to be replaced with character classes. // XML allows all manner of ridiculous numbers and digits. var CDATA = '[CDATA[' var DOCTYPE = 'DOCTYPE' var XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace' var XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/' var rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE } // http://www.w3.org/TR/REC-xml/#NT-NameStartChar // This implementation works on strings, a single character at a time // as such, it cannot ever support astral-plane characters (10000-EFFFF) // without a significant breaking change to either this parser, or the // JavaScript language. Implementation of an emoji-capable xml parser // is left as an exercise for the reader. var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/ var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/ var entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/ var entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/ function isWhitespace (c) { return c === ' ' || c === '\n' || c === '\r' || c === '\t' } function isQuote (c) { return c === '"' || c === '\'' } function isAttribEnd (c) { return c === '>' || isWhitespace(c) } function isMatch (regex, c) { return regex.test(c) } function notMatch (regex, c) { return !isMatch(regex, c) } var S = 0 sax.STATE = { BEGIN: S++, // leading byte order mark or whitespace BEGIN_WHITESPACE: S++, // leading whitespace TEXT: S++, // general stuff TEXT_ENTITY: S++, // & and such. OPEN_WAKA: S++, // < SGML_DECL: S++, // <!BLARG SGML_DECL_QUOTED: S++, // <!BLARG foo "bar DOCTYPE: S++, // <!DOCTYPE DOCTYPE_QUOTED: S++, // <!DOCTYPE "//blah DOCTYPE_DTD: S++, // <!DOCTYPE "//blah" [ ... DOCTYPE_DTD_QUOTED: S++, // <!DOCTYPE "//blah" [ "foo COMMENT_STARTING: S++, // <!- COMMENT: S++, // <!-- COMMENT_ENDING: S++, // <!-- blah - COMMENT_ENDED: S++, // <!-- blah -- CDATA: S++, // <![CDATA[ something CDATA_ENDING: S++, // ] CDATA_ENDING_2: S++, // ]] PROC_INST: S++, // <?hi PROC_INST_BODY: S++, // <?hi there PROC_INST_ENDING: S++, // <?hi "there" ? OPEN_TAG: S++, // <strong OPEN_TAG_SLASH: S++, // <strong / ATTRIB: S++, // <a ATTRIB_NAME: S++, // <a foo ATTRIB_NAME_SAW_WHITE: S++, // <a foo _ ATTRIB_VALUE: S++, // <a foo= ATTRIB_VALUE_QUOTED: S++, // <a foo="bar ATTRIB_VALUE_CLOSED: S++, // <a foo="bar" ATTRIB_VALUE_UNQUOTED: S++, // <a foo=bar ATTRIB_VALUE_ENTITY_Q: S++, // <foo bar=""" ATTRIB_VALUE_ENTITY_U: S++, // <foo bar=" CLOSE_TAG: S++, // </a CLOSE_TAG_SAW_WHITE: S++, // </a > SCRIPT: S++, // <script> ... SCRIPT_ENDING: S++ // <script> ... < } sax.XML_ENTITIES = { 'amp': '&', 'gt': '>', 'lt': '<', 'quot': '"', 'apos': "'" } sax.ENTITIES = { 'amp': '&', 'gt': '>', 'lt': '<', 'quot': '"', 'apos': "'", 'AElig': 198, 'Aacute': 193, 'Acirc': 194, 'Agrave': 192, 'Aring': 197, 'Atilde': 195, 'Auml': 196, 'Ccedil': 199, 'ETH': 208, 'Eacute': 201, 'Ecirc': 202, 'Egrave': 200, 'Euml': 203, 'Iacute': 205, 'Icirc': 206, 'Igrave': 204, 'Iuml': 207, 'Ntilde': 209, 'Oacute': 211, 'Ocirc': 212, 'Ograve': 210, 'Oslash': 216, 'Otilde': 213, 'Ouml': 214, 'THORN': 222, 'Uacute': 218, 'Ucirc': 219, 'Ugrave': 217, 'Uuml': 220, 'Yacute': 221, 'aacute': 225, 'acirc': 226, 'aelig': 230, 'agrave': 224, 'aring': 229, 'atilde': 227, 'auml': 228, 'ccedil': 231, 'eacute': 233, 'ecirc': 234, 'egrave': 232, 'eth': 240, 'euml': 235, 'iacute': 237, 'icirc': 238, 'igrave': 236, 'iuml': 239, 'ntilde': 241, 'oacute': 243, 'ocirc': 244, 'ograve': 242, 'oslash': 248, 'otilde': 245, 'ouml': 246, 'szlig': 223, 'thorn': 254, 'uacute': 250, 'ucirc': 251, 'ugrave': 249, 'uuml': 252, 'yacute': 253, 'yuml': 255, 'copy': 169, 'reg': 174, 'nbsp': 160, 'iexcl': 161, 'cent': 162, 'pound': 163, 'curren': 164, 'yen': 165, 'brvbar': 166, 'sect': 167, 'uml': 168, 'ordf': 170, 'laquo': 171, 'not': 172, 'shy': 173, 'macr': 175, 'deg': 176, 'plusmn': 177, 'sup1': 185, 'sup2': 178, 'sup3': 179, 'acute': 180, 'micro': 181, 'para': 182, 'middot': 183, 'cedil': 184, 'ordm': 186, 'raquo': 187, 'frac14': 188, 'frac12': 189, 'frac34': 190, 'iquest': 191, 'times': 215, 'divide': 247, 'OElig': 338, 'oelig': 339, 'Scaron': 352, 'scaron': 353, 'Yuml': 376, 'fnof': 402, 'circ': 710, 'tilde': 732, 'Alpha': 913, 'Beta': 914, 'Gamma': 915, 'Delta': 916, 'Epsilon': 917, 'Zeta': 918, 'Eta': 919, 'Theta': 920, 'Iota': 921, 'Kappa': 922, 'Lambda': 923, 'Mu': 924, 'Nu': 925, 'Xi': 926, 'Omicron': 927, 'Pi': 928, 'Rho': 929, 'Sigma': 931, 'Tau': 932, 'Upsilon': 933, 'Phi': 934, 'Chi': 935, 'Psi': 936, 'Omega': 937, 'alpha': 945, 'beta': 946, 'gamma': 947, 'delta': 948, 'epsilon': 949, 'zeta': 950, 'eta': 951, 'theta': 952, 'iota': 953, 'kappa': 954, 'lambda': 955, 'mu': 956, 'nu': 957, 'xi': 958, 'omicron': 959, 'pi': 960, 'rho': 961, 'sigmaf': 962, 'sigma': 963, 'tau': 964, 'upsilon': 965, 'phi': 966, 'chi': 967, 'psi': 968, 'omega': 969, 'thetasym': 977, 'upsih': 978, 'piv': 982, 'ensp': 8194, 'emsp': 8195, 'thinsp': 8201, 'zwnj': 8204, 'zwj': 8205, 'lrm': 8206, 'rlm': 8207, 'ndash': 8211, 'mdash': 8212, 'lsquo': 8216, 'rsquo': 8217, 'sbquo': 8218, 'ldquo': 8220, 'rdquo': 8221, 'bdquo': 8222, 'dagger': 8224, 'Dagger': 8225, 'bull': 8226, 'hellip': 8230, 'permil': 8240, 'prime': 8242, 'Prime': 8243, 'lsaquo': 8249, 'rsaquo': 8250, 'oline': 8254, 'frasl': 8260, 'euro': 8364, 'image': 8465, 'weierp': 8472, 'real': 8476, 'trade': 8482, 'alefsym': 8501, 'larr': 8592, 'uarr': 8593, 'rarr': 8594, 'darr': 8595, 'harr': 8596, 'crarr': 8629, 'lArr': 8656, 'uArr': 8657, 'rArr': 8658, 'dArr': 8659, 'hArr': 8660, 'forall': 8704, 'part': 8706, 'exist': 8707, 'empty': 8709, 'nabla': 8711, 'isin': 8712, 'notin': 8713, 'ni': 8715, 'prod': 8719, 'sum': 8721, 'minus': 8722, 'lowast': 8727, 'radic': 8730, 'prop': 8733, 'infin': 8734, 'ang': 8736, 'and': 8743, 'or': 8744, 'cap': 8745, 'cup': 8746, 'int': 8747, 'there4': 8756, 'sim': 8764, 'cong': 8773, 'asymp': 8776, 'ne': 8800, 'equiv': 8801, 'le': 8804, 'ge': 8805, 'sub': 8834, 'sup': 8835, 'nsub': 8836, 'sube': 8838, 'supe': 8839, 'oplus': 8853, 'otimes': 8855, 'perp': 8869, 'sdot': 8901, 'lceil': 8968, 'rceil': 8969, 'lfloor': 8970, 'rfloor': 8971, 'lang': 9001, 'rang': 9002, 'loz': 9674, 'spades': 9824, 'clubs': 9827, 'hearts': 9829, 'diams': 9830 } Object.keys(sax.ENTITIES).forEach(function (key) { var e = sax.ENTITIES[key] var s = typeof e === 'number' ? String.fromCharCode(e) : e sax.ENTITIES[key] = s }) for (var s in sax.STATE) { sax.STATE[sax.STATE[s]] = s } // shorthand S = sax.STATE function emit (parser, event, data) { parser[event] && parser[event](data) } function emitNode (parser, nodeType, data) { if (parser.textNode) closeText(parser) emit(parser, nodeType, data) } function closeText (parser) { parser.textNode = textopts(parser.opt, parser.textNode) if (parser.textNode) emit(parser, 'ontext', parser.textNode) parser.textNode = '' } function textopts (opt, text) { if (opt.trim) text = text.trim() if (opt.normalize) text = text.replace(/\s+/g, ' ') return text } function error (parser, er) { closeText(parser) if (parser.trackPosition) { er += '\nLine: ' + parser.line + '\nColumn: ' + parser.column + '\nChar: ' + parser.c } er = new Error(er) parser.error = er emit(parser, 'onerror', er) return parser } function end (parser) { if (parser.sawRoot && !parser.closedRoot) strictFail(parser, 'Unclosed root tag') if ((parser.state !== S.BEGIN) && (parser.state !== S.BEGIN_WHITESPACE) && (parser.state !== S.TEXT)) { error(parser, 'Unexpected end') } closeText(parser) parser.c = '' parser.closed = true emit(parser, 'onend') SAXParser.call(parser, parser.strict, parser.opt) return parser } function strictFail (parser, message) { if (typeof parser !== 'object' || !(parser instanceof SAXParser)) { throw new Error('bad call to strictFail') } if (parser.strict) { error(parser, message) } } function newTag (parser) { if (!parser.strict) parser.tagName = parser.tagName[parser.looseCase]() var parent = parser.tags[parser.tags.length - 1] || parser var tag = parser.tag = { name: parser.tagName, attributes: {} } // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar" if (parser.opt.xmlns) { tag.ns = parent.ns } parser.attribList.length = 0 emitNode(parser, 'onopentagstart', tag) } function qname (name, attribute) { var i = name.indexOf(':') var qualName = i < 0 ? [ '', name ] : name.split(':') var prefix = qualName[0] var local = qualName[1] // <x "xmlns"="http://foo"> if (attribute && name === 'xmlns') { prefix = 'xmlns' local = '' } return { prefix: prefix, local: local } } function attrib (parser) { if (!parser.strict) { parser.attribName = parser.attribName[parser.looseCase]() } if (parser.attribList.indexOf(parser.attribName) !== -1 || parser.tag.attributes.hasOwnProperty(parser.attribName)) { parser.attribName = parser.attribValue = '' return } if (parser.opt.xmlns) { var qn = qname(parser.attribName, true) var prefix = qn.prefix var local = qn.local if (prefix === 'xmlns') { // namespace binding attribute. push the binding into scope if (local === 'xml' && parser.attribValue !== XML_NAMESPACE) { strictFail(parser, 'xml: prefix must be bound to ' + XML_NAMESPACE + '\n' + 'Actual: ' + parser.attribValue) } else if (local === 'xmlns' && parser.attribValue !== XMLNS_NAMESPACE) { strictFail(parser, 'xmlns: prefix must be bound to ' + XMLNS_NAMESPACE + '\n' + 'Actual: ' + parser.attribValue) } else { var tag = parser.tag var parent = parser.tags[parser.tags.length - 1] || parser if (tag.ns === parent.ns) { tag.ns = Object.create(parent.ns) } tag.ns[local] = parser.attribValue } } // defer onattribute events until all attributes have been seen // so any new bindings can take effect. preserve attribute order // so deferred events can be emitted in document order parser.attribList.push([parser.attribName, parser.attribValue]) } else { // in non-xmlns mode, we can emit the event right away parser.tag.attributes[parser.attribName] = parser.attribValue emitNode(parser, 'onattribute', { name: parser.attribName, value: parser.attribValue }) } parser.attribName = parser.attribValue = '' } function openTag (parser, selfClosing) { if (parser.opt.xmlns) { // emit namespace binding events var tag = parser.tag // add namespace info to tag var qn = qname(parser.tagName) tag.prefix = qn.prefix tag.local = qn.local tag.uri = tag.ns[qn.prefix] || '' if (tag.prefix && !tag.uri) { strictFail(parser, 'Unbound namespace prefix: ' + JSON.stringify(parser.tagName)) tag.uri = qn.prefix } var parent = parser.tags[parser.tags.length - 1] || parser if (tag.ns && parent.ns !== tag.ns) { Object.keys(tag.ns).forEach(function (p) { emitNode(parser, 'onopennamespace', { prefix: p, uri: tag.ns[p] }) }) } // handle deferred onattribute events // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (var i = 0, l = parser.attribList.length; i < l; i++) { var nv = parser.attribList[i] var name = nv[0] var value = nv[1] var qualName = qname(name, true) var prefix = qualName.prefix var local = qualName.local var uri = prefix === '' ? '' : (tag.ns[prefix] || '') var a = { name: name, value: value, prefix: prefix, local: local, uri: uri } // if there's any attributes with an undefined namespace, // then fail on them now. if (prefix && prefix !== 'xmlns' && !uri) { strictFail(parser, 'Unbound namespace prefix: ' + JSON.stringify(prefix)) a.uri = prefix } parser.tag.attributes[name] = a emitNode(parser, 'onattribute', a) } parser.attribList.length = 0 } parser.tag.isSelfClosing = !!selfClosing // process the tag parser.sawRoot = true parser.tags.push(parser.tag) emitNode(parser, 'onopentag', parser.tag) if (!selfClosing) { // special case for <script> in non-strict mode. if (!parser.noscript && parser.tagName.toLowerCase() === 'script') { parser.state = S.SCRIPT } else { parser.state = S.TEXT } parser.tag = null parser.tagName = '' } parser.attribName = parser.attribValue = '' parser.attribList.length = 0 } function closeTag (parser) { if (!parser.tagName) { strictFail(parser, 'Weird empty close tag.') parser.textNode += '</>' parser.state = S.TEXT return } if (parser.script) { if (parser.tagName !== 'script') { parser.script += '</' + parser.tagName + '>' parser.tagName = '' parser.state = S.SCRIPT return } emitNode(parser, 'onscript', parser.script) parser.script = '' } // first make sure that the closing tag actually exists. // <a><b></c></b></a> will close everything, otherwise. var t = parser.tags.length var tagName = parser.tagName if (!parser.strict) { tagName = tagName[parser.looseCase]() } var closeTo = tagName while (t--) { var close = parser.tags[t] if (close.name !== closeTo) { // fail the first time in strict mode strictFail(parser, 'Unexpected close tag') } else { break } } // didn't find it. we already failed for strict, so just abort. if (t < 0) { strictFail(parser, 'Unmatched closing tag: ' + parser.tagName) parser.textNode += '</' + parser.tagName + '>' parser.state = S.TEXT return } parser.tagName = tagName var s = parser.tags.length while (s-- > t) { var tag = parser.tag = parser.tags.pop() parser.tagName = parser.tag.name emitNode(parser, 'onclosetag', parser.tagName) var x = {} for (var i in tag.ns) { x[i] = tag.ns[i] } var parent = parser.tags[parser.tags.length - 1] || parser if (parser.opt.xmlns && tag.ns !== parent.ns) { // remove namespace bindings introduced by tag Object.keys(tag.ns).forEach(function (p) { var n = tag.ns[p] emitNode(parser, 'onclosenamespace', { prefix: p, uri: n }) }) } } if (t === 0) parser.closedRoot = true parser.tagName = parser.attribValue = parser.attribName = '' parser.attribList.length = 0 parser.state = S.TEXT } function parseEntity (parser) { var entity = parser.entity var entityLC = entity.toLowerCase() var num var numStr = '' if (parser.ENTITIES[entity]) { return parser.ENTITIES[entity] } if (parser.ENTITIES[entityLC]) { return parser.ENTITIES[entityLC] } entity = entityLC if (entity.charAt(0) === '#') { if (entity.charAt(1) === 'x') { entity = entity.slice(2) num = parseInt(entity, 16) numStr = num.toString(16) } else { entity = entity.slice(1) num = parseInt(entity, 10) numStr = num.toString(10) } } entity = entity.replace(/^0+/, '') if (isNaN(num) || numStr.toLowerCase() !== entity) { strictFail(parser, 'Invalid character entity') return '&' + parser.entity + ';' } return String.fromCodePoint(num) } function beginWhiteSpace (parser, c) { if (c === '<') { parser.state = S.OPEN_WAKA parser.startTagPosition = parser.position } else if (!isWhitespace(c)) { // have to process this as a text node. // weird, but happens. strictFail(parser, 'Non-whitespace before first tag.') parser.textNode = c parser.state = S.TEXT } } function charAt (chunk, i) { var result = '' if (i < chunk.length) { result = chunk.charAt(i) } return result } function write (chunk) { var parser = this if (this.error) { throw this.error } if (parser.closed) { return error(parser, 'Cannot write after close. Assign an onready handler.') } if (chunk === null) { return end(parser) } if (typeof chunk === 'object') { chunk = chunk.toString() } var i = 0 var c = '' while (true) { c = charAt(chunk, i++) parser.c = c if (!c) { break } if (parser.trackPosition) { parser.position++ if (c === '\n') { parser.line++ parser.column = 0 } else { parser.column++ } } switch (parser.state) { case S.BEGIN: parser.state = S.BEGIN_WHITESPACE if (c === '\uFEFF') { continue } beginWhiteSpace(parser, c) continue case S.BEGIN_WHITESPACE: beginWhiteSpace(parser, c) continue case S.TEXT: if (parser.sawRoot && !parser.closedRoot) { var starti = i - 1 while (c && c !== '<' && c !== '&') { c = charAt(chunk, i++) if (c && parser.trackPosition) { parser.position++ if (c === '\n') { parser.line++ parser.column = 0 } else { parser.column++ } } } parser.textNode += chunk.substring(starti, i - 1) } if (c === '<' && !(parser.sawRoot && parser.closedRoot && !parser.strict)) { parser.state = S.OPEN_WAKA parser.startTagPosition = parser.position } else { if (!isWhitespace(c) && (!parser.sawRoot || parser.closedRoot)) { strictFail(parser, 'Text data outside of root node.') } if (c === '&') { parser.state = S.TEXT_ENTITY } else { parser.textNode += c } } continue case S.SCRIPT: // only non-strict if (c === '<') { parser.state = S.SCRIPT_ENDING } else { parser.script += c } continue case S.SCRIPT_ENDING: if (c === '/') { parser.state = S.CLOSE_TAG } else { parser.script += '<' + c parser.state = S.SCRIPT } continue case S.OPEN_WAKA: // either a /, ?, !, or text is coming next. if (c === '!') { parser.state = S.SGML_DECL parser.sgmlDecl = '' } else if (isWhitespace(c)) { // wait for it... } else if (isMatch(nameStart, c)) { parser.state = S.OPEN_TAG parser.tagName = c } else if (c === '/') { parser.state = S.CLOSE_TAG parser.tagName = '' } else if (c === '?') { parser.state = S.PROC_INST parser.procInstName = parser.procInstBody = '' } else { strictFail(parser, 'Unencoded <') // if there was some whitespace, then add that in. if (parser.startTagPosition + 1 < parser.position) { var pad = parser.position - parser.startTagPosition c = new Array(pad).join(' ') + c } parser.textNode += '<' + c parser.state = S.TEXT } continue case S.SGML_DECL: if ((parser.sgmlDecl + c).toUpperCase() === CDATA) { emitNode(parser, 'onopencdata') parser.state = S.CDATA parser.sgmlDecl = '' parser.cdata = '' } else if (parser.sgmlDecl + c === '--') { parser.state = S.COMMENT parser.comment = '' parser.sgmlDecl = '' } else if ((parser.sgmlDecl + c).toUpperCase() === DOCTYPE) { parser.state = S.DOCTYPE if (parser.doctype || parser.sawRoot) { strictFail(parser, 'Inappropriately located doctype declaration') } parser.doctype = '' parser.sgmlDecl = '' } else if (c === '>') { emitNode(parser, 'onsgmldeclaration', parser.sgmlDecl) parser.sgmlDecl = '' parser.state = S.TEXT } else if (isQuote(c)) { parser.state = S.SGML_DECL_QUOTED parser.sgmlDecl += c } else { parser.sgmlDecl += c } continue case S.SGML_DECL_QUOTED: if (c === parser.q) { parser.state = S.SGML_DECL parser.q = '' } parser.sgmlDecl += c continue case S.DOCTYPE: if (c === '>') { parser.state = S.TEXT emitNode(parser, 'ondoctype', parser.doctype) parser.doctype = true // just remember that we saw it. } else { parser.doctype += c if (c === '[') { parser.state = S.DOCTYPE_DTD } else if (isQuote(c)) { parser.state = S.DOCTYPE_QUOTED parser.q = c } } continue case S.DOCTYPE_QUOTED: parser.doctype += c if (c === parser.q) { parser.q = '' parser.state = S.DOCTYPE } continue case S.DOCTYPE_DTD: parser.doctype += c if (c === ']') { parser.state = S.DOCTYPE } else if (isQuote(c)) { parser.state = S.DOCTYPE_DTD_QUOTED parser.q = c } continue case S.DOCTYPE_DTD_QUOTED: parser.doctype += c if (c === parser.q) { parser.state = S.DOCTYPE_DTD parser.q = '' } continue case S.COMMENT: if (c === '-') { parser.state = S.COMMENT_ENDING } else { parser.comment += c } continue case S.COMMENT_ENDING: if (c === '-') { parser.state = S.COMMENT_ENDED parser.comment = textopts(parser.opt, parser.comment) if (parser.comment) { emitNode(parser, 'oncomment', parser.comment) } parser.comment = '' } else { parser.comment += '-' + c parser.state = S.COMMENT } continue case S.COMMENT_ENDED: if (c !== '>') { strictFail(parser, 'Malformed comment') // allow <!-- blah -- bloo --> in non-strict mode, // which is a comment of " blah -- bloo " parser.comment += '--' + c parser.state = S.COMMENT } else { parser.state = S.TEXT } continue case S.CDATA: if (c === ']') { parser.state = S.CDATA_ENDING } else { parser.cdata += c } continue case S.CDATA_ENDING: if (c === ']') { parser.state = S.CDATA_ENDING_2 } else { parser.cdata += ']' + c parser.state = S.CDATA } continue case S.CDATA_ENDING_2: if (c === '>') { if (parser.cdata) { emitNode(parser, 'oncdata', parser.cdata) } emitNode(parser, 'onclosecdata') parser.cdata = '' parser.state = S.TEXT } else if (c === ']') { parser.cdata += ']' } else { parser.cdata += ']]' + c parser.state = S.CDATA } continue case S.PROC_INST: if (c === '?') { parser.state = S.PROC_INST_ENDING } else if (isWhitespace(c)) { parser.state = S.PROC_INST_BODY } else { parser.procInstName += c } continue case S.PROC_INST_BODY: if (!parser.procInstBody && isWhitespace(c)) { continue } else if (c === '?') { parser.state = S.PROC_INST_ENDING } else { parser.procInstBody += c } continue case S.PROC_INST_ENDING: if (c === '>') { emitNode(parser, 'onprocessinginstruction', { name: parser.procInstName, body: parser.procInstBody }) parser.procInstName = parser.procInstBody = '' parser.state = S.TEXT } else { parser.procInstBody += '?' + c parser.state = S.PROC_INST_BODY } continue case S.OPEN_TAG: if (isMatch(nameBody, c)) { parser.tagName += c } else { newTag(parser) if (c === '>') { openTag(parser) } else if (c === '/') { parser.state = S.OPEN_TAG_SLASH } else { if (!isWhitespace(c)) { strictFail(parser, 'Invalid character in tag name') } parser.state = S.ATTRIB } } continue case S.OPEN_TAG_SLASH: if (c === '>') { openTag(parser, true) closeTag(parser) } else { strictFail(parser, 'Forward-slash in opening tag not followed by >') parser.state = S.ATTRIB } continue case S.ATTRIB: // haven't read the attribute name yet. if (isWhitespace(c)) { continue } else if (c === '>') { openTag(parser) } else if (c === '/') { parser.state = S.OPEN_TAG_SLASH } else if (isMatch(nameStart, c)) { parser.attribName = c parser.attribValue = '' parser.state = S.ATTRIB_NAME } else { strictFail(parser, 'Invalid attribute name') } continue case S.ATTRIB_NAME: if (c === '=') { parser.state = S.ATTRIB_VALUE } else if (c === '>') { strictFail(parser, 'Attribute without value') parser.attribValue = parser.attribName attrib(parser) openTag(parser) } else if (isWhitespace(c)) { parser.state = S.ATTRIB_NAME_SAW_WHITE } else if (isMatch(nameBody, c)) { parser.attribName += c } else { strictFail(parser, 'Invalid attribute name') } continue case S.ATTRIB_NAME_SAW_WHITE: if (c === '=') { parser.state = S.ATTRIB_VALUE } else if (isWhitespace(c)) { continue } else { strictFail(parser, 'Attribute without value') parser.tag.attributes[parser.attribName] = '' parser.attribValue = '' emitNode(parser, 'onattribute', { name: parser.attribName, value: '' }) parser.attribName = '' if (c === '>') { openTag(parser) } else if (isMatch(nameStart, c)) { parser.attribName = c parser.state = S.ATTRIB_NAME } else { strictFail(parser, 'Invalid attribute name') parser.state = S.ATTRIB } } continue case S.ATTRIB_VALUE: if (isWhitespace(c)) { continue } else if (isQuote(c)) { parser.q = c parser.state = S.ATTRIB_VALUE_QUOTED } else { strictFail(parser, 'Unquoted attribute value') parser.state = S.ATTRIB_VALUE_UNQUOTED parser.attribValue = c } continue case S.ATTRIB_VALUE_QUOTED: if (c !== parser.q) { if (c === '&') { parser.state = S.ATTRIB_VALUE_ENTITY_Q } else { parser.attribValue += c } continue } attrib(parser) parser.q = '' parser.state = S.ATTRIB_VALUE_CLOSED continue case S.ATTRIB_VALUE_CLOSED: if (isWhitespace(c)) { parser.state = S.ATTRIB } else if (c === '>') { openTag(parser) } else if (c === '/') { parser.state = S.OPEN_TAG_SLASH } else if (isMatch(nameStart, c)) { strictFail(parser, 'No whitespace between attributes') parser.attribName = c parser.attribValue = '' parser.state = S.ATTRIB_NAME } else { strictFail(parser, 'Invalid attribute name') } continue case S.ATTRIB_VALUE_UNQUOTED: if (!isAttribEnd(c)) { if (c === '&') { parser.state = S.ATTRIB_VALUE_ENTITY_U } else { parser.attribValue += c } continue } attrib(parser) if (c === '>') { openTag(parser) } else { parser.state = S.ATTRIB } continue case S.CLOSE_TAG: if (!parser.tagName) { if (isWhitespace(c)) { continue } else if (notMatch(nameStart, c)) { if (parser.script) { parser.script += '</' + c parser.state = S.SCRIPT } else { strictFail(parser, 'Invalid tagname in closing tag.') } } else { parser.tagName = c } } else if (c === '>') { closeTag(parser) } else if (isMatch(nameBody, c)) { parser.tagName += c } else if (parser.script) { parser.script += '</' + parser.tagName parser.tagName = '' parser.state = S.SCRIPT } else { if (!isWhitespace(c)) { strictFail(parser, 'Invalid tagname in closing tag') } parser.state = S.CLOSE_TAG_SAW_WHITE } continue case S.CLOSE_TAG_SAW_WHITE: if (isWhitespace(c)) { continue } if (c === '>') { closeTag(parser) } else { strictFail(parser, 'Invalid characters in closing tag') } continue case S.TEXT_ENTITY: case S.ATTRIB_VALUE_ENTITY_Q: case S.ATTRIB_VALUE_ENTITY_U: var returnState var buffer switch (parser.state) { case S.TEXT_ENTITY: returnState = S.TEXT buffer = 'textNode' break case S.ATTRIB_VALUE_ENTITY_Q: returnState = S.ATTRIB_VALUE_QUOTED buffer = 'attribValue' break case S.ATTRIB_VALUE_ENTITY_U: returnState = S.ATTRIB_VALUE_UNQUOTED buffer = 'attribValue' break } if (c === ';') { parser[buffer] += parseEntity(parser) parser.entity = '' parser.state = returnState } else if (isMatch(parser.entity.length ? entityBody : entityStart, c)) { parser.entity += c } else { strictFail(parser, 'Invalid character in entity name') parser[buffer] += '&' + parser.entity + c parser.entity = '' parser.state = returnState } continue default: throw new Error(parser, 'Unknown state: ' + parser.state) } } // while if (parser.position >= parser.bufferCheckPosition) { checkBufferLength(parser) } return parser } /*! http://mths.be/fromcodepoint v0.1.0 by @mathias */ /* istanbul ignore next */ if (!String.fromCodePoint) { (function () { var stringFromCharCode = String.fromCharCode var floor = Math.floor var fromCodePoint = function () { var MAX_SIZE = 0x4000 var codeUnits = [] var highSurrogate var lowSurrogate var index = -1 var length = arguments.length if (!length) { return '' } var result = '' while (++index < length) { var codePoint = Number(arguments[index]) if ( !isFinite(codePoint) || // `NaN`, `+Infinity`, or `-Infinity` codePoint < 0 || // not a valid Unicode code point codePoint > 0x10FFFF || // not a valid Unicode code point floor(codePoint) !== codePoint // not an integer ) { throw RangeError('Invalid code point: ' + codePoint) } if (codePoint <= 0xFFFF) { // BMP code point codeUnits.push(codePoint) } else { // Astral code point; split in surrogate halves // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae codePoint -= 0x10000 highSurrogate = (codePoint >> 10) + 0xD800 lowSurrogate = (codePoint % 0x400) + 0xDC00 codeUnits.push(highSurrogate, lowSurrogate) } if (index + 1 === length || codeUnits.length > MAX_SIZE) { result += stringFromCharCode.apply(null, codeUnits) codeUnits.length = 0 } } return result } /* istanbul ignore next */ if (Object.defineProperty) { Object.defineProperty(String, 'fromCodePoint', { value: fromCodePoint, configurable: true, writable: true }) } else { String.fromCodePoint = fromCodePoint } }()) } })(typeof exports === 'undefined' ? this.sax = {} : exports)