/** * Not type-checking this file because it's mostly vendor code. */ /*! * HTML Parser By John Resig (ejohn.org) * Modified by Juriy "kangax" Zaytsev * Original code by Erik Arvidsson, Mozilla Public License * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js */ import { makeMap, no } from 'shared/util' import { isNonPhrasingTag, canBeLeftOpenTag } from 'web/util/index' // Regular Expressions for parsing tags and attributes const singleAttrIdentifier = /([^s"'<>/=]+)/ const singleAttrAssign = /(?:=)/ const singleAttrValues = [ // attr value double quotes /"([^"]*)"+/.source, // attr value, single quotes /'([^']*)'+/.source, // attr value, no quotes /([^s"'=<>`]+)/.source ] const attribute = new RegExp( '^\s*' + singleAttrIdentifier.source + '(?:\s*(' + singleAttrAssign.source + ')' + '\s*(?:' + singleAttrValues.join('|') + '))?' ) // could use https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName // but for Vue templates we can enforce a simple charset const ncname = '[a-zA-Z_][\w\-\.]*' const qnameCapture = '((?:' + ncname + '\:)?' + ncname + ')' const startTagOpen = new RegExp('^<' + qnameCapture) const startTagClose = /^s*(/?)>/ const endTag = new RegExp('^<\/' + qnameCapture + '[^>]*>') const doctype = /^<!DOCTYPE [^>]+>/i const comment = /^<!--/ const conditionalComment = /^<![/ let IS_REGEX_CAPTURING_BROKEN = false 'x'.replace(/x(.)?/g, function (m, g) { IS_REGEX_CAPTURING_BROKEN = g === '' }) // Special Elements (can contain anything) const isScriptOrStyle = makeMap('script,style', true) const hasLang = attr => attr.name === 'lang' && attr.value !== 'html' const isSpecialTag = (tag, isSFC, stack) => { if (isScriptOrStyle(tag)) { return true } if (isSFC && stack.length === 1) { // top-level template that has no pre-processor if (tag === 'template' && !stack[0].attrs.some(hasLang)) { return false } else { return true } } return false } const reCache = {} const ltRE = /</g const gtRE = />/g const nlRE = / /g const ampRE = /&/g const quoteRE = /"/g function decodeAttr (value, shouldDecodeNewlines) { if (shouldDecodeNewlines) { value = value.replace(nlRE, ' ') } return value .replace(ltRE, '<') .replace(gtRE, '>') .replace(ampRE, '&') .replace(quoteRE, '"') } export function parseHTML (html, options) { const stack = [] const expectHTML = options.expectHTML const isUnaryTag = options.isUnaryTag || no let index = 0 let last, lastTag while (html) { last = html // Make sure we're not in a script or style element if (!lastTag || !isSpecialTag(lastTag, options.sfc, stack)) { let textEnd = html.indexOf('<') if (textEnd === 0) { // Comment: if (comment.test(html)) { const commentEnd = html.indexOf('-->') if (commentEnd >= 0) { advance(commentEnd + 3) continue } } // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment if (conditionalComment.test(html)) { const conditionalEnd = html.indexOf(']>') if (conditionalEnd >= 0) { advance(conditionalEnd + 2) continue } } // Doctype: const doctypeMatch = html.match(doctype) if (doctypeMatch) { advance(doctypeMatch[0].length) continue } // End tag: const endTagMatch = html.match(endTag) if (endTagMatch) { const curIndex = index advance(endTagMatch[0].length) parseEndTag(endTagMatch[0], endTagMatch[1], curIndex, index) continue } // Start tag: const startTagMatch = parseStartTag() if (startTagMatch) { handleStartTag(startTagMatch) continue } } let text, rest, next if (textEnd > 0) { rest = html.slice(textEnd) while ( !endTag.test(rest) && !startTagOpen.test(rest) && !comment.test(rest) && !conditionalComment.test(rest) ) { // < in plain text, be forgiving and treat it as text next = rest.indexOf('<', 1) if (next < 0) break textEnd += next rest = html.slice(textEnd) } text = html.substring(0, textEnd) advance(textEnd) } if (textEnd < 0) { text = html html = '' } if (options.chars && text) { options.chars(text) } } else { var stackedTag = lastTag.toLowerCase() var reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\s\S]*?)(</' + stackedTag + '[^>]*>)', 'i')) var endTagLength = 0 var rest = html.replace(reStackedTag, function (all, text, endTag) { endTagLength = endTag.length if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') { text = text .replace(/<!--([sS]*?)-->/g, '$1') .replace(/<![CDATA[([sS]*?)]]>/g, '$1') } if (options.chars) { options.chars(text) } return '' }) index += html.length - rest.length html = rest parseEndTag('</' + stackedTag + '>', stackedTag, index - endTagLength, index) } if (html === last && options.chars) { options.chars(html) break } } // Clean up any remaining tags parseEndTag() function advance (n) { index += n html = html.substring(n) } function parseStartTag () { const start = html.match(startTagOpen) if (start) { const match = { tagName: start[1], attrs: [], start: index } advance(start[0].length) let end, attr while (!(end = html.match(startTagClose)) && (attr = html.match(attribute))) { advance(attr[0].length) match.attrs.push(attr) } if (end) { match.unarySlash = end[1] advance(end[0].length) match.end = index return match } } } function handleStartTag (match) { const tagName = match.tagName let unarySlash = match.unarySlash if (expectHTML) { if (lastTag === 'p' && isNonPhrasingTag(tagName)) { parseEndTag('', lastTag) } if (canBeLeftOpenTag(tagName) && lastTag === tagName) { parseEndTag('', tagName) } } const unary = isUnaryTag(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash const l = match.attrs.length const attrs = new Array(l) for (let i = 0; i < l; i++) { const args = match.attrs[i] // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778 if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) { if (args[3] === '') { delete args[3] } if (args[4] === '') { delete args[4] } if (args[5] === '') { delete args[5] } } const value = args[3] || args[4] || args[5] || '' attrs[i] = { name: args[1], value: decodeAttr( value, options.shouldDecodeNewlines ) } } if (!unary) { stack.push({ tag: tagName, attrs: attrs }) lastTag = tagName unarySlash = '' } if (options.start) { options.start(tagName, attrs, unary, match.start, match.end) } } function parseEndTag (tag, tagName, start, end) { let pos if (start == null) start = index if (end == null) end = index // Find the closest opened tag of the same type if (tagName) { const needle = tagName.toLowerCase() for (pos = stack.length - 1; pos >= 0; pos--) { if (stack[pos].tag.toLowerCase() === needle) { break } } } else { // If no tag name is provided, clean shop pos = 0 } if (pos >= 0) { // Close all the open elements, up the stack for (let i = stack.length - 1; i >= pos; i--) { if (options.end) { options.end(stack[i].tag, start, end) } } // Remove the open elements from the stack stack.length = pos lastTag = pos && stack[pos - 1].tag } else if (tagName.toLowerCase() === 'br') { if (options.start) { options.start(tagName, [], true, start, end) } } else if (tagName.toLowerCase() === 'p') { if (options.start) { options.start(tagName, [], false, start, end) } if (options.end) { options.end(tagName, start, end) } } } }