Skip to content

Commit 035a843

Browse files
committed
simplify parsing logic
1 parent 6966273 commit 035a843

File tree

2 files changed

+74
-61
lines changed

2 files changed

+74
-61
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@beforesemicolon/html-parser",
3-
"version": "0.9.0",
3+
"version": "0.10.0",
44
"description": "HTML parser for any Javascript runtime environment",
55
"type": "module",
66
"types": "./dist/types/index.d.ts",

src/parse.ts

Lines changed: 73 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,16 @@ import {
55
NodeLike,
66
DocumentFragmentLike,
77
} from './Doc.ts'
8-
import { selfClosingTags } from './self-closing-tags.ts'
98

109
export type NodeHandlerCallback = (node: ElementLike | NodeLike) => void
1110

11+
const SELF_CLOSING_TAGS =
12+
/^(AREA|META|BASE|BR|COL|EMBED|HR|IMG|INPUT|LINK|PARAM|SOURCE|TRACK|WBR|COMMAND|KEYGEN|MENUITEM|DOCTYPE|!DOCTYPE)$/i
1213
// Pre-compiled regexes for better performance
1314
const HTML_PATTERN =
1415
/<!--([^]*?(?=-->))-->|<(\/|!)?([a-z][a-z0-9-]*)\s*([^>]*?)(\/?)>/gi
1516
const ATTR_PATTERN =
1617
/([a-z][\w-.:]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi
17-
const SVG_TEST = /svg/i
18-
const MATH_TEST = /math/i
19-
const HTML_TEST = /html/i
20-
const SCRIPT_TEST = /^SCRIPT$/i
2118

2219
// URI based on https://developer.mozilla.org/en-US/docs/Web/API/Document/createElementNS
2320
const NSURI: Record<string, string> = {
@@ -26,8 +23,35 @@ const NSURI: Record<string, string> = {
2623
MATH: 'http://www.w3.org/1998/Math/MathML',
2724
}
2825

29-
// Cache self-closing tags regex
30-
let selfClosingTagsRegex: RegExp
26+
// Cache for tag regexes and namespaces
27+
const tagRegexCache = new Map<string, RegExp>()
28+
const namespaceCache = new Map<string, string>()
29+
30+
const getTagRegex = (tagName: string): RegExp => {
31+
let regex = tagRegexCache.get(tagName)
32+
if (!regex) {
33+
regex = new RegExp(tagName, 'i')
34+
tagRegexCache.set(tagName, regex)
35+
}
36+
return regex
37+
}
38+
39+
const getNamespace = (tagName: string, parentNS?: string): string => {
40+
let ns = namespaceCache.get(tagName)
41+
if (!ns) {
42+
const lower = tagName.toLowerCase()
43+
ns =
44+
lower === 'svg'
45+
? NSURI.SVG
46+
: lower.startsWith('math')
47+
? NSURI.MATH
48+
: lower === 'html'
49+
? NSURI.HTML
50+
: parentNS ?? NSURI.HTML
51+
namespaceCache.set(tagName, ns)
52+
}
53+
return ns
54+
}
3155

3256
const setAttributes = (node: Element | ElementLike, attributes: string) => {
3357
const trimmed = attributes?.trim()
@@ -38,11 +62,7 @@ const setAttributes = (node: Element | ElementLike, attributes: string) => {
3862

3963
while ((match = ATTR_PATTERN.exec(trimmed))) {
4064
const name = match[1]
41-
const value =
42-
match[2] ??
43-
match[3] ??
44-
match[4] ??
45-
(match[0].includes('=') ? '' : '')
65+
const value = match[2] ?? match[3] ?? match[4] ?? ''
4666
node.setAttribute(name, value)
4767
}
4868
}
@@ -55,6 +75,18 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
5575
markup: string,
5676
handler: D | NodeHandlerCallback = Doc as D
5777
): ParseReturn<D> => {
78+
// Fast path for simple text-only content
79+
if (!markup.includes('<')) {
80+
const doc = (
81+
!handler || typeof handler === 'function' ? Doc : handler
82+
) as DocumentLike
83+
const fragment = doc.createDocumentFragment()
84+
const textNode = doc.createTextNode(markup)
85+
fragment.appendChild(textNode)
86+
if (typeof handler === 'function') handler(textNode)
87+
return fragment as ParseReturn<D>
88+
}
89+
5890
HTML_PATTERN.lastIndex = 0
5991
let match: RegExpExecArray | null = null
6092
const doc = (
@@ -63,17 +95,14 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
6395
const cb = (typeof handler === 'function'
6496
? handler
6597
: null) as unknown as NodeHandlerCallback
66-
const stack: Array<ElementLike | DocumentFragmentLike> = [
67-
doc.createDocumentFragment(),
68-
]
98+
99+
// Pre-allocate stack with reasonable size
100+
const stack: Array<ElementLike | DocumentFragmentLike> = new Array(32)
101+
stack[0] = doc.createDocumentFragment()
102+
let stackIndex = 0
69103
let lastIndex = 0
70104
const markupLength = markup.length
71105

72-
// Cache self-closing tags regex
73-
if (!selfClosingTagsRegex) {
74-
selfClosingTagsRegex = selfClosingTags()
75-
}
76-
77106
while ((match = HTML_PATTERN.exec(markup)) !== null) {
78107
const [
79108
,
@@ -89,11 +118,10 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
89118
continue
90119
}
91120

92-
const stackTop = stack.length - 1
93-
const stackLastItem = stack[stackTop]
121+
const stackLastItem = stack[stackIndex]
94122

95-
// pre lingering text
96-
if (match.index >= lastIndex + 1) {
123+
// Pre-lingering text
124+
if (match.index > lastIndex) {
97125
const text = markup.slice(lastIndex, match.index)
98126
const node = doc.createTextNode(text)
99127
stackLastItem?.appendChild(node)
@@ -112,46 +140,34 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
112140
if (tagName) {
113141
if (bangOrClosingSlash) {
114142
const stackTagName = stackLastItem?.tagName
115-
if (
116-
stackTagName &&
117-
new RegExp(tagName, 'i').test(stackTagName)
118-
) {
119-
stack.pop()
143+
if (stackTagName && getTagRegex(tagName).test(stackTagName)) {
144+
stackIndex--
120145
}
121146
continue
122147
}
123148

124-
const ns = SVG_TEST.test(tagName)
125-
? NSURI.SVG
126-
: MATH_TEST.test(tagName)
127-
? NSURI.MATH
128-
: HTML_TEST.test(tagName)
129-
? NSURI.HTML
130-
: (stackLastItem as ElementLike)?.namespaceURI ?? NSURI.HTML
149+
const ns = getNamespace(
150+
tagName,
151+
(stackLastItem as ElementLike)?.namespaceURI
152+
)
153+
const isSelfClosing =
154+
SELF_CLOSING_TAGS.test(tagName.toLowerCase()) ||
155+
selfClosingSlash === '/'
131156

132-
const selfClosingTag =
133-
selfClosingTagsRegex.test(tagName) || selfClosingSlash === '/'
157+
const node = doc.createElementNS(ns, tagName)
158+
setAttributes(node, attributes)
159+
stackLastItem?.appendChild(node)
134160

135-
if (selfClosingTag) {
136-
const node = doc.createElementNS(ns, tagName)
137-
setAttributes(node, attributes)
138-
stackLastItem?.appendChild(node)
161+
if (isSelfClosing) {
139162
cb?.(node)
140163
continue
141164
}
142165

143-
const node = doc.createElementNS(ns, tagName)
144-
setAttributes(node, attributes)
145-
stackLastItem?.appendChild(node)
146-
147-
// scripts in particular can have html strings that do not need to be rendered.
148-
// The overall markup therefore we need a special lookup to find the closing tag
149-
// without considering these possible HTML tag matches to be part of the final DOM
150-
if (SCRIPT_TEST.test(tagName)) {
151-
// try to find the closing tag
166+
// Handle script tags specially
167+
if (tagName.toUpperCase() === 'SCRIPT') {
152168
const possibleSimilarOnesNested: string[] = []
153169
const exactTagPattern = new RegExp(
154-
`<(\\/)?(${tagName})\\s*([^>]*?)>`,
170+
`<(\\/)?(${tagName})\\s*([^>]*)>`,
155171
'ig'
156172
)
157173
const markupAhead = markup.slice(lastIndex)
@@ -160,10 +176,9 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
160176
while (
161177
(tagMatch = exactTagPattern.exec(markupAhead)) !== null
162178
) {
163-
const [, closingSlash, name, , selfClosingSlash] = tagMatch
179+
const [, closingSlash, name] = tagMatch
164180

165-
// check if the tag name is matched
166-
if (new RegExp(tagName, 'i').test(name)) {
181+
if (getTagRegex(tagName).test(name)) {
167182
if (closingSlash) {
168183
if (!possibleSimilarOnesNested.length) {
169184
const textNode = doc.createTextNode(
@@ -172,20 +187,18 @@ export const parse = <D extends Partial<DocumentLike | Document>>(
172187
node.appendChild(textNode)
173188
lastIndex =
174189
lastIndex + exactTagPattern.lastIndex
175-
HTML_PATTERN.lastIndex = lastIndex // move the pattern needle to start matching later in the string
190+
HTML_PATTERN.lastIndex = lastIndex
176191
break
177192
} else {
178193
possibleSimilarOnesNested.pop()
179194
}
180-
} else if (!selfClosingSlash) {
181-
// could be that there is a script HTML string inside
182-
// we need to track those, so we don't mix them with the possible script closing tag
195+
} else {
183196
possibleSimilarOnesNested.push(name)
184197
}
185198
}
186199
}
187200
} else {
188-
stack.push(node)
201+
stack[++stackIndex] = node
189202
}
190203

191204
cb?.(node)

0 commit comments

Comments
 (0)