Skip to content

Commit 70733c0

Browse files
authored
Merge pull request #1550 from fsprojects/perf/html-parser-stringbuilder-optimization
Daily Perf Improver: Optimize HTML parser CharList with StringBuilder
2 parents c1760f3 + 2aa63c2 commit 70733c0

File tree

3 files changed

+63
-16
lines changed

3 files changed

+63
-16
lines changed

src/FSharp.Data.Html.Core/HtmlParser.fs

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,15 @@ module internal HtmlParser =
7777
String(buffer)
7878

7979
type CharList =
80-
{ mutable Contents: char list }
80+
{ mutable Contents: StringBuilder }
8181

82-
static member Empty = { Contents = [] }
82+
static member Empty = { Contents = StringBuilder() }
8383

84-
override x.ToString() =
85-
String(x.Contents |> List.rev |> List.toArray)
84+
override x.ToString() = x.Contents.ToString()
8685

87-
member x.Cons(c) = x.Contents <- c :: x.Contents
86+
member x.Cons(c: char) = x.Contents.Append(c) |> ignore
8887
member x.Length = x.Contents.Length
89-
member x.Clear() = x.Contents <- []
88+
member x.Clear() = x.Contents.Clear() |> ignore
9089

9190
type InsertionMode =
9291
| DefaultMode
@@ -116,8 +115,8 @@ module internal HtmlParser =
116115

117116
static member Create(reader: TextReader) =
118117
{ Attributes = []
119-
CurrentTag = CharList.Empty
120-
Content = CharList.Empty
118+
CurrentTag = { Contents = StringBuilder() }
119+
Content = { Contents = StringBuilder() }
121120
HasFormattedParent = false
122121
InsertionMode = DefaultMode
123122
Tokens = []
@@ -133,7 +132,7 @@ module internal HtmlParser =
133132
member x.ContentLength = x.Content.Length
134133

135134
member x.NewAttribute() =
136-
x.Attributes <- (CharList.Empty, CharList.Empty) :: x.Attributes
135+
x.Attributes <- ({ Contents = StringBuilder() }, { Contents = StringBuilder() }) :: x.Attributes
137136

138137
member x.ConsAttrName() =
139138
match x.Attributes with
@@ -170,7 +169,7 @@ module internal HtmlParser =
170169
member x.EmitSelfClosingTag() =
171170
let name = x.CurrentTag.ToString().Trim()
172171
let result = Tag(true, name, x.GetAttributes())
173-
x.CurrentTag <- CharList.Empty
172+
x.CurrentTag <- { Contents = StringBuilder() }
174173
x.InsertionMode <- DefaultMode
175174
x.Attributes <- []
176175
x.Tokens <- result :: x.Tokens
@@ -212,7 +211,7 @@ module internal HtmlParser =
212211
else
213212
DefaultMode
214213

215-
x.CurrentTag <- CharList.Empty
214+
x.CurrentTag <- { Contents = StringBuilder() }
216215
x.Attributes <- []
217216
x.Tokens <- result :: x.Tokens
218217

@@ -223,7 +222,7 @@ module internal HtmlParser =
223222
for c in content.ToCharArray() do
224223
x.ConsAttrValue c
225224

226-
x.Content <- CharList.Empty
225+
x.Content <- { Contents = StringBuilder() }
227226
x.InsertionMode <- DefaultMode
228227

229228
member x.Emit() : unit =
@@ -247,17 +246,17 @@ module internal HtmlParser =
247246
| DocTypeMode -> DocType content
248247
| CDATAMode -> CData(content.Replace("<![CDATA[", "").Replace("]]>", ""))
249248

250-
x.Content <- CharList.Empty
249+
x.Content <- { Contents = StringBuilder() }
251250
x.InsertionMode <- DefaultMode
252251

253252
match result with
254253
| Text t when String.IsNullOrEmpty(t) -> ()
255254
| _ -> x.Tokens <- result :: x.Tokens
256255

257256
member x.Cons() = x.Content.Cons(x.Reader.ReadChar())
258-
member x.Cons(char) = x.Content.Cons(char)
259-
member x.Cons(char) = Array.iter (x.Content.Cons) char
260-
member x.Cons(char: string) = x.Cons(char.ToCharArray())
257+
member x.Cons(char: char) = x.Content.Cons(char)
258+
member x.Cons(chars: char array) = Array.iter (x.Content.Cons) chars
259+
member x.Cons(chars: string) = x.Cons(chars.ToCharArray())
261260

262261
member x.ConsTag() =
263262
match x.Reader.ReadChar() with

tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
1515
</EmbeddedResource>
1616
<Compile Include="JsonBenchmarks.fs" />
17+
<Compile Include="HtmlBenchmarks.fs" />
1718
<Compile Include="Program.fs" />
1819
</ItemGroup>
1920
<ItemGroup>
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
namespace FSharp.Data.Benchmarks
2+
3+
open System
4+
open System.IO
5+
open BenchmarkDotNet.Attributes
6+
open FSharp.Data
7+
8+
[<MemoryDiagnoser>]
9+
[<SimpleJob>]
10+
type HtmlBenchmarks() =
11+
12+
let mutable simpleHtmlText = ""
13+
let mutable zooplaHtmlText = ""
14+
let mutable usPresidentsHtmlText = ""
15+
let mutable doctorWhoHtmlText = ""
16+
let mutable wimbledonHtmlText = ""
17+
18+
[<GlobalSetup>]
19+
member this.Setup() =
20+
let dataPath = Path.Combine(__SOURCE_DIRECTORY__, "../FSharp.Data.Tests/Data")
21+
22+
// Load various HTML files of different sizes and complexity
23+
simpleHtmlText <- File.ReadAllText(Path.Combine(dataPath, "SimpleHtmlTablesWithTr.html"))
24+
zooplaHtmlText <- File.ReadAllText(Path.Combine(dataPath, "zoopla.html")) // ~773KB
25+
usPresidentsHtmlText <- File.ReadAllText(Path.Combine(dataPath, "us_presidents_wikipedia.html")) // ~698KB
26+
doctorWhoHtmlText <- File.ReadAllText(Path.Combine(dataPath, "doctor_who2.html")) // ~518KB
27+
wimbledonHtmlText <- File.ReadAllText(Path.Combine(dataPath, "wimbledon_wikipedia.html")) // ~411KB
28+
29+
[<Benchmark>]
30+
member this.ParseSimpleHtml() =
31+
HtmlDocument.Parse(simpleHtmlText)
32+
33+
[<Benchmark>]
34+
member this.ParseZooplaHtml() =
35+
HtmlDocument.Parse(zooplaHtmlText)
36+
37+
[<Benchmark>]
38+
member this.ParseUsPresidentsHtml() =
39+
HtmlDocument.Parse(usPresidentsHtmlText)
40+
41+
[<Benchmark>]
42+
member this.ParseDoctorWhoHtml() =
43+
HtmlDocument.Parse(doctorWhoHtmlText)
44+
45+
[<Benchmark>]
46+
member this.ParseWimbledonHtml() =
47+
HtmlDocument.Parse(wimbledonHtmlText)

0 commit comments

Comments
 (0)