1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
(* This file is part of Markup.ml, released under the MIT license. See
LICENSE.md for details, or visit https://github.com/aantron/markup.ml. *)
(* Aliases for reducing the number of deprecation warnings. *)
module String =
struct
include String
let lowercase = lowercase [@ocaml.warning "-3"]
end
module Char =
struct
include Char
let lowercase = lowercase [@ocaml.warning "-3"]
end
type 'a cont = 'a -> unit
type 'a cps = exn cont -> 'a cont -> unit
type location = int * int
let compare_locations (line, column) (line', column') =
match line - line' with
| 0 -> column - column'
| order -> order
type name = string * string
let xml_ns = "http://www.w3.org/XML/1998/namespace"
let xmlns_ns = "http://www.w3.org/2000/xmlns/"
let xlink_ns = "http://www.w3.org/1999/xlink"
let html_ns = "http://www.w3.org/1999/xhtml"
let svg_ns = "http://www.w3.org/2000/svg"
let mathml_ns = "http://www.w3.org/1998/Math/MathML"
module Token_tag =
struct
type t =
{name : string;
attributes : (string * string) list;
self_closing : bool}
end
type xml_declaration =
{version : string;
encoding : string option;
standalone : bool option}
type doctype =
{doctype_name : string option;
public_identifier : string option;
system_identifier : string option;
raw_text : string option;
force_quirks : bool}
type signal =
[ `Start_element of name * (name * string) list
| `End_element
| `Text of string list
| `Xml of xml_declaration
| `Doctype of doctype
| `PI of string * string
| `Comment of string ]
type general_token =
[ `Xml of xml_declaration
| `Doctype of doctype
| `Start of Token_tag.t
| `End of Token_tag.t
| `Chars of string list
| `Char of int
| `PI of string * string
| `Comment of string
| `EOF ]
let u_rep = Uchar.to_int Uutf.u_rep
let add_utf_8 buffer c =
Uutf.Buffer.add_utf_8 buffer (Uchar.unsafe_of_int c)
let format_char = Printf.sprintf "U+%04X"
(* Type constraints are necessary to avoid polymorphic comparison, which would
greatly reduce performance: https://github.com/aantron/markup.ml/pull/15. *)
let is_in_range (lower : int) (upper : int) c = c >= lower && c <= upper
(* HTML 8.2.2.5. *)
let is_control_character = function
| 0x000B -> true
| c when is_in_range 0x0001 0x0008 c -> true
| c when is_in_range 0x000E 0x001F c -> true
| c when is_in_range 0x007F 0x009F c -> true
| _ -> false
(* HTML 8.2.2.5. *)
let is_non_character = function
| c when is_in_range 0xFDD0 0xFDEF c -> true
| c when (c land 0xFFFF = 0xFFFF) || (c land 0xFFFF = 0xFFFE) -> true
| _ -> false
let is_digit = is_in_range 0x0030 0x0039
let is_hex_digit = function
| c when is_digit c -> true
| c when is_in_range 0x0041 0x0046 c -> true
| c when is_in_range 0x0061 0x0066 c -> true
| _ -> false
let is_scalar = function
| c when (c >= 0x10FFFF) || ((c >= 0xD800) && (c <= 0xDFFF)) -> false
| _ -> true
let is_uppercase = is_in_range 0x0041 0x005A
let is_lowercase = is_in_range 0x0061 0x007A
let is_alphabetic = function
| c when is_uppercase c -> true
| c when is_lowercase c -> true
| _ -> false
let is_alphanumeric = function
| c when is_alphabetic c -> true
| c when is_digit c -> true
| _ -> false
let is_whitespace c = c = 0x0020 || c = 0x000A || c = 0x0009 || c = 0x000D
let is_whitespace_only s =
try
s |> String.iter (fun c ->
if is_whitespace (int_of_char c) then ()
else raise Exit);
true
with Exit -> false
let to_lowercase = function
| c when is_uppercase c -> c + 0x20
| c -> c
let is_printable = is_in_range 0x0020 0x007E
let char c =
if is_printable c then begin
let buffer = Buffer.create 4 in
add_utf_8 buffer c;
Buffer.contents buffer
end
else
format_char c
let is_valid_html_char c = not (is_control_character c || is_non_character c)
let is_valid_xml_char c =
is_in_range 0x0020 0xD7FF c
|| c = 0x0009
|| c = 0x000A
|| c = 0x000D
|| is_in_range 0xE000 0xFFFD c
|| is_in_range 0x10000 0x10FFFF c
let signal_to_string = function
| `Comment s ->
Printf.sprintf "<!--%s-->" s
| `Doctype d ->
let text =
match d.doctype_name with
| None ->
begin match d.raw_text with
| None -> ""
| Some s -> " " ^ s
end
| Some name ->
match d.public_identifier, d.system_identifier with
| None, None -> " " ^ name
| Some p, None -> Printf.sprintf " %s PUBLIC \"%s\"" name p
| None, Some s -> Printf.sprintf " %s SYSTEM \"%s\"" name s
| Some p, Some s -> Printf.sprintf " %s PUBLIC \"%s\" \"%s\"" name p s
in
Printf.sprintf "<!DOCTYPE%s>" text
| `Start_element (name, attributes) ->
let name_to_string = function
| "", local_name -> local_name
| ns, local_name -> ns ^ ":" ^ local_name
in
let attributes =
attributes
|> List.map (fun (name, value) ->
Printf.sprintf " %s=\"%s\"" (name_to_string name) value)
|> String.concat ""
in
Printf.sprintf "<%s%s>" (name_to_string name) attributes
| `End_element ->
"</...>"
| `Text ss ->
String.concat "" ss
| `Xml x ->
let s = Printf.sprintf "<?xml version=\"%s\">" x.version in
let s =
match x.encoding with
| None -> s
| Some encoding -> Printf.sprintf "%s encoding=\"%s\"" s encoding
in
let s =
match x.standalone with
| None -> s
| Some standalone ->
Printf.sprintf
"%s standalone=\"%s\"" s (if standalone then "yes" else "no")
in
s ^ "?>"
| `PI (target, s) ->
Printf.sprintf "<?%s %s?>" target s
let token_to_string = function
| `Xml x ->
signal_to_string (`Xml x)
| `Doctype d ->
signal_to_string (`Doctype d)
| `Start t ->
let name = "", t.Token_tag.name in
let attributes =
t.Token_tag.attributes |> List.map (fun (n, v) -> ("", n), v) in
let s = signal_to_string (`Start_element (name, attributes)) in
if not t.Token_tag.self_closing then s
else (String.sub s 0 (String.length s - 1)) ^ "/>"
| `End t ->
Printf.sprintf "</%s>" t.Token_tag.name
| `Chars ss ->
String.concat "" ss
| `Char i ->
char i
| `PI v ->
signal_to_string (`PI v)
| `Comment s ->
signal_to_string (`Comment s)
| `EOF ->
"EOF"
let whitespace_chars = " \t\n\r"
let whitespace_prefix_length s =
let rec loop index =
if index = String.length s then index
else
if String.contains whitespace_chars s.[index] then loop (index + 1)
else index
in
loop 0
let whitespace_suffix_length s =
let rec loop rindex =
if rindex = String.length s then rindex
else
if String.contains whitespace_chars s.[String.length s - rindex - 1] then
loop (rindex + 1)
else rindex
in
loop 0
let trim_string_left s =
let prefix_length = whitespace_prefix_length s in
String.sub s prefix_length (String.length s - prefix_length)
let trim_string_right s =
let suffix_length = whitespace_suffix_length s in
String.sub s 0 (String.length s - suffix_length)
(* String.trim not available for OCaml < 4.00. *)
let trim_string s = s |> trim_string_left |> trim_string_right
(* Specialization of List.mem at string list, to avoid polymorphic
comparison. *)
let list_mem_string (s : string) l = List.exists (fun s' -> s' = s) l