1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
(* This file is part of Markup.ml, released under the MIT license. See
   LICENSE.md for details, or visit https://github.com/aantron/markup.ml. *)

(* Aliases for reducing the number of deprecation warnings. *)
module String =
struct
  include String
  let lowercase = lowercase [@ocaml.warning "-3"]
end

module Char =
struct
  include Char
  let lowercase = lowercase [@ocaml.warning "-3"]
end

type 'a cont = 'a -> unit
type 'a cps = exn cont -> 'a cont -> unit

type location = int * int

let compare_locations (line, column) (line', column') =
  match line - line' with
  | 0 -> column - column'
  | order -> order

type name = string * string

let xml_ns = "http://www.w3.org/XML/1998/namespace"
let xmlns_ns = "http://www.w3.org/2000/xmlns/"
let xlink_ns = "http://www.w3.org/1999/xlink"
let html_ns = "http://www.w3.org/1999/xhtml"
let svg_ns = "http://www.w3.org/2000/svg"
let mathml_ns = "http://www.w3.org/1998/Math/MathML"

module Token_tag =
struct
  type t =
    {name         : string;
     attributes   : (string * string) list;
     self_closing : bool}
end

type xml_declaration =
  {version    : string;
   encoding   : string option;
   standalone : bool option}

type doctype =
  {doctype_name      : string option;
   public_identifier : string option;
   system_identifier : string option;
   raw_text          : string option;
   force_quirks      : bool}

type signal =
  [ `Start_element of name * (name * string) list
  | `End_element
  | `Text of string list
  | `Xml of xml_declaration
  | `Doctype of doctype
  | `PI of string * string
  | `Comment of string ]

type general_token =
  [ `Xml of xml_declaration
  | `Doctype of doctype
  | `Start of Token_tag.t
  | `End of Token_tag.t
  | `Chars of string list
  | `Char of int
  | `PI of string * string
  | `Comment of string
  | `EOF ]

let u_rep = Uchar.to_int Uutf.u_rep

let add_utf_8 buffer c =
  Uutf.Buffer.add_utf_8 buffer (Uchar.unsafe_of_int c)

let format_char = Printf.sprintf "U+%04X"

(* Type constraints are necessary to avoid polymorphic comparison, which would
   greatly reduce performance: https://github.com/aantron/markup.ml/pull/15. *)
let is_in_range (lower : int) (upper : int) c = c >= lower && c <= upper

(* HTML 8.2.2.5. *)
let is_control_character = function
  | 0x000B -> true
  | c when is_in_range 0x0001 0x0008 c -> true
  | c when is_in_range 0x000E 0x001F c -> true
  | c when is_in_range 0x007F 0x009F c -> true
  | _ -> false

(* HTML 8.2.2.5. *)
let is_non_character = function
  | c when is_in_range 0xFDD0 0xFDEF c -> true
  | c when (c land 0xFFFF = 0xFFFF) || (c land 0xFFFF = 0xFFFE) -> true
  | _ -> false

let is_digit = is_in_range 0x0030 0x0039

let is_hex_digit = function
  | c when is_digit c -> true
  | c when is_in_range 0x0041 0x0046 c -> true
  | c when is_in_range 0x0061 0x0066 c -> true
  | _ -> false

let is_scalar = function
  | c when (c >= 0x10FFFF) || ((c >= 0xD800) && (c <= 0xDFFF)) -> false
  | _ -> true

let is_uppercase = is_in_range 0x0041 0x005A

let is_lowercase = is_in_range 0x0061 0x007A

let is_alphabetic = function
  | c when is_uppercase c -> true
  | c when is_lowercase c -> true
  | _ -> false

let is_alphanumeric = function
  | c when is_alphabetic c -> true
  | c when is_digit c -> true
  | _ -> false

let is_whitespace c = c = 0x0020 || c = 0x000A || c = 0x0009 || c = 0x000D

let is_whitespace_only s =
  try
    s |> String.iter (fun c ->
      if is_whitespace (int_of_char c) then ()
      else raise Exit);
    true

  with Exit -> false

let to_lowercase = function
  | c when is_uppercase c -> c + 0x20
  | c -> c

let is_printable = is_in_range 0x0020 0x007E

let char c =
  if is_printable c then begin
    let buffer = Buffer.create 4 in
    add_utf_8 buffer c;
    Buffer.contents buffer
  end
  else
    format_char c

let is_valid_html_char c = not (is_control_character c || is_non_character c)

let is_valid_xml_char c =
  is_in_range 0x0020 0xD7FF c
  || c = 0x0009
  || c = 0x000A
  || c = 0x000D
  || is_in_range 0xE000 0xFFFD c
  || is_in_range 0x10000 0x10FFFF c

let signal_to_string = function
  | `Comment s ->
    Printf.sprintf "<!--%s-->" s

  | `Doctype d ->
    let text =
      match d.doctype_name with
      | None ->
        begin match d.raw_text with
        | None -> ""
        | Some s -> " " ^ s
        end
      | Some name ->
        match d.public_identifier, d.system_identifier with
        | None, None -> " " ^ name
        | Some p, None -> Printf.sprintf " %s PUBLIC \"%s\"" name p
        | None, Some s -> Printf.sprintf " %s SYSTEM \"%s\"" name s
        | Some p, Some s -> Printf.sprintf " %s PUBLIC \"%s\" \"%s\"" name p s
    in
    Printf.sprintf "<!DOCTYPE%s>" text

  | `Start_element (name, attributes) ->
    let name_to_string = function
      | "", local_name -> local_name
      | ns, local_name -> ns ^ ":" ^ local_name
    in
    let attributes =
      attributes
      |> List.map (fun (name, value) ->
        Printf.sprintf " %s=\"%s\"" (name_to_string name) value)
      |> String.concat ""
    in
    Printf.sprintf "<%s%s>" (name_to_string name) attributes

  | `End_element ->
    "</...>"

  | `Text ss ->
    String.concat "" ss

  | `Xml x ->
    let s = Printf.sprintf "<?xml version=\"%s\">" x.version in
    let s =
      match x.encoding with
      | None -> s
      | Some encoding -> Printf.sprintf "%s encoding=\"%s\"" s encoding
    in
    let s =
      match x.standalone with
      | None -> s
      | Some standalone ->
        Printf.sprintf
          "%s standalone=\"%s\"" s (if standalone then "yes" else "no")
    in
    s ^ "?>"

  | `PI (target, s) ->
    Printf.sprintf "<?%s %s?>" target s

let token_to_string = function
  | `Xml x ->
    signal_to_string (`Xml x)

  | `Doctype d ->
    signal_to_string (`Doctype d)

  | `Start t ->
    let name = "", t.Token_tag.name in
    let attributes =
      t.Token_tag.attributes |> List.map (fun (n, v) -> ("", n), v) in
    let s = signal_to_string (`Start_element (name, attributes)) in
    if not t.Token_tag.self_closing then s
    else (String.sub s 0 (String.length s - 1)) ^ "/>"

  | `End t ->
    Printf.sprintf "</%s>" t.Token_tag.name

  | `Chars ss ->
    String.concat "" ss

  | `Char i ->
    char i

  | `PI v ->
    signal_to_string (`PI v)

  | `Comment s ->
    signal_to_string (`Comment s)

  | `EOF ->
    "EOF"

let whitespace_chars = " \t\n\r"

let whitespace_prefix_length s =
  let rec loop index =
    if index = String.length s then index
    else
      if String.contains whitespace_chars s.[index] then loop (index + 1)
      else index
  in
  loop 0

let whitespace_suffix_length s =
  let rec loop rindex =
    if rindex = String.length s then rindex
    else
      if String.contains whitespace_chars s.[String.length s - rindex - 1] then
        loop (rindex + 1)
      else rindex
  in
  loop 0

let trim_string_left s =
  let prefix_length = whitespace_prefix_length s in
  String.sub s prefix_length (String.length s - prefix_length)

let trim_string_right s =
  let suffix_length = whitespace_suffix_length s in
  String.sub s 0 (String.length s - suffix_length)

(* String.trim not available for OCaml < 4.00. *)
let trim_string s = s |> trim_string_left |> trim_string_right

(* Specialization of List.mem at string list, to avoid polymorphic
   comparison. *)
let list_mem_string (s : string) l = List.exists (fun s' -> s' = s) l