You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.9 KiB
45 lines
1.9 KiB
|
|
;;;; Copied from https://github.com/anarchodin/trivial-utf-16 |
|
(in-package :whatscl/utf16-hacks) |
|
|
|
(deftype high-surrogate () |
|
"A Unicode High Surrogate." |
|
'(integer #xD800 #xDBFF)) |
|
|
|
(deftype low-surrogate () |
|
"A Unicode Low Surrogate." |
|
'(integer #xDC00 #xDFFF)) |
|
|
|
(deftype unicode-point () |
|
"A Unicode code point." |
|
'(integer 0 #x10FFFF)) |
|
|
|
(deftype unicode-string () |
|
"A vector of Unicode code points." |
|
'(vector unicode-point)) |
|
|
|
(defun surrogates-to-codepoint (high-surrogate low-surrogate) |
|
"Translate a pair of surrogate codepoints to a non-BMP codepoint. Returns the codepoint as an integer." |
|
(check-type high-surrogate high-surrogate "a Unicode high-surrogate") |
|
(check-type low-surrogate low-surrogate "a Unicode low-surrogate") |
|
(let ((low-bits (ldb (byte 10 0) low-surrogate)) |
|
(high-bits (ldb (byte 10 0) high-surrogate))) |
|
(+ #x10000 (dpb high-bits (byte 10 10) low-bits)))) |
|
|
|
(defun decode-utf-16 (utf-16-string) |
|
"Turn a vector of UTF-16 code units into a vector of Unicode code points. Passes unpaired surrogate codepoints straight through." |
|
(let ((result '())) |
|
(dotimes (i (length utf-16-string)) |
|
(let ((codepoint (elt utf-16-string i))) |
|
;; The first branch converts high-surrogate followed by low-surrogate. |
|
;; The second branch ignores low-surrogate preceded by high-surrogate. |
|
;; FIXME: I'm sure this can be done better. |
|
(cond ((and (typep codepoint 'high-surrogate) |
|
(< i (1- (length utf-16-string))) |
|
(typep (elt utf-16-string (1+ i)) 'low-surrogate)) |
|
(push (surrogates-to-codepoint codepoint (elt utf-16-string (1+ i))) result)) |
|
((and (typep codepoint 'low-surrogate) |
|
(> i 0) |
|
(typep (elt utf-16-string (1- i)) 'high-surrogate))) |
|
(t (push codepoint result))))) |
|
(coerce (nreverse result) 'unicode-string)))
|
|
|