You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

45 lines
1.9 KiB

;;;; Copied from https://github.com/anarchodin/trivial-utf-16
(in-package :whatscl/utf16-hacks)
(deftype high-surrogate ()
"A Unicode High Surrogate."
'(integer #xD800 #xDBFF))
(deftype low-surrogate ()
"A Unicode Low Surrogate."
'(integer #xDC00 #xDFFF))
(deftype unicode-point ()
"A Unicode code point."
'(integer 0 #x10FFFF))
(deftype unicode-string ()
"A vector of Unicode code points."
'(vector unicode-point))
(defun surrogates-to-codepoint (high-surrogate low-surrogate)
"Translate a pair of surrogate codepoints to a non-BMP codepoint. Returns the codepoint as an integer."
(check-type high-surrogate high-surrogate "a Unicode high-surrogate")
(check-type low-surrogate low-surrogate "a Unicode low-surrogate")
(let ((low-bits (ldb (byte 10 0) low-surrogate))
(high-bits (ldb (byte 10 0) high-surrogate)))
(+ #x10000 (dpb high-bits (byte 10 10) low-bits))))
(defun decode-utf-16 (utf-16-string)
"Turn a vector of UTF-16 code units into a vector of Unicode code points. Passes unpaired surrogate codepoints straight through."
(let ((result '()))
(dotimes (i (length utf-16-string))
(let ((codepoint (elt utf-16-string i)))
;; The first branch converts high-surrogate followed by low-surrogate.
;; The second branch ignores low-surrogate preceded by high-surrogate.
;; FIXME: I'm sure this can be done better.
(cond ((and (typep codepoint 'high-surrogate)
(< i (1- (length utf-16-string)))
(typep (elt utf-16-string (1+ i)) 'low-surrogate))
(push (surrogates-to-codepoint codepoint (elt utf-16-string (1+ i))) result))
((and (typep codepoint 'low-surrogate)
(> i 0)
(typep (elt utf-16-string (1- i)) 'high-surrogate)))
(t (push codepoint result)))))
(coerce (nreverse result) 'unicode-string)))