Switch to comparse for UTF8 validation.
This commit is contained in:
parent
baf570ab65
commit
d194289740
|
@ -0,0 +1,76 @@
|
||||||
|
(import chicken scheme)
|
||||||
|
(use srfi-4 srfi-13 srfi-14 comparse)
|
||||||
|
|
||||||
|
|
||||||
|
(define (ucs-range->char-set/inclusive lower upper)
|
||||||
|
(ucs-range->char-set lower (add1 upper)))
|
||||||
|
|
||||||
|
(define utf8-tail
|
||||||
|
(in (ucs-range->char-set/inclusive #x80 #xBF)))
|
||||||
|
|
||||||
|
(define utf8-1
|
||||||
|
(in (ucs-range->char-set/inclusive #x00 #x7F)))
|
||||||
|
|
||||||
|
(define utf8-2
|
||||||
|
(sequence
|
||||||
|
(in (ucs-range->char-set/inclusive #xC2 #xDF))
|
||||||
|
utf8-tail))
|
||||||
|
|
||||||
|
(define utf8-3
|
||||||
|
(any-of
|
||||||
|
(sequence
|
||||||
|
(is #\xE0)
|
||||||
|
(in (ucs-range->char-set/inclusive #xA0 #xBF))
|
||||||
|
utf8-tail)
|
||||||
|
(sequence
|
||||||
|
(in (ucs-range->char-set/inclusive #xE1 #xEC))
|
||||||
|
(repeated utf8-tail 2))
|
||||||
|
(sequence
|
||||||
|
(is #\xED)
|
||||||
|
(in (ucs-range->char-set/inclusive #x80 #x9F))
|
||||||
|
utf8-tail)
|
||||||
|
(sequence
|
||||||
|
(in (ucs-range->char-set/inclusive #xEE #xEF))
|
||||||
|
(repeated utf8-tail 2))))
|
||||||
|
|
||||||
|
(define utf8-4
|
||||||
|
(any-of
|
||||||
|
(sequence
|
||||||
|
(is #\xF0)
|
||||||
|
(in (ucs-range->char-set/inclusive #x90 #xBF))
|
||||||
|
(repeated utf8-tail 2))
|
||||||
|
(sequence
|
||||||
|
(in (ucs-range->char-set/inclusive #xF1 #xF3))
|
||||||
|
(repeated utf8-tail 3))
|
||||||
|
(sequence
|
||||||
|
(is #\xF4)
|
||||||
|
(in (ucs-range->char-set/inclusive #x80 #x8F))
|
||||||
|
(repeated utf8-tail 2))))
|
||||||
|
|
||||||
|
(define utf8-char
|
||||||
|
(any-of
|
||||||
|
utf8-1
|
||||||
|
utf8-2
|
||||||
|
utf8-3
|
||||||
|
utf8-4))
|
||||||
|
|
||||||
|
(define utf8-string
|
||||||
|
(followed-by (zero-or-more utf8-char) end-of-input))
|
||||||
|
|
||||||
|
;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
|
||||||
|
;; (parse utf8-char (->parser-input #\a))
|
||||||
|
|
||||||
|
;; (define (valid-utf8? s)
|
||||||
|
;; (let ((len (string-length s)))
|
||||||
|
;; (let loop ((i 0))
|
||||||
|
;; (if (= i len)
|
||||||
|
;; #t
|
||||||
|
;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
|
||||||
|
;; (if r
|
||||||
|
;; (loop (+ i (length r)))
|
||||||
|
;; (string-ref s i)))))))
|
||||||
|
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
|
||||||
|
;; (valid-utf8? "Hello")
|
||||||
|
;; (parse utf8-char (->parser-input (->string #\H)))
|
||||||
|
|
||||||
|
;; #\xC0
|
|
@ -25,9 +25,7 @@
|
||||||
|
|
||||||
(import chicken scheme data-structures extras ports posix foreign)
|
(import chicken scheme data-structures extras ports posix foreign)
|
||||||
(use srfi-1 srfi-4 spiffy intarweb uri-common base64 simple-sha1 srfi-18
|
(use srfi-1 srfi-4 spiffy intarweb uri-common base64 simple-sha1 srfi-18
|
||||||
srfi-13 mailbox)
|
srfi-13 mailbox srfi-14 comparse)
|
||||||
|
|
||||||
(foreign-declare "#include \"utf8validator.c\"")
|
|
||||||
|
|
||||||
(define-inline (neq? obj1 obj2) (not (eq? obj1 obj2)))
|
(define-inline (neq? obj1 obj2) (not (eq? obj1 obj2)))
|
||||||
|
|
||||||
|
@ -186,7 +184,7 @@
|
||||||
(dynamic-wind
|
(dynamic-wind
|
||||||
(lambda () (mutex-lock! (websocket-send-mutex ws)))
|
(lambda () (mutex-lock! (websocket-send-mutex ws)))
|
||||||
(lambda () (send-frame ws optype data #t))
|
(lambda () (send-frame ws optype data #t))
|
||||||
(lambda () (mutex-unlock! (websocket-send-mutex ws))))
|
(lambda () (mutex-unlock! (websocket-send-mutex ws)))))
|
||||||
|
|
||||||
(define (websocket-unmask-frame-payload payload len frame-masking-key)
|
(define (websocket-unmask-frame-payload payload len frame-masking-key)
|
||||||
(define tmaskkey (make-u8vector 4 #f #t #t))
|
(define tmaskkey (make-u8vector 4 #f #t #t))
|
||||||
|
@ -339,53 +337,9 @@
|
||||||
(make-property-condition 'unhandled-optype
|
(make-property-condition 'unhandled-optype
|
||||||
'optype frame-optype)))))))))))
|
'optype frame-optype)))))))))))
|
||||||
|
|
||||||
(define (valid-utf8-2? s)
|
(include "utf8-grammar.scm")
|
||||||
(define-external str c-string s)
|
|
||||||
(define-external len int (string-length s))
|
|
||||||
(zero?
|
|
||||||
((foreign-lambda* int ()
|
|
||||||
"
|
|
||||||
static const uint8_t utf8d[] = {
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
|
||||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
|
||||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
|
||||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
|
||||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
|
||||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
|
||||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
|
||||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
|
||||||
};
|
|
||||||
|
|
||||||
uint32_t si;
|
|
||||||
uint32_t *state;
|
|
||||||
si = 0;
|
|
||||||
state = &si;
|
|
||||||
uint32_t type;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
|
||||||
// type = utf8d[(uint8_t)str[i]];
|
|
||||||
type = utf8d[*((uint8_t*)str)];
|
|
||||||
*state = utf8d[256 + (*state) * 16 + type];
|
|
||||||
|
|
||||||
if (*state != 0) // reject
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
C_return(*state);
|
|
||||||
"
|
|
||||||
))
|
|
||||||
))
|
|
||||||
|
|
||||||
(define (valid-utf8? s)
|
(define (valid-utf8? s)
|
||||||
(let ((len (string-length s)))
|
(parse utf8-string (->parser-input s) memoize: #t))
|
||||||
((foreign-lambda int "utf8_valid" scheme-pointer int)
|
|
||||||
s len)))
|
|
||||||
|
|
||||||
(define (close-code->integer s)
|
(define (close-code->integer s)
|
||||||
(if (string-null? s)
|
(if (string-null? s)
|
||||||
|
|
Loading…
Reference in New Issue