Switch to comparse for UTF8 validation.

This commit is contained in:
Thomas Hintz 2014-10-17 06:52:56 -07:00
parent baf570ab65
commit d194289740
2 changed files with 80 additions and 50 deletions

76
utf8-grammar.scm Normal file
View File

@ -0,0 +1,76 @@
(import chicken scheme)
(use srfi-4 srfi-13 srfi-14 comparse)
(define (ucs-range->char-set/inclusive lower upper)
(ucs-range->char-set lower (add1 upper)))
(define utf8-tail
(in (ucs-range->char-set/inclusive #x80 #xBF)))
(define utf8-1
(in (ucs-range->char-set/inclusive #x00 #x7F)))
(define utf8-2
(sequence
(in (ucs-range->char-set/inclusive #xC2 #xDF))
utf8-tail))
(define utf8-3
(any-of
(sequence
(is #\xE0)
(in (ucs-range->char-set/inclusive #xA0 #xBF))
utf8-tail)
(sequence
(in (ucs-range->char-set/inclusive #xE1 #xEC))
(repeated utf8-tail 2))
(sequence
(is #\xED)
(in (ucs-range->char-set/inclusive #x80 #x9F))
utf8-tail)
(sequence
(in (ucs-range->char-set/inclusive #xEE #xEF))
(repeated utf8-tail 2))))
(define utf8-4
(any-of
(sequence
(is #\xF0)
(in (ucs-range->char-set/inclusive #x90 #xBF))
(repeated utf8-tail 2))
(sequence
(in (ucs-range->char-set/inclusive #xF1 #xF3))
(repeated utf8-tail 3))
(sequence
(is #\xF4)
(in (ucs-range->char-set/inclusive #x80 #x8F))
(repeated utf8-tail 2))))
(define utf8-char
(any-of
utf8-1
utf8-2
utf8-3
utf8-4))
(define utf8-string
(followed-by (zero-or-more utf8-char) end-of-input))
;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
;; (parse utf8-char (->parser-input #\a))
;; (define (valid-utf8? s)
;; (let ((len (string-length s)))
;; (let loop ((i 0))
;; (if (= i len)
;; #t
;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
;; (if r
;; (loop (+ i (length r)))
;; (string-ref s i)))))))
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
;; (valid-utf8? "Hello")
;; (parse utf8-char (->parser-input (->string #\H)))
;; #\xC0

View File

@ -25,9 +25,7 @@
(import chicken scheme data-structures extras ports posix foreign)
(use srfi-1 srfi-4 spiffy intarweb uri-common base64 simple-sha1 srfi-18
srfi-13 mailbox)
(foreign-declare "#include \"utf8validator.c\"")
srfi-13 mailbox srfi-14 comparse)
(define-inline (neq? obj1 obj2) (not (eq? obj1 obj2)))
@ -186,7 +184,7 @@
(dynamic-wind
(lambda () (mutex-lock! (websocket-send-mutex ws)))
(lambda () (send-frame ws optype data #t))
(lambda () (mutex-unlock! (websocket-send-mutex ws))))
(lambda () (mutex-unlock! (websocket-send-mutex ws)))))
(define (websocket-unmask-frame-payload payload len frame-masking-key)
(define tmaskkey (make-u8vector 4 #f #t #t))
@ -339,53 +337,9 @@
(make-property-condition 'unhandled-optype
'optype frame-optype)))))))))))
(define (valid-utf8-2? s)
(define-external str c-string s)
(define-external len int (string-length s))
(zero?
((foreign-lambda* int ()
"
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
uint32_t si;
uint32_t *state;
si = 0;
state = &si;
uint32_t type;
for (int i = 0; i < len; i++) {
// type = utf8d[(uint8_t)str[i]];
type = utf8d[*((uint8_t*)str)];
*state = utf8d[256 + (*state) * 16 + type];
if (*state != 0) // reject
break;
}
C_return(*state);
"
))
))
(include "utf8-grammar.scm")
(define (valid-utf8? s)
(let ((len (string-length s)))
((foreign-lambda int "utf8_valid" scheme-pointer int)
s len)))
(parse utf8-string (->parser-input s) memoize: #t))
(define (close-code->integer s)
(if (string-null? s)