2 * Copyright 2007-2009, Lloyd Hilaiel.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. Neither the name of Lloyd Hilaiel nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
41 #ifdef YAJL_LEXER_DEBUG
43 tokToStr(yajl_tok tok
)
46 case yajl_tok_bool
: return "bool";
47 case yajl_tok_colon
: return "colon";
48 case yajl_tok_comma
: return "comma";
49 case yajl_tok_eof
: return "eof";
50 case yajl_tok_error
: return "error";
51 case yajl_tok_left_brace
: return "brace";
52 case yajl_tok_left_bracket
: return "bracket";
53 case yajl_tok_null
: return "null";
54 case yajl_tok_integer
: return "integer";
55 case yajl_tok_double
: return "double";
56 case yajl_tok_right_brace
: return "brace";
57 case yajl_tok_right_bracket
: return "bracket";
58 case yajl_tok_string
: return "string";
59 case yajl_tok_string_with_escapes
: return "string_with_escapes";
65 /* Impact of the stream parsing feature on the lexer:
67 * YAJL support stream parsing. That is, the ability to parse the first
68 * bits of a chunk of JSON before the last bits are available (still on
69 * the network or disk). This makes the lexer more complex. The
70 * responsibility of the lexer is to handle transparently the case where
71 * a chunk boundary falls in the middle of a token. This is
72 * accomplished is via a buffer and a character reading abstraction.
74 * Overview of implementation
76 * When we lex to end of input string before end of token is hit, we
77 * copy all of the input text composing the token into our lexBuf.
79 * Every time we read a character, we do so through the readChar function.
80 * readChar's responsibility is to handle pulling all chars from the buffer
81 * before pulling chars from input text
85 /* the overal line and char offset into the data */
92 /* a input buffer to handle the case where a token is spread over
96 /* in the case where we have data in the lexBuf, bufOff holds
97 * the current offset into the lexBuf. */
100 /* are we using the lex buf? */
101 unsigned int bufInUse
;
103 /* shall we allow comments? */
104 unsigned int allowComments
;
106 /* shall we validate utf8 inside strings? */
107 unsigned int validateUTF8
;
109 yajl_alloc_funcs
* alloc
;
112 #define readChar(lxr, txt, off) \
113 (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
114 (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
117 #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
120 yajl_lex_alloc(yajl_alloc_funcs
* alloc
,
121 unsigned int allowComments
, unsigned int validateUTF8
)
123 yajl_lexer lxr
= (yajl_lexer
) YA_MALLOC(alloc
, sizeof(struct yajl_lexer_t
));
124 memset((void *) lxr
, 0, sizeof(struct yajl_lexer_t
));
125 lxr
->buf
= yajl_buf_alloc(alloc
);
126 lxr
->allowComments
= allowComments
;
127 lxr
->validateUTF8
= validateUTF8
;
133 yajl_lex_free(yajl_lexer lxr
)
135 yajl_buf_free(lxr
->buf
);
136 YA_FREE(lxr
->alloc
, lxr
);
140 /* a lookup table which lets us quickly determine three things:
141 * VEC - valid escaped conrol char
142 * IJC - invalid json char
143 * VHC - valid hex char
144 * note. the solidus '/' may be escaped or not.
150 static const char charLookupTable
[256] =
152 /*00*/ IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
,
153 /*08*/ IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
,
154 /*10*/ IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
,
155 /*18*/ IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
, IJC
,
157 /*20*/ 0 , 0 , VEC
|IJC
, 0 , 0 , 0 , 0 , 0 ,
158 /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC
,
159 /*30*/ VHC
, VHC
, VHC
, VHC
, VHC
, VHC
, VHC
, VHC
,
160 /*38*/ VHC
, VHC
, 0 , 0 , 0 , 0 , 0 , 0 ,
162 /*40*/ 0 , VHC
, VHC
, VHC
, VHC
, VHC
, VHC
, 0 ,
163 /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
164 /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
165 /*58*/ 0 , 0 , 0 , 0 , VEC
|IJC
, 0 , 0 , 0 ,
167 /*60*/ 0 , VHC
, VEC
|VHC
, VHC
, VHC
, VHC
, VEC
|VHC
, 0 ,
168 /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC
, 0 ,
169 /*70*/ 0 , 0 , VEC
, 0 , VEC
, 0 , 0 , 0 ,
170 /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
172 /* include these so we don't have to always check the range of the char */
173 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
174 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
175 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
176 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
178 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
179 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
180 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
181 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
183 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
184 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
185 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
186 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
188 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
189 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
190 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
191 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
194 /** process a variable length utf8 encoded codepoint.
197 * yajl_tok_string - if valid utf8 char was parsed and offset was
199 * yajl_tok_eof - if end of input was hit before validation could
201 * yajl_tok_error - if invalid utf8 was encountered
203 * NOTE: on error the offset will point to the first char of the
205 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
208 yajl_lex_utf8_char(yajl_lexer lexer
, const unsigned char * jsonText
,
209 unsigned int jsonTextLen
, unsigned int * offset
,
210 unsigned char curChar
)
212 if (curChar
<= 0x7f) {
214 return yajl_tok_string
;
215 } else if ((curChar
>> 5) == 0x6) {
218 curChar
= readChar(lexer
, jsonText
, offset
);
219 if ((curChar
>> 6) == 0x2) return yajl_tok_string
;
220 } else if ((curChar
>> 4) == 0x0e) {
223 curChar
= readChar(lexer
, jsonText
, offset
);
224 if ((curChar
>> 6) == 0x2) {
226 curChar
= readChar(lexer
, jsonText
, offset
);
227 if ((curChar
>> 6) == 0x2) return yajl_tok_string
;
229 } else if ((curChar
>> 3) == 0x1e) {
232 curChar
= readChar(lexer
, jsonText
, offset
);
233 if ((curChar
>> 6) == 0x2) {
235 curChar
= readChar(lexer
, jsonText
, offset
);
236 if ((curChar
>> 6) == 0x2) {
238 curChar
= readChar(lexer
, jsonText
, offset
);
239 if ((curChar
>> 6) == 0x2) return yajl_tok_string
;
244 return yajl_tok_error
;
247 /* lex a string. input is the lexer, pointer to beginning of
248 * json text, and start of string (offset).
249 * a token is returned which has the following meanings:
250 * yajl_tok_string: lex of string was successful. offset points to
252 * yajl_tok_eof: end of text was encountered before we could complete
254 * yajl_tok_error: embedded in the string were unallowable chars. offset
255 * points to the offending char
257 #define STR_CHECK_EOF \
258 if (*offset >= jsonTextLen) { \
259 tok = yajl_tok_eof; \
260 goto finish_string_lex; \
264 yajl_lex_string(yajl_lexer lexer
, const unsigned char * jsonText
,
265 unsigned int jsonTextLen
, unsigned int * offset
)
267 yajl_tok tok
= yajl_tok_error
;
271 unsigned char curChar
;
275 curChar
= readChar(lexer
, jsonText
, offset
);
277 /* quote terminates */
278 if (curChar
== '"') {
279 tok
= yajl_tok_string
;
282 /* backslash escapes a set of control chars, */
283 else if (curChar
== '\\') {
287 /* special case \u */
288 curChar
= readChar(lexer
, jsonText
, offset
);
289 if (curChar
== 'u') {
294 curChar
= readChar(lexer
, jsonText
, offset
);
295 if (!(charLookupTable
[curChar
] & VHC
)) {
296 /* back up to offending char */
297 unreadChar(lexer
, offset
);
298 lexer
->error
= yajl_lex_string_invalid_hex_char
;
299 goto finish_string_lex
;
302 } else if (!(charLookupTable
[curChar
] & VEC
)) {
303 /* back up to offending char */
304 unreadChar(lexer
, offset
);
305 lexer
->error
= yajl_lex_string_invalid_escaped_char
;
306 goto finish_string_lex
;
309 /* when not validating UTF8 it's a simple table lookup to determine
310 * if the present character is invalid */
311 else if(charLookupTable
[curChar
] & IJC
) {
312 /* back up to offending char */
313 unreadChar(lexer
, offset
);
314 lexer
->error
= yajl_lex_string_invalid_json_char
;
315 goto finish_string_lex
;
317 /* when in validate UTF8 mode we need to do some extra work */
318 else if (lexer
->validateUTF8
) {
319 yajl_tok t
= yajl_lex_utf8_char(lexer
, jsonText
, jsonTextLen
,
322 if (t
== yajl_tok_eof
) {
324 goto finish_string_lex
;
325 } else if (t
== yajl_tok_error
) {
326 lexer
->error
= yajl_lex_string_invalid_utf8
;
327 goto finish_string_lex
;
330 /* accept it, and move on */
333 /* tell our buddy, the parser, wether he needs to process this string
335 if (hasEscapes
&& tok
== yajl_tok_string
) {
336 tok
= yajl_tok_string_with_escapes
;
342 #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
345 yajl_lex_number(yajl_lexer lexer
, const unsigned char * jsonText
,
346 unsigned int jsonTextLen
, unsigned int * offset
)
348 /** XXX: numbers are the only entities in json that we must lex
349 * _beyond_ in order to know that they are complete. There
350 * is an ambiguous case for integers at EOF. */
354 yajl_tok tok
= yajl_tok_integer
;
357 c
= readChar(lexer
, jsonText
, offset
);
359 /* optional leading minus */
362 c
= readChar(lexer
, jsonText
, offset
);
365 /* a single zero, or a series of integers */
368 c
= readChar(lexer
, jsonText
, offset
);
369 } else if (c
>= '1' && c
<= '9') {
372 c
= readChar(lexer
, jsonText
, offset
);
373 } while (c
>= '0' && c
<= '9');
375 unreadChar(lexer
, offset
);
376 lexer
->error
= yajl_lex_missing_integer_after_minus
;
377 return yajl_tok_error
;
380 /* optional fraction (indicates this is floating point) */
385 c
= readChar(lexer
, jsonText
, offset
);
387 while (c
>= '0' && c
<= '9') {
390 c
= readChar(lexer
, jsonText
, offset
);
394 unreadChar(lexer
, offset
);
395 lexer
->error
= yajl_lex_missing_integer_after_decimal
;
396 return yajl_tok_error
;
398 tok
= yajl_tok_double
;
401 /* optional exponent (indicates this is floating point) */
402 if (c
== 'e' || c
== 'E') {
404 c
= readChar(lexer
, jsonText
, offset
);
407 if (c
== '+' || c
== '-') {
409 c
= readChar(lexer
, jsonText
, offset
);
412 if (c
>= '0' && c
<= '9') {
415 c
= readChar(lexer
, jsonText
, offset
);
416 } while (c
>= '0' && c
<= '9');
418 unreadChar(lexer
, offset
);
419 lexer
->error
= yajl_lex_missing_integer_after_exponent
;
420 return yajl_tok_error
;
422 tok
= yajl_tok_double
;
425 /* we always go "one too far" */
426 unreadChar(lexer
, offset
);
432 yajl_lex_comment(yajl_lexer lexer
, const unsigned char * jsonText
,
433 unsigned int jsonTextLen
, unsigned int * offset
)
437 yajl_tok tok
= yajl_tok_comment
;
440 c
= readChar(lexer
, jsonText
, offset
);
442 /* either slash or star expected */
444 /* now we throw away until end of line */
447 c
= readChar(lexer
, jsonText
, offset
);
449 } else if (c
== '*') {
450 /* now we throw away until end of comment */
453 c
= readChar(lexer
, jsonText
, offset
);
456 c
= readChar(lexer
, jsonText
, offset
);
460 unreadChar(lexer
, offset
);
465 lexer
->error
= yajl_lex_invalid_char
;
466 tok
= yajl_tok_error
;
473 yajl_lex_lex(yajl_lexer lexer
, const unsigned char * jsonText
,
474 unsigned int jsonTextLen
, unsigned int * offset
,
475 const unsigned char ** outBuf
, unsigned int * outLen
)
477 yajl_tok tok
= yajl_tok_error
;
479 unsigned int startOffset
= *offset
;
485 assert(*offset
<= jsonTextLen
);
487 if (*offset
>= jsonTextLen
) {
492 c
= readChar(lexer
, jsonText
, offset
);
496 tok
= yajl_tok_left_bracket
;
499 tok
= yajl_tok_right_bracket
;
502 tok
= yajl_tok_left_brace
;
505 tok
= yajl_tok_right_brace
;
508 tok
= yajl_tok_comma
;
511 tok
= yajl_tok_colon
;
513 case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
517 const char * want
= "rue";
519 if (*offset
>= jsonTextLen
) {
523 c
= readChar(lexer
, jsonText
, offset
);
525 unreadChar(lexer
, offset
);
526 lexer
->error
= yajl_lex_invalid_string
;
527 tok
= yajl_tok_error
;
535 const char * want
= "alse";
537 if (*offset
>= jsonTextLen
) {
541 c
= readChar(lexer
, jsonText
, offset
);
543 unreadChar(lexer
, offset
);
544 lexer
->error
= yajl_lex_invalid_string
;
545 tok
= yajl_tok_error
;
553 const char * want
= "ull";
555 if (*offset
>= jsonTextLen
) {
559 c
= readChar(lexer
, jsonText
, offset
);
561 unreadChar(lexer
, offset
);
562 lexer
->error
= yajl_lex_invalid_string
;
563 tok
= yajl_tok_error
;
571 tok
= yajl_lex_string(lexer
, (const unsigned char *) jsonText
,
572 jsonTextLen
, offset
);
576 case '0': case '1': case '2': case '3': case '4':
577 case '5': case '6': case '7': case '8': case '9': {
578 /* integer parsing wants to start from the beginning */
579 unreadChar(lexer
, offset
);
580 tok
= yajl_lex_number(lexer
, (const unsigned char *) jsonText
,
581 jsonTextLen
, offset
);
585 /* hey, look, a probable comment! If comments are disabled
587 if (!lexer
->allowComments
) {
588 unreadChar(lexer
, offset
);
589 lexer
->error
= yajl_lex_unallowed_comment
;
590 tok
= yajl_tok_error
;
593 /* if comments are enabled, then we should try to lex
594 * the thing. possible outcomes are
595 * - successful lex (tok_comment, which means continue),
596 * - malformed comment opening (slash not followed by
597 * '*' or '/') (tok_error)
598 * - eof hit. (tok_eof) */
599 tok
= yajl_lex_comment(lexer
, (const unsigned char *) jsonText
,
600 jsonTextLen
, offset
);
601 if (tok
== yajl_tok_comment
) {
602 /* "error" is silly, but that's the initial
603 * state of tok. guilty until proven innocent. */
604 tok
= yajl_tok_error
;
605 yajl_buf_clear(lexer
->buf
);
607 startOffset
= *offset
;
610 /* hit error or eof, bail */
613 lexer
->error
= yajl_lex_invalid_char
;
614 tok
= yajl_tok_error
;
621 /* need to append to buffer if the buffer is in use or
622 * if it's an EOF token */
623 if (tok
== yajl_tok_eof
|| lexer
->bufInUse
) {
624 if (!lexer
->bufInUse
) yajl_buf_clear(lexer
->buf
);
626 yajl_buf_append(lexer
->buf
, jsonText
+ startOffset
, *offset
- startOffset
);
629 if (tok
!= yajl_tok_eof
) {
630 *outBuf
= yajl_buf_data(lexer
->buf
);
631 *outLen
= yajl_buf_len(lexer
->buf
);
634 } else if (tok
!= yajl_tok_error
) {
635 *outBuf
= jsonText
+ startOffset
;
636 *outLen
= *offset
- startOffset
;
639 /* special case for strings. skip the quotes. */
640 if (tok
== yajl_tok_string
|| tok
== yajl_tok_string_with_escapes
)
642 assert(*outLen
>= 2);
648 #ifdef YAJL_LEXER_DEBUG
649 if (tok
== yajl_tok_error
) {
650 printf("lexical error: %s\n",
651 yajl_lex_error_to_string(yajl_lex_get_error(lexer
)));
652 } else if (tok
== yajl_tok_eof
) {
655 printf("lexed %s: '", tokToStr(tok
));
656 fwrite(*outBuf
, 1, *outLen
, stdout
);
665 yajl_lex_error_to_string(yajl_lex_error error
)
669 return "ok, no error";
670 case yajl_lex_string_invalid_utf8
:
671 return "invalid bytes in UTF8 string.";
672 case yajl_lex_string_invalid_escaped_char
:
673 return "inside a string, '\\' occurs before a character "
675 case yajl_lex_string_invalid_json_char
:
676 return "invalid character inside string.";
677 case yajl_lex_string_invalid_hex_char
:
678 return "invalid (non-hex) character occurs after '\\u' inside "
680 case yajl_lex_invalid_char
:
681 return "invalid char in json text.";
682 case yajl_lex_invalid_string
:
683 return "invalid string in json text.";
684 case yajl_lex_missing_integer_after_exponent
:
685 return "malformed number, a digit is required after the exponent.";
686 case yajl_lex_missing_integer_after_decimal
:
687 return "malformed number, a digit is required after the "
689 case yajl_lex_missing_integer_after_minus
:
690 return "malformed number, a digit is required after the "
692 case yajl_lex_unallowed_comment
:
693 return "probable comment found in input text, comments are "
696 return "unknown error code";
700 /** allows access to more specific information about the lexical
701 * error when yajl_lex_lex returns yajl_tok_error. */
703 yajl_lex_get_error(yajl_lexer lexer
)
705 if (lexer
== NULL
) return (yajl_lex_error
) -1;
709 unsigned int yajl_lex_current_line(yajl_lexer lexer
)
711 return lexer
->lineOff
;
714 unsigned int yajl_lex_current_char(yajl_lexer lexer
)
716 return lexer
->charOff
;
719 yajl_tok
yajl_lex_peek(yajl_lexer lexer
, const unsigned char * jsonText
,
720 unsigned int jsonTextLen
, unsigned int offset
)
722 const unsigned char * outBuf
;
724 unsigned int bufLen
= yajl_buf_len(lexer
->buf
);
725 unsigned int bufOff
= lexer
->bufOff
;
726 unsigned int bufInUse
= lexer
->bufInUse
;
729 tok
= yajl_lex_lex(lexer
, jsonText
, jsonTextLen
, &offset
,
732 lexer
->bufOff
= bufOff
;
733 lexer
->bufInUse
= bufInUse
;
734 yajl_buf_truncate(lexer
->buf
, bufLen
);