Dogcows Code - chaz/yoink/blob - src/yajl_lex.c

   1 /*
   2  * Copyright 2007-2009, Lloyd Hilaiel.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  *  1. Redistributions of source code must retain the above copyright
   9  *     notice, this list of conditions and the following disclaimer.
  10  *
  11  *  2. Redistributions in binary form must reproduce the above copyright
  12  *     notice, this list of conditions and the following disclaimer in
  13  *     the documentation and/or other materials provided with the
  14  *     distribution.
  15  *
  16  *  3. Neither the name of Lloyd Hilaiel nor the names of its
  17  *     contributors may be used to endorse or promote products derived
  18  *     from this software without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  24  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  26  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  28  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  29  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 #include "yajl_lex.h"
  34 #include "yajl_buf.h"
  35
  36 #include <stdlib.h>
  37 #include <stdio.h>
  38 #include <assert.h>
  39 #include <string.h>
  40
  41 #ifdef YAJL_LEXER_DEBUG
  42 static const char *
  43 tokToStr(yajl_tok tok)
  44 {
  45     switch (tok) {
  46         case yajl_tok_bool: return "bool";
  47         case yajl_tok_colon: return "colon";
  48         case yajl_tok_comma: return "comma";
  49         case yajl_tok_eof: return "eof";
  50         case yajl_tok_error: return "error";
  51         case yajl_tok_left_brace: return "brace";
  52         case yajl_tok_left_bracket: return "bracket";
  53         case yajl_tok_null: return "null";
  54         case yajl_tok_integer: return "integer";
  55         case yajl_tok_double: return "double";
  56         case yajl_tok_right_brace: return "brace";
  57         case yajl_tok_right_bracket: return "bracket";
  58         case yajl_tok_string: return "string";
  59         case yajl_tok_string_with_escapes: return "string_with_escapes";
  60     }
  61     return "unknown";
  62 }
  63 #endif
  64
  65 /* Impact of the stream parsing feature on the lexer:
  66  *
  67  * YAJL support stream parsing.  That is, the ability to parse the first
  68  * bits of a chunk of JSON before the last bits are available (still on
  69  * the network or disk).  This makes the lexer more complex.  The
  70  * responsibility of the lexer is to handle transparently the case where
  71  * a chunk boundary falls in the middle of a token.  This is
  72  * accomplished is via a buffer and a character reading abstraction.
  73  *
  74  * Overview of implementation
  75  *
  76  * When we lex to end of input string before end of token is hit, we
  77  * copy all of the input text composing the token into our lexBuf.
  78  *
  79  * Every time we read a character, we do so through the readChar function.
  80  * readChar's responsibility is to handle pulling all chars from the buffer
  81  * before pulling chars from input text
  82  */
  83
  84 struct yajl_lexer_t {
  85     /* the overal line and char offset into the data */
  86     unsigned int lineOff;
  87     unsigned int charOff;
  88
  89     /* error */
  90     yajl_lex_error error;
  91
  92     /* a input buffer to handle the case where a token is spread over
  93      * multiple chunks */
  94     yajl_buf buf;
  95
  96     /* in the case where we have data in the lexBuf, bufOff holds
  97      * the current offset into the lexBuf. */
  98     unsigned int bufOff;
  99
 100     /* are we using the lex buf? */
 101     unsigned int bufInUse;
 102
 103     /* shall we allow comments? */
 104     unsigned int allowComments;
 105
 106     /* shall we validate utf8 inside strings? */
 107     unsigned int validateUTF8;
 108
 109     yajl_alloc_funcs * alloc;
 110 };
 111
 112 #define readChar(lxr, txt, off)                      \
 113     (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
 114      (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
 115      ((txt)[(*(off))++]))
 116
 117 #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
 118
 119 yajl_lexer
 120 yajl_lex_alloc(yajl_alloc_funcs * alloc,
 121                unsigned int allowComments, unsigned int validateUTF8)
 122 {
 123     yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
 124     memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
 125     lxr->buf = yajl_buf_alloc(alloc);
 126     lxr->allowComments = allowComments;
 127     lxr->validateUTF8 = validateUTF8;
 128     lxr->alloc = alloc;
 129     return lxr;
 130 }
 131
 132 void
 133 yajl_lex_free(yajl_lexer lxr)
 134 {
 135     yajl_buf_free(lxr->buf);
 136     YA_FREE(lxr->alloc, lxr);
 137     return;
 138 }
 139
 140 /* a lookup table which lets us quickly determine three things:
 141  * VEC - valid escaped conrol char
 142  * IJC - invalid json char
 143  * VHC - valid hex char
 144  * note.  the solidus '/' may be escaped or not.
 145  * note.  the
 146  */
 147 #define VEC 1
 148 #define IJC 2
 149 #define VHC 4
 150 static const char charLookupTable[256] =
 151 {
 152 /*00*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 153 /*08*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 154 /*10*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 155 /*18*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 156
 157 /*20*/ 0      , 0      , VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
 158 /*28*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , VEC    ,
 159 /*30*/ VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    ,
 160 /*38*/ VHC    , VHC    , 0      , 0      , 0      , 0      , 0      , 0      ,
 161
 162 /*40*/ 0      , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , 0      ,
 163 /*48*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 164 /*50*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 165 /*58*/ 0      , 0      , 0      , 0      , VEC|IJC, 0      , 0      , 0      ,
 166
 167 /*60*/ 0      , VHC    , VEC|VHC, VHC    , VHC    , VHC    , VEC|VHC, 0      ,
 168 /*68*/ 0      , 0      , 0      , 0      , 0      , 0      , VEC    , 0      ,
 169 /*70*/ 0      , 0      , VEC    , 0      , VEC    , 0      , 0      , 0      ,
 170 /*78*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 171
 172 /* include these so we don't have to always check the range of the char */
 173        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 174        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 175        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 176        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 177
 178        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 179        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 180        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 181        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 182
 183        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 184        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 185        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 186        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 187
 188        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 189        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 190        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 191        0      , 0      , 0      , 0      , 0      , 0      , 0      , 0
 192 };
 193
 194 /** process a variable length utf8 encoded codepoint.
 195  *
 196  *  returns:
 197  *    yajl_tok_string - if valid utf8 char was parsed and offset was
 198  *                      advanced
 199  *    yajl_tok_eof - if end of input was hit before validation could
 200  *                   complete
 201  *    yajl_tok_error - if invalid utf8 was encountered
 202  *
 203  *  NOTE: on error the offset will point to the first char of the
 204  *  invalid utf8 */
 205 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
 206
 207 static yajl_tok
 208 yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
 209                    unsigned int jsonTextLen, unsigned int * offset,
 210                    unsigned char curChar)
 211 {
 212     if (curChar <= 0x7f) {
 213         /* single byte */
 214         return yajl_tok_string;
 215     } else if ((curChar >> 5) == 0x6) {
 216         /* two byte */
 217         UTF8_CHECK_EOF;
 218         curChar = readChar(lexer, jsonText, offset);
 219         if ((curChar >> 6) == 0x2) return yajl_tok_string;
 220     } else if ((curChar >> 4) == 0x0e) {
 221         /* three byte */
 222         UTF8_CHECK_EOF;
 223         curChar = readChar(lexer, jsonText, offset);
 224         if ((curChar >> 6) == 0x2) {
 225             UTF8_CHECK_EOF;
 226             curChar = readChar(lexer, jsonText, offset);
 227             if ((curChar >> 6) == 0x2) return yajl_tok_string;
 228         }
 229     } else if ((curChar >> 3) == 0x1e) {
 230         /* four byte */
 231         UTF8_CHECK_EOF;
 232         curChar = readChar(lexer, jsonText, offset);
 233         if ((curChar >> 6) == 0x2) {
 234             UTF8_CHECK_EOF;
 235             curChar = readChar(lexer, jsonText, offset);
 236             if ((curChar >> 6) == 0x2) {
 237                 UTF8_CHECK_EOF;
 238                 curChar = readChar(lexer, jsonText, offset);
 239                 if ((curChar >> 6) == 0x2) return yajl_tok_string;
 240             }
 241         }
 242     }
 243
 244     return yajl_tok_error;
 245 }
 246
 247 /* lex a string.  input is the lexer, pointer to beginning of
 248  * json text, and start of string (offset).
 249  * a token is returned which has the following meanings:
 250  * yajl_tok_string: lex of string was successful.  offset points to
 251  *                  terminating '"'.
 252  * yajl_tok_eof: end of text was encountered before we could complete
 253  *               the lex.
 254  * yajl_tok_error: embedded in the string were unallowable chars.  offset
 255  *               points to the offending char
 256  */
 257 #define STR_CHECK_EOF \
 258 if (*offset >= jsonTextLen) { \
 259    tok = yajl_tok_eof; \
 260    goto finish_string_lex; \
 261 }
 262
 263 static yajl_tok
 264 yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
 265                 unsigned int jsonTextLen, unsigned int * offset)
 266 {
 267     yajl_tok tok = yajl_tok_error;
 268     int hasEscapes = 0;
 269
 270     for (;;) {
 271                 unsigned char curChar;
 272
 273                 STR_CHECK_EOF;
 274
 275         curChar = readChar(lexer, jsonText, offset);
 276
 277         /* quote terminates */
 278         if (curChar == '"') {
 279             tok = yajl_tok_string;
 280             break;
 281         }
 282         /* backslash escapes a set of control chars, */
 283         else if (curChar == '\\') {
 284             hasEscapes = 1;
 285             STR_CHECK_EOF;
 286
 287             /* special case \u */
 288             curChar = readChar(lexer, jsonText, offset);
 289             if (curChar == 'u') {
 290                 unsigned int i = 0;
 291
 292                 for (i=0;i<4;i++) {
 293                     STR_CHECK_EOF;
 294                     curChar = readChar(lexer, jsonText, offset);
 295                     if (!(charLookupTable[curChar] & VHC)) {
 296                         /* back up to offending char */
 297                         unreadChar(lexer, offset);
 298                         lexer->error = yajl_lex_string_invalid_hex_char;
 299                         goto finish_string_lex;
 300                     }
 301                 }
 302             } else if (!(charLookupTable[curChar] & VEC)) {
 303                 /* back up to offending char */
 304                 unreadChar(lexer, offset);
 305                 lexer->error = yajl_lex_string_invalid_escaped_char;
 306                 goto finish_string_lex;
 307             }
 308         }
 309         /* when not validating UTF8 it's a simple table lookup to determine
 310          * if the present character is invalid */
 311         else if(charLookupTable[curChar] & IJC) {
 312             /* back up to offending char */
 313             unreadChar(lexer, offset);
 314             lexer->error = yajl_lex_string_invalid_json_char;
 315             goto finish_string_lex;
 316         }
 317         /* when in validate UTF8 mode we need to do some extra work */
 318         else if (lexer->validateUTF8) {
 319             yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
 320                                             offset, curChar);
 321
 322             if (t == yajl_tok_eof) {
 323                 tok = yajl_tok_eof;
 324                 goto finish_string_lex;
 325             } else if (t == yajl_tok_error) {
 326                 lexer->error = yajl_lex_string_invalid_utf8;
 327                 goto finish_string_lex;
 328             }
 329         }
 330         /* accept it, and move on */
 331     }
 332   finish_string_lex:
 333     /* tell our buddy, the parser, wether he needs to process this string
 334      * again */
 335     if (hasEscapes && tok == yajl_tok_string) {
 336         tok = yajl_tok_string_with_escapes;
 337     }
 338
 339     return tok;
 340 }
 341
 342 #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
 343
 344 static yajl_tok
 345 yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
 346                 unsigned int jsonTextLen, unsigned int * offset)
 347 {
 348     /** XXX: numbers are the only entities in json that we must lex
 349      *       _beyond_ in order to know that they are complete.  There
 350      *       is an ambiguous case for integers at EOF. */
 351
 352     unsigned char c;
 353
 354     yajl_tok tok = yajl_tok_integer;
 355
 356     RETURN_IF_EOF;
 357     c = readChar(lexer, jsonText, offset);
 358
 359     /* optional leading minus */
 360     if (c == '-') {
 361         RETURN_IF_EOF;
 362         c = readChar(lexer, jsonText, offset);
 363     }
 364
 365     /* a single zero, or a series of integers */
 366     if (c == '0') {
 367         RETURN_IF_EOF;
 368         c = readChar(lexer, jsonText, offset);
 369     } else if (c >= '1' && c <= '9') {
 370         do {
 371             RETURN_IF_EOF;
 372             c = readChar(lexer, jsonText, offset);
 373         } while (c >= '0' && c <= '9');
 374     } else {
 375         unreadChar(lexer, offset);
 376         lexer->error = yajl_lex_missing_integer_after_minus;
 377         return yajl_tok_error;
 378     }
 379
 380     /* optional fraction (indicates this is floating point) */
 381     if (c == '.') {
 382         int numRd = 0;
 383
 384         RETURN_IF_EOF;
 385         c = readChar(lexer, jsonText, offset);
 386
 387         while (c >= '0' && c <= '9') {
 388             numRd++;
 389             RETURN_IF_EOF;
 390             c = readChar(lexer, jsonText, offset);
 391         }
 392
 393         if (!numRd) {
 394             unreadChar(lexer, offset);
 395             lexer->error = yajl_lex_missing_integer_after_decimal;
 396             return yajl_tok_error;
 397         }
 398         tok = yajl_tok_double;
 399     }
 400
 401     /* optional exponent (indicates this is floating point) */
 402     if (c == 'e' || c == 'E') {
 403         RETURN_IF_EOF;
 404         c = readChar(lexer, jsonText, offset);
 405
 406         /* optional sign */
 407         if (c == '+' || c == '-') {
 408             RETURN_IF_EOF;
 409             c = readChar(lexer, jsonText, offset);
 410         }
 411
 412         if (c >= '0' && c <= '9') {
 413             do {
 414                 RETURN_IF_EOF;
 415                 c = readChar(lexer, jsonText, offset);
 416             } while (c >= '0' && c <= '9');
 417         } else {
 418             unreadChar(lexer, offset);
 419             lexer->error = yajl_lex_missing_integer_after_exponent;
 420             return yajl_tok_error;
 421         }
 422         tok = yajl_tok_double;
 423     }
 424
 425     /* we always go "one too far" */
 426     unreadChar(lexer, offset);
 427
 428     return tok;
 429 }
 430
 431 static yajl_tok
 432 yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
 433                  unsigned int jsonTextLen, unsigned int * offset)
 434 {
 435     unsigned char c;
 436
 437     yajl_tok tok = yajl_tok_comment;
 438
 439     RETURN_IF_EOF;
 440     c = readChar(lexer, jsonText, offset);
 441
 442     /* either slash or star expected */
 443     if (c == '/') {
 444         /* now we throw away until end of line */
 445         do {
 446             RETURN_IF_EOF;
 447             c = readChar(lexer, jsonText, offset);
 448         } while (c != '\n');
 449     } else if (c == '*') {
 450         /* now we throw away until end of comment */
 451         for (;;) {
 452             RETURN_IF_EOF;
 453             c = readChar(lexer, jsonText, offset);
 454             if (c == '*') {
 455                 RETURN_IF_EOF;
 456                 c = readChar(lexer, jsonText, offset);
 457                 if (c == '/') {
 458                     break;
 459                 } else {
 460                     unreadChar(lexer, offset);
 461                 }
 462             }
 463         }
 464     } else {
 465         lexer->error = yajl_lex_invalid_char;
 466         tok = yajl_tok_error;
 467     }
 468
 469     return tok;
 470 }
 471
 472 yajl_tok
 473 yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
 474              unsigned int jsonTextLen, unsigned int * offset,
 475              const unsigned char ** outBuf, unsigned int * outLen)
 476 {
 477     yajl_tok tok = yajl_tok_error;
 478     unsigned char c;
 479     unsigned int startOffset = *offset;
 480
 481     *outBuf = NULL;
 482     *outLen = 0;
 483
 484     for (;;) {
 485         assert(*offset <= jsonTextLen);
 486
 487         if (*offset >= jsonTextLen) {
 488             tok = yajl_tok_eof;
 489             goto lexed;
 490         }
 491
 492         c = readChar(lexer, jsonText, offset);
 493
 494         switch (c) {
 495             case '{':
 496                 tok = yajl_tok_left_bracket;
 497                 goto lexed;
 498             case '}':
 499                 tok = yajl_tok_right_bracket;
 500                 goto lexed;
 501             case '[':
 502                 tok = yajl_tok_left_brace;
 503                 goto lexed;
 504             case ']':
 505                 tok = yajl_tok_right_brace;
 506                 goto lexed;
 507             case ',':
 508                 tok = yajl_tok_comma;
 509                 goto lexed;
 510             case ':':
 511                 tok = yajl_tok_colon;
 512                 goto lexed;
 513             case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
 514                 startOffset++;
 515                 break;
 516             case 't': {
 517                 const char * want = "rue";
 518                 do {
 519                     if (*offset >= jsonTextLen) {
 520                         tok = yajl_tok_eof;
 521                         goto lexed;
 522                     }
 523                     c = readChar(lexer, jsonText, offset);
 524                     if (c != *want) {
 525                         unreadChar(lexer, offset);
 526                         lexer->error = yajl_lex_invalid_string;
 527                         tok = yajl_tok_error;
 528                         goto lexed;
 529                     }
 530                 } while (*(++want));
 531                 tok = yajl_tok_bool;
 532                 goto lexed;
 533             }
 534             case 'f': {
 535                 const char * want = "alse";
 536                 do {
 537                     if (*offset >= jsonTextLen) {
 538                         tok = yajl_tok_eof;
 539                         goto lexed;
 540                     }
 541                     c = readChar(lexer, jsonText, offset);
 542                     if (c != *want) {
 543                         unreadChar(lexer, offset);
 544                         lexer->error = yajl_lex_invalid_string;
 545                         tok = yajl_tok_error;
 546                         goto lexed;
 547                     }
 548                 } while (*(++want));
 549                 tok = yajl_tok_bool;
 550                 goto lexed;
 551             }
 552             case 'n': {
 553                 const char * want = "ull";
 554                 do {
 555                     if (*offset >= jsonTextLen) {
 556                         tok = yajl_tok_eof;
 557                         goto lexed;
 558                     }
 559                     c = readChar(lexer, jsonText, offset);
 560                     if (c != *want) {
 561                         unreadChar(lexer, offset);
 562                         lexer->error = yajl_lex_invalid_string;
 563                         tok = yajl_tok_error;
 564                         goto lexed;
 565                     }
 566                 } while (*(++want));
 567                 tok = yajl_tok_null;
 568                 goto lexed;
 569             }
 570             case '"': {
 571                 tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
 572                                       jsonTextLen, offset);
 573                 goto lexed;
 574             }
 575             case '-':
 576             case '0': case '1': case '2': case '3': case '4':
 577             case '5': case '6': case '7': case '8': case '9': {
 578                 /* integer parsing wants to start from the beginning */
 579                 unreadChar(lexer, offset);
 580                 tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
 581                                       jsonTextLen, offset);
 582                 goto lexed;
 583             }
 584             case '/':
 585                 /* hey, look, a probable comment!  If comments are disabled
 586                  * it's an error. */
 587                 if (!lexer->allowComments) {
 588                     unreadChar(lexer, offset);
 589                     lexer->error = yajl_lex_unallowed_comment;
 590                     tok = yajl_tok_error;
 591                     goto lexed;
 592                 }
 593                 /* if comments are enabled, then we should try to lex
 594                  * the thing.  possible outcomes are
 595                  * - successful lex (tok_comment, which means continue),
 596                  * - malformed comment opening (slash not followed by
 597                  *   '*' or '/') (tok_error)
 598                  * - eof hit. (tok_eof) */
 599                 tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
 600                                        jsonTextLen, offset);
 601                 if (tok == yajl_tok_comment) {
 602                     /* "error" is silly, but that's the initial
 603                      * state of tok.  guilty until proven innocent. */
 604                     tok = yajl_tok_error;
 605                     yajl_buf_clear(lexer->buf);
 606                     lexer->bufInUse = 0;
 607                     startOffset = *offset;
 608                     break;
 609                 }
 610                 /* hit error or eof, bail */
 611                 goto lexed;
 612             default:
 613                 lexer->error = yajl_lex_invalid_char;
 614                 tok = yajl_tok_error;
 615                 goto lexed;
 616         }
 617     }
 618
 619
 620   lexed:
 621     /* need to append to buffer if the buffer is in use or
 622      * if it's an EOF token */
 623     if (tok == yajl_tok_eof || lexer->bufInUse) {
 624         if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
 625         lexer->bufInUse = 1;
 626         yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
 627         lexer->bufOff = 0;
 628
 629         if (tok != yajl_tok_eof) {
 630             *outBuf = yajl_buf_data(lexer->buf);
 631             *outLen = yajl_buf_len(lexer->buf);
 632             lexer->bufInUse = 0;
 633         }
 634     } else if (tok != yajl_tok_error) {
 635         *outBuf = jsonText + startOffset;
 636         *outLen = *offset - startOffset;
 637     }
 638
 639     /* special case for strings. skip the quotes. */
 640     if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
 641     {
 642         assert(*outLen >= 2);
 643         (*outBuf)++;
 644         *outLen -= 2;
 645     }
 646
 647
 648 #ifdef YAJL_LEXER_DEBUG
 649     if (tok == yajl_tok_error) {
 650         printf("lexical error: %s\n",
 651                yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
 652     } else if (tok == yajl_tok_eof) {
 653         printf("EOF hit\n");
 654     } else {
 655         printf("lexed %s: '", tokToStr(tok));
 656         fwrite(*outBuf, 1, *outLen, stdout);
 657         printf("'\n");
 658     }
 659 #endif
 660
 661     return tok;
 662 }
 663
 664 const char *
 665 yajl_lex_error_to_string(yajl_lex_error error)
 666 {
 667     switch (error) {
 668         case yajl_lex_e_ok:
 669             return "ok, no error";
 670         case yajl_lex_string_invalid_utf8:
 671             return "invalid bytes in UTF8 string.";
 672         case yajl_lex_string_invalid_escaped_char:
 673             return "inside a string, '\\' occurs before a character "
 674                    "which it may not.";
 675         case yajl_lex_string_invalid_json_char:
 676             return "invalid character inside string.";
 677         case yajl_lex_string_invalid_hex_char:
 678             return "invalid (non-hex) character occurs after '\\u' inside "
 679                    "string.";
 680         case yajl_lex_invalid_char:
 681             return "invalid char in json text.";
 682         case yajl_lex_invalid_string:
 683             return "invalid string in json text.";
 684         case yajl_lex_missing_integer_after_exponent:
 685             return "malformed number, a digit is required after the exponent.";
 686         case yajl_lex_missing_integer_after_decimal:
 687             return "malformed number, a digit is required after the "
 688                    "decimal point.";
 689         case yajl_lex_missing_integer_after_minus:
 690             return "malformed number, a digit is required after the "
 691                    "minus sign.";
 692         case yajl_lex_unallowed_comment:
 693             return "probable comment found in input text, comments are "
 694                    "not enabled.";
 695     }
 696     return "unknown error code";
 697 }
 698
 699
 700 /** allows access to more specific information about the lexical
 701  *  error when yajl_lex_lex returns yajl_tok_error. */
 702 yajl_lex_error
 703 yajl_lex_get_error(yajl_lexer lexer)
 704 {
 705     if (lexer == NULL) return (yajl_lex_error) -1;
 706     return lexer->error;
 707 }
 708
 709 unsigned int yajl_lex_current_line(yajl_lexer lexer)
 710 {
 711     return lexer->lineOff;
 712 }
 713
 714 unsigned int yajl_lex_current_char(yajl_lexer lexer)
 715 {
 716     return lexer->charOff;
 717 }
 718
 719 yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
 720                        unsigned int jsonTextLen, unsigned int offset)
 721 {
 722     const unsigned char * outBuf;
 723     unsigned int outLen;
 724     unsigned int bufLen = yajl_buf_len(lexer->buf);
 725     unsigned int bufOff = lexer->bufOff;
 726     unsigned int bufInUse = lexer->bufInUse;
 727     yajl_tok tok;
 728
 729     tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
 730                        &outBuf, &outLen);
 731
 732     lexer->bufOff = bufOff;
 733     lexer->bufInUse = bufInUse;
 734     yajl_buf_truncate(lexer->buf, bufLen);
 735
 736     return tok;
 737 }