git.gag.com Git - fw/sdcc/blob - support/cpp2/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26 #include <assert.h>
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, U s  },
  46 #define TK(e, s) { SPELL_ ## s,    U #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 /* Returns with a logical line that contains no escaped newlines or
  99    trigraphs.  This is a time-critical inner loop.  */
 100 void
 101 _cpp_clean_line (cpp_reader *pfile)
 102 {
 103   cpp_buffer *buffer;
 104   const uchar *s;
 105   uchar c, *d, *p;
 106
 107   buffer = pfile->buffer;
 108   buffer->cur_note = buffer->notes_used = 0;
 109   buffer->cur = buffer->line_base = buffer->next_line;
 110   buffer->need_line = false;
 111   s = buffer->next_line - 1;
 112
 113   if (!buffer->from_stage3)
 114     {
 115       /* Short circuit for the common case of an un-escaped line with
 116          no trigraphs.  The primary win here is by not writing any
 117          data back to memory until we have to.  */
 118       for (;;)
 119         {
 120           c = *++s;
 121           if (c == '\n' || c == '\r')
 122             {
 123               d = (uchar *) s;
 124
 125               if (s == buffer->rlimit)
 126                 goto done;
 127
 128               /* DOS line ending? */
 129               if (c == '\r' && s[1] == '\n')
 130                 s++;
 131
 132               if (s == buffer->rlimit)
 133                 goto done;
 134
 135               /* check for escaped newline */
 136               p = d;
 137               while (p != buffer->next_line && is_nvspace (p[-1]))
 138                 p--;
 139               if (p == buffer->next_line || p[-1] != '\\')
 140                 goto done;
 141
 142               /* Have an escaped newline; process it and proceed to
 143                  the slow path.  */
 144               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 145               d = p - 2;
 146               buffer->next_line = p - 1;
 147               break;
 148             }
 149           if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 150             {
 151               /* Have a trigraph.  We may or may not have to convert
 152                  it.  Add a line note regardless, for -Wtrigraphs.  */
 153               add_line_note (buffer, s, s[2]);
 154               if (CPP_OPTION (pfile, trigraphs))
 155                 {
 156                   /* We do, and that means we have to switch to the
 157                      slow path.  */
 158                   d = (uchar *) s;
 159                   *d = _cpp_trigraph_map[s[2]];
 160                   s += 2;
 161                   break;
 162                 }
 163             }
 164         }
 165
 166
 167       for (;;)
 168         {
 169           c = *++s;
 170           *++d = c;
 171
 172           if (c == '\n' || c == '\r')
 173             {
 174                   /* Handle DOS line endings.  */
 175               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 176                 s++;
 177               if (s == buffer->rlimit)
 178                 break;
 179
 180               /* Escaped?  */
 181               p = d;
 182               while (p != buffer->next_line && is_nvspace (p[-1]))
 183                 p--;
 184               if (p == buffer->next_line || p[-1] != '\\')
 185                 break;
 186
 187               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 188               d = p - 2;
 189               buffer->next_line = p - 1;
 190             }
 191           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 192             {
 193               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 194               add_line_note (buffer, d, s[2]);
 195               if (CPP_OPTION (pfile, trigraphs))
 196                 {
 197                   *d = _cpp_trigraph_map[s[2]];
 198                   s += 2;
 199                 }
 200             }
 201         }
 202     }
 203   else
 204     {
 205       do
 206         s++;
 207       while (*s != '\n' && *s != '\r');
 208       d = (uchar *) s;
 209
 210       /* Handle DOS line endings.  */
 211       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 212         s++;
 213     }
 214
 215  done:
 216   *d = '\n';
 217   /* A sentinel note that should never be processed.  */
 218   add_line_note (buffer, d + 1, '\n');
 219   buffer->next_line = s + 1;
 220 }
 221
 222 /* Return true if the trigraph indicated by NOTE should be warned
 223    about in a comment.  */
 224 static bool
 225 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 226 {
 227   const uchar *p;
 228
 229   /* Within comments we don't warn about trigraphs, unless the
 230      trigraph forms an escaped newline, as that may change
 231      behavior.  */
 232   if (note->type != '/')
 233     return false;
 234
 235   /* If -trigraphs, then this was an escaped newline iff the next note
 236      is coincident.  */
 237   if (CPP_OPTION (pfile, trigraphs))
 238     return note[1].pos == note->pos;
 239
 240   /* Otherwise, see if this forms an escaped newline.  */
 241   p = note->pos + 3;
 242   while (is_nvspace (*p))
 243     p++;
 244
 245   /* There might have been escaped newlines between the trigraph and the
 246      newline we found.  Hence the position test.  */
 247   return (*p == '\n' && p < note[1].pos);
 248 }
 249
 250 /* Process the notes created by add_line_note as far as the current
 251    location.  */
 252 void
 253 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 254 {
 255   cpp_buffer *buffer = pfile->buffer;
 256
 257   for (;;)
 258     {
 259       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 260       unsigned int col;
 261
 262       if (note->pos > buffer->cur)
 263         break;
 264
 265       buffer->cur_note++;
 266       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 267
 268       if (note->type == '\\' || note->type == ' ')
 269         {
 270           if (note->type == ' ' && !in_comment)
 271             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 272                                  "backslash and newline separated by space");
 273
 274           if (buffer->next_line > buffer->rlimit)
 275             {
 276               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 277                                    "backslash-newline at end of file");
 278               /* Prevent "no newline at end of file" warning.  */
 279               buffer->next_line = buffer->rlimit;
 280             }
 281
 282           buffer->line_base = note->pos;
 283           CPP_INCREMENT_LINE (pfile, 0);
 284         }
 285       else if (_cpp_trigraph_map[note->type])
 286         {
 287           if (CPP_OPTION (pfile, warn_trigraphs)
 288               && (!in_comment || warn_in_comment (pfile, note)))
 289             {
 290               if (CPP_OPTION (pfile, trigraphs))
 291                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 292                                      "trigraph ??%c converted to %c",
 293                                      note->type,
 294                                      (int) _cpp_trigraph_map[note->type]);
 295               else
 296                 {
 297                   cpp_error_with_line
 298                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 299                      "trigraph ??%c ignored, use -trigraphs to enable",
 300                      note->type);
 301                 }
 302             }
 303         }
 304       else
 305         abort ();
 306     }
 307 }
 308
 309 /* SDCC _asm specific */
 310 /* Skip an _asm ... _endasm block.  We find the end of the comment by
 311    seeing _endasm.  Returns non-zero if _asm terminated by EOF, zero
 312    otherwise.  */
 313 static int
 314 skip_asm_block (cpp_reader *pfile)
 315 {
 316 #define _ENDASM_STR "endasm"
 317 #define _ENDASM_LEN ((sizeof _ENDASM_STR) - 1)
 318
 319   cpp_buffer *buffer = pfile->buffer;
 320   cppchar_t c = EOF;
 321   int prev_space = 0;
 322   int ret = 1;
 323
 324   while (buffer->cur != buffer->rlimit)
 325     {
 326       prev_space = is_space(c);
 327       c = *buffer->cur++;
 328
 329       if (prev_space && c == '_')
 330         {
 331           if (buffer->cur + _ENDASM_LEN <= buffer->rlimit &&
 332             strncmp((char *)buffer->cur, _ENDASM_STR, _ENDASM_LEN) == 0)
 333             {
 334               buffer->cur += _ENDASM_LEN;
 335               ret = 0;
 336               break;
 337             }
 338         }
 339       else if (c == '\n')
 340         {
 341           unsigned int cols;
 342           --buffer->cur;
 343           _cpp_process_line_notes (pfile, true);
 344           if (buffer->next_line >= buffer->rlimit)
 345             return true;
 346           _cpp_clean_line (pfile);
 347
 348           cols = buffer->next_line - buffer->line_base;
 349           CPP_INCREMENT_LINE (pfile, cols);
 350         }
 351     }
 352
 353   _cpp_process_line_notes (pfile, true);
 354   return ret;
 355 }
 356
 357 /* Skip a C-style block comment.  We find the end of the comment by
 358    seeing if an asterisk is before every '/' we encounter.  Returns
 359    nonzero if comment terminated by EOF, zero otherwise.
 360
 361    Buffer->cur points to the initial asterisk of the comment.  */
 362 bool
 363 _cpp_skip_block_comment (cpp_reader *pfile)
 364 {
 365   cpp_buffer *buffer = pfile->buffer;
 366   const uchar *cur = buffer->cur;
 367   uchar c;
 368
 369   cur++;
 370   if (*cur == '/')
 371     cur++;
 372
 373   for (;;)
 374     {
 375       /* People like decorating comments with '*', so check for '/'
 376          instead for efficiency.  */
 377       c = *cur++;
 378
 379       if (c == '/')
 380         {
 381           if (cur[-2] == '*')
 382             break;
 383
 384           /* Warn about potential nested comments, but not if the '/'
 385              comes immediately before the true comment delimiter.
 386              Don't bother to get it right across escaped newlines.  */
 387           if (CPP_OPTION (pfile, warn_comments)
 388               && cur[0] == '*' && cur[1] != '/')
 389             {
 390               buffer->cur = cur;
 391               cpp_error_with_line (pfile, CPP_DL_WARNING,
 392                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 393                                    "\"/*\" within comment");
 394             }
 395         }
 396       else if (c == '\n')
 397         {
 398           unsigned int cols;
 399           buffer->cur = cur - 1;
 400           _cpp_process_line_notes (pfile, true);
 401           if (buffer->next_line >= buffer->rlimit)
 402             return true;
 403           _cpp_clean_line (pfile);
 404
 405           cols = buffer->next_line - buffer->line_base;
 406           CPP_INCREMENT_LINE (pfile, cols);
 407
 408           cur = buffer->cur;
 409         }
 410     }
 411
 412   buffer->cur = cur;
 413   _cpp_process_line_notes (pfile, true);
 414   return false;
 415 }
 416
 417 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 418    terminating newline.  Handles escaped newlines.  Returns nonzero
 419    if a multiline comment.  */
 420 static int
 421 skip_line_comment (cpp_reader *pfile)
 422 {
 423   cpp_buffer *buffer = pfile->buffer;
 424   unsigned int orig_line = pfile->line_table->highest_line;
 425
 426   while (*buffer->cur != '\n')
 427     buffer->cur++;
 428
 429   _cpp_process_line_notes (pfile, true);
 430   return orig_line != pfile->line_table->highest_line;
 431 }
 432
 433 /* Skips whitespace, saving the next non-whitespace character.  */
 434 static void
 435 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 436 {
 437   cpp_buffer *buffer = pfile->buffer;
 438   bool saw_NUL = false;
 439
 440   do
 441     {
 442       /* Horizontal space always OK.  */
 443       if (c == ' ' || c == '\t')
 444         ;
 445       /* Just \f \v or \0 left.  */
 446       else if (c == '\0')
 447         saw_NUL = true;
 448       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 449         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 450                              CPP_BUF_COL (buffer),
 451                              "%s in preprocessing directive",
 452                              c == '\f' ? "form feed" : "vertical tab");
 453
 454       c = *buffer->cur++;
 455     }
 456   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 457   while (is_nvspace (c));
 458
 459   if (saw_NUL)
 460     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 461
 462   buffer->cur--;
 463 }
 464
 465 /* See if the characters of a number token are valid in a name (no
 466    '.', '+' or '-').  */
 467 static int
 468 name_p (cpp_reader *pfile, const cpp_string *string)
 469 {
 470   unsigned int i;
 471
 472   for (i = 0; i < string->len; i++)
 473     if (!is_idchar (string->text[i]))
 474       return 0;
 475
 476   return 1;
 477 }
 478
 479 /* After parsing an identifier or other sequence, produce a warning about
 480    sequences not in NFC/NFKC.  */
 481 static void
 482 warn_about_normalization (cpp_reader *pfile,
 483                           const cpp_token *token,
 484                           const struct normalize_state *s)
 485 {
 486   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 487       && !pfile->state.skipping)
 488     {
 489       /* Make sure that the token is printed using UCNs, even
 490          if we'd otherwise happily print UTF-8.  */
 491       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 492       size_t sz;
 493
 494       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 495       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 496         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 497                              "`%.*s' is not in NFKC", (int) sz, buf);
 498       else
 499         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 500                              "`%.*s' is not in NFC", (int) sz, buf);
 501     }
 502 }
 503
 504 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 505    an identifier.  FIRST is TRUE if this starts an identifier.  */
 506 static bool
 507 forms_identifier_p (cpp_reader *pfile, int first,
 508                     struct normalize_state *state)
 509 {
 510   cpp_buffer *buffer = pfile->buffer;
 511
 512   if (*buffer->cur == '$')
 513     {
 514       if (!CPP_OPTION (pfile, dollars_in_ident))
 515         return false;
 516
 517       buffer->cur++;
 518       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 519         {
 520           CPP_OPTION (pfile, warn_dollars) = 0;
 521           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 522         }
 523
 524       return true;
 525     }
 526
 527   /* Is this a syntactically valid UCN?  */
 528   if (CPP_OPTION (pfile, extended_identifiers)
 529       && *buffer->cur == '\\'
 530       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 531     {
 532       buffer->cur += 2;
 533       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 534                           state))
 535         return true;
 536       buffer->cur -= 2;
 537     }
 538
 539   return false;
 540 }
 541
 542 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 543 static cpp_hashnode *
 544 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 545                 struct normalize_state *nst)
 546 {
 547   cpp_hashnode *result;
 548   const uchar *cur;
 549   unsigned int len;
 550   unsigned int hash = HT_HASHSTEP (0, *base);
 551
 552   cur = pfile->buffer->cur;
 553   if (! starts_ucn)
 554     while (ISIDNUM (*cur))
 555       {
 556         hash = HT_HASHSTEP (hash, *cur);
 557         cur++;
 558       }
 559   pfile->buffer->cur = cur;
 560   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 561     {
 562       /* Slower version for identifiers containing UCNs (or $).  */
 563       do {
 564         while (ISIDNUM (*pfile->buffer->cur))
 565           {
 566             pfile->buffer->cur++;
 567             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 568           }
 569       } while (forms_identifier_p (pfile, false, nst));
 570       result = _cpp_interpret_identifier (pfile, base,
 571                                           pfile->buffer->cur - base);
 572     }
 573   else
 574     {
 575       len = cur - base;
 576       hash = HT_HASHFINISH (hash, len);
 577
 578       result = (cpp_hashnode *)
 579         ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
 580     }
 581
 582   /* Rarely, identifiers require diagnostics when lexed.  */
 583   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 584                         && !pfile->state.skipping, 0))
 585     {
 586       /* It is allowed to poison the same identifier twice.  */
 587       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 588         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 589                    NODE_NAME (result));
 590
 591       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 592          replacement list of a variadic macro.  */
 593       if (result == pfile->spec_nodes.n__VA_ARGS__
 594           && !pfile->state.va_args_ok)
 595         cpp_error (pfile, CPP_DL_PEDWARN,
 596                    "__VA_ARGS__ can only appear in the expansion"
 597                    " of a C99 variadic macro");
 598     }
 599
 600   return result;
 601 }
 602
 603 /* SDCC specific */
 604 /* Pedantic parse a number, beginning with character C, skipping embedded
 605    backslash-newlines.  LEADING_PERIOD is nonzero if there was a "."
 606    before C.  Place the result in NUMBER.  */
 607 static void
 608 pedantic_lex_number (cpp_reader *pfile, cpp_string *number)
 609 {
 610 #define get_effective_char(pfile) (*pfile->buffer->cur++)
 611 #define BACKUP() (--pfile->buffer->cur)
 612
 613   enum num_type_e { NT_DEC, NT_HEX } num_type = NT_DEC;
 614   enum num_part_e { NP_WHOLE, NP_FRACT, NP_EXP, NP_INT_SUFFIX, NP_FLOAT_SUFFIX } num_part = NP_WHOLE;
 615
 616   uchar c = *(pfile->buffer->cur - 1);
 617   struct obstack *stack = &pfile->hash_table->stack;
 618   int len = 0;
 619   int has_whole = 0;
 620   int has_fract = 0;
 621
 622   if ('.' == c)
 623     {
 624       num_part = NP_FRACT;
 625       ++len;
 626       obstack_1grow (stack, '.');
 627       c = get_effective_char(pfile);
 628     }
 629   else
 630     {
 631       if ('0' == c)
 632         {
 633           has_whole = 1;
 634           ++len;
 635           obstack_1grow (stack, c);
 636           c = get_effective_char(pfile);
 637
 638           switch (c)
 639             {
 640             case 'X':
 641             case 'x':
 642               num_type = NT_HEX;
 643               ++len;
 644               obstack_1grow (stack, c);
 645               c = get_effective_char(pfile);
 646               break;
 647
 648             case '.':
 649               num_part = NP_FRACT;
 650               ++len;
 651               obstack_1grow (stack, c);
 652               c = get_effective_char(pfile);
 653               break;
 654             }
 655         }
 656     }
 657
 658   for (; ; )
 659     {
 660       switch (num_part)
 661         {
 662         case NP_WHOLE:
 663           if (NT_DEC == num_type)
 664             {
 665               while (ISDIGIT (c))
 666                 {
 667                   has_whole = 1;
 668                   ++len;
 669                   obstack_1grow (stack, c);
 670                   c = get_effective_char(pfile);
 671                 }
 672
 673               if ('.' == c)
 674                 {
 675                   num_part = NP_FRACT;
 676                   ++len;
 677                   obstack_1grow (stack, c);
 678                   c = get_effective_char(pfile);
 679                   continue;
 680                 }
 681               else if ('E' == c || 'e' == c)
 682                 {
 683                   if (has_whole || has_fract)
 684                   {
 685                     num_part = NP_EXP;
 686                     ++len;
 687                     obstack_1grow (stack, c);
 688                     c = get_effective_char(pfile);
 689                     continue;
 690                   }
 691                   else
 692                     break;
 693                 }
 694             }
 695           else
 696             {
 697               while (ISXDIGIT (c))
 698                 {
 699                   has_whole = 1;
 700                   ++len;
 701                   obstack_1grow (stack, c);
 702                   c = get_effective_char(pfile);
 703                 }
 704
 705               if ('.' == c)
 706                 {
 707                   num_part = NP_FRACT;
 708                   ++len;
 709                   obstack_1grow (stack, c);
 710                   c = get_effective_char(pfile);
 711                   continue;
 712                 }
 713               else if ('P' == c || 'p' == c)
 714                 {
 715                   if (has_whole || has_fract)
 716                     {
 717                       num_part = NP_EXP;
 718                       ++len;
 719                       obstack_1grow (stack, c);
 720                       c = get_effective_char(pfile);
 721                       continue;
 722                     }
 723                   else
 724                     break;
 725                 }
 726             }
 727           num_part = NP_INT_SUFFIX;
 728           continue;
 729
 730         case NP_FRACT:
 731           if (NT_DEC == num_type)
 732             {
 733               while (ISDIGIT (c))
 734                 {
 735                   has_fract = 1;
 736                   ++len;
 737                   obstack_1grow (stack, c);
 738                   c = get_effective_char(pfile);
 739                 }
 740
 741               if ('E' == c || 'e' == c)
 742                 {
 743                   if (has_whole || has_fract)
 744                     {
 745                       num_part = NP_EXP;
 746                       ++len;
 747                       obstack_1grow (stack, c);
 748                       c = get_effective_char(pfile);
 749                       continue;
 750                     }
 751                 }
 752             }
 753           else
 754             {
 755               while (ISXDIGIT (c))
 756                 {
 757                   has_fract = 1;
 758                   ++len;
 759                   obstack_1grow (stack, c);
 760                   c = get_effective_char(pfile);
 761                 }
 762
 763               if ('P' == c || 'p' == c)
 764                 {
 765                   if (has_whole || has_fract)
 766                     {
 767                       num_part = NP_EXP;
 768                       ++len;
 769                       obstack_1grow (stack, c);
 770                       c = get_effective_char(pfile);
 771                       continue;
 772                     }
 773                 }
 774             }
 775           num_part = NP_FLOAT_SUFFIX;
 776           continue;
 777
 778         case NP_EXP:
 779           if ('+' == c || '-' == c)
 780             {
 781               ++len;
 782               obstack_1grow (stack, c);
 783               c = get_effective_char(pfile);
 784             }
 785
 786           while (ISDIGIT (c))
 787             {
 788               ++len;
 789               obstack_1grow (stack, c);
 790               c = get_effective_char(pfile);
 791             }
 792
 793           num_part = NP_FLOAT_SUFFIX;
 794           continue;
 795
 796         case NP_INT_SUFFIX:
 797            if ('L' == c || 'l' == c)
 798             {
 799               uchar prevc = c;
 800
 801               ++len;
 802               obstack_1grow (stack, c);
 803               c = get_effective_char(pfile);
 804
 805               if (c == prevc)
 806                 {
 807                   ++len;
 808                   obstack_1grow (stack, c);
 809                   c = get_effective_char(pfile);
 810                 }
 811             }
 812           else if ('U' == c || 'u' == c)
 813             {
 814               ++len;
 815               obstack_1grow (stack, c);
 816               c = get_effective_char(pfile);
 817             }
 818           break;
 819
 820         case NP_FLOAT_SUFFIX:
 821            if ('F' == c || 'f' == c)
 822             {
 823               ++len;
 824               obstack_1grow (stack, c);
 825               c = get_effective_char(pfile);
 826             }
 827           else if ('L' == c || 'l' == c)
 828             {
 829               ++len;
 830               obstack_1grow (stack, c);
 831               c = get_effective_char(pfile);
 832             }
 833           break;
 834         }
 835       break;
 836     }
 837
 838   /* Step back over the unwanted char.  */
 839   BACKUP ();
 840
 841   number->text = obstack_finish (stack);
 842   number->len = len;
 843 }
 844
 845 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 846 static void
 847 lex_number (cpp_reader *pfile, cpp_string *number,
 848             struct normalize_state *nst)
 849 {
 850   const uchar *cur;
 851   const uchar *base;
 852   uchar *dest;
 853
 854   base = pfile->buffer->cur - 1;
 855   do
 856     {
 857       cur = pfile->buffer->cur;
 858
 859       /* N.B. ISIDNUM does not include $.  */
 860       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 861         {
 862           cur++;
 863           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 864         }
 865
 866       pfile->buffer->cur = cur;
 867     }
 868   while (forms_identifier_p (pfile, false, nst));
 869
 870   number->len = cur - base;
 871   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 872   memcpy (dest, base, number->len);
 873   dest[number->len] = '\0';
 874   number->text = dest;
 875 }
 876
 877 /* Create a token of type TYPE with a literal spelling.  */
 878 static void
 879 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 880                 unsigned int len, enum cpp_ttype type)
 881 {
 882   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 883
 884   memcpy (dest, base, len);
 885   dest[len] = '\0';
 886   token->type = type;
 887   token->val.str.len = len;
 888   token->val.str.text = dest;
 889 }
 890
 891 /* Lexes a string, character constant, or angle-bracketed header file
 892    name.  The stored string contains the spelling, including opening
 893    quote and leading any leading 'L'.  It returns the type of the
 894    literal, or CPP_OTHER if it was not properly terminated.
 895
 896    The spelling is NUL-terminated, but it is not guaranteed that this
 897    is the first NUL since embedded NULs are preserved.  */
 898 static void
 899 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 900 {
 901   bool saw_NUL = false;
 902   const uchar *cur;
 903   cppchar_t terminator;
 904   enum cpp_ttype type;
 905
 906   cur = base;
 907   terminator = *cur++;
 908   if (terminator == 'L')
 909     terminator = *cur++;
 910   if (terminator == '\"')
 911     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
 912   else if (terminator == '\'')
 913     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
 914   else
 915     terminator = '>', type = CPP_HEADER_NAME;
 916
 917   for (;;)
 918     {
 919       cppchar_t c = *cur++;
 920
 921       /* In #include-style directives, terminators are not escapable.  */
 922       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 923         cur++;
 924       else if (c == terminator)
 925         break;
 926       else if (c == '\n')
 927         {
 928           cur--;
 929           type = CPP_OTHER;
 930           break;
 931         }
 932       else if (c == '\0')
 933         saw_NUL = true;
 934     }
 935
 936   if (saw_NUL && !pfile->state.skipping)
 937     cpp_error (pfile, CPP_DL_WARNING,
 938                "null character(s) preserved in literal");
 939
 940   pfile->buffer->cur = cur;
 941   create_literal (pfile, token, base, cur - base, type);
 942 }
 943
 944 /* Fixed _WIN32 problem with CR-CR-LF sequences when outputting
 945    comment blocks (when executed with -C option) and
 946    _asm (SDCPP specific) blocks */
 947
 948 /* Count and copy characters from src to dest, excluding CRs:
 949    CRs are automatically generated, because the output is
 950    opened in TEXT mode. If dest == NULL, only count chars */
 951 static unsigned int
 952 copy_text_chars (unsigned char *dest, const unsigned char *src, unsigned int len)
 953 {
 954   unsigned int n = 0;
 955   const unsigned char *p;
 956
 957   for (p = src; p != src + len; ++p)
 958     {
 959       assert(*p != '\0');
 960
 961       if (*p != '\r')
 962         {
 963           if (dest != NULL)
 964             *dest++ = *p;
 965           ++n;
 966         }
 967     }
 968
 969     return n;
 970 }
 971
 972 /* SDCC _asm specific */
 973 /* The stored comment includes the comment start and any terminator.  */
 974 static void
 975 save_asm (cpp_reader *pfile, cpp_token *token, const unsigned char *from)
 976 {
 977 #define _ASM_STR  "_asm"
 978 #define _ASM_LEN  ((sizeof _ASM_STR) - 1)
 979
 980   unsigned char *buffer;
 981   unsigned int text_len, len;
 982
 983   len = pfile->buffer->cur - from;
 984   /* + _ASM_LEN for the initial '_asm'.  */
 985   text_len = copy_text_chars (NULL, from, len) + _ASM_LEN;
 986   buffer = _cpp_unaligned_alloc (pfile, text_len);
 987
 988
 989   token->type = CPP_ASM;
 990   token->val.str.len = text_len;
 991   token->val.str.text = buffer;
 992
 993   memcpy (buffer, _ASM_STR, _ASM_LEN);
 994   copy_text_chars (buffer + _ASM_LEN, from, len);
 995 }
 996
 997 /* The stored comment includes the comment start and any terminator.  */
 998 static void
 999 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1000               cppchar_t type)
1001 {
1002   unsigned char *buffer;
1003   unsigned int len, clen;
1004
1005   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1006
1007   /* C++ comments probably (not definitely) have moved past a new
1008      line, which we don't want to save in the comment.  */
1009   if (is_vspace (pfile->buffer->cur[-1]))
1010     len--;
1011
1012   /* If we are currently in a directive, then we need to store all
1013      C++ comments as C comments internally, and so we need to
1014      allocate a little extra space in that case.
1015
1016      Note that the only time we encounter a directive here is
1017      when we are saving comments in a "#define".  */
1018   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1019
1020   buffer = _cpp_unaligned_alloc (pfile, clen);
1021
1022   token->type = CPP_COMMENT;
1023   token->val.str.len = clen;
1024   token->val.str.text = buffer;
1025
1026   buffer[0] = '/';
1027   copy_text_chars (buffer + 1, from, len);
1028
1029   /* Finish conversion to a C comment, if necessary.  */
1030   if (pfile->state.in_directive && type == '/')
1031     {
1032       buffer[1] = '*';
1033       buffer[clen - 2] = '*';
1034       buffer[clen - 1] = '/';
1035     }
1036 }
1037
1038 /* Allocate COUNT tokens for RUN.  */
1039 void
1040 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1041 {
1042   run->base = XNEWVEC (cpp_token, count);
1043   run->limit = run->base + count;
1044   run->next = NULL;
1045 }
1046
1047 /* Returns the next tokenrun, or creates one if there is none.  */
1048 static tokenrun *
1049 next_tokenrun (tokenrun *run)
1050 {
1051   if (run->next == NULL)
1052     {
1053       run->next = XNEW (tokenrun);
1054       run->next->prev = run;
1055       _cpp_init_tokenrun (run->next, 250);
1056     }
1057
1058   return run->next;
1059 }
1060
1061 /* Allocate a single token that is invalidated at the same time as the
1062    rest of the tokens on the line.  Has its line and col set to the
1063    same as the last lexed token, so that diagnostics appear in the
1064    right place.  */
1065 cpp_token *
1066 _cpp_temp_token (cpp_reader *pfile)
1067 {
1068   cpp_token *old, *result;
1069
1070   old = pfile->cur_token - 1;
1071   if (pfile->cur_token == pfile->cur_run->limit)
1072     {
1073       pfile->cur_run = next_tokenrun (pfile->cur_run);
1074       pfile->cur_token = pfile->cur_run->base;
1075     }
1076
1077   result = pfile->cur_token++;
1078   result->src_loc = old->src_loc;
1079   return result;
1080 }
1081
1082 /* Lex a token into RESULT (external interface).  Takes care of issues
1083    like directive handling, token lookahead, multiple include
1084    optimization and skipping.  */
1085 const cpp_token *
1086 _cpp_lex_token (cpp_reader *pfile)
1087 {
1088   cpp_token *result;
1089
1090   for (;;)
1091     {
1092       if (pfile->cur_token == pfile->cur_run->limit)
1093         {
1094           pfile->cur_run = next_tokenrun (pfile->cur_run);
1095           pfile->cur_token = pfile->cur_run->base;
1096         }
1097
1098       if (pfile->lookaheads)
1099         {
1100           pfile->lookaheads--;
1101           result = pfile->cur_token++;
1102         }
1103       else
1104         result = _cpp_lex_direct (pfile);
1105
1106       if (result->flags & BOL)
1107         {
1108           /* Is this a directive.  If _cpp_handle_directive returns
1109              false, it is an assembler #.  */
1110           if (result->type == CPP_HASH
1111               /* 6.10.3 p 11: Directives in a list of macro arguments
1112                  gives undefined behavior.  This implementation
1113                  handles the directive as normal.  */
1114               && pfile->state.parsing_args != 1
1115               && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1116             {
1117               if (pfile->directive_result.type == CPP_PADDING)
1118                 continue;
1119               else
1120                 {
1121                   result = &pfile->directive_result;
1122                   break;
1123                 }
1124             }
1125
1126           if (pfile->cb.line_change && !pfile->state.skipping)
1127             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1128         }
1129
1130       /* We don't skip tokens in directives.  */
1131       if (pfile->state.in_directive)
1132         break;
1133
1134       /* Outside a directive, invalidate controlling macros.  At file
1135          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1136          get here and MI optimization works.  */
1137       pfile->mi_valid = false;
1138
1139       if (!pfile->state.skipping || result->type == CPP_EOF)
1140         break;
1141     }
1142
1143   return result;
1144 }
1145
1146 /* Returns true if a fresh line has been loaded.  */
1147 bool
1148 _cpp_get_fresh_line (cpp_reader *pfile)
1149 {
1150   int return_at_eof;
1151
1152   /* We can't get a new line until we leave the current directive.  */
1153   if (pfile->state.in_directive)
1154     return false;
1155
1156   for (;;)
1157     {
1158       cpp_buffer *buffer = pfile->buffer;
1159
1160       if (!buffer->need_line)
1161         return true;
1162
1163       if (buffer->next_line < buffer->rlimit)
1164         {
1165           _cpp_clean_line (pfile);
1166           return true;
1167         }
1168
1169       /* First, get out of parsing arguments state.  */
1170       if (pfile->state.parsing_args)
1171         return false;
1172
1173       /* End of buffer.  Non-empty files should end in a newline.  */
1174       if (buffer->buf != buffer->rlimit
1175           && buffer->next_line > buffer->rlimit
1176           && !buffer->from_stage3)
1177         {
1178           /* Only warn once.  */
1179           buffer->next_line = buffer->rlimit;
1180           cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1181                                CPP_BUF_COLUMN (buffer, buffer->cur),
1182                                "no newline at end of file");
1183         }
1184
1185       return_at_eof = buffer->return_at_eof;
1186       _cpp_pop_buffer (pfile);
1187       if (pfile->buffer == NULL || return_at_eof)
1188         return false;
1189     }
1190 }
1191
1192 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1193   do                                                    \
1194     {                                                   \
1195       result->type = ELSE_TYPE;                         \
1196       if (*buffer->cur == CHAR)                         \
1197         buffer->cur++, result->type = THEN_TYPE;        \
1198     }                                                   \
1199   while (0)
1200
1201 /* Lex a token into pfile->cur_token, which is also incremented, to
1202    get diagnostics pointing to the correct location.
1203
1204    Does not handle issues such as token lookahead, multiple-include
1205    optimization, directives, skipping etc.  This function is only
1206    suitable for use by _cpp_lex_token, and in special cases like
1207    lex_expansion_token which doesn't care for any of these issues.
1208
1209    When meeting a newline, returns CPP_EOF if parsing a directive,
1210    otherwise returns to the start of the token buffer if permissible.
1211    Returns the location of the lexed token.  */
1212 cpp_token *
1213 _cpp_lex_direct (cpp_reader *pfile)
1214 {
1215   cppchar_t c;
1216   cpp_buffer *buffer;
1217   const unsigned char *comment_start;
1218   cpp_token *result = pfile->cur_token++;
1219
1220  fresh_line:
1221   result->flags = 0;
1222   buffer = pfile->buffer;
1223   if (buffer->need_line)
1224     {
1225       if (!_cpp_get_fresh_line (pfile))
1226         {
1227           result->type = CPP_EOF;
1228           if (!pfile->state.in_directive)
1229             {
1230               /* Tell the compiler the line number of the EOF token.  */
1231               result->src_loc = pfile->line_table->highest_line;
1232               result->flags = BOL;
1233             }
1234           return result;
1235         }
1236       if (!pfile->keep_tokens)
1237         {
1238           pfile->cur_run = &pfile->base_run;
1239           result = pfile->base_run.base;
1240           pfile->cur_token = result + 1;
1241         }
1242       result->flags = BOL;
1243       if (pfile->state.parsing_args == 2)
1244         result->flags |= PREV_WHITE;
1245     }
1246   buffer = pfile->buffer;
1247  update_tokens_line:
1248   result->src_loc = pfile->line_table->highest_line;
1249
1250  skipped_white:
1251   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1252       && !pfile->overlaid_buffer)
1253     {
1254       _cpp_process_line_notes (pfile, false);
1255       result->src_loc = pfile->line_table->highest_line;
1256     }
1257   c = *buffer->cur++;
1258
1259   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1260                                CPP_BUF_COLUMN (buffer, buffer->cur));
1261
1262   switch (c)
1263     {
1264     case ' ': case '\t': case '\f': case '\v': case '\0':
1265       result->flags |= PREV_WHITE;
1266       skip_whitespace (pfile, c);
1267       goto skipped_white;
1268
1269     case '\n':
1270       if (buffer->cur < buffer->rlimit)
1271         CPP_INCREMENT_LINE (pfile, 0);
1272       buffer->need_line = true;
1273       goto fresh_line;
1274
1275     case '0': case '1': case '2': case '3': case '4':
1276     case '5': case '6': case '7': case '8': case '9':
1277       {
1278         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1279         result->type = CPP_NUMBER;
1280         if (CPP_OPTION(pfile, pedantic_parse_number))
1281           pedantic_lex_number (pfile, &result->val.str);
1282         else
1283           lex_number (pfile, &result->val.str, &nst);
1284         warn_about_normalization (pfile, result, &nst);
1285         break;
1286       }
1287
1288     case 'L':
1289       /* 'L' may introduce wide characters or strings.  */
1290       if (*buffer->cur == '\'' || *buffer->cur == '"')
1291         {
1292           lex_string (pfile, result, buffer->cur - 1);
1293           break;
1294         }
1295       /* Fall through.  */
1296
1297     case '_':
1298     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1299     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1300     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1301     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1302     case 'y': case 'z':
1303     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1304     case 'G': case 'H': case 'I': case 'J': case 'K':
1305     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1306     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1307     case 'Y': case 'Z':
1308       result->type = CPP_NAME;
1309       {
1310         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1311         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1312                                            &nst);
1313         warn_about_normalization (pfile, result, &nst);
1314       }
1315
1316       /* SDCC _asm specific */
1317       /* handle _asm ... _endasm ;  */
1318       if (CPP_OPTION(pfile, preproc_asm) == 0 && result->val.node == pfile->spec_nodes.n__asm)
1319         {
1320           comment_start = buffer->cur;
1321           result->type = CPP_ASM;
1322           skip_asm_block (pfile);
1323           /* Save the _asm block as a token in its own right.  */
1324           save_asm (pfile, result, comment_start);
1325         }
1326       /* Convert named operators to their proper types.  */
1327       else if (result->val.node->flags & NODE_OPERATOR)
1328         {
1329           result->flags |= NAMED_OP;
1330           result->type = (enum cpp_ttype) result->val.node->directive_index;
1331         }
1332       break;
1333
1334     case '\'':
1335     case '"':
1336       lex_string (pfile, result, buffer->cur - 1);
1337       break;
1338
1339     case '/':
1340       /* A potential block or line comment.  */
1341       comment_start = buffer->cur;
1342       c = *buffer->cur;
1343
1344       if (c == '*')
1345         {
1346           if (_cpp_skip_block_comment (pfile))
1347             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1348         }
1349       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1350                             || cpp_in_system_header (pfile)))
1351         {
1352           /* Warn about comments only if pedantically GNUC89, and not
1353              in system headers.  */
1354           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1355               && ! buffer->warned_cplusplus_comments)
1356             {
1357               cpp_error (pfile, CPP_DL_PEDWARN,
1358                          "C++ style comments are not allowed in ISO C90");
1359               cpp_error (pfile, CPP_DL_PEDWARN,
1360                          "(this will be reported only once per input file)");
1361               buffer->warned_cplusplus_comments = 1;
1362             }
1363
1364           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1365             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1366         }
1367       else if (c == '=')
1368         {
1369           buffer->cur++;
1370           result->type = CPP_DIV_EQ;
1371           break;
1372         }
1373       else
1374         {
1375           result->type = CPP_DIV;
1376           break;
1377         }
1378
1379       if (!pfile->state.save_comments)
1380         {
1381           result->flags |= PREV_WHITE;
1382           goto update_tokens_line;
1383         }
1384
1385       /* Save the comment as a token in its own right.  */
1386       save_comment (pfile, result, comment_start, c);
1387       break;
1388
1389     case '<':
1390       if (pfile->state.angled_headers)
1391         {
1392           lex_string (pfile, result, buffer->cur - 1);
1393           break;
1394         }
1395
1396       result->type = CPP_LESS;
1397       if (*buffer->cur == '=')
1398         buffer->cur++, result->type = CPP_LESS_EQ;
1399       else if (*buffer->cur == '<')
1400         {
1401           buffer->cur++;
1402           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1403         }
1404       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1405         {
1406           buffer->cur++;
1407           IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1408         }
1409       else if (CPP_OPTION (pfile, digraphs))
1410         {
1411           if (*buffer->cur == ':')
1412             {
1413               buffer->cur++;
1414               result->flags |= DIGRAPH;
1415               result->type = CPP_OPEN_SQUARE;
1416             }
1417           else if (*buffer->cur == '%')
1418             {
1419               buffer->cur++;
1420               result->flags |= DIGRAPH;
1421               result->type = CPP_OPEN_BRACE;
1422             }
1423         }
1424       break;
1425
1426     case '>':
1427       result->type = CPP_GREATER;
1428       if (*buffer->cur == '=')
1429         buffer->cur++, result->type = CPP_GREATER_EQ;
1430       else if (*buffer->cur == '>')
1431         {
1432           buffer->cur++;
1433           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1434         }
1435       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1436         {
1437           buffer->cur++;
1438           IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1439         }
1440       break;
1441
1442     case '%':
1443       result->type = CPP_MOD;
1444       if (*buffer->cur == '=')
1445         buffer->cur++, result->type = CPP_MOD_EQ;
1446       else if (CPP_OPTION (pfile, digraphs))
1447         {
1448           if (*buffer->cur == ':')
1449             {
1450               buffer->cur++;
1451               result->flags |= DIGRAPH;
1452               result->type = CPP_HASH;
1453               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1454                 buffer->cur += 2, result->type = CPP_PASTE;
1455             }
1456           else if (*buffer->cur == '>')
1457             {
1458               buffer->cur++;
1459               result->flags |= DIGRAPH;
1460               result->type = CPP_CLOSE_BRACE;
1461             }
1462         }
1463       break;
1464
1465     case '.':
1466       result->type = CPP_DOT;
1467       if (ISDIGIT (*buffer->cur))
1468         {
1469           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1470           result->type = CPP_NUMBER;
1471           if (CPP_OPTION(pfile, pedantic_parse_number))
1472             pedantic_lex_number (pfile, &result->val.str);
1473           else
1474             lex_number (pfile, &result->val.str, &nst);
1475           warn_about_normalization (pfile, result, &nst);
1476         }
1477       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1478         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1479       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1480         buffer->cur++, result->type = CPP_DOT_STAR;
1481       break;
1482
1483     case '+':
1484       result->type = CPP_PLUS;
1485       if (*buffer->cur == '+')
1486         buffer->cur++, result->type = CPP_PLUS_PLUS;
1487       else if (*buffer->cur == '=')
1488         buffer->cur++, result->type = CPP_PLUS_EQ;
1489       break;
1490
1491     case '-':
1492       result->type = CPP_MINUS;
1493       if (*buffer->cur == '>')
1494         {
1495           buffer->cur++;
1496           result->type = CPP_DEREF;
1497           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1498             buffer->cur++, result->type = CPP_DEREF_STAR;
1499         }
1500       else if (*buffer->cur == '-')
1501         buffer->cur++, result->type = CPP_MINUS_MINUS;
1502       else if (*buffer->cur == '=')
1503         buffer->cur++, result->type = CPP_MINUS_EQ;
1504       break;
1505
1506     case '&':
1507       result->type = CPP_AND;
1508       if (*buffer->cur == '&')
1509         buffer->cur++, result->type = CPP_AND_AND;
1510       else if (*buffer->cur == '=')
1511         buffer->cur++, result->type = CPP_AND_EQ;
1512       break;
1513
1514     case '|':
1515       result->type = CPP_OR;
1516       if (*buffer->cur == '|')
1517         buffer->cur++, result->type = CPP_OR_OR;
1518       else if (*buffer->cur == '=')
1519         buffer->cur++, result->type = CPP_OR_EQ;
1520       break;
1521
1522     case ':':
1523       result->type = CPP_COLON;
1524       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1525         buffer->cur++, result->type = CPP_SCOPE;
1526       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1527         {
1528           buffer->cur++;
1529           result->flags |= DIGRAPH;
1530           result->type = CPP_CLOSE_SQUARE;
1531         }
1532       break;
1533
1534     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1535     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1536     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1537     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1538     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1539
1540     case '?': result->type = CPP_QUERY; break;
1541     case '~': result->type = CPP_COMPL; break;
1542     case ',': result->type = CPP_COMMA; break;
1543     case '(': result->type = CPP_OPEN_PAREN; break;
1544     case ')': result->type = CPP_CLOSE_PAREN; break;
1545     case '[': result->type = CPP_OPEN_SQUARE; break;
1546     case ']': result->type = CPP_CLOSE_SQUARE; break;
1547     case '{': result->type = CPP_OPEN_BRACE; break;
1548     case '}': result->type = CPP_CLOSE_BRACE; break;
1549     case ';': result->type = CPP_SEMICOLON; break;
1550
1551       /* @ is a punctuator in Objective-C.  */
1552     case '@': result->type = CPP_ATSIGN; break;
1553
1554     case '$':
1555     case '\\':
1556       {
1557         const uchar *base = --buffer->cur;
1558         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1559
1560         if (forms_identifier_p (pfile, true, &nst))
1561           {
1562             result->type = CPP_NAME;
1563             result->val.node = lex_identifier (pfile, base, true, &nst);
1564             warn_about_normalization (pfile, result, &nst);
1565             break;
1566           }
1567         buffer->cur++;
1568       }
1569
1570     default:
1571       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1572       break;
1573     }
1574
1575   return result;
1576 }
1577
1578 /* An upper bound on the number of bytes needed to spell TOKEN.
1579    Does not include preceding whitespace.  */
1580 unsigned int
1581 cpp_token_len (const cpp_token *token)
1582 {
1583   unsigned int len;
1584
1585   switch (TOKEN_SPELL (token))
1586     {
1587     default:            len = 4;                                break;
1588     case SPELL_LITERAL: len = token->val.str.len;               break;
1589     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1590     }
1591
1592   return len;
1593 }
1594
1595 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1596    Return the number of bytes read out of NAME.  (There are always
1597    10 bytes written to BUFFER.)  */
1598
1599 static size_t
1600 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1601 {
1602   int j;
1603   int ucn_len = 0;
1604   int ucn_len_c;
1605   unsigned t;
1606   unsigned long utf32;
1607
1608   /* Compute the length of the UTF-8 sequence.  */
1609   for (t = *name; t & 0x80; t <<= 1)
1610     ucn_len++;
1611
1612   utf32 = *name & (0x7F >> ucn_len);
1613   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1614     {
1615       utf32 = (utf32 << 6) | (*++name & 0x3F);
1616
1617       /* Ill-formed UTF-8.  */
1618       if ((*name & ~0x3F) != 0x80)
1619         abort ();
1620     }
1621
1622   *buffer++ = '\\';
1623   *buffer++ = 'U';
1624   for (j = 7; j >= 0; j--)
1625     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1626   return ucn_len;
1627 }
1628
1629
1630 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1631    already contain the enough space to hold the token's spelling.
1632    Returns a pointer to the character after the last character written.
1633    FORSTRING is true if this is to be the spelling after translation
1634    phase 1 (this is different for UCNs).
1635    FIXME: Would be nice if we didn't need the PFILE argument.  */
1636 unsigned char *
1637 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1638                  unsigned char *buffer, bool forstring)
1639 {
1640   switch (TOKEN_SPELL (token))
1641     {
1642     case SPELL_OPERATOR:
1643       {
1644         const unsigned char *spelling;
1645         unsigned char c;
1646
1647         if (token->flags & DIGRAPH)
1648           spelling
1649             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1650         else if (token->flags & NAMED_OP)
1651           goto spell_ident;
1652         else
1653           spelling = TOKEN_NAME (token);
1654
1655         while ((c = *spelling++) != '\0')
1656           *buffer++ = c;
1657       }
1658       break;
1659
1660     spell_ident:
1661     case SPELL_IDENT:
1662       if (forstring)
1663         {
1664           memcpy (buffer, NODE_NAME (token->val.node),
1665                   NODE_LEN (token->val.node));
1666           buffer += NODE_LEN (token->val.node);
1667         }
1668       else
1669         {
1670           size_t i;
1671           const unsigned char * name = NODE_NAME (token->val.node);
1672
1673           for (i = 0; i < NODE_LEN (token->val.node); i++)
1674             if (name[i] & ~0x7F)
1675               {
1676                 i += utf8_to_ucn (buffer, name + i) - 1;
1677                 buffer += 10;
1678               }
1679             else
1680               *buffer++ = NODE_NAME (token->val.node)[i];
1681         }
1682       break;
1683
1684     case SPELL_LITERAL:
1685       memcpy (buffer, token->val.str.text, token->val.str.len);
1686       buffer += token->val.str.len;
1687       break;
1688
1689     case SPELL_NONE:
1690       cpp_error (pfile, CPP_DL_ICE,
1691                  "unspellable token %s", TOKEN_NAME (token));
1692       break;
1693     }
1694
1695   return buffer;
1696 }
1697
1698 /* Returns TOKEN spelt as a null-terminated string.  The string is
1699    freed when the reader is destroyed.  Useful for diagnostics.  */
1700 unsigned char *
1701 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1702 {
1703   unsigned int len = cpp_token_len (token) + 1;
1704   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1705
1706   end = cpp_spell_token (pfile, token, start, false);
1707   end[0] = '\0';
1708
1709   return start;
1710 }
1711
1712 /* Used by C front ends, which really should move to using
1713    cpp_token_as_text.  */
1714 const char *
1715 cpp_type2name (enum cpp_ttype type)
1716 {
1717   return (const char *) token_spellings[type].name;
1718 }
1719
1720 /* Writes the spelling of token to FP, without any preceding space.
1721    Separated from cpp_spell_token for efficiency - to avoid stdio
1722    double-buffering.  */
1723 void
1724 cpp_output_token (const cpp_token *token, FILE *fp)
1725 {
1726   switch (TOKEN_SPELL (token))
1727     {
1728     case SPELL_OPERATOR:
1729       {
1730         const unsigned char *spelling;
1731         int c;
1732
1733         if (token->flags & DIGRAPH)
1734           spelling
1735             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1736         else if (token->flags & NAMED_OP)
1737           goto spell_ident;
1738         else
1739           spelling = TOKEN_NAME (token);
1740
1741         c = *spelling;
1742         do
1743           putc (c, fp);
1744         while ((c = *++spelling) != '\0');
1745       }
1746       break;
1747
1748     spell_ident:
1749     case SPELL_IDENT:
1750       {
1751         size_t i;
1752         const unsigned char * name = NODE_NAME (token->val.node);
1753
1754         for (i = 0; i < NODE_LEN (token->val.node); i++)
1755           if (name[i] & ~0x7F)
1756             {
1757               unsigned char buffer[10];
1758               i += utf8_to_ucn (buffer, name + i) - 1;
1759               fwrite (buffer, 1, 10, fp);
1760             }
1761           else
1762             fputc (NODE_NAME (token->val.node)[i], fp);
1763       }
1764       break;
1765
1766     case SPELL_LITERAL:
1767       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1768       break;
1769
1770     case SPELL_NONE:
1771       /* An error, most probably.  */
1772       break;
1773     }
1774 }
1775
1776 /* Compare two tokens.  */
1777 int
1778 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1779 {
1780   if (a->type == b->type && a->flags == b->flags)
1781     switch (TOKEN_SPELL (a))
1782       {
1783       default:                  /* Keep compiler happy.  */
1784       case SPELL_OPERATOR:
1785         return 1;
1786       case SPELL_NONE:
1787         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1788       case SPELL_IDENT:
1789         return a->val.node == b->val.node;
1790       case SPELL_LITERAL:
1791         return (a->val.str.len == b->val.str.len
1792                 && !memcmp (a->val.str.text, b->val.str.text,
1793                             a->val.str.len));
1794       }
1795
1796   return 0;
1797 }
1798
1799 /* Returns nonzero if a space should be inserted to avoid an
1800    accidental token paste for output.  For simplicity, it is
1801    conservative, and occasionally advises a space where one is not
1802    needed, e.g. "." and ".2".  */
1803 int
1804 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1805                  const cpp_token *token2)
1806 {
1807   enum cpp_ttype a = token1->type, b = token2->type;
1808   cppchar_t c;
1809
1810   if (token1->flags & NAMED_OP)
1811     a = CPP_NAME;
1812   if (token2->flags & NAMED_OP)
1813     b = CPP_NAME;
1814
1815   c = EOF;
1816   if (token2->flags & DIGRAPH)
1817     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1818   else if (token_spellings[b].category == SPELL_OPERATOR)
1819     c = token_spellings[b].name[0];
1820
1821   /* Quickly get everything that can paste with an '='.  */
1822   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1823     return 1;
1824
1825   switch (a)
1826     {
1827     case CPP_GREATER:   return c == '>' || c == '?';
1828     case CPP_LESS:      return c == '<' || c == '?' || c == '%' || c == ':';
1829     case CPP_PLUS:      return c == '+';
1830     case CPP_MINUS:     return c == '-' || c == '>';
1831     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1832     case CPP_MOD:       return c == ':' || c == '>';
1833     case CPP_AND:       return c == '&';
1834     case CPP_OR:        return c == '|';
1835     case CPP_COLON:     return c == ':' || c == '>';
1836     case CPP_DEREF:     return c == '*';
1837     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1838     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1839     case CPP_NAME:      return ((b == CPP_NUMBER
1840                                  && name_p (pfile, &token2->val.str))
1841                                 || b == CPP_NAME
1842                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1843     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1844                                 || c == '.' || c == '+' || c == '-');
1845                                       /* UCNs */
1846     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1847                                  && b == CPP_NAME)
1848                                 || (CPP_OPTION (pfile, objc)
1849                                     && token1->val.str.text[0] == '@'
1850                                     && (b == CPP_NAME || b == CPP_STRING)));
1851     default:            break;
1852     }
1853
1854   return 0;
1855 }
1856
1857 /* Output all the remaining tokens on the current line, and a newline
1858    character, to FP.  Leading whitespace is removed.  If there are
1859    macros, special token padding is not performed.  */
1860 void
1861 cpp_output_line (cpp_reader *pfile, FILE *fp)
1862 {
1863   const cpp_token *token;
1864
1865   token = cpp_get_token (pfile);
1866   while (token->type != CPP_EOF)
1867     {
1868       cpp_output_token (token, fp);
1869       token = cpp_get_token (pfile);
1870       if (token->flags & PREV_WHITE)
1871         putc (' ', fp);
1872     }
1873
1874   putc ('\n', fp);
1875 }
1876
1877 /* Memory buffers.  Changing these three constants can have a dramatic
1878    effect on performance.  The values here are reasonable defaults,
1879    but might be tuned.  If you adjust them, be sure to test across a
1880    range of uses of cpplib, including heavy nested function-like macro
1881    expansion.  Also check the change in peak memory usage (NJAMD is a
1882    good tool for this).  */
1883 #define MIN_BUFF_SIZE 8000
1884 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1885 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1886         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1887
1888 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1889   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1890 #endif
1891
1892 /* Create a new allocation buffer.  Place the control block at the end
1893    of the buffer, so that buffer overflows will cause immediate chaos.  */
1894 static _cpp_buff *
1895 new_buff (size_t len)
1896 {
1897   _cpp_buff *result;
1898   unsigned char *base;
1899
1900   if (len < MIN_BUFF_SIZE)
1901     len = MIN_BUFF_SIZE;
1902   len = CPP_ALIGN (len);
1903
1904   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1905   result = (_cpp_buff *) (base + len);
1906   result->base = base;
1907   result->cur = base;
1908   result->limit = base + len;
1909   result->next = NULL;
1910   return result;
1911 }
1912
1913 /* Place a chain of unwanted allocation buffers on the free list.  */
1914 void
1915 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1916 {
1917   _cpp_buff *end = buff;
1918
1919   while (end->next)
1920     end = end->next;
1921   end->next = pfile->free_buffs;
1922   pfile->free_buffs = buff;
1923 }
1924
1925 /* Return a free buffer of size at least MIN_SIZE.  */
1926 _cpp_buff *
1927 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1928 {
1929   _cpp_buff *result, **p;
1930
1931   for (p = &pfile->free_buffs;; p = &(*p)->next)
1932     {
1933       size_t size;
1934
1935       if (*p == NULL)
1936         return new_buff (min_size);
1937       result = *p;
1938       size = result->limit - result->base;
1939       /* Return a buffer that's big enough, but don't waste one that's
1940          way too big.  */
1941       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1942         break;
1943     }
1944
1945   *p = result->next;
1946   result->next = NULL;
1947   result->cur = result->base;
1948   return result;
1949 }
1950
1951 /* Creates a new buffer with enough space to hold the uncommitted
1952    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1953    the excess bytes to the new buffer.  Chains the new buffer after
1954    BUFF, and returns the new buffer.  */
1955 _cpp_buff *
1956 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1957 {
1958   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1959   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1960
1961   buff->next = new_buff;
1962   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1963   return new_buff;
1964 }
1965
1966 /* Creates a new buffer with enough space to hold the uncommitted
1967    remaining bytes of the buffer pointed to by BUFF, and at least
1968    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1969    Chains the new buffer before the buffer pointed to by BUFF, and
1970    updates the pointer to point to the new buffer.  */
1971 void
1972 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1973 {
1974   _cpp_buff *new_buff, *old_buff = *pbuff;
1975   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1976
1977   new_buff = _cpp_get_buff (pfile, size);
1978   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1979   new_buff->next = old_buff;
1980   *pbuff = new_buff;
1981 }
1982
1983 /* Free a chain of buffers starting at BUFF.  */
1984 void
1985 _cpp_free_buff (_cpp_buff *buff)
1986 {
1987   _cpp_buff *next;
1988
1989   for (; buff; buff = next)
1990     {
1991       next = buff->next;
1992       free (buff->base);
1993     }
1994 }
1995
1996 /* Allocate permanent, unaligned storage of length LEN.  */
1997 unsigned char *
1998 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1999 {
2000   _cpp_buff *buff = pfile->u_buff;
2001   unsigned char *result = buff->cur;
2002
2003   if (len > (size_t) (buff->limit - result))
2004     {
2005       buff = _cpp_get_buff (pfile, len);
2006       buff->next = pfile->u_buff;
2007       pfile->u_buff = buff;
2008       result = buff->cur;
2009     }
2010
2011   buff->cur = result + len;
2012   return result;
2013 }
2014
2015 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2016    That buffer is used for growing allocations when saving macro
2017    replacement lists in a #define, and when parsing an answer to an
2018    assertion in #assert, #unassert or #if (and therefore possibly
2019    whilst expanding macros).  It therefore must not be used by any
2020    code that they might call: specifically the lexer and the guts of
2021    the macro expander.
2022
2023    All existing other uses clearly fit this restriction: storing
2024    registered pragmas during initialization.  */
2025 unsigned char *
2026 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2027 {
2028   _cpp_buff *buff = pfile->a_buff;
2029   unsigned char *result = buff->cur;
2030
2031   if (len > (size_t) (buff->limit - result))
2032     {
2033       buff = _cpp_get_buff (pfile, len);
2034       buff->next = pfile->a_buff;
2035       pfile->a_buff = buff;
2036       result = buff->cur;
2037     }
2038
2039   buff->cur = result + len;
2040   return result;
2041 }
2042
2043 /* Say which field of TOK is in use.  */
2044
2045 enum cpp_token_fld_kind
2046 cpp_token_val_index (cpp_token *tok)
2047 {
2048   switch (TOKEN_SPELL (tok))
2049     {
2050     case SPELL_IDENT:
2051       return CPP_TOKEN_FLD_NODE;
2052     case SPELL_LITERAL:
2053       return CPP_TOKEN_FLD_STR;
2054     case SPELL_NONE:
2055       if (tok->type == CPP_MACRO_ARG)
2056         return CPP_TOKEN_FLD_ARG_NO;
2057       else if (tok->type == CPP_PADDING)
2058         return CPP_TOKEN_FLD_SOURCE;
2059       else if (tok->type == CPP_PRAGMA)
2060         return CPP_TOKEN_FLD_STR;
2061       /* else fall through */
2062     default:
2063       return CPP_TOKEN_FLD_NONE;
2064     }
2065 }