git.gag.com Git - fw/sdcc/blob - support/cpp2/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26 #include <assert.h>
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, U s  },
  46 #define TK(e, s) { SPELL_ ## s,    U #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 /* Returns with a logical line that contains no escaped newlines or
  99    trigraphs.  This is a time-critical inner loop.  */
 100 void
 101 _cpp_clean_line (cpp_reader *pfile)
 102 {
 103   cpp_buffer *buffer;
 104   const uchar *s;
 105   uchar c, *d, *p;
 106
 107   buffer = pfile->buffer;
 108   buffer->cur_note = buffer->notes_used = 0;
 109   buffer->cur = buffer->line_base = buffer->next_line;
 110   buffer->need_line = false;
 111   s = buffer->next_line - 1;
 112
 113   if (!buffer->from_stage3)
 114     {
 115       /* Short circuit for the common case of an un-escaped line with
 116          no trigraphs.  The primary win here is by not writing any
 117          data back to memory until we have to.  */
 118       for (;;)
 119         {
 120           c = *++s;
 121           if (c == '\n' || c == '\r')
 122             {
 123               d = (uchar *) s;
 124
 125               if (s == buffer->rlimit)
 126                 goto done;
 127
 128               /* DOS line ending? */
 129               if (c == '\r' && s[1] == '\n')
 130                 s++;
 131
 132               if (s == buffer->rlimit)
 133                 goto done;
 134
 135               /* check for escaped newline */
 136               p = d;
 137               while (p != buffer->next_line && is_nvspace (p[-1]))
 138                 p--;
 139               if (p == buffer->next_line || p[-1] != '\\')
 140                 goto done;
 141
 142               /* Have an escaped newline; process it and proceed to
 143                  the slow path.  */
 144               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 145               d = p - 2;
 146               buffer->next_line = p - 1;
 147               break;
 148             }
 149           if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 150             {
 151               /* Have a trigraph.  We may or may not have to convert
 152                  it.  Add a line note regardless, for -Wtrigraphs.  */
 153               add_line_note (buffer, s, s[2]);
 154               if (CPP_OPTION (pfile, trigraphs))
 155                 {
 156                   /* We do, and that means we have to switch to the
 157                      slow path.  */
 158                   d = (uchar *) s;
 159                   *d = _cpp_trigraph_map[s[2]];
 160                   s += 2;
 161                   break;
 162                 }
 163             }
 164         }
 165
 166
 167       for (;;)
 168         {
 169           c = *++s;
 170           *++d = c;
 171
 172           if (c == '\n' || c == '\r')
 173             {
 174                   /* Handle DOS line endings.  */
 175               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 176                 s++;
 177               if (s == buffer->rlimit)
 178                 break;
 179
 180               /* Escaped?  */
 181               p = d;
 182               while (p != buffer->next_line && is_nvspace (p[-1]))
 183                 p--;
 184               if (p == buffer->next_line || p[-1] != '\\')
 185                 break;
 186
 187               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 188               d = p - 2;
 189               buffer->next_line = p - 1;
 190             }
 191           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 192             {
 193               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 194               add_line_note (buffer, d, s[2]);
 195               if (CPP_OPTION (pfile, trigraphs))
 196                 {
 197                   *d = _cpp_trigraph_map[s[2]];
 198                   s += 2;
 199                 }
 200             }
 201         }
 202     }
 203   else
 204     {
 205       do
 206         s++;
 207       while (*s != '\n' && *s != '\r');
 208       d = (uchar *) s;
 209
 210       /* Handle DOS line endings.  */
 211       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 212         s++;
 213     }
 214
 215  done:
 216   *d = '\n';
 217   /* A sentinel note that should never be processed.  */
 218   add_line_note (buffer, d + 1, '\n');
 219   buffer->next_line = s + 1;
 220 }
 221
 222 /* Return true if the trigraph indicated by NOTE should be warned
 223    about in a comment.  */
 224 static bool
 225 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 226 {
 227   const uchar *p;
 228
 229   /* Within comments we don't warn about trigraphs, unless the
 230      trigraph forms an escaped newline, as that may change
 231      behavior.  */
 232   if (note->type != '/')
 233     return false;
 234
 235   /* If -trigraphs, then this was an escaped newline iff the next note
 236      is coincident.  */
 237   if (CPP_OPTION (pfile, trigraphs))
 238     return note[1].pos == note->pos;
 239
 240   /* Otherwise, see if this forms an escaped newline.  */
 241   p = note->pos + 3;
 242   while (is_nvspace (*p))
 243     p++;
 244
 245   /* There might have been escaped newlines between the trigraph and the
 246      newline we found.  Hence the position test.  */
 247   return (*p == '\n' && p < note[1].pos);
 248 }
 249
 250 /* Process the notes created by add_line_note as far as the current
 251    location.  */
 252 void
 253 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 254 {
 255   cpp_buffer *buffer = pfile->buffer;
 256
 257   for (;;)
 258     {
 259       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 260       unsigned int col;
 261
 262       if (note->pos > buffer->cur)
 263         break;
 264
 265       buffer->cur_note++;
 266       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 267
 268       if (note->type == '\\' || note->type == ' ')
 269         {
 270           if (note->type == ' ' && !in_comment)
 271             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 272                                  "backslash and newline separated by space");
 273
 274           if (buffer->next_line > buffer->rlimit)
 275             {
 276               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 277                                    "backslash-newline at end of file");
 278               /* Prevent "no newline at end of file" warning.  */
 279               buffer->next_line = buffer->rlimit;
 280             }
 281
 282           buffer->line_base = note->pos;
 283           CPP_INCREMENT_LINE (pfile, 0);
 284         }
 285       else if (_cpp_trigraph_map[note->type])
 286         {
 287           if (CPP_OPTION (pfile, warn_trigraphs)
 288               && (!in_comment || warn_in_comment (pfile, note)))
 289             {
 290               if (CPP_OPTION (pfile, trigraphs))
 291                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 292                                      "trigraph ??%c converted to %c",
 293                                      note->type,
 294                                      (int) _cpp_trigraph_map[note->type]);
 295               else
 296                 {
 297                   cpp_error_with_line
 298                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 299                      "trigraph ??%c ignored, use -trigraphs to enable",
 300                      note->type);
 301                 }
 302             }
 303         }
 304       else
 305         abort ();
 306     }
 307 }
 308
 309 /* SDCC _asm specific */
 310 /* Skip an _asm ... _endasm block.  We find the end of the comment by
 311    seeing _endasm.  Returns non-zero if _asm terminated by EOF, zero
 312    otherwise.  */
 313 static int
 314 skip_asm_block (cpp_reader *pfile)
 315 {
 316 #define _ENDASM_STR "endasm"
 317 #define _ENDASM_LEN ((sizeof _ENDASM_STR) - 1)
 318
 319   cpp_buffer *buffer = pfile->buffer;
 320   cppchar_t c = EOF;
 321   int prev_space = 0;
 322   int ret = 1;
 323
 324   while (buffer->cur != buffer->rlimit)
 325     {
 326       prev_space = is_space(c);
 327       c = *buffer->cur++;
 328
 329       if (prev_space && c == '_')
 330         {
 331           if (buffer->cur + _ENDASM_LEN <= buffer->rlimit &&
 332             strncmp((char *)buffer->cur, _ENDASM_STR, _ENDASM_LEN) == 0)
 333             {
 334               buffer->cur += _ENDASM_LEN;
 335               ret = 0;
 336               break;
 337             }
 338         }
 339       else if (c == '\n')
 340         {
 341           unsigned int cols;
 342           --buffer->cur;
 343           _cpp_process_line_notes (pfile, true);
 344           if (buffer->next_line >= buffer->rlimit)
 345             return true;
 346           _cpp_clean_line (pfile);
 347
 348           cols = buffer->next_line - buffer->line_base;
 349           CPP_INCREMENT_LINE (pfile, cols);
 350         }
 351     }
 352
 353   _cpp_process_line_notes (pfile, true);
 354   return ret;
 355 }
 356
 357 /* Skip a C-style block comment.  We find the end of the comment by
 358    seeing if an asterisk is before every '/' we encounter.  Returns
 359    nonzero if comment terminated by EOF, zero otherwise.
 360
 361    Buffer->cur points to the initial asterisk of the comment.  */
 362 bool
 363 _cpp_skip_block_comment (cpp_reader *pfile)
 364 {
 365   cpp_buffer *buffer = pfile->buffer;
 366   const uchar *cur = buffer->cur;
 367   uchar c;
 368
 369   cur++;
 370   if (*cur == '/')
 371     cur++;
 372
 373   for (;;)
 374     {
 375       /* People like decorating comments with '*', so check for '/'
 376          instead for efficiency.  */
 377       c = *cur++;
 378
 379       if (c == '/')
 380         {
 381           if (cur[-2] == '*')
 382             break;
 383
 384           /* Warn about potential nested comments, but not if the '/'
 385              comes immediately before the true comment delimiter.
 386              Don't bother to get it right across escaped newlines.  */
 387           if (CPP_OPTION (pfile, warn_comments)
 388               && cur[0] == '*' && cur[1] != '/')
 389             {
 390               buffer->cur = cur;
 391               cpp_error_with_line (pfile, CPP_DL_WARNING,
 392                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 393                                    "\"/*\" within comment");
 394             }
 395         }
 396       else if (c == '\n')
 397         {
 398           unsigned int cols;
 399           buffer->cur = cur - 1;
 400           _cpp_process_line_notes (pfile, true);
 401           if (buffer->next_line >= buffer->rlimit)
 402             return true;
 403           _cpp_clean_line (pfile);
 404
 405           cols = buffer->next_line - buffer->line_base;
 406           CPP_INCREMENT_LINE (pfile, cols);
 407
 408           cur = buffer->cur;
 409         }
 410     }
 411
 412   buffer->cur = cur;
 413   _cpp_process_line_notes (pfile, true);
 414   return false;
 415 }
 416
 417 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 418    terminating newline.  Handles escaped newlines.  Returns nonzero
 419    if a multiline comment.  */
 420 static int
 421 skip_line_comment (cpp_reader *pfile)
 422 {
 423   cpp_buffer *buffer = pfile->buffer;
 424   unsigned int orig_line = pfile->line_table->highest_line;
 425
 426   while (*buffer->cur != '\n')
 427     buffer->cur++;
 428
 429   _cpp_process_line_notes (pfile, true);
 430   return orig_line != pfile->line_table->highest_line;
 431 }
 432
 433 /* Skips whitespace, saving the next non-whitespace character.  */
 434 static void
 435 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 436 {
 437   cpp_buffer *buffer = pfile->buffer;
 438   bool saw_NUL = false;
 439
 440   do
 441     {
 442       /* Horizontal space always OK.  */
 443       if (c == ' ' || c == '\t')
 444         ;
 445       /* Just \f \v or \0 left.  */
 446       else if (c == '\0')
 447         saw_NUL = true;
 448       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 449         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 450                              CPP_BUF_COL (buffer),
 451                              "%s in preprocessing directive",
 452                              c == '\f' ? "form feed" : "vertical tab");
 453
 454       c = *buffer->cur++;
 455     }
 456   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 457   while (is_nvspace (c));
 458
 459   if (saw_NUL)
 460     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 461
 462   buffer->cur--;
 463 }
 464
 465 /* See if the characters of a number token are valid in a name (no
 466    '.', '+' or '-').  */
 467 static int
 468 name_p (cpp_reader *pfile, const cpp_string *string)
 469 {
 470   unsigned int i;
 471
 472   for (i = 0; i < string->len; i++)
 473     if (!is_idchar (string->text[i]))
 474       return 0;
 475
 476   return 1;
 477 }
 478
 479 /* After parsing an identifier or other sequence, produce a warning about
 480    sequences not in NFC/NFKC.  */
 481 static void
 482 warn_about_normalization (cpp_reader *pfile,
 483                           const cpp_token *token,
 484                           const struct normalize_state *s)
 485 {
 486   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 487       && !pfile->state.skipping)
 488     {
 489       /* Make sure that the token is printed using UCNs, even
 490          if we'd otherwise happily print UTF-8.  */
 491       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 492       size_t sz;
 493
 494       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 495       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 496         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 497                              "`%.*s' is not in NFKC", (int) sz, buf);
 498       else
 499         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 500                              "`%.*s' is not in NFC", (int) sz, buf);
 501     }
 502 }
 503
 504 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 505    an identifier.  FIRST is TRUE if this starts an identifier.  */
 506 static bool
 507 forms_identifier_p (cpp_reader *pfile, int first,
 508                     struct normalize_state *state)
 509 {
 510   cpp_buffer *buffer = pfile->buffer;
 511
 512   if (*buffer->cur == '$')
 513     {
 514       if (!CPP_OPTION (pfile, dollars_in_ident))
 515         return false;
 516
 517       buffer->cur++;
 518       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 519         {
 520           CPP_OPTION (pfile, warn_dollars) = 0;
 521           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 522         }
 523
 524       return true;
 525     }
 526
 527   /* Is this a syntactically valid UCN?  */
 528   if (CPP_OPTION (pfile, extended_identifiers)
 529       && *buffer->cur == '\\'
 530       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 531     {
 532       buffer->cur += 2;
 533       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 534                           state))
 535         return true;
 536       buffer->cur -= 2;
 537     }
 538
 539   return false;
 540 }
 541
 542 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 543 static cpp_hashnode *
 544 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 545                 struct normalize_state *nst)
 546 {
 547   cpp_hashnode *result;
 548   const uchar *cur;
 549   unsigned int len;
 550   unsigned int hash = HT_HASHSTEP (0, *base);
 551
 552   cur = pfile->buffer->cur;
 553   if (! starts_ucn)
 554     while (ISIDNUM (*cur))
 555       {
 556         hash = HT_HASHSTEP (hash, *cur);
 557         cur++;
 558       }
 559   pfile->buffer->cur = cur;
 560   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 561     {
 562       /* Slower version for identifiers containing UCNs (or $).  */
 563       do {
 564         while (ISIDNUM (*pfile->buffer->cur))
 565           {
 566             pfile->buffer->cur++;
 567             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 568           }
 569       } while (forms_identifier_p (pfile, false, nst));
 570       result = _cpp_interpret_identifier (pfile, base,
 571                                           pfile->buffer->cur - base);
 572     }
 573   else
 574     {
 575       len = cur - base;
 576       hash = HT_HASHFINISH (hash, len);
 577
 578       result = (cpp_hashnode *)
 579         ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
 580     }
 581
 582   /* Rarely, identifiers require diagnostics when lexed.  */
 583   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 584                         && !pfile->state.skipping, 0))
 585     {
 586       /* It is allowed to poison the same identifier twice.  */
 587       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 588         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 589                    NODE_NAME (result));
 590
 591       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 592          replacement list of a variadic macro.  */
 593       if (result == pfile->spec_nodes.n__VA_ARGS__
 594           && !pfile->state.va_args_ok)
 595         cpp_error (pfile, CPP_DL_PEDWARN,
 596                    "__VA_ARGS__ can only appear in the expansion"
 597                    " of a C99 variadic macro");
 598     }
 599
 600   return result;
 601 }
 602
 603 /* SDCC specific */
 604 /* Pedantic parse a number, beginning with character C, skipping embedded
 605    backslash-newlines.  LEADING_PERIOD is nonzero if there was a "."
 606    before C.  Place the result in NUMBER.  */
 607 static void
 608 pedantic_lex_number (cpp_reader *pfile, cpp_string *number)
 609 {
 610 #define get_effective_char(pfile) (*pfile->buffer->cur++)
 611 #define BACKUP() (--pfile->buffer->cur)
 612
 613   enum num_type_e { NT_DEC, NT_HEX } num_type = NT_DEC;
 614   enum num_part_e { NP_WHOLE, NP_FRACT, NP_EXP, NP_INT_SUFFIX, NP_FLOAT_SUFFIX } num_part = NP_WHOLE;
 615
 616   uchar c = *(pfile->buffer->cur - 1);
 617   struct obstack *stack = &pfile->hash_table->stack;
 618   int len = 0;
 619   int has_whole = 0;
 620   int has_fract = 0;
 621
 622   if ('.' == c)
 623     {
 624       num_part = NP_FRACT;
 625       ++len;
 626       obstack_1grow (stack, '.');
 627       c = get_effective_char(pfile);
 628     }
 629   else
 630     {
 631       if ('0' == c)
 632         {
 633           has_whole = 1;
 634           ++len;
 635           obstack_1grow (stack, c);
 636           c = get_effective_char(pfile);
 637
 638           switch (c)
 639             {
 640             case 'X':
 641             case 'x':
 642               num_type = NT_HEX;
 643               ++len;
 644               obstack_1grow (stack, c);
 645               c = get_effective_char(pfile);
 646               break;
 647
 648             case '.':
 649               num_part = NP_FRACT;
 650               ++len;
 651               obstack_1grow (stack, c);
 652               c = get_effective_char(pfile);
 653               break;
 654             }
 655         }
 656     }
 657
 658   for (; ; )
 659     {
 660       switch (num_part)
 661         {
 662         case NP_WHOLE:
 663           if (NT_DEC == num_type)
 664             {
 665               while (ISDIGIT (c))
 666                 {
 667                   has_whole = 1;
 668                   ++len;
 669                   obstack_1grow (stack, c);
 670                   c = get_effective_char(pfile);
 671                 }
 672
 673               if ('.' == c)
 674                 {
 675                   num_part = NP_FRACT;
 676                   ++len;
 677                   obstack_1grow (stack, c);
 678                   c = get_effective_char(pfile);
 679                   continue;
 680                 }
 681               else if ('E' == c || 'e' == c)
 682                 {
 683                   if (has_whole || has_fract)
 684                   {
 685                     num_part = NP_EXP;
 686                     ++len;
 687                     obstack_1grow (stack, c);
 688                     c = get_effective_char(pfile);
 689                     continue;
 690                   }
 691                   else
 692                     break;
 693                 }
 694             }
 695           else
 696             {
 697               while (ISXDIGIT (c))
 698                 {
 699                   has_whole = 1;
 700                   ++len;
 701                   obstack_1grow (stack, c);
 702                   c = get_effective_char(pfile);
 703                 }
 704
 705               if ('.' == c)
 706                 {
 707                   num_part = NP_FRACT;
 708                   ++len;
 709                   obstack_1grow (stack, c);
 710                   c = get_effective_char(pfile);
 711                   continue;
 712                 }
 713               else if ('P' == c || 'p' == c)
 714                 {
 715                   if (has_whole || has_fract)
 716                     {
 717                       num_part = NP_EXP;
 718                       ++len;
 719                       obstack_1grow (stack, c);
 720                       c = get_effective_char(pfile);
 721                       continue;
 722                     }
 723                   else
 724                     break;
 725                 }
 726             }
 727           num_part = NP_INT_SUFFIX;
 728           continue;
 729
 730         case NP_FRACT:
 731           if (NT_DEC == num_type)
 732             {
 733               while (ISDIGIT (c))
 734                 {
 735                   has_fract = 1;
 736                   ++len;
 737                   obstack_1grow (stack, c);
 738                   c = get_effective_char(pfile);
 739                 }
 740
 741               if ('E' == c || 'e' == c)
 742                 {
 743                   if (has_whole || has_fract)
 744                     {
 745                       num_part = NP_EXP;
 746                       ++len;
 747                       obstack_1grow (stack, c);
 748                       c = get_effective_char(pfile);
 749                       continue;
 750                     }
 751                 }
 752             }
 753           else
 754             {
 755               while (ISXDIGIT (c))
 756                 {
 757                   has_fract = 1;
 758                   ++len;
 759                   obstack_1grow (stack, c);
 760                   c = get_effective_char(pfile);
 761                 }
 762
 763               if ('P' == c || 'p' == c)
 764                 {
 765                   if (has_whole || has_fract)
 766                     {
 767                       num_part = NP_EXP;
 768                       ++len;
 769                       obstack_1grow (stack, c);
 770                       c = get_effective_char(pfile);
 771                       continue;
 772                     }
 773                 }
 774             }
 775           num_part = NP_FLOAT_SUFFIX;
 776           continue;
 777
 778         case NP_EXP:
 779           if ('+' == c || '-' == c)
 780             {
 781               ++len;
 782               obstack_1grow (stack, c);
 783               c = get_effective_char(pfile);
 784             }
 785
 786           while (ISDIGIT (c))
 787             {
 788               ++len;
 789               obstack_1grow (stack, c);
 790               c = get_effective_char(pfile);
 791             }
 792
 793           num_part = NP_FLOAT_SUFFIX;
 794           continue;
 795
 796         case NP_INT_SUFFIX:
 797            if ('L' == c || 'l' == c)
 798             {
 799               uchar prevc = c;
 800
 801               ++len;
 802               obstack_1grow (stack, c);
 803               c = get_effective_char(pfile);
 804
 805               if (c == prevc)
 806                 {
 807                   ++len;
 808                   obstack_1grow (stack, c);
 809                   c = get_effective_char(pfile);
 810                 }
 811             }
 812           else if ('U' == c || 'u' == c)
 813             {
 814               ++len;
 815               obstack_1grow (stack, c);
 816               c = get_effective_char(pfile);
 817             }
 818           break;
 819
 820         case NP_FLOAT_SUFFIX:
 821            if ('F' == c || 'f' == c)
 822             {
 823               ++len;
 824               obstack_1grow (stack, c);
 825               c = get_effective_char(pfile);
 826             }
 827           else if ('L' == c || 'l' == c)
 828             {
 829               ++len;
 830               obstack_1grow (stack, c);
 831               c = get_effective_char(pfile);
 832             }
 833           break;
 834         }
 835       break;
 836     }
 837
 838   /* Step back over the unwanted char.  */
 839   BACKUP ();
 840
 841   number->text = obstack_finish (stack);
 842   number->len = len;
 843 }
 844
 845 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 846 static void
 847 lex_number (cpp_reader *pfile, cpp_string *number,
 848             struct normalize_state *nst)
 849 {
 850   const uchar *cur;
 851   const uchar *base;
 852   uchar *dest;
 853
 854   base = pfile->buffer->cur - 1;
 855   do
 856     {
 857       cur = pfile->buffer->cur;
 858
 859       /* N.B. ISIDNUM does not include $.  */
 860       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 861         {
 862           cur++;
 863           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 864         }
 865
 866       pfile->buffer->cur = cur;
 867     }
 868   while (forms_identifier_p (pfile, false, nst));
 869
 870   number->len = cur - base;
 871   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 872   memcpy (dest, base, number->len);
 873   dest[number->len] = '\0';
 874   number->text = dest;
 875 }
 876
 877 /* Create a token of type TYPE with a literal spelling.  */
 878 static void
 879 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 880                 unsigned int len, enum cpp_ttype type)
 881 {
 882   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 883
 884   memcpy (dest, base, len);
 885   dest[len] = '\0';
 886   token->type = type;
 887   token->val.str.len = len;
 888   token->val.str.text = dest;
 889 }
 890
 891 /* Lexes a string, character constant, or angle-bracketed header file
 892    name.  The stored string contains the spelling, including opening
 893    quote and leading any leading 'L'.  It returns the type of the
 894    literal, or CPP_OTHER if it was not properly terminated.
 895
 896    The spelling is NUL-terminated, but it is not guaranteed that this
 897    is the first NUL since embedded NULs are preserved.  */
 898 static void
 899 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 900 {
 901   bool saw_NUL = false;
 902   const uchar *cur;
 903   cppchar_t terminator;
 904   enum cpp_ttype type;
 905
 906   cur = base;
 907   terminator = *cur++;
 908   if (terminator == 'L')
 909     terminator = *cur++;
 910   if (terminator == '\"')
 911     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
 912   else if (terminator == '\'')
 913     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
 914   else
 915     terminator = '>', type = CPP_HEADER_NAME;
 916
 917   for (;;)
 918     {
 919       cppchar_t c = *cur++;
 920
 921       /* In #include-style directives, terminators are not escapable.  */
 922       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 923         cur++;
 924       else if (c == terminator)
 925         break;
 926       else if (c == '\n')
 927         {
 928           cur--;
 929           type = CPP_OTHER;
 930           break;
 931         }
 932       else if (c == '\0')
 933         saw_NUL = true;
 934     }
 935
 936   if (saw_NUL && !pfile->state.skipping)
 937     cpp_error (pfile, CPP_DL_WARNING,
 938                "null character(s) preserved in literal");
 939
 940   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 941     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 942                (int) terminator);
 943
 944   pfile->buffer->cur = cur;
 945   create_literal (pfile, token, base, cur - base, type);
 946 }
 947
 948 /* Fixed _WIN32 problem with CR-CR-LF sequences when outputting
 949    comment blocks (when executed with -C option) and
 950    _asm (SDCPP specific) blocks */
 951
 952 /* Count and copy characters from src to dest, excluding CRs:
 953    CRs are automatically generated, because the output is
 954    opened in TEXT mode. If dest == NULL, only count chars */
 955 static unsigned int
 956 copy_text_chars (unsigned char *dest, const unsigned char *src, unsigned int len)
 957 {
 958   unsigned int n = 0;
 959   const unsigned char *p;
 960
 961   for (p = src; p != src + len; ++p)
 962     {
 963       assert(*p != '\0');
 964
 965       if (*p != '\r')
 966         {
 967           if (dest != NULL)
 968             *dest++ = *p;
 969           ++n;
 970         }
 971     }
 972
 973     return n;
 974 }
 975
 976 /* SDCC _asm specific */
 977 /* The stored comment includes the comment start and any terminator.  */
 978 static void
 979 save_asm (cpp_reader *pfile, cpp_token *token, const unsigned char *from)
 980 {
 981 #define _ASM_STR  "_asm"
 982 #define _ASM_LEN  ((sizeof _ASM_STR) - 1)
 983
 984   unsigned char *buffer;
 985   unsigned int text_len, len;
 986
 987   len = pfile->buffer->cur - from;
 988   /* + _ASM_LEN for the initial '_asm'.  */
 989   text_len = copy_text_chars (NULL, from, len) + _ASM_LEN;
 990   buffer = _cpp_unaligned_alloc (pfile, text_len);
 991
 992
 993   token->type = CPP_ASM;
 994   token->val.str.len = text_len;
 995   token->val.str.text = buffer;
 996
 997   memcpy (buffer, _ASM_STR, _ASM_LEN);
 998   copy_text_chars (buffer + _ASM_LEN, from, len);
 999 }
1000
1001 /* The stored comment includes the comment start and any terminator.  */
1002 static void
1003 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1004               cppchar_t type)
1005 {
1006   unsigned char *buffer;
1007   unsigned int len, clen;
1008
1009   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1010
1011   /* C++ comments probably (not definitely) have moved past a new
1012      line, which we don't want to save in the comment.  */
1013   if (is_vspace (pfile->buffer->cur[-1]))
1014     len--;
1015
1016   /* If we are currently in a directive, then we need to store all
1017      C++ comments as C comments internally, and so we need to
1018      allocate a little extra space in that case.
1019
1020      Note that the only time we encounter a directive here is
1021      when we are saving comments in a "#define".  */
1022   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1023
1024   buffer = _cpp_unaligned_alloc (pfile, clen);
1025
1026   token->type = CPP_COMMENT;
1027   token->val.str.len = clen;
1028   token->val.str.text = buffer;
1029
1030   buffer[0] = '/';
1031   copy_text_chars (buffer + 1, from, len);
1032
1033   /* Finish conversion to a C comment, if necessary.  */
1034   if (pfile->state.in_directive && type == '/')
1035     {
1036       buffer[1] = '*';
1037       buffer[clen - 2] = '*';
1038       buffer[clen - 1] = '/';
1039     }
1040 }
1041
1042 /* Allocate COUNT tokens for RUN.  */
1043 void
1044 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1045 {
1046   run->base = XNEWVEC (cpp_token, count);
1047   run->limit = run->base + count;
1048   run->next = NULL;
1049 }
1050
1051 /* Returns the next tokenrun, or creates one if there is none.  */
1052 static tokenrun *
1053 next_tokenrun (tokenrun *run)
1054 {
1055   if (run->next == NULL)
1056     {
1057       run->next = XNEW (tokenrun);
1058       run->next->prev = run;
1059       _cpp_init_tokenrun (run->next, 250);
1060     }
1061
1062   return run->next;
1063 }
1064
1065 /* Allocate a single token that is invalidated at the same time as the
1066    rest of the tokens on the line.  Has its line and col set to the
1067    same as the last lexed token, so that diagnostics appear in the
1068    right place.  */
1069 cpp_token *
1070 _cpp_temp_token (cpp_reader *pfile)
1071 {
1072   cpp_token *old, *result;
1073
1074   old = pfile->cur_token - 1;
1075   if (pfile->cur_token == pfile->cur_run->limit)
1076     {
1077       pfile->cur_run = next_tokenrun (pfile->cur_run);
1078       pfile->cur_token = pfile->cur_run->base;
1079     }
1080
1081   result = pfile->cur_token++;
1082   result->src_loc = old->src_loc;
1083   return result;
1084 }
1085
1086 /* Lex a token into RESULT (external interface).  Takes care of issues
1087    like directive handling, token lookahead, multiple include
1088    optimization and skipping.  */
1089 const cpp_token *
1090 _cpp_lex_token (cpp_reader *pfile)
1091 {
1092   cpp_token *result;
1093
1094   for (;;)
1095     {
1096       if (pfile->cur_token == pfile->cur_run->limit)
1097         {
1098           pfile->cur_run = next_tokenrun (pfile->cur_run);
1099           pfile->cur_token = pfile->cur_run->base;
1100         }
1101
1102       if (pfile->lookaheads)
1103         {
1104           pfile->lookaheads--;
1105           result = pfile->cur_token++;
1106         }
1107       else
1108         result = _cpp_lex_direct (pfile);
1109
1110       if (result->flags & BOL)
1111         {
1112           /* Is this a directive.  If _cpp_handle_directive returns
1113              false, it is an assembler #.  */
1114           if (result->type == CPP_HASH
1115               /* 6.10.3 p 11: Directives in a list of macro arguments
1116                  gives undefined behavior.  This implementation
1117                  handles the directive as normal.  */
1118               && pfile->state.parsing_args != 1)
1119             {
1120               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1121                 {
1122                   if (pfile->directive_result.type == CPP_PADDING)
1123                     continue;
1124                   result = &pfile->directive_result;
1125                 }
1126             }
1127           else if (pfile->state.in_deferred_pragma)
1128             result = &pfile->directive_result;
1129
1130           if (pfile->cb.line_change && !pfile->state.skipping)
1131             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1132         }
1133
1134       /* We don't skip tokens in directives.  */
1135       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1136         break;
1137
1138       /* Outside a directive, invalidate controlling macros.  At file
1139          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1140          get here and MI optimization works.  */
1141       pfile->mi_valid = false;
1142
1143       if (!pfile->state.skipping || result->type == CPP_EOF)
1144         break;
1145     }
1146
1147   return result;
1148 }
1149
1150 /* Returns true if a fresh line has been loaded.  */
1151 bool
1152 _cpp_get_fresh_line (cpp_reader *pfile)
1153 {
1154   int return_at_eof;
1155
1156   /* We can't get a new line until we leave the current directive.  */
1157   if (pfile->state.in_directive)
1158     return false;
1159
1160   for (;;)
1161     {
1162       cpp_buffer *buffer = pfile->buffer;
1163
1164       if (!buffer->need_line)
1165         return true;
1166
1167       if (buffer->next_line < buffer->rlimit)
1168         {
1169           _cpp_clean_line (pfile);
1170           return true;
1171         }
1172
1173       /* First, get out of parsing arguments state.  */
1174       if (pfile->state.parsing_args)
1175         return false;
1176
1177       /* End of buffer.  Non-empty files should end in a newline.  */
1178       if (buffer->buf != buffer->rlimit
1179           && buffer->next_line > buffer->rlimit
1180           && !buffer->from_stage3)
1181         {
1182           /* Only warn once.  */
1183           buffer->next_line = buffer->rlimit;
1184           cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1185                                CPP_BUF_COLUMN (buffer, buffer->cur),
1186                                "no newline at end of file");
1187         }
1188
1189       return_at_eof = buffer->return_at_eof;
1190       _cpp_pop_buffer (pfile);
1191       if (pfile->buffer == NULL || return_at_eof)
1192         return false;
1193     }
1194 }
1195
1196 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1197   do                                                    \
1198     {                                                   \
1199       result->type = ELSE_TYPE;                         \
1200       if (*buffer->cur == CHAR)                         \
1201         buffer->cur++, result->type = THEN_TYPE;        \
1202     }                                                   \
1203   while (0)
1204
1205 /* Lex a token into pfile->cur_token, which is also incremented, to
1206    get diagnostics pointing to the correct location.
1207
1208    Does not handle issues such as token lookahead, multiple-include
1209    optimization, directives, skipping etc.  This function is only
1210    suitable for use by _cpp_lex_token, and in special cases like
1211    lex_expansion_token which doesn't care for any of these issues.
1212
1213    When meeting a newline, returns CPP_EOF if parsing a directive,
1214    otherwise returns to the start of the token buffer if permissible.
1215    Returns the location of the lexed token.  */
1216 cpp_token *
1217 _cpp_lex_direct (cpp_reader *pfile)
1218 {
1219   cppchar_t c;
1220   cpp_buffer *buffer;
1221   const unsigned char *comment_start;
1222   cpp_token *result = pfile->cur_token++;
1223
1224  fresh_line:
1225   result->flags = 0;
1226   buffer = pfile->buffer;
1227   if (buffer->need_line)
1228     {
1229       if (pfile->state.in_deferred_pragma)
1230         {
1231           result->type = CPP_PRAGMA_EOL;
1232           pfile->state.in_deferred_pragma = false;
1233           if (!pfile->state.pragma_allow_expansion)
1234             pfile->state.prevent_expansion--;
1235           return result;
1236         }
1237       if (!_cpp_get_fresh_line (pfile))
1238         {
1239           result->type = CPP_EOF;
1240           if (!pfile->state.in_directive)
1241             {
1242               /* Tell the compiler the line number of the EOF token.  */
1243               result->src_loc = pfile->line_table->highest_line;
1244               result->flags = BOL;
1245             }
1246           return result;
1247         }
1248       if (!pfile->keep_tokens)
1249         {
1250           pfile->cur_run = &pfile->base_run;
1251           result = pfile->base_run.base;
1252           pfile->cur_token = result + 1;
1253         }
1254       result->flags = BOL;
1255       if (pfile->state.parsing_args == 2)
1256         result->flags |= PREV_WHITE;
1257     }
1258   buffer = pfile->buffer;
1259  update_tokens_line:
1260   result->src_loc = pfile->line_table->highest_line;
1261
1262  skipped_white:
1263   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1264       && !pfile->overlaid_buffer)
1265     {
1266       _cpp_process_line_notes (pfile, false);
1267       result->src_loc = pfile->line_table->highest_line;
1268     }
1269   c = *buffer->cur++;
1270
1271   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1272                                CPP_BUF_COLUMN (buffer, buffer->cur));
1273
1274   switch (c)
1275     {
1276     case ' ': case '\t': case '\f': case '\v': case '\0':
1277       result->flags |= PREV_WHITE;
1278       skip_whitespace (pfile, c);
1279       goto skipped_white;
1280
1281     case '\n':
1282       if (buffer->cur < buffer->rlimit)
1283         CPP_INCREMENT_LINE (pfile, 0);
1284       buffer->need_line = true;
1285       goto fresh_line;
1286
1287     case '0': case '1': case '2': case '3': case '4':
1288     case '5': case '6': case '7': case '8': case '9':
1289       {
1290         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1291         result->type = CPP_NUMBER;
1292         if (CPP_OPTION(pfile, pedantic_parse_number))
1293           pedantic_lex_number (pfile, &result->val.str);
1294         else
1295           lex_number (pfile, &result->val.str, &nst);
1296         warn_about_normalization (pfile, result, &nst);
1297         break;
1298       }
1299
1300     case 'L':
1301       /* 'L' may introduce wide characters or strings.  */
1302       if (*buffer->cur == '\'' || *buffer->cur == '"')
1303         {
1304           lex_string (pfile, result, buffer->cur - 1);
1305           break;
1306         }
1307       /* Fall through.  */
1308
1309     case '_':
1310     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1311     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1312     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1313     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1314     case 'y': case 'z':
1315     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1316     case 'G': case 'H': case 'I': case 'J': case 'K':
1317     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1318     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1319     case 'Y': case 'Z':
1320       result->type = CPP_NAME;
1321       {
1322         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1323         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1324                                            &nst);
1325         warn_about_normalization (pfile, result, &nst);
1326       }
1327
1328       /* SDCC _asm specific */
1329       /* handle _asm ... _endasm ;  */
1330       if (CPP_OPTION(pfile, preproc_asm) == 0 && result->val.node == pfile->spec_nodes.n__asm)
1331         {
1332           comment_start = buffer->cur;
1333           result->type = CPP_ASM;
1334           skip_asm_block (pfile);
1335           /* Save the _asm block as a token in its own right.  */
1336           save_asm (pfile, result, comment_start);
1337         }
1338       /* Convert named operators to their proper types.  */
1339       else if (result->val.node->flags & NODE_OPERATOR)
1340         {
1341           result->flags |= NAMED_OP;
1342           result->type = (enum cpp_ttype) result->val.node->directive_index;
1343         }
1344       break;
1345
1346     case '\'':
1347     case '"':
1348       lex_string (pfile, result, buffer->cur - 1);
1349       break;
1350
1351     case '/':
1352       /* A potential block or line comment.  */
1353       comment_start = buffer->cur;
1354       c = *buffer->cur;
1355
1356       if (c == '*')
1357         {
1358           if (_cpp_skip_block_comment (pfile))
1359             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1360         }
1361       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1362                             || cpp_in_system_header (pfile)))
1363         {
1364           /* Warn about comments only if pedantically GNUC89, and not
1365              in system headers.  */
1366           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1367               && ! buffer->warned_cplusplus_comments)
1368             {
1369               cpp_error (pfile, CPP_DL_PEDWARN,
1370                          "C++ style comments are not allowed in ISO C90");
1371               cpp_error (pfile, CPP_DL_PEDWARN,
1372                          "(this will be reported only once per input file)");
1373               buffer->warned_cplusplus_comments = 1;
1374             }
1375
1376           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1377             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1378         }
1379       else if (c == '=')
1380         {
1381           buffer->cur++;
1382           result->type = CPP_DIV_EQ;
1383           break;
1384         }
1385       else
1386         {
1387           result->type = CPP_DIV;
1388           break;
1389         }
1390
1391       if (!pfile->state.save_comments)
1392         {
1393           result->flags |= PREV_WHITE;
1394           goto update_tokens_line;
1395         }
1396
1397       /* Save the comment as a token in its own right.  */
1398       save_comment (pfile, result, comment_start, c);
1399       break;
1400
1401     case '<':
1402       if (pfile->state.angled_headers)
1403         {
1404           lex_string (pfile, result, buffer->cur - 1);
1405           break;
1406         }
1407
1408       result->type = CPP_LESS;
1409       if (*buffer->cur == '=')
1410         buffer->cur++, result->type = CPP_LESS_EQ;
1411       else if (*buffer->cur == '<')
1412         {
1413           buffer->cur++;
1414           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1415         }
1416       else if (CPP_OPTION (pfile, digraphs))
1417         {
1418           if (*buffer->cur == ':')
1419             {
1420               buffer->cur++;
1421               result->flags |= DIGRAPH;
1422               result->type = CPP_OPEN_SQUARE;
1423             }
1424           else if (*buffer->cur == '%')
1425             {
1426               buffer->cur++;
1427               result->flags |= DIGRAPH;
1428               result->type = CPP_OPEN_BRACE;
1429             }
1430         }
1431       break;
1432
1433     case '>':
1434       result->type = CPP_GREATER;
1435       if (*buffer->cur == '=')
1436         buffer->cur++, result->type = CPP_GREATER_EQ;
1437       else if (*buffer->cur == '>')
1438         {
1439           buffer->cur++;
1440           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1441         }
1442       break;
1443
1444     case '%':
1445       result->type = CPP_MOD;
1446       if (*buffer->cur == '=')
1447         buffer->cur++, result->type = CPP_MOD_EQ;
1448       else if (CPP_OPTION (pfile, digraphs))
1449         {
1450           if (*buffer->cur == ':')
1451             {
1452               buffer->cur++;
1453               result->flags |= DIGRAPH;
1454               result->type = CPP_HASH;
1455               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1456                 buffer->cur += 2, result->type = CPP_PASTE;
1457             }
1458           else if (*buffer->cur == '>')
1459             {
1460               buffer->cur++;
1461               result->flags |= DIGRAPH;
1462               result->type = CPP_CLOSE_BRACE;
1463             }
1464         }
1465       break;
1466
1467     case '.':
1468       result->type = CPP_DOT;
1469       if (ISDIGIT (*buffer->cur))
1470         {
1471           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1472           result->type = CPP_NUMBER;
1473           if (CPP_OPTION(pfile, pedantic_parse_number))
1474             pedantic_lex_number (pfile, &result->val.str);
1475           else
1476             lex_number (pfile, &result->val.str, &nst);
1477           warn_about_normalization (pfile, result, &nst);
1478         }
1479       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1480         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1481       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1482         buffer->cur++, result->type = CPP_DOT_STAR;
1483       break;
1484
1485     case '+':
1486       result->type = CPP_PLUS;
1487       if (*buffer->cur == '+')
1488         buffer->cur++, result->type = CPP_PLUS_PLUS;
1489       else if (*buffer->cur == '=')
1490         buffer->cur++, result->type = CPP_PLUS_EQ;
1491       break;
1492
1493     case '-':
1494       result->type = CPP_MINUS;
1495       if (*buffer->cur == '>')
1496         {
1497           buffer->cur++;
1498           result->type = CPP_DEREF;
1499           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1500             buffer->cur++, result->type = CPP_DEREF_STAR;
1501         }
1502       else if (*buffer->cur == '-')
1503         buffer->cur++, result->type = CPP_MINUS_MINUS;
1504       else if (*buffer->cur == '=')
1505         buffer->cur++, result->type = CPP_MINUS_EQ;
1506       break;
1507
1508     case '&':
1509       result->type = CPP_AND;
1510       if (*buffer->cur == '&')
1511         buffer->cur++, result->type = CPP_AND_AND;
1512       else if (*buffer->cur == '=')
1513         buffer->cur++, result->type = CPP_AND_EQ;
1514       break;
1515
1516     case '|':
1517       result->type = CPP_OR;
1518       if (*buffer->cur == '|')
1519         buffer->cur++, result->type = CPP_OR_OR;
1520       else if (*buffer->cur == '=')
1521         buffer->cur++, result->type = CPP_OR_EQ;
1522       break;
1523
1524     case ':':
1525       result->type = CPP_COLON;
1526       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1527         buffer->cur++, result->type = CPP_SCOPE;
1528       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1529         {
1530           buffer->cur++;
1531           result->flags |= DIGRAPH;
1532           result->type = CPP_CLOSE_SQUARE;
1533         }
1534       break;
1535
1536     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1537     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1538     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1539     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1540     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1541
1542     case '?': result->type = CPP_QUERY; break;
1543     case '~': result->type = CPP_COMPL; break;
1544     case ',': result->type = CPP_COMMA; break;
1545     case '(': result->type = CPP_OPEN_PAREN; break;
1546     case ')': result->type = CPP_CLOSE_PAREN; break;
1547     case '[': result->type = CPP_OPEN_SQUARE; break;
1548     case ']': result->type = CPP_CLOSE_SQUARE; break;
1549     case '{': result->type = CPP_OPEN_BRACE; break;
1550     case '}': result->type = CPP_CLOSE_BRACE; break;
1551     case ';': result->type = CPP_SEMICOLON; break;
1552
1553       /* @ is a punctuator in Objective-C.  */
1554     case '@': result->type = CPP_ATSIGN; break;
1555
1556     case '$':
1557     case '\\':
1558       {
1559         const uchar *base = --buffer->cur;
1560         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1561
1562         if (forms_identifier_p (pfile, true, &nst))
1563           {
1564             result->type = CPP_NAME;
1565             result->val.node = lex_identifier (pfile, base, true, &nst);
1566             warn_about_normalization (pfile, result, &nst);
1567             break;
1568           }
1569         buffer->cur++;
1570       }
1571
1572     default:
1573       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1574       break;
1575     }
1576
1577   return result;
1578 }
1579
1580 /* An upper bound on the number of bytes needed to spell TOKEN.
1581    Does not include preceding whitespace.  */
1582 unsigned int
1583 cpp_token_len (const cpp_token *token)
1584 {
1585   unsigned int len;
1586
1587   switch (TOKEN_SPELL (token))
1588     {
1589     default:            len = 4;                                break;
1590     case SPELL_LITERAL: len = token->val.str.len;               break;
1591     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1592     }
1593
1594   return len;
1595 }
1596
1597 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1598    Return the number of bytes read out of NAME.  (There are always
1599    10 bytes written to BUFFER.)  */
1600
1601 static size_t
1602 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1603 {
1604   int j;
1605   int ucn_len = 0;
1606   int ucn_len_c;
1607   unsigned t;
1608   unsigned long utf32;
1609
1610   /* Compute the length of the UTF-8 sequence.  */
1611   for (t = *name; t & 0x80; t <<= 1)
1612     ucn_len++;
1613
1614   utf32 = *name & (0x7F >> ucn_len);
1615   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1616     {
1617       utf32 = (utf32 << 6) | (*++name & 0x3F);
1618
1619       /* Ill-formed UTF-8.  */
1620       if ((*name & ~0x3F) != 0x80)
1621         abort ();
1622     }
1623
1624   *buffer++ = '\\';
1625   *buffer++ = 'U';
1626   for (j = 7; j >= 0; j--)
1627     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1628   return ucn_len;
1629 }
1630
1631
1632 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1633    already contain the enough space to hold the token's spelling.
1634    Returns a pointer to the character after the last character written.
1635    FORSTRING is true if this is to be the spelling after translation
1636    phase 1 (this is different for UCNs).
1637    FIXME: Would be nice if we didn't need the PFILE argument.  */
1638 unsigned char *
1639 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1640                  unsigned char *buffer, bool forstring)
1641 {
1642   switch (TOKEN_SPELL (token))
1643     {
1644     case SPELL_OPERATOR:
1645       {
1646         const unsigned char *spelling;
1647         unsigned char c;
1648
1649         if (token->flags & DIGRAPH)
1650           spelling
1651             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1652         else if (token->flags & NAMED_OP)
1653           goto spell_ident;
1654         else
1655           spelling = TOKEN_NAME (token);
1656
1657         while ((c = *spelling++) != '\0')
1658           *buffer++ = c;
1659       }
1660       break;
1661
1662     spell_ident:
1663     case SPELL_IDENT:
1664       if (forstring)
1665         {
1666           memcpy (buffer, NODE_NAME (token->val.node),
1667                   NODE_LEN (token->val.node));
1668           buffer += NODE_LEN (token->val.node);
1669         }
1670       else
1671         {
1672           size_t i;
1673           const unsigned char * name = NODE_NAME (token->val.node);
1674
1675           for (i = 0; i < NODE_LEN (token->val.node); i++)
1676             if (name[i] & ~0x7F)
1677               {
1678                 i += utf8_to_ucn (buffer, name + i) - 1;
1679                 buffer += 10;
1680               }
1681             else
1682               *buffer++ = NODE_NAME (token->val.node)[i];
1683         }
1684       break;
1685
1686     case SPELL_LITERAL:
1687       memcpy (buffer, token->val.str.text, token->val.str.len);
1688       buffer += token->val.str.len;
1689       break;
1690
1691     case SPELL_NONE:
1692       cpp_error (pfile, CPP_DL_ICE,
1693                  "unspellable token %s", TOKEN_NAME (token));
1694       break;
1695     }
1696
1697   return buffer;
1698 }
1699
1700 /* Returns TOKEN spelt as a null-terminated string.  The string is
1701    freed when the reader is destroyed.  Useful for diagnostics.  */
1702 unsigned char *
1703 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1704 {
1705   unsigned int len = cpp_token_len (token) + 1;
1706   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1707
1708   end = cpp_spell_token (pfile, token, start, false);
1709   end[0] = '\0';
1710
1711   return start;
1712 }
1713
1714 /* Used by C front ends, which really should move to using
1715    cpp_token_as_text.  */
1716 const char *
1717 cpp_type2name (enum cpp_ttype type)
1718 {
1719   return (const char *) token_spellings[type].name;
1720 }
1721
1722 /* Writes the spelling of token to FP, without any preceding space.
1723    Separated from cpp_spell_token for efficiency - to avoid stdio
1724    double-buffering.  */
1725 void
1726 cpp_output_token (const cpp_token *token, FILE *fp)
1727 {
1728   switch (TOKEN_SPELL (token))
1729     {
1730     case SPELL_OPERATOR:
1731       {
1732         const unsigned char *spelling;
1733         int c;
1734
1735         if (token->flags & DIGRAPH)
1736           spelling
1737             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1738         else if (token->flags & NAMED_OP)
1739           goto spell_ident;
1740         else
1741           spelling = TOKEN_NAME (token);
1742
1743         c = *spelling;
1744         do
1745           putc (c, fp);
1746         while ((c = *++spelling) != '\0');
1747       }
1748       break;
1749
1750     spell_ident:
1751     case SPELL_IDENT:
1752       {
1753         size_t i;
1754         const unsigned char * name = NODE_NAME (token->val.node);
1755
1756         for (i = 0; i < NODE_LEN (token->val.node); i++)
1757           if (name[i] & ~0x7F)
1758             {
1759               unsigned char buffer[10];
1760               i += utf8_to_ucn (buffer, name + i) - 1;
1761               fwrite (buffer, 1, 10, fp);
1762             }
1763           else
1764             fputc (NODE_NAME (token->val.node)[i], fp);
1765       }
1766       break;
1767
1768     case SPELL_LITERAL:
1769       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1770       break;
1771
1772     case SPELL_NONE:
1773       /* An error, most probably.  */
1774       break;
1775     }
1776 }
1777
1778 /* Compare two tokens.  */
1779 int
1780 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1781 {
1782   if (a->type == b->type && a->flags == b->flags)
1783     switch (TOKEN_SPELL (a))
1784       {
1785       default:                  /* Keep compiler happy.  */
1786       case SPELL_OPERATOR:
1787         return 1;
1788       case SPELL_NONE:
1789         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1790       case SPELL_IDENT:
1791         return a->val.node == b->val.node;
1792       case SPELL_LITERAL:
1793         return (a->val.str.len == b->val.str.len
1794                 && !memcmp (a->val.str.text, b->val.str.text,
1795                             a->val.str.len));
1796       }
1797
1798   return 0;
1799 }
1800
1801 /* Returns nonzero if a space should be inserted to avoid an
1802    accidental token paste for output.  For simplicity, it is
1803    conservative, and occasionally advises a space where one is not
1804    needed, e.g. "." and ".2".  */
1805 int
1806 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1807                  const cpp_token *token2)
1808 {
1809   enum cpp_ttype a = token1->type, b = token2->type;
1810   cppchar_t c;
1811
1812   if (token1->flags & NAMED_OP)
1813     a = CPP_NAME;
1814   if (token2->flags & NAMED_OP)
1815     b = CPP_NAME;
1816
1817   c = EOF;
1818   if (token2->flags & DIGRAPH)
1819     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1820   else if (token_spellings[b].category == SPELL_OPERATOR)
1821     c = token_spellings[b].name[0];
1822
1823   /* Quickly get everything that can paste with an '='.  */
1824   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1825     return 1;
1826
1827   switch (a)
1828     {
1829     case CPP_GREATER:   return c == '>';
1830     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1831     case CPP_PLUS:      return c == '+';
1832     case CPP_MINUS:     return c == '-' || c == '>';
1833     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1834     case CPP_MOD:       return c == ':' || c == '>';
1835     case CPP_AND:       return c == '&';
1836     case CPP_OR:        return c == '|';
1837     case CPP_COLON:     return c == ':' || c == '>';
1838     case CPP_DEREF:     return c == '*';
1839     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1840     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1841     case CPP_NAME:      return ((b == CPP_NUMBER
1842                                  && name_p (pfile, &token2->val.str))
1843                                 || b == CPP_NAME
1844                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1845     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1846                                 || c == '.' || c == '+' || c == '-');
1847                                       /* UCNs */
1848     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1849                                  && b == CPP_NAME)
1850                                 || (CPP_OPTION (pfile, objc)
1851                                     && token1->val.str.text[0] == '@'
1852                                     && (b == CPP_NAME || b == CPP_STRING)));
1853     default:            break;
1854     }
1855
1856   return 0;
1857 }
1858
1859 /* Output all the remaining tokens on the current line, and a newline
1860    character, to FP.  Leading whitespace is removed.  If there are
1861    macros, special token padding is not performed.  */
1862 void
1863 cpp_output_line (cpp_reader *pfile, FILE *fp)
1864 {
1865   const cpp_token *token;
1866
1867   token = cpp_get_token (pfile);
1868   while (token->type != CPP_EOF)
1869     {
1870       cpp_output_token (token, fp);
1871       token = cpp_get_token (pfile);
1872       if (token->flags & PREV_WHITE)
1873         putc (' ', fp);
1874     }
1875
1876   putc ('\n', fp);
1877 }
1878
1879 /* Memory buffers.  Changing these three constants can have a dramatic
1880    effect on performance.  The values here are reasonable defaults,
1881    but might be tuned.  If you adjust them, be sure to test across a
1882    range of uses of cpplib, including heavy nested function-like macro
1883    expansion.  Also check the change in peak memory usage (NJAMD is a
1884    good tool for this).  */
1885 #define MIN_BUFF_SIZE 8000
1886 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1887 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1888         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1889
1890 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1891   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1892 #endif
1893
1894 /* Create a new allocation buffer.  Place the control block at the end
1895    of the buffer, so that buffer overflows will cause immediate chaos.  */
1896 static _cpp_buff *
1897 new_buff (size_t len)
1898 {
1899   _cpp_buff *result;
1900   unsigned char *base;
1901
1902   if (len < MIN_BUFF_SIZE)
1903     len = MIN_BUFF_SIZE;
1904   len = CPP_ALIGN (len);
1905
1906   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1907   result = (_cpp_buff *) (base + len);
1908   result->base = base;
1909   result->cur = base;
1910   result->limit = base + len;
1911   result->next = NULL;
1912   return result;
1913 }
1914
1915 /* Place a chain of unwanted allocation buffers on the free list.  */
1916 void
1917 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1918 {
1919   _cpp_buff *end = buff;
1920
1921   while (end->next)
1922     end = end->next;
1923   end->next = pfile->free_buffs;
1924   pfile->free_buffs = buff;
1925 }
1926
1927 /* Return a free buffer of size at least MIN_SIZE.  */
1928 _cpp_buff *
1929 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1930 {
1931   _cpp_buff *result, **p;
1932
1933   for (p = &pfile->free_buffs;; p = &(*p)->next)
1934     {
1935       size_t size;
1936
1937       if (*p == NULL)
1938         return new_buff (min_size);
1939       result = *p;
1940       size = result->limit - result->base;
1941       /* Return a buffer that's big enough, but don't waste one that's
1942          way too big.  */
1943       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1944         break;
1945     }
1946
1947   *p = result->next;
1948   result->next = NULL;
1949   result->cur = result->base;
1950   return result;
1951 }
1952
1953 /* Creates a new buffer with enough space to hold the uncommitted
1954    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1955    the excess bytes to the new buffer.  Chains the new buffer after
1956    BUFF, and returns the new buffer.  */
1957 _cpp_buff *
1958 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1959 {
1960   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1961   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1962
1963   buff->next = new_buff;
1964   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1965   return new_buff;
1966 }
1967
1968 /* Creates a new buffer with enough space to hold the uncommitted
1969    remaining bytes of the buffer pointed to by BUFF, and at least
1970    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1971    Chains the new buffer before the buffer pointed to by BUFF, and
1972    updates the pointer to point to the new buffer.  */
1973 void
1974 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1975 {
1976   _cpp_buff *new_buff, *old_buff = *pbuff;
1977   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1978
1979   new_buff = _cpp_get_buff (pfile, size);
1980   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1981   new_buff->next = old_buff;
1982   *pbuff = new_buff;
1983 }
1984
1985 /* Free a chain of buffers starting at BUFF.  */
1986 void
1987 _cpp_free_buff (_cpp_buff *buff)
1988 {
1989   _cpp_buff *next;
1990
1991   for (; buff; buff = next)
1992     {
1993       next = buff->next;
1994       free (buff->base);
1995     }
1996 }
1997
1998 /* Allocate permanent, unaligned storage of length LEN.  */
1999 unsigned char *
2000 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2001 {
2002   _cpp_buff *buff = pfile->u_buff;
2003   unsigned char *result = buff->cur;
2004
2005   if (len > (size_t) (buff->limit - result))
2006     {
2007       buff = _cpp_get_buff (pfile, len);
2008       buff->next = pfile->u_buff;
2009       pfile->u_buff = buff;
2010       result = buff->cur;
2011     }
2012
2013   buff->cur = result + len;
2014   return result;
2015 }
2016
2017 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2018    That buffer is used for growing allocations when saving macro
2019    replacement lists in a #define, and when parsing an answer to an
2020    assertion in #assert, #unassert or #if (and therefore possibly
2021    whilst expanding macros).  It therefore must not be used by any
2022    code that they might call: specifically the lexer and the guts of
2023    the macro expander.
2024
2025    All existing other uses clearly fit this restriction: storing
2026    registered pragmas during initialization.  */
2027 unsigned char *
2028 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2029 {
2030   _cpp_buff *buff = pfile->a_buff;
2031   unsigned char *result = buff->cur;
2032
2033   if (len > (size_t) (buff->limit - result))
2034     {
2035       buff = _cpp_get_buff (pfile, len);
2036       buff->next = pfile->a_buff;
2037       pfile->a_buff = buff;
2038       result = buff->cur;
2039     }
2040
2041   buff->cur = result + len;
2042   return result;
2043 }
2044
2045 /* Say which field of TOK is in use.  */
2046
2047 enum cpp_token_fld_kind
2048 cpp_token_val_index (cpp_token *tok)
2049 {
2050   switch (TOKEN_SPELL (tok))
2051     {
2052     case SPELL_IDENT:
2053       return CPP_TOKEN_FLD_NODE;
2054     case SPELL_LITERAL:
2055       return CPP_TOKEN_FLD_STR;
2056     case SPELL_NONE:
2057       if (tok->type == CPP_MACRO_ARG)
2058         return CPP_TOKEN_FLD_ARG_NO;
2059       else if (tok->type == CPP_PADDING)
2060         return CPP_TOKEN_FLD_SOURCE;
2061       else if (tok->type == CPP_PRAGMA)
2062         return CPP_TOKEN_FLD_PRAGMA;
2063       /* else fall through */
2064     default:
2065       return CPP_TOKEN_FLD_NONE;
2066     }
2067 }