git.gag.com Git - fw/sdcc/blob - support/cpp2/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26 #include <assert.h>
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, U s  },
  46 #define TK(e, s) { SPELL_ ## s,    U #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 /* Returns with a logical line that contains no escaped newlines or
  99    trigraphs.  This is a time-critical inner loop.  */
 100 void
 101 _cpp_clean_line (cpp_reader *pfile)
 102 {
 103   cpp_buffer *buffer;
 104   const uchar *s;
 105   uchar c, *d, *p;
 106
 107   buffer = pfile->buffer;
 108   buffer->cur_note = buffer->notes_used = 0;
 109   buffer->cur = buffer->line_base = buffer->next_line;
 110   buffer->need_line = false;
 111   s = buffer->next_line - 1;
 112
 113   if (!buffer->from_stage3)
 114     {
 115       /* Short circuit for the common case of an un-escaped line with
 116          no trigraphs.  The primary win here is by not writing any
 117          data back to memory until we have to.  */
 118       for (;;)
 119         {
 120           c = *++s;
 121           if (c == '\n' || c == '\r')
 122             {
 123               d = (uchar *) s;
 124
 125               if (s == buffer->rlimit)
 126                 goto done;
 127
 128               /* DOS line ending? */
 129               if (c == '\r' && s[1] == '\n')
 130                 s++;
 131
 132               if (s == buffer->rlimit)
 133                 goto done;
 134
 135               /* check for escaped newline */
 136               p = d;
 137               while (p != buffer->next_line && is_nvspace (p[-1]))
 138                 p--;
 139               if (p == buffer->next_line || p[-1] != '\\')
 140                 goto done;
 141
 142               /* Have an escaped newline; process it and proceed to
 143                  the slow path.  */
 144               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 145               d = p - 2;
 146               buffer->next_line = p - 1;
 147               break;
 148             }
 149           if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 150             {
 151               /* Have a trigraph.  We may or may not have to convert
 152                  it.  Add a line note regardless, for -Wtrigraphs.  */
 153               add_line_note (buffer, s, s[2]);
 154               if (CPP_OPTION (pfile, trigraphs))
 155                 {
 156                   /* We do, and that means we have to switch to the
 157                      slow path.  */
 158                   d = (uchar *) s;
 159                   *d = _cpp_trigraph_map[s[2]];
 160                   s += 2;
 161                   break;
 162                 }
 163             }
 164         }
 165
 166
 167       for (;;)
 168         {
 169           c = *++s;
 170           *++d = c;
 171
 172           if (c == '\n' || c == '\r')
 173             {
 174                   /* Handle DOS line endings.  */
 175               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 176                 s++;
 177               if (s == buffer->rlimit)
 178                 break;
 179
 180               /* Escaped?  */
 181               p = d;
 182               while (p != buffer->next_line && is_nvspace (p[-1]))
 183                 p--;
 184               if (p == buffer->next_line || p[-1] != '\\')
 185                 break;
 186
 187               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 188               d = p - 2;
 189               buffer->next_line = p - 1;
 190             }
 191           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 192             {
 193               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 194               add_line_note (buffer, d, s[2]);
 195               if (CPP_OPTION (pfile, trigraphs))
 196                 {
 197                   *d = _cpp_trigraph_map[s[2]];
 198                   s += 2;
 199                 }
 200             }
 201         }
 202     }
 203   else
 204     {
 205       do
 206         s++;
 207       while (*s != '\n' && *s != '\r');
 208       d = (uchar *) s;
 209
 210       /* Handle DOS line endings.  */
 211       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 212         s++;
 213     }
 214
 215  done:
 216   *d = '\n';
 217   /* A sentinel note that should never be processed.  */
 218   add_line_note (buffer, d + 1, '\n');
 219   buffer->next_line = s + 1;
 220 }
 221
 222 /* Return true if the trigraph indicated by NOTE should be warned
 223    about in a comment.  */
 224 static bool
 225 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 226 {
 227   const uchar *p;
 228
 229   /* Within comments we don't warn about trigraphs, unless the
 230      trigraph forms an escaped newline, as that may change
 231      behavior.  */
 232   if (note->type != '/')
 233     return false;
 234
 235   /* If -trigraphs, then this was an escaped newline iff the next note
 236      is coincident.  */
 237   if (CPP_OPTION (pfile, trigraphs))
 238     return note[1].pos == note->pos;
 239
 240   /* Otherwise, see if this forms an escaped newline.  */
 241   p = note->pos + 3;
 242   while (is_nvspace (*p))
 243     p++;
 244
 245   /* There might have been escaped newlines between the trigraph and the
 246      newline we found.  Hence the position test.  */
 247   return (*p == '\n' && p < note[1].pos);
 248 }
 249
 250 /* Process the notes created by add_line_note as far as the current
 251    location.  */
 252 void
 253 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 254 {
 255   cpp_buffer *buffer = pfile->buffer;
 256
 257   for (;;)
 258     {
 259       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 260       unsigned int col;
 261
 262       if (note->pos > buffer->cur)
 263         break;
 264
 265       buffer->cur_note++;
 266       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 267
 268       if (note->type == '\\' || note->type == ' ')
 269         {
 270           if (note->type == ' ' && !in_comment)
 271             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 272                                  "backslash and newline separated by space");
 273
 274           if (buffer->next_line > buffer->rlimit)
 275             {
 276               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 277                                    "backslash-newline at end of file");
 278               /* Prevent "no newline at end of file" warning.  */
 279               buffer->next_line = buffer->rlimit;
 280             }
 281
 282           buffer->line_base = note->pos;
 283           CPP_INCREMENT_LINE (pfile, 0);
 284         }
 285       else if (_cpp_trigraph_map[note->type])
 286         {
 287           if (CPP_OPTION (pfile, warn_trigraphs)
 288               && (!in_comment || warn_in_comment (pfile, note)))
 289             {
 290               if (CPP_OPTION (pfile, trigraphs))
 291                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 292                                      "trigraph ??%c converted to %c",
 293                                      note->type,
 294                                      (int) _cpp_trigraph_map[note->type]);
 295               else
 296                 {
 297                   cpp_error_with_line
 298                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 299                      "trigraph ??%c ignored, use -trigraphs to enable",
 300                      note->type);
 301                 }
 302             }
 303         }
 304       else
 305         abort ();
 306     }
 307 }
 308
 309 /* SDCC _asm specific */
 310 /* Skip an _asm ... _endasm block.  We find the end of the comment by
 311    seeing _endasm.  Returns non-zero if _asm terminated by EOF, zero
 312    otherwise.  */
 313 static int
 314 skip_asm_block (cpp_reader *pfile)
 315 {
 316 #define _ENDASM_STR "endasm"
 317 #define _ENDASM_LEN ((sizeof _ENDASM_STR) - 1)
 318
 319   cpp_buffer *buffer = pfile->buffer;
 320   cppchar_t c = EOF;
 321   int prev_space = 0;
 322   int ret = 1;
 323
 324   while (buffer->cur != buffer->rlimit)
 325     {
 326       prev_space = is_space(c);
 327       c = *buffer->cur++;
 328
 329       if (prev_space && c == '_')
 330         {
 331           if (buffer->cur + _ENDASM_LEN <= buffer->rlimit &&
 332             strncmp(buffer->cur, _ENDASM_STR, _ENDASM_LEN) == 0)
 333             {
 334               buffer->cur += _ENDASM_LEN;
 335               ret = 0;
 336               break;
 337             }
 338         }
 339       else if (c == '\n')
 340         {
 341           unsigned int cols;
 342           --buffer->cur;
 343           _cpp_process_line_notes (pfile, true);
 344           if (buffer->next_line >= buffer->rlimit)
 345             return true;
 346           _cpp_clean_line (pfile);
 347
 348           cols = buffer->next_line - buffer->line_base;
 349           CPP_INCREMENT_LINE (pfile, cols);
 350         }
 351     }
 352
 353   _cpp_process_line_notes (pfile, true);
 354   return ret;
 355 }
 356
 357 /* Skip a C-style block comment.  We find the end of the comment by
 358    seeing if an asterisk is before every '/' we encounter.  Returns
 359    nonzero if comment terminated by EOF, zero otherwise.
 360
 361    Buffer->cur points to the initial asterisk of the comment.  */
 362 bool
 363 _cpp_skip_block_comment (cpp_reader *pfile)
 364 {
 365   cpp_buffer *buffer = pfile->buffer;
 366   const uchar *cur = buffer->cur;
 367   uchar c;
 368
 369   cur++;
 370   if (*cur == '/')
 371     cur++;
 372
 373   for (;;)
 374     {
 375       /* People like decorating comments with '*', so check for '/'
 376          instead for efficiency.  */
 377       c = *cur++;
 378
 379       if (c == '/')
 380         {
 381           if (cur[-2] == '*')
 382             break;
 383
 384           /* Warn about potential nested comments, but not if the '/'
 385              comes immediately before the true comment delimiter.
 386              Don't bother to get it right across escaped newlines.  */
 387           if (CPP_OPTION (pfile, warn_comments)
 388               && cur[0] == '*' && cur[1] != '/')
 389             {
 390               buffer->cur = cur;
 391               cpp_error_with_line (pfile, CPP_DL_WARNING,
 392                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 393                                    "\"/*\" within comment");
 394             }
 395         }
 396       else if (c == '\n')
 397         {
 398           unsigned int cols;
 399           buffer->cur = cur - 1;
 400           _cpp_process_line_notes (pfile, true);
 401           if (buffer->next_line >= buffer->rlimit)
 402             return true;
 403           _cpp_clean_line (pfile);
 404
 405           cols = buffer->next_line - buffer->line_base;
 406           CPP_INCREMENT_LINE (pfile, cols);
 407
 408           cur = buffer->cur;
 409         }
 410     }
 411
 412   buffer->cur = cur;
 413   _cpp_process_line_notes (pfile, true);
 414   return false;
 415 }
 416
 417 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 418    terminating newline.  Handles escaped newlines.  Returns nonzero
 419    if a multiline comment.  */
 420 static int
 421 skip_line_comment (cpp_reader *pfile)
 422 {
 423   cpp_buffer *buffer = pfile->buffer;
 424   unsigned int orig_line = pfile->line_table->highest_line;
 425
 426   while (*buffer->cur != '\n')
 427     buffer->cur++;
 428
 429   _cpp_process_line_notes (pfile, true);
 430   return orig_line != pfile->line_table->highest_line;
 431 }
 432
 433 /* Skips whitespace, saving the next non-whitespace character.  */
 434 static void
 435 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 436 {
 437   cpp_buffer *buffer = pfile->buffer;
 438   bool saw_NUL = false;
 439
 440   do
 441     {
 442       /* Horizontal space always OK.  */
 443       if (c == ' ' || c == '\t')
 444         ;
 445       /* Just \f \v or \0 left.  */
 446       else if (c == '\0')
 447         saw_NUL = true;
 448       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 449         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 450                              CPP_BUF_COL (buffer),
 451                              "%s in preprocessing directive",
 452                              c == '\f' ? "form feed" : "vertical tab");
 453
 454       c = *buffer->cur++;
 455     }
 456   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 457   while (is_nvspace (c));
 458
 459   if (saw_NUL)
 460     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 461
 462   buffer->cur--;
 463 }
 464
 465 /* See if the characters of a number token are valid in a name (no
 466    '.', '+' or '-').  */
 467 static int
 468 name_p (cpp_reader *pfile, const cpp_string *string)
 469 {
 470   unsigned int i;
 471
 472   for (i = 0; i < string->len; i++)
 473     if (!is_idchar (string->text[i]))
 474       return 0;
 475
 476   return 1;
 477 }
 478
 479 /* After parsing an identifier or other sequence, produce a warning about
 480    sequences not in NFC/NFKC.  */
 481 static void
 482 warn_about_normalization (cpp_reader *pfile,
 483                           const cpp_token *token,
 484                           const struct normalize_state *s)
 485 {
 486   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 487       && !pfile->state.skipping)
 488     {
 489       /* Make sure that the token is printed using UCNs, even
 490          if we'd otherwise happily print UTF-8.  */
 491       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 492       size_t sz;
 493
 494       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 495       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 496         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 497                              "`%.*s' is not in NFKC", (int) sz, buf);
 498       else
 499         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 500                              "`%.*s' is not in NFC", (int) sz, buf);
 501     }
 502 }
 503
 504 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 505    an identifier.  FIRST is TRUE if this starts an identifier.  */
 506 static bool
 507 forms_identifier_p (cpp_reader *pfile, int first,
 508                     struct normalize_state *state)
 509 {
 510   cpp_buffer *buffer = pfile->buffer;
 511
 512   if (*buffer->cur == '$')
 513     {
 514       if (!CPP_OPTION (pfile, dollars_in_ident))
 515         return false;
 516
 517       buffer->cur++;
 518       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 519         {
 520           CPP_OPTION (pfile, warn_dollars) = 0;
 521           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 522         }
 523
 524       return true;
 525     }
 526
 527   /* Is this a syntactically valid UCN?  */
 528   if (CPP_OPTION (pfile, extended_identifiers)
 529       && *buffer->cur == '\\'
 530       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 531     {
 532       buffer->cur += 2;
 533       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 534                           state))
 535         return true;
 536       buffer->cur -= 2;
 537     }
 538
 539   return false;
 540 }
 541
 542 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 543 static cpp_hashnode *
 544 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 545                 struct normalize_state *nst)
 546 {
 547   cpp_hashnode *result;
 548   const uchar *cur;
 549   unsigned int len;
 550   unsigned int hash = HT_HASHSTEP (0, *base);
 551
 552   cur = pfile->buffer->cur;
 553   if (! starts_ucn)
 554     while (ISIDNUM (*cur))
 555       {
 556         hash = HT_HASHSTEP (hash, *cur);
 557         cur++;
 558       }
 559   pfile->buffer->cur = cur;
 560   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 561     {
 562       /* Slower version for identifiers containing UCNs (or $).  */
 563       do {
 564         while (ISIDNUM (*pfile->buffer->cur))
 565           {
 566             pfile->buffer->cur++;
 567             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 568           }
 569       } while (forms_identifier_p (pfile, false, nst));
 570       result = _cpp_interpret_identifier (pfile, base,
 571                                           pfile->buffer->cur - base);
 572     }
 573   else
 574     {
 575       len = cur - base;
 576       hash = HT_HASHFINISH (hash, len);
 577
 578       result = (cpp_hashnode *)
 579         ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
 580     }
 581
 582   /* Rarely, identifiers require diagnostics when lexed.  */
 583   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 584                         && !pfile->state.skipping, 0))
 585     {
 586       /* It is allowed to poison the same identifier twice.  */
 587       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 588         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 589                    NODE_NAME (result));
 590
 591       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 592          replacement list of a variadic macro.  */
 593       if (result == pfile->spec_nodes.n__VA_ARGS__
 594           && !pfile->state.va_args_ok)
 595         cpp_error (pfile, CPP_DL_PEDWARN,
 596                    "__VA_ARGS__ can only appear in the expansion"
 597                    " of a C99 variadic macro");
 598     }
 599
 600   return result;
 601 }
 602
 603 /* SDCC specific */
 604 /* Pedantic parse a number, beginning with character C, skipping embedded
 605    backslash-newlines.  LEADING_PERIOD is nonzero if there was a "."
 606    before C.  Place the result in NUMBER.  */
 607 static void
 608 pedantic_lex_number (cpp_reader *pfile, cpp_string *number)
 609 {
 610 #define get_effective_char(pfile) (*pfile->buffer->cur++)
 611 #define BACKUP() (--pfile->buffer->cur)
 612
 613   enum num_type_e { NT_DEC, NT_HEX } num_type = NT_DEC;
 614   enum num_part_e { NP_WHOLE, NP_FRACT, NP_EXP, NP_INT_SUFFIX, NP_FLOAT_SUFFIX } num_part = NP_WHOLE;
 615
 616   uchar c = *(pfile->buffer->cur - 1);
 617   struct obstack *stack = &pfile->hash_table->stack;
 618   cpp_buffer *buffer = pfile->buffer;
 619   int len = 0;
 620   int has_whole = 0;
 621   int has_fract = 0;
 622
 623   if ('.' == c)
 624     {
 625       num_part = NP_FRACT;
 626       ++len;
 627       obstack_1grow (stack, '.');
 628       c = get_effective_char(pfile);
 629     }
 630   else
 631     {
 632       if ('0' == c)
 633         {
 634           has_whole = 1;
 635           ++len;
 636           obstack_1grow (stack, c);
 637           c = get_effective_char(pfile);
 638
 639           switch (c)
 640             {
 641             case 'X':
 642             case 'x':
 643               num_type = NT_HEX;
 644               ++len;
 645               obstack_1grow (stack, c);
 646               c = get_effective_char(pfile);
 647               break;
 648
 649             case '.':
 650               num_part = NP_FRACT;
 651               ++len;
 652               obstack_1grow (stack, c);
 653               c = get_effective_char(pfile);
 654               break;
 655             }
 656         }
 657     }
 658
 659   for (; ; )
 660     {
 661       switch (num_part)
 662         {
 663         case NP_WHOLE:
 664           if (NT_DEC == num_type)
 665             {
 666               while (ISDIGIT (c))
 667                 {
 668                   has_whole = 1;
 669                   ++len;
 670                   obstack_1grow (stack, c);
 671                   c = get_effective_char(pfile);
 672                 }
 673
 674               if ('.' == c)
 675                 {
 676                   num_part = NP_FRACT;
 677                   ++len;
 678                   obstack_1grow (stack, c);
 679                   c = get_effective_char(pfile);
 680                   continue;
 681                 }
 682               else if ('E' == c || 'e' == c)
 683                 {
 684                   if (has_whole || has_fract)
 685                   {
 686                     num_part = NP_EXP;
 687                     ++len;
 688                     obstack_1grow (stack, c);
 689                     c = get_effective_char(pfile);
 690                     continue;
 691                   }
 692                   else
 693                     break;
 694                 }
 695             }
 696           else
 697             {
 698               while (ISXDIGIT (c))
 699                 {
 700                   has_whole = 1;
 701                   ++len;
 702                   obstack_1grow (stack, c);
 703                   c = get_effective_char(pfile);
 704                 }
 705
 706               if ('.' == c)
 707                 {
 708                   num_part = NP_FRACT;
 709                   ++len;
 710                   obstack_1grow (stack, c);
 711                   c = get_effective_char(pfile);
 712                   continue;
 713                 }
 714               else if ('P' == c || 'p' == c)
 715                 {
 716                   if (has_whole || has_fract)
 717                     {
 718                       num_part = NP_EXP;
 719                       ++len;
 720                       obstack_1grow (stack, c);
 721                       c = get_effective_char(pfile);
 722                       continue;
 723                     }
 724                   else
 725                     break;
 726                 }
 727             }
 728           num_part = NP_INT_SUFFIX;
 729           continue;
 730
 731         case NP_FRACT:
 732           if (NT_DEC == num_type)
 733             {
 734               while (ISDIGIT (c))
 735                 {
 736                   has_fract = 1;
 737                   ++len;
 738                   obstack_1grow (stack, c);
 739                   c = get_effective_char(pfile);
 740                 }
 741
 742               if ('E' == c || 'e' == c)
 743                 {
 744                   if (has_whole || has_fract)
 745                     {
 746                       num_part = NP_EXP;
 747                       ++len;
 748                       obstack_1grow (stack, c);
 749                       c = get_effective_char(pfile);
 750                       continue;
 751                     }
 752                 }
 753             }
 754           else
 755             {
 756               while (ISXDIGIT (c))
 757                 {
 758                   has_fract = 1;
 759                   ++len;
 760                   obstack_1grow (stack, c);
 761                   c = get_effective_char(pfile);
 762                 }
 763
 764               if ('P' == c || 'p' == c)
 765                 {
 766                   if (has_whole || has_fract)
 767                     {
 768                       num_part = NP_EXP;
 769                       ++len;
 770                       obstack_1grow (stack, c);
 771                       c = get_effective_char(pfile);
 772                       continue;
 773                     }
 774                 }
 775             }
 776           num_part = NP_FLOAT_SUFFIX;
 777           continue;
 778
 779         case NP_EXP:
 780           if ('+' == c || '-' == c)
 781             {
 782               ++len;
 783               obstack_1grow (stack, c);
 784               c = get_effective_char(pfile);
 785             }
 786
 787           while (ISDIGIT (c))
 788             {
 789               ++len;
 790               obstack_1grow (stack, c);
 791               c = get_effective_char(pfile);
 792             }
 793
 794           num_part = NP_FLOAT_SUFFIX;
 795           continue;
 796
 797         case NP_INT_SUFFIX:
 798            if ('L' == c || 'l' == c)
 799             {
 800               uchar prevc = c;
 801
 802               ++len;
 803               obstack_1grow (stack, c);
 804               c = get_effective_char(pfile);
 805
 806               if (c == prevc)
 807                 {
 808                   ++len;
 809                   obstack_1grow (stack, c);
 810                   c = get_effective_char(pfile);
 811                 }
 812             }
 813           else if ('U' == c || 'u' == c)
 814             {
 815               ++len;
 816               obstack_1grow (stack, c);
 817               c = get_effective_char(pfile);
 818             }
 819           break;
 820
 821         case NP_FLOAT_SUFFIX:
 822            if ('F' == c || 'f' == c)
 823             {
 824               ++len;
 825               obstack_1grow (stack, c);
 826               c = get_effective_char(pfile);
 827             }
 828           else if ('L' == c || 'l' == c)
 829             {
 830               ++len;
 831               obstack_1grow (stack, c);
 832               c = get_effective_char(pfile);
 833             }
 834           break;
 835         }
 836       break;
 837     }
 838
 839   /* Step back over the unwanted char.  */
 840   BACKUP ();
 841
 842   number->text = obstack_finish (stack);
 843   number->len = len;
 844 }
 845
 846 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 847 static void
 848 lex_number (cpp_reader *pfile, cpp_string *number,
 849             struct normalize_state *nst)
 850 {
 851   const uchar *cur;
 852   const uchar *base;
 853   uchar *dest;
 854
 855   base = pfile->buffer->cur - 1;
 856   do
 857     {
 858       cur = pfile->buffer->cur;
 859
 860       /* N.B. ISIDNUM does not include $.  */
 861       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 862         {
 863           cur++;
 864           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 865         }
 866
 867       pfile->buffer->cur = cur;
 868     }
 869   while (forms_identifier_p (pfile, false, nst));
 870
 871   number->len = cur - base;
 872   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 873   memcpy (dest, base, number->len);
 874   dest[number->len] = '\0';
 875   number->text = dest;
 876 }
 877
 878 /* Create a token of type TYPE with a literal spelling.  */
 879 static void
 880 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 881                 unsigned int len, enum cpp_ttype type)
 882 {
 883   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 884
 885   memcpy (dest, base, len);
 886   dest[len] = '\0';
 887   token->type = type;
 888   token->val.str.len = len;
 889   token->val.str.text = dest;
 890 }
 891
 892 /* Lexes a string, character constant, or angle-bracketed header file
 893    name.  The stored string contains the spelling, including opening
 894    quote and leading any leading 'L'.  It returns the type of the
 895    literal, or CPP_OTHER if it was not properly terminated.
 896
 897    The spelling is NUL-terminated, but it is not guaranteed that this
 898    is the first NUL since embedded NULs are preserved.  */
 899 static void
 900 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 901 {
 902   bool saw_NUL = false;
 903   const uchar *cur;
 904   cppchar_t terminator;
 905   enum cpp_ttype type;
 906
 907   cur = base;
 908   terminator = *cur++;
 909   if (terminator == 'L')
 910     terminator = *cur++;
 911   if (terminator == '\"')
 912     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
 913   else if (terminator == '\'')
 914     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
 915   else
 916     terminator = '>', type = CPP_HEADER_NAME;
 917
 918   for (;;)
 919     {
 920       cppchar_t c = *cur++;
 921
 922       /* In #include-style directives, terminators are not escapable.  */
 923       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 924         cur++;
 925       else if (c == terminator)
 926         break;
 927       else if (c == '\n')
 928         {
 929           cur--;
 930           type = CPP_OTHER;
 931           break;
 932         }
 933       else if (c == '\0')
 934         saw_NUL = true;
 935     }
 936
 937   if (saw_NUL && !pfile->state.skipping)
 938     cpp_error (pfile, CPP_DL_WARNING,
 939                "null character(s) preserved in literal");
 940
 941   pfile->buffer->cur = cur;
 942   create_literal (pfile, token, base, cur - base, type);
 943 }
 944
 945 /* Fixed _WIN32 problem with CR-CR-LF sequences when outputting
 946    comment blocks (when executed with -C option) and
 947    _asm (SDCPP specific) blocks */
 948
 949 /* Count and copy characters from src to dest, excluding CRs:
 950    CRs are automatically generated, because the output is
 951    opened in TEXT mode. If dest == NULL, only count chars */
 952 static unsigned int
 953 copy_text_chars (char *dest, const char *src, unsigned int len)
 954 {
 955   unsigned int n = 0;
 956   const char *p;
 957
 958   for (p = src; p != src + len; ++p)
 959     {
 960       assert(*p != '\0');
 961
 962       if (*p != '\r')
 963         {
 964           if (dest != NULL)
 965             *dest++ = *p;
 966           ++n;
 967         }
 968     }
 969
 970     return n;
 971 }
 972
 973 /* SDCC _asm specific */
 974 /* The stored comment includes the comment start and any terminator.  */
 975 static void
 976 save_asm (cpp_reader *pfile, cpp_token *token, const unsigned char *from)
 977 {
 978 #define _ASM_STR  "_asm"
 979 #define _ASM_LEN  ((sizeof _ASM_STR) - 1)
 980
 981   unsigned char *buffer;
 982   unsigned int text_len, len;
 983
 984   len = pfile->buffer->cur - from;
 985   /* + _ASM_LEN for the initial '_asm'.  */
 986   text_len = copy_text_chars (NULL, from, len) + _ASM_LEN;
 987   buffer = _cpp_unaligned_alloc (pfile, text_len);
 988
 989
 990   token->type = CPP_ASM;
 991   token->val.str.len = text_len;
 992   token->val.str.text = buffer;
 993
 994   memcpy (buffer, _ASM_STR, _ASM_LEN);
 995   copy_text_chars (buffer + _ASM_LEN, from, len);
 996 }
 997
 998 /* The stored comment includes the comment start and any terminator.  */
 999 static void
1000 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1001               cppchar_t type)
1002 {
1003   unsigned char *buffer;
1004   unsigned int len, clen;
1005
1006   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1007
1008   /* C++ comments probably (not definitely) have moved past a new
1009      line, which we don't want to save in the comment.  */
1010   if (is_vspace (pfile->buffer->cur[-1]))
1011     len--;
1012
1013   /* If we are currently in a directive, then we need to store all
1014      C++ comments as C comments internally, and so we need to
1015      allocate a little extra space in that case.
1016
1017      Note that the only time we encounter a directive here is
1018      when we are saving comments in a "#define".  */
1019   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1020
1021   buffer = _cpp_unaligned_alloc (pfile, clen);
1022
1023   token->type = CPP_COMMENT;
1024   token->val.str.len = clen;
1025   token->val.str.text = buffer;
1026
1027   buffer[0] = '/';
1028   copy_text_chars (buffer + 1, from, len);
1029
1030   /* Finish conversion to a C comment, if necessary.  */
1031   if (pfile->state.in_directive && type == '/')
1032     {
1033       buffer[1] = '*';
1034       buffer[clen - 2] = '*';
1035       buffer[clen - 1] = '/';
1036     }
1037 }
1038
1039 /* Allocate COUNT tokens for RUN.  */
1040 void
1041 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1042 {
1043   run->base = XNEWVEC (cpp_token, count);
1044   run->limit = run->base + count;
1045   run->next = NULL;
1046 }
1047
1048 /* Returns the next tokenrun, or creates one if there is none.  */
1049 static tokenrun *
1050 next_tokenrun (tokenrun *run)
1051 {
1052   if (run->next == NULL)
1053     {
1054       run->next = XNEW (tokenrun);
1055       run->next->prev = run;
1056       _cpp_init_tokenrun (run->next, 250);
1057     }
1058
1059   return run->next;
1060 }
1061
1062 /* Allocate a single token that is invalidated at the same time as the
1063    rest of the tokens on the line.  Has its line and col set to the
1064    same as the last lexed token, so that diagnostics appear in the
1065    right place.  */
1066 cpp_token *
1067 _cpp_temp_token (cpp_reader *pfile)
1068 {
1069   cpp_token *old, *result;
1070
1071   old = pfile->cur_token - 1;
1072   if (pfile->cur_token == pfile->cur_run->limit)
1073     {
1074       pfile->cur_run = next_tokenrun (pfile->cur_run);
1075       pfile->cur_token = pfile->cur_run->base;
1076     }
1077
1078   result = pfile->cur_token++;
1079   result->src_loc = old->src_loc;
1080   return result;
1081 }
1082
1083 /* Lex a token into RESULT (external interface).  Takes care of issues
1084    like directive handling, token lookahead, multiple include
1085    optimization and skipping.  */
1086 const cpp_token *
1087 _cpp_lex_token (cpp_reader *pfile)
1088 {
1089   cpp_token *result;
1090
1091   for (;;)
1092     {
1093       if (pfile->cur_token == pfile->cur_run->limit)
1094         {
1095           pfile->cur_run = next_tokenrun (pfile->cur_run);
1096           pfile->cur_token = pfile->cur_run->base;
1097         }
1098
1099       if (pfile->lookaheads)
1100         {
1101           pfile->lookaheads--;
1102           result = pfile->cur_token++;
1103         }
1104       else
1105         result = _cpp_lex_direct (pfile);
1106
1107       if (result->flags & BOL)
1108         {
1109           /* Is this a directive.  If _cpp_handle_directive returns
1110              false, it is an assembler #.  */
1111           if (result->type == CPP_HASH
1112               /* 6.10.3 p 11: Directives in a list of macro arguments
1113                  gives undefined behavior.  This implementation
1114                  handles the directive as normal.  */
1115               && pfile->state.parsing_args != 1
1116               && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1117             {
1118               if (pfile->directive_result.type == CPP_PADDING)
1119                 continue;
1120               else
1121                 {
1122                   result = &pfile->directive_result;
1123                   break;
1124                 }
1125             }
1126
1127           if (pfile->cb.line_change && !pfile->state.skipping)
1128             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1129         }
1130
1131       /* We don't skip tokens in directives.  */
1132       if (pfile->state.in_directive)
1133         break;
1134
1135       /* Outside a directive, invalidate controlling macros.  At file
1136          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1137          get here and MI optimization works.  */
1138       pfile->mi_valid = false;
1139
1140       if (!pfile->state.skipping || result->type == CPP_EOF)
1141         break;
1142     }
1143
1144   return result;
1145 }
1146
1147 /* Returns true if a fresh line has been loaded.  */
1148 bool
1149 _cpp_get_fresh_line (cpp_reader *pfile)
1150 {
1151   int return_at_eof;
1152
1153   /* We can't get a new line until we leave the current directive.  */
1154   if (pfile->state.in_directive)
1155     return false;
1156
1157   for (;;)
1158     {
1159       cpp_buffer *buffer = pfile->buffer;
1160
1161       if (!buffer->need_line)
1162         return true;
1163
1164       if (buffer->next_line < buffer->rlimit)
1165         {
1166           _cpp_clean_line (pfile);
1167           return true;
1168         }
1169
1170       /* First, get out of parsing arguments state.  */
1171       if (pfile->state.parsing_args)
1172         return false;
1173
1174       /* End of buffer.  Non-empty files should end in a newline.  */
1175       if (buffer->buf != buffer->rlimit
1176           && buffer->next_line > buffer->rlimit
1177           && !buffer->from_stage3)
1178         {
1179           /* Only warn once.  */
1180           buffer->next_line = buffer->rlimit;
1181           cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1182                                CPP_BUF_COLUMN (buffer, buffer->cur),
1183                                "no newline at end of file");
1184         }
1185
1186       return_at_eof = buffer->return_at_eof;
1187       _cpp_pop_buffer (pfile);
1188       if (pfile->buffer == NULL || return_at_eof)
1189         return false;
1190     }
1191 }
1192
1193 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1194   do                                                    \
1195     {                                                   \
1196       result->type = ELSE_TYPE;                         \
1197       if (*buffer->cur == CHAR)                         \
1198         buffer->cur++, result->type = THEN_TYPE;        \
1199     }                                                   \
1200   while (0)
1201
1202 /* Lex a token into pfile->cur_token, which is also incremented, to
1203    get diagnostics pointing to the correct location.
1204
1205    Does not handle issues such as token lookahead, multiple-include
1206    optimization, directives, skipping etc.  This function is only
1207    suitable for use by _cpp_lex_token, and in special cases like
1208    lex_expansion_token which doesn't care for any of these issues.
1209
1210    When meeting a newline, returns CPP_EOF if parsing a directive,
1211    otherwise returns to the start of the token buffer if permissible.
1212    Returns the location of the lexed token.  */
1213 cpp_token *
1214 _cpp_lex_direct (cpp_reader *pfile)
1215 {
1216   cppchar_t c;
1217   cpp_buffer *buffer;
1218   const unsigned char *comment_start;
1219   cpp_token *result = pfile->cur_token++;
1220
1221  fresh_line:
1222   result->flags = 0;
1223   buffer = pfile->buffer;
1224   if (buffer->need_line)
1225     {
1226       if (!_cpp_get_fresh_line (pfile))
1227         {
1228           result->type = CPP_EOF;
1229           if (!pfile->state.in_directive)
1230             {
1231               /* Tell the compiler the line number of the EOF token.  */
1232               result->src_loc = pfile->line_table->highest_line;
1233               result->flags = BOL;
1234             }
1235           return result;
1236         }
1237       if (!pfile->keep_tokens)
1238         {
1239           pfile->cur_run = &pfile->base_run;
1240           result = pfile->base_run.base;
1241           pfile->cur_token = result + 1;
1242         }
1243       result->flags = BOL;
1244       if (pfile->state.parsing_args == 2)
1245         result->flags |= PREV_WHITE;
1246     }
1247   buffer = pfile->buffer;
1248  update_tokens_line:
1249   result->src_loc = pfile->line_table->highest_line;
1250
1251  skipped_white:
1252   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1253       && !pfile->overlaid_buffer)
1254     {
1255       _cpp_process_line_notes (pfile, false);
1256       result->src_loc = pfile->line_table->highest_line;
1257     }
1258   c = *buffer->cur++;
1259
1260   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1261                                CPP_BUF_COLUMN (buffer, buffer->cur));
1262
1263   switch (c)
1264     {
1265     case ' ': case '\t': case '\f': case '\v': case '\0':
1266       result->flags |= PREV_WHITE;
1267       skip_whitespace (pfile, c);
1268       goto skipped_white;
1269
1270     case '\n':
1271       if (buffer->cur < buffer->rlimit)
1272         CPP_INCREMENT_LINE (pfile, 0);
1273       buffer->need_line = true;
1274       goto fresh_line;
1275
1276     case '0': case '1': case '2': case '3': case '4':
1277     case '5': case '6': case '7': case '8': case '9':
1278       {
1279         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1280         result->type = CPP_NUMBER;
1281         if (CPP_OPTION(pfile, pedantic_parse_number))
1282           pedantic_lex_number (pfile, &result->val.str);
1283         else
1284           lex_number (pfile, &result->val.str, &nst);
1285         warn_about_normalization (pfile, result, &nst);
1286         break;
1287       }
1288
1289     case 'L':
1290       /* 'L' may introduce wide characters or strings.  */
1291       if (*buffer->cur == '\'' || *buffer->cur == '"')
1292         {
1293           lex_string (pfile, result, buffer->cur - 1);
1294           break;
1295         }
1296       /* Fall through.  */
1297
1298     case '_':
1299     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1300     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1301     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1302     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1303     case 'y': case 'z':
1304     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1305     case 'G': case 'H': case 'I': case 'J': case 'K':
1306     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1307     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1308     case 'Y': case 'Z':
1309       result->type = CPP_NAME;
1310       {
1311         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1312         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1313                                            &nst);
1314         warn_about_normalization (pfile, result, &nst);
1315       }
1316
1317       /* SDCC _asm specific */
1318       /* handle _asm ... _endasm ;  */
1319       if (CPP_OPTION(pfile, preproc_asm) == 0 && result->val.node == pfile->spec_nodes.n__asm)
1320         {
1321           comment_start = buffer->cur;
1322           result->type = CPP_ASM;
1323           skip_asm_block (pfile);
1324           /* Save the _asm block as a token in its own right.  */
1325           save_asm (pfile, result, comment_start);
1326         }
1327       /* Convert named operators to their proper types.  */
1328       else if (result->val.node->flags & NODE_OPERATOR)
1329         {
1330           result->flags |= NAMED_OP;
1331           result->type = (enum cpp_ttype) result->val.node->directive_index;
1332         }
1333       break;
1334
1335     case '\'':
1336     case '"':
1337       lex_string (pfile, result, buffer->cur - 1);
1338       break;
1339
1340     case '/':
1341       /* A potential block or line comment.  */
1342       comment_start = buffer->cur;
1343       c = *buffer->cur;
1344
1345       if (c == '*')
1346         {
1347           if (_cpp_skip_block_comment (pfile))
1348             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1349         }
1350       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1351                             || cpp_in_system_header (pfile)))
1352         {
1353           /* Warn about comments only if pedantically GNUC89, and not
1354              in system headers.  */
1355           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1356               && ! buffer->warned_cplusplus_comments)
1357             {
1358               cpp_error (pfile, CPP_DL_PEDWARN,
1359                          "C++ style comments are not allowed in ISO C90");
1360               cpp_error (pfile, CPP_DL_PEDWARN,
1361                          "(this will be reported only once per input file)");
1362               buffer->warned_cplusplus_comments = 1;
1363             }
1364
1365           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1366             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1367         }
1368       else if (c == '=')
1369         {
1370           buffer->cur++;
1371           result->type = CPP_DIV_EQ;
1372           break;
1373         }
1374       else
1375         {
1376           result->type = CPP_DIV;
1377           break;
1378         }
1379
1380       if (!pfile->state.save_comments)
1381         {
1382           result->flags |= PREV_WHITE;
1383           goto update_tokens_line;
1384         }
1385
1386       /* Save the comment as a token in its own right.  */
1387       save_comment (pfile, result, comment_start, c);
1388       break;
1389
1390     case '<':
1391       if (pfile->state.angled_headers)
1392         {
1393           lex_string (pfile, result, buffer->cur - 1);
1394           break;
1395         }
1396
1397       result->type = CPP_LESS;
1398       if (*buffer->cur == '=')
1399         buffer->cur++, result->type = CPP_LESS_EQ;
1400       else if (*buffer->cur == '<')
1401         {
1402           buffer->cur++;
1403           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1404         }
1405       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1406         {
1407           buffer->cur++;
1408           IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1409         }
1410       else if (CPP_OPTION (pfile, digraphs))
1411         {
1412           if (*buffer->cur == ':')
1413             {
1414               buffer->cur++;
1415               result->flags |= DIGRAPH;
1416               result->type = CPP_OPEN_SQUARE;
1417             }
1418           else if (*buffer->cur == '%')
1419             {
1420               buffer->cur++;
1421               result->flags |= DIGRAPH;
1422               result->type = CPP_OPEN_BRACE;
1423             }
1424         }
1425       break;
1426
1427     case '>':
1428       result->type = CPP_GREATER;
1429       if (*buffer->cur == '=')
1430         buffer->cur++, result->type = CPP_GREATER_EQ;
1431       else if (*buffer->cur == '>')
1432         {
1433           buffer->cur++;
1434           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1435         }
1436       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1437         {
1438           buffer->cur++;
1439           IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1440         }
1441       break;
1442
1443     case '%':
1444       result->type = CPP_MOD;
1445       if (*buffer->cur == '=')
1446         buffer->cur++, result->type = CPP_MOD_EQ;
1447       else if (CPP_OPTION (pfile, digraphs))
1448         {
1449           if (*buffer->cur == ':')
1450             {
1451               buffer->cur++;
1452               result->flags |= DIGRAPH;
1453               result->type = CPP_HASH;
1454               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1455                 buffer->cur += 2, result->type = CPP_PASTE;
1456             }
1457           else if (*buffer->cur == '>')
1458             {
1459               buffer->cur++;
1460               result->flags |= DIGRAPH;
1461               result->type = CPP_CLOSE_BRACE;
1462             }
1463         }
1464       break;
1465
1466     case '.':
1467       result->type = CPP_DOT;
1468       if (ISDIGIT (*buffer->cur))
1469         {
1470           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1471           result->type = CPP_NUMBER;
1472           if (CPP_OPTION(pfile, pedantic_parse_number))
1473             pedantic_lex_number (pfile, &result->val.str);
1474           else
1475             lex_number (pfile, &result->val.str, &nst);
1476           warn_about_normalization (pfile, result, &nst);
1477         }
1478       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1479         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1480       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1481         buffer->cur++, result->type = CPP_DOT_STAR;
1482       break;
1483
1484     case '+':
1485       result->type = CPP_PLUS;
1486       if (*buffer->cur == '+')
1487         buffer->cur++, result->type = CPP_PLUS_PLUS;
1488       else if (*buffer->cur == '=')
1489         buffer->cur++, result->type = CPP_PLUS_EQ;
1490       break;
1491
1492     case '-':
1493       result->type = CPP_MINUS;
1494       if (*buffer->cur == '>')
1495         {
1496           buffer->cur++;
1497           result->type = CPP_DEREF;
1498           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1499             buffer->cur++, result->type = CPP_DEREF_STAR;
1500         }
1501       else if (*buffer->cur == '-')
1502         buffer->cur++, result->type = CPP_MINUS_MINUS;
1503       else if (*buffer->cur == '=')
1504         buffer->cur++, result->type = CPP_MINUS_EQ;
1505       break;
1506
1507     case '&':
1508       result->type = CPP_AND;
1509       if (*buffer->cur == '&')
1510         buffer->cur++, result->type = CPP_AND_AND;
1511       else if (*buffer->cur == '=')
1512         buffer->cur++, result->type = CPP_AND_EQ;
1513       break;
1514
1515     case '|':
1516       result->type = CPP_OR;
1517       if (*buffer->cur == '|')
1518         buffer->cur++, result->type = CPP_OR_OR;
1519       else if (*buffer->cur == '=')
1520         buffer->cur++, result->type = CPP_OR_EQ;
1521       break;
1522
1523     case ':':
1524       result->type = CPP_COLON;
1525       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1526         buffer->cur++, result->type = CPP_SCOPE;
1527       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1528         {
1529           buffer->cur++;
1530           result->flags |= DIGRAPH;
1531           result->type = CPP_CLOSE_SQUARE;
1532         }
1533       break;
1534
1535     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1536     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1537     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1538     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1539     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1540
1541     case '?': result->type = CPP_QUERY; break;
1542     case '~': result->type = CPP_COMPL; break;
1543     case ',': result->type = CPP_COMMA; break;
1544     case '(': result->type = CPP_OPEN_PAREN; break;
1545     case ')': result->type = CPP_CLOSE_PAREN; break;
1546     case '[': result->type = CPP_OPEN_SQUARE; break;
1547     case ']': result->type = CPP_CLOSE_SQUARE; break;
1548     case '{': result->type = CPP_OPEN_BRACE; break;
1549     case '}': result->type = CPP_CLOSE_BRACE; break;
1550     case ';': result->type = CPP_SEMICOLON; break;
1551
1552       /* @ is a punctuator in Objective-C.  */
1553     case '@': result->type = CPP_ATSIGN; break;
1554
1555     case '$':
1556     case '\\':
1557       {
1558         const uchar *base = --buffer->cur;
1559         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1560
1561         if (forms_identifier_p (pfile, true, &nst))
1562           {
1563             result->type = CPP_NAME;
1564             result->val.node = lex_identifier (pfile, base, true, &nst);
1565             warn_about_normalization (pfile, result, &nst);
1566             break;
1567           }
1568         buffer->cur++;
1569       }
1570
1571     default:
1572       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1573       break;
1574     }
1575
1576   return result;
1577 }
1578
1579 /* An upper bound on the number of bytes needed to spell TOKEN.
1580    Does not include preceding whitespace.  */
1581 unsigned int
1582 cpp_token_len (const cpp_token *token)
1583 {
1584   unsigned int len;
1585
1586   switch (TOKEN_SPELL (token))
1587     {
1588     default:            len = 4;                                break;
1589     case SPELL_LITERAL: len = token->val.str.len;               break;
1590     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1591     }
1592
1593   return len;
1594 }
1595
1596 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1597    Return the number of bytes read out of NAME.  (There are always
1598    10 bytes written to BUFFER.)  */
1599
1600 static size_t
1601 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1602 {
1603   int j;
1604   int ucn_len = 0;
1605   int ucn_len_c;
1606   unsigned t;
1607   unsigned long utf32;
1608
1609   /* Compute the length of the UTF-8 sequence.  */
1610   for (t = *name; t & 0x80; t <<= 1)
1611     ucn_len++;
1612
1613   utf32 = *name & (0x7F >> ucn_len);
1614   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1615     {
1616       utf32 = (utf32 << 6) | (*++name & 0x3F);
1617
1618       /* Ill-formed UTF-8.  */
1619       if ((*name & ~0x3F) != 0x80)
1620         abort ();
1621     }
1622
1623   *buffer++ = '\\';
1624   *buffer++ = 'U';
1625   for (j = 7; j >= 0; j--)
1626     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1627   return ucn_len;
1628 }
1629
1630
1631 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1632    already contain the enough space to hold the token's spelling.
1633    Returns a pointer to the character after the last character written.
1634    FORSTRING is true if this is to be the spelling after translation
1635    phase 1 (this is different for UCNs).
1636    FIXME: Would be nice if we didn't need the PFILE argument.  */
1637 unsigned char *
1638 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1639                  unsigned char *buffer, bool forstring)
1640 {
1641   switch (TOKEN_SPELL (token))
1642     {
1643     case SPELL_OPERATOR:
1644       {
1645         const unsigned char *spelling;
1646         unsigned char c;
1647
1648         if (token->flags & DIGRAPH)
1649           spelling
1650             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1651         else if (token->flags & NAMED_OP)
1652           goto spell_ident;
1653         else
1654           spelling = TOKEN_NAME (token);
1655
1656         while ((c = *spelling++) != '\0')
1657           *buffer++ = c;
1658       }
1659       break;
1660
1661     spell_ident:
1662     case SPELL_IDENT:
1663       if (forstring)
1664         {
1665           memcpy (buffer, NODE_NAME (token->val.node),
1666                   NODE_LEN (token->val.node));
1667           buffer += NODE_LEN (token->val.node);
1668         }
1669       else
1670         {
1671           size_t i;
1672           const unsigned char * name = NODE_NAME (token->val.node);
1673
1674           for (i = 0; i < NODE_LEN (token->val.node); i++)
1675             if (name[i] & ~0x7F)
1676               {
1677                 i += utf8_to_ucn (buffer, name + i) - 1;
1678                 buffer += 10;
1679               }
1680             else
1681               *buffer++ = NODE_NAME (token->val.node)[i];
1682         }
1683       break;
1684
1685     case SPELL_LITERAL:
1686       memcpy (buffer, token->val.str.text, token->val.str.len);
1687       buffer += token->val.str.len;
1688       break;
1689
1690     case SPELL_NONE:
1691       cpp_error (pfile, CPP_DL_ICE,
1692                  "unspellable token %s", TOKEN_NAME (token));
1693       break;
1694     }
1695
1696   return buffer;
1697 }
1698
1699 /* Returns TOKEN spelt as a null-terminated string.  The string is
1700    freed when the reader is destroyed.  Useful for diagnostics.  */
1701 unsigned char *
1702 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1703 {
1704   unsigned int len = cpp_token_len (token) + 1;
1705   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1706
1707   end = cpp_spell_token (pfile, token, start, false);
1708   end[0] = '\0';
1709
1710   return start;
1711 }
1712
1713 /* Used by C front ends, which really should move to using
1714    cpp_token_as_text.  */
1715 const char *
1716 cpp_type2name (enum cpp_ttype type)
1717 {
1718   return (const char *) token_spellings[type].name;
1719 }
1720
1721 /* Writes the spelling of token to FP, without any preceding space.
1722    Separated from cpp_spell_token for efficiency - to avoid stdio
1723    double-buffering.  */
1724 void
1725 cpp_output_token (const cpp_token *token, FILE *fp)
1726 {
1727   switch (TOKEN_SPELL (token))
1728     {
1729     case SPELL_OPERATOR:
1730       {
1731         const unsigned char *spelling;
1732         int c;
1733
1734         if (token->flags & DIGRAPH)
1735           spelling
1736             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1737         else if (token->flags & NAMED_OP)
1738           goto spell_ident;
1739         else
1740           spelling = TOKEN_NAME (token);
1741
1742         c = *spelling;
1743         do
1744           putc (c, fp);
1745         while ((c = *++spelling) != '\0');
1746       }
1747       break;
1748
1749     spell_ident:
1750     case SPELL_IDENT:
1751       {
1752         size_t i;
1753         const unsigned char * name = NODE_NAME (token->val.node);
1754
1755         for (i = 0; i < NODE_LEN (token->val.node); i++)
1756           if (name[i] & ~0x7F)
1757             {
1758               unsigned char buffer[10];
1759               i += utf8_to_ucn (buffer, name + i) - 1;
1760               fwrite (buffer, 1, 10, fp);
1761             }
1762           else
1763             fputc (NODE_NAME (token->val.node)[i], fp);
1764       }
1765       break;
1766
1767     case SPELL_LITERAL:
1768       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1769       break;
1770
1771     case SPELL_NONE:
1772       /* An error, most probably.  */
1773       break;
1774     }
1775 }
1776
1777 /* Compare two tokens.  */
1778 int
1779 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1780 {
1781   if (a->type == b->type && a->flags == b->flags)
1782     switch (TOKEN_SPELL (a))
1783       {
1784       default:                  /* Keep compiler happy.  */
1785       case SPELL_OPERATOR:
1786         return 1;
1787       case SPELL_NONE:
1788         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1789       case SPELL_IDENT:
1790         return a->val.node == b->val.node;
1791       case SPELL_LITERAL:
1792         return (a->val.str.len == b->val.str.len
1793                 && !memcmp (a->val.str.text, b->val.str.text,
1794                             a->val.str.len));
1795       }
1796
1797   return 0;
1798 }
1799
1800 /* Returns nonzero if a space should be inserted to avoid an
1801    accidental token paste for output.  For simplicity, it is
1802    conservative, and occasionally advises a space where one is not
1803    needed, e.g. "." and ".2".  */
1804 int
1805 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1806                  const cpp_token *token2)
1807 {
1808   enum cpp_ttype a = token1->type, b = token2->type;
1809   cppchar_t c;
1810
1811   if (token1->flags & NAMED_OP)
1812     a = CPP_NAME;
1813   if (token2->flags & NAMED_OP)
1814     b = CPP_NAME;
1815
1816   c = EOF;
1817   if (token2->flags & DIGRAPH)
1818     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1819   else if (token_spellings[b].category == SPELL_OPERATOR)
1820     c = token_spellings[b].name[0];
1821
1822   /* Quickly get everything that can paste with an '='.  */
1823   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1824     return 1;
1825
1826   switch (a)
1827     {
1828     case CPP_GREATER:   return c == '>' || c == '?';
1829     case CPP_LESS:      return c == '<' || c == '?' || c == '%' || c == ':';
1830     case CPP_PLUS:      return c == '+';
1831     case CPP_MINUS:     return c == '-' || c == '>';
1832     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1833     case CPP_MOD:       return c == ':' || c == '>';
1834     case CPP_AND:       return c == '&';
1835     case CPP_OR:        return c == '|';
1836     case CPP_COLON:     return c == ':' || c == '>';
1837     case CPP_DEREF:     return c == '*';
1838     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1839     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1840     case CPP_NAME:      return ((b == CPP_NUMBER
1841                                  && name_p (pfile, &token2->val.str))
1842                                 || b == CPP_NAME
1843                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1844     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1845                                 || c == '.' || c == '+' || c == '-');
1846                                       /* UCNs */
1847     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1848                                  && b == CPP_NAME)
1849                                 || (CPP_OPTION (pfile, objc)
1850                                     && token1->val.str.text[0] == '@'
1851                                     && (b == CPP_NAME || b == CPP_STRING)));
1852     default:            break;
1853     }
1854
1855   return 0;
1856 }
1857
1858 /* Output all the remaining tokens on the current line, and a newline
1859    character, to FP.  Leading whitespace is removed.  If there are
1860    macros, special token padding is not performed.  */
1861 void
1862 cpp_output_line (cpp_reader *pfile, FILE *fp)
1863 {
1864   const cpp_token *token;
1865
1866   token = cpp_get_token (pfile);
1867   while (token->type != CPP_EOF)
1868     {
1869       cpp_output_token (token, fp);
1870       token = cpp_get_token (pfile);
1871       if (token->flags & PREV_WHITE)
1872         putc (' ', fp);
1873     }
1874
1875   putc ('\n', fp);
1876 }
1877
1878 /* Memory buffers.  Changing these three constants can have a dramatic
1879    effect on performance.  The values here are reasonable defaults,
1880    but might be tuned.  If you adjust them, be sure to test across a
1881    range of uses of cpplib, including heavy nested function-like macro
1882    expansion.  Also check the change in peak memory usage (NJAMD is a
1883    good tool for this).  */
1884 #define MIN_BUFF_SIZE 8000
1885 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1886 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1887         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1888
1889 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1890   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1891 #endif
1892
1893 /* Create a new allocation buffer.  Place the control block at the end
1894    of the buffer, so that buffer overflows will cause immediate chaos.  */
1895 static _cpp_buff *
1896 new_buff (size_t len)
1897 {
1898   _cpp_buff *result;
1899   unsigned char *base;
1900
1901   if (len < MIN_BUFF_SIZE)
1902     len = MIN_BUFF_SIZE;
1903   len = CPP_ALIGN (len);
1904
1905   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1906   result = (_cpp_buff *) (base + len);
1907   result->base = base;
1908   result->cur = base;
1909   result->limit = base + len;
1910   result->next = NULL;
1911   return result;
1912 }
1913
1914 /* Place a chain of unwanted allocation buffers on the free list.  */
1915 void
1916 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1917 {
1918   _cpp_buff *end = buff;
1919
1920   while (end->next)
1921     end = end->next;
1922   end->next = pfile->free_buffs;
1923   pfile->free_buffs = buff;
1924 }
1925
1926 /* Return a free buffer of size at least MIN_SIZE.  */
1927 _cpp_buff *
1928 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1929 {
1930   _cpp_buff *result, **p;
1931
1932   for (p = &pfile->free_buffs;; p = &(*p)->next)
1933     {
1934       size_t size;
1935
1936       if (*p == NULL)
1937         return new_buff (min_size);
1938       result = *p;
1939       size = result->limit - result->base;
1940       /* Return a buffer that's big enough, but don't waste one that's
1941          way too big.  */
1942       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1943         break;
1944     }
1945
1946   *p = result->next;
1947   result->next = NULL;
1948   result->cur = result->base;
1949   return result;
1950 }
1951
1952 /* Creates a new buffer with enough space to hold the uncommitted
1953    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1954    the excess bytes to the new buffer.  Chains the new buffer after
1955    BUFF, and returns the new buffer.  */
1956 _cpp_buff *
1957 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1958 {
1959   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1960   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1961
1962   buff->next = new_buff;
1963   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1964   return new_buff;
1965 }
1966
1967 /* Creates a new buffer with enough space to hold the uncommitted
1968    remaining bytes of the buffer pointed to by BUFF, and at least
1969    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1970    Chains the new buffer before the buffer pointed to by BUFF, and
1971    updates the pointer to point to the new buffer.  */
1972 void
1973 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1974 {
1975   _cpp_buff *new_buff, *old_buff = *pbuff;
1976   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1977
1978   new_buff = _cpp_get_buff (pfile, size);
1979   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1980   new_buff->next = old_buff;
1981   *pbuff = new_buff;
1982 }
1983
1984 /* Free a chain of buffers starting at BUFF.  */
1985 void
1986 _cpp_free_buff (_cpp_buff *buff)
1987 {
1988   _cpp_buff *next;
1989
1990   for (; buff; buff = next)
1991     {
1992       next = buff->next;
1993       free (buff->base);
1994     }
1995 }
1996
1997 /* Allocate permanent, unaligned storage of length LEN.  */
1998 unsigned char *
1999 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2000 {
2001   _cpp_buff *buff = pfile->u_buff;
2002   unsigned char *result = buff->cur;
2003
2004   if (len > (size_t) (buff->limit - result))
2005     {
2006       buff = _cpp_get_buff (pfile, len);
2007       buff->next = pfile->u_buff;
2008       pfile->u_buff = buff;
2009       result = buff->cur;
2010     }
2011
2012   buff->cur = result + len;
2013   return result;
2014 }
2015
2016 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2017    That buffer is used for growing allocations when saving macro
2018    replacement lists in a #define, and when parsing an answer to an
2019    assertion in #assert, #unassert or #if (and therefore possibly
2020    whilst expanding macros).  It therefore must not be used by any
2021    code that they might call: specifically the lexer and the guts of
2022    the macro expander.
2023
2024    All existing other uses clearly fit this restriction: storing
2025    registered pragmas during initialization.  */
2026 unsigned char *
2027 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2028 {
2029   _cpp_buff *buff = pfile->a_buff;
2030   unsigned char *result = buff->cur;
2031
2032   if (len > (size_t) (buff->limit - result))
2033     {
2034       buff = _cpp_get_buff (pfile, len);
2035       buff->next = pfile->a_buff;
2036       pfile->a_buff = buff;
2037       result = buff->cur;
2038     }
2039
2040   buff->cur = result + len;
2041   return result;
2042 }
2043
2044 /* Say which field of TOK is in use.  */
2045
2046 enum cpp_token_fld_kind
2047 cpp_token_val_index (cpp_token *tok)
2048 {
2049   switch (TOKEN_SPELL (tok))
2050     {
2051     case SPELL_IDENT:
2052       return CPP_TOKEN_FLD_NODE;
2053     case SPELL_LITERAL:
2054       return CPP_TOKEN_FLD_STR;
2055     case SPELL_NONE:
2056       if (tok->type == CPP_MACRO_ARG)
2057         return CPP_TOKEN_FLD_ARG_NO;
2058       else if (tok->type == CPP_PADDING)
2059         return CPP_TOKEN_FLD_SOURCE;
2060       else if (tok->type == CPP_PRAGMA)
2061         return CPP_TOKEN_FLD_STR;
2062       /* else fall through */
2063     default:
2064       return CPP_TOKEN_FLD_NONE;
2065     }
2066 }