The data contained in this repository can be downloaded to your computer using one of several clients.
Please see the documentation of your version control software client for more information.

Please select the desired protocol below to get the URL.

This URL has Read-Only access.

Statistics
| Branch: | Revision:

main_repo / deps / v8 / src / scanner.cc @ f230a1cf

History | View | Annotate | Download (37 KB)

1
// Copyright 2011 the V8 project authors. All rights reserved.
2
// Redistribution and use in source and binary forms, with or without
3
// modification, are permitted provided that the following conditions are
4
// met:
5
//
6
//     * Redistributions of source code must retain the above copyright
7
//       notice, this list of conditions and the following disclaimer.
8
//     * Redistributions in binary form must reproduce the above
9
//       copyright notice, this list of conditions and the following
10
//       disclaimer in the documentation and/or other materials provided
11
//       with the distribution.
12
//     * Neither the name of Google Inc. nor the names of its
13
//       contributors may be used to endorse or promote products derived
14
//       from this software without specific prior written permission.
15
//
16
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27

    
28
// Features shared by parsing and pre-parsing scanners.
29

    
30
#include <cmath>
31

    
32
#include "scanner.h"
33

    
34
#include "../include/v8stdint.h"
35
#include "char-predicates-inl.h"
36
#include "conversions-inl.h"
37
#include "list-inl.h"
38

    
39
namespace v8 {
40
namespace internal {
41

    
42
// ----------------------------------------------------------------------------
43
// Scanner
44

    
45
Scanner::Scanner(UnicodeCache* unicode_cache)
46
    : unicode_cache_(unicode_cache),
47
      octal_pos_(Location::invalid()),
48
      harmony_scoping_(false),
49
      harmony_modules_(false),
50
      harmony_numeric_literals_(false) { }
51

    
52

    
53
void Scanner::Initialize(Utf16CharacterStream* source) {
54
  source_ = source;
55
  // Need to capture identifiers in order to recognize "get" and "set"
56
  // in object literals.
57
  Init();
58
  // Skip initial whitespace allowing HTML comment ends just like
59
  // after a newline and scan first token.
60
  has_line_terminator_before_next_ = true;
61
  SkipWhiteSpace();
62
  Scan();
63
}
64

    
65

    
66
uc32 Scanner::ScanHexNumber(int expected_length) {
67
  ASSERT(expected_length <= 4);  // prevent overflow
68

    
69
  uc32 digits[4] = { 0, 0, 0, 0 };
70
  uc32 x = 0;
71
  for (int i = 0; i < expected_length; i++) {
72
    digits[i] = c0_;
73
    int d = HexValue(c0_);
74
    if (d < 0) {
75
      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
76
      // should be illegal, but other JS VMs just return the
77
      // non-escaped version of the original character.
78

    
79
      // Push back digits that we have advanced past.
80
      for (int j = i-1; j >= 0; j--) {
81
        PushBack(digits[j]);
82
      }
83
      return -1;
84
    }
85
    x = x * 16 + d;
86
    Advance();
87
  }
88

    
89
  return x;
90
}
91

    
92

    
93
// Ensure that tokens can be stored in a byte.
94
STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
95

    
96
// Table of one-character tokens, by character (0x00..0x7f only).
97
static const byte one_char_tokens[] = {
98
  Token::ILLEGAL,
99
  Token::ILLEGAL,
100
  Token::ILLEGAL,
101
  Token::ILLEGAL,
102
  Token::ILLEGAL,
103
  Token::ILLEGAL,
104
  Token::ILLEGAL,
105
  Token::ILLEGAL,
106
  Token::ILLEGAL,
107
  Token::ILLEGAL,
108
  Token::ILLEGAL,
109
  Token::ILLEGAL,
110
  Token::ILLEGAL,
111
  Token::ILLEGAL,
112
  Token::ILLEGAL,
113
  Token::ILLEGAL,
114
  Token::ILLEGAL,
115
  Token::ILLEGAL,
116
  Token::ILLEGAL,
117
  Token::ILLEGAL,
118
  Token::ILLEGAL,
119
  Token::ILLEGAL,
120
  Token::ILLEGAL,
121
  Token::ILLEGAL,
122
  Token::ILLEGAL,
123
  Token::ILLEGAL,
124
  Token::ILLEGAL,
125
  Token::ILLEGAL,
126
  Token::ILLEGAL,
127
  Token::ILLEGAL,
128
  Token::ILLEGAL,
129
  Token::ILLEGAL,
130
  Token::ILLEGAL,
131
  Token::ILLEGAL,
132
  Token::ILLEGAL,
133
  Token::ILLEGAL,
134
  Token::ILLEGAL,
135
  Token::ILLEGAL,
136
  Token::ILLEGAL,
137
  Token::ILLEGAL,
138
  Token::LPAREN,       // 0x28
139
  Token::RPAREN,       // 0x29
140
  Token::ILLEGAL,
141
  Token::ILLEGAL,
142
  Token::COMMA,        // 0x2c
143
  Token::ILLEGAL,
144
  Token::ILLEGAL,
145
  Token::ILLEGAL,
146
  Token::ILLEGAL,
147
  Token::ILLEGAL,
148
  Token::ILLEGAL,
149
  Token::ILLEGAL,
150
  Token::ILLEGAL,
151
  Token::ILLEGAL,
152
  Token::ILLEGAL,
153
  Token::ILLEGAL,
154
  Token::ILLEGAL,
155
  Token::ILLEGAL,
156
  Token::COLON,        // 0x3a
157
  Token::SEMICOLON,    // 0x3b
158
  Token::ILLEGAL,
159
  Token::ILLEGAL,
160
  Token::ILLEGAL,
161
  Token::CONDITIONAL,  // 0x3f
162
  Token::ILLEGAL,
163
  Token::ILLEGAL,
164
  Token::ILLEGAL,
165
  Token::ILLEGAL,
166
  Token::ILLEGAL,
167
  Token::ILLEGAL,
168
  Token::ILLEGAL,
169
  Token::ILLEGAL,
170
  Token::ILLEGAL,
171
  Token::ILLEGAL,
172
  Token::ILLEGAL,
173
  Token::ILLEGAL,
174
  Token::ILLEGAL,
175
  Token::ILLEGAL,
176
  Token::ILLEGAL,
177
  Token::ILLEGAL,
178
  Token::ILLEGAL,
179
  Token::ILLEGAL,
180
  Token::ILLEGAL,
181
  Token::ILLEGAL,
182
  Token::ILLEGAL,
183
  Token::ILLEGAL,
184
  Token::ILLEGAL,
185
  Token::ILLEGAL,
186
  Token::ILLEGAL,
187
  Token::ILLEGAL,
188
  Token::ILLEGAL,
189
  Token::LBRACK,     // 0x5b
190
  Token::ILLEGAL,
191
  Token::RBRACK,     // 0x5d
192
  Token::ILLEGAL,
193
  Token::ILLEGAL,
194
  Token::ILLEGAL,
195
  Token::ILLEGAL,
196
  Token::ILLEGAL,
197
  Token::ILLEGAL,
198
  Token::ILLEGAL,
199
  Token::ILLEGAL,
200
  Token::ILLEGAL,
201
  Token::ILLEGAL,
202
  Token::ILLEGAL,
203
  Token::ILLEGAL,
204
  Token::ILLEGAL,
205
  Token::ILLEGAL,
206
  Token::ILLEGAL,
207
  Token::ILLEGAL,
208
  Token::ILLEGAL,
209
  Token::ILLEGAL,
210
  Token::ILLEGAL,
211
  Token::ILLEGAL,
212
  Token::ILLEGAL,
213
  Token::ILLEGAL,
214
  Token::ILLEGAL,
215
  Token::ILLEGAL,
216
  Token::ILLEGAL,
217
  Token::ILLEGAL,
218
  Token::ILLEGAL,
219
  Token::ILLEGAL,
220
  Token::ILLEGAL,
221
  Token::LBRACE,       // 0x7b
222
  Token::ILLEGAL,
223
  Token::RBRACE,       // 0x7d
224
  Token::BIT_NOT,      // 0x7e
225
  Token::ILLEGAL
226
};
227

    
228

    
229
Token::Value Scanner::Next() {
230
  current_ = next_;
231
  has_line_terminator_before_next_ = false;
232
  has_multiline_comment_before_next_ = false;
233
  if (static_cast<unsigned>(c0_) <= 0x7f) {
234
    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
235
    if (token != Token::ILLEGAL) {
236
      int pos = source_pos();
237
      next_.token = token;
238
      next_.location.beg_pos = pos;
239
      next_.location.end_pos = pos + 1;
240
      Advance();
241
      return current_.token;
242
    }
243
  }
244
  Scan();
245
  return current_.token;
246
}
247

    
248

    
249
static inline bool IsByteOrderMark(uc32 c) {
250
  // The Unicode value U+FFFE is guaranteed never to be assigned as a
251
  // Unicode character; this implies that in a Unicode context the
252
  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
253
  // character expressed in little-endian byte order (since it could
254
  // not be a U+FFFE character expressed in big-endian byte
255
  // order). Nevertheless, we check for it to be compatible with
256
  // Spidermonkey.
257
  return c == 0xFEFF || c == 0xFFFE;
258
}
259

    
260

    
261
bool Scanner::SkipWhiteSpace() {
262
  int start_position = source_pos();
263

    
264
  while (true) {
265
    // We treat byte-order marks (BOMs) as whitespace for better
266
    // compatibility with Spidermonkey and other JavaScript engines.
267
    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
268
      // IsWhiteSpace() includes line terminators!
269
      if (unicode_cache_->IsLineTerminator(c0_)) {
270
        // Ignore line terminators, but remember them. This is necessary
271
        // for automatic semicolon insertion.
272
        has_line_terminator_before_next_ = true;
273
      }
274
      Advance();
275
    }
276

    
277
    // If there is an HTML comment end '-->' at the beginning of a
278
    // line (with only whitespace in front of it), we treat the rest
279
    // of the line as a comment. This is in line with the way
280
    // SpiderMonkey handles it.
281
    if (c0_ == '-' && has_line_terminator_before_next_) {
282
      Advance();
283
      if (c0_ == '-') {
284
        Advance();
285
        if (c0_ == '>') {
286
          // Treat the rest of the line as a comment.
287
          SkipSingleLineComment();
288
          // Continue skipping white space after the comment.
289
          continue;
290
        }
291
        PushBack('-');  // undo Advance()
292
      }
293
      PushBack('-');  // undo Advance()
294
    }
295
    // Return whether or not we skipped any characters.
296
    return source_pos() != start_position;
297
  }
298
}
299

    
300

    
301
Token::Value Scanner::SkipSingleLineComment() {
302
  Advance();
303

    
304
  // The line terminator at the end of the line is not considered
305
  // to be part of the single-line comment; it is recognized
306
  // separately by the lexical grammar and becomes part of the
307
  // stream of input elements for the syntactic grammar (see
308
  // ECMA-262, section 7.4).
309
  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
310
    Advance();
311
  }
312

    
313
  return Token::WHITESPACE;
314
}
315

    
316

    
317
Token::Value Scanner::SkipMultiLineComment() {
318
  ASSERT(c0_ == '*');
319
  Advance();
320

    
321
  while (c0_ >= 0) {
322
    uc32 ch = c0_;
323
    Advance();
324
    if (unicode_cache_->IsLineTerminator(ch)) {
325
      // Following ECMA-262, section 7.4, a comment containing
326
      // a newline will make the comment count as a line-terminator.
327
      has_multiline_comment_before_next_ = true;
328
    }
329
    // If we have reached the end of the multi-line comment, we
330
    // consume the '/' and insert a whitespace. This way all
331
    // multi-line comments are treated as whitespace.
332
    if (ch == '*' && c0_ == '/') {
333
      c0_ = ' ';
334
      return Token::WHITESPACE;
335
    }
336
  }
337

    
338
  // Unterminated multi-line comment.
339
  return Token::ILLEGAL;
340
}
341

    
342

    
343
Token::Value Scanner::ScanHtmlComment() {
344
  // Check for <!-- comments.
345
  ASSERT(c0_ == '!');
346
  Advance();
347
  if (c0_ == '-') {
348
    Advance();
349
    if (c0_ == '-') return SkipSingleLineComment();
350
    PushBack('-');  // undo Advance()
351
  }
352
  PushBack('!');  // undo Advance()
353
  ASSERT(c0_ == '!');
354
  return Token::LT;
355
}
356

    
357

    
358
void Scanner::Scan() {
359
  next_.literal_chars = NULL;
360
  Token::Value token;
361
  do {
362
    // Remember the position of the next token
363
    next_.location.beg_pos = source_pos();
364

    
365
    switch (c0_) {
366
      case ' ':
367
      case '\t':
368
        Advance();
369
        token = Token::WHITESPACE;
370
        break;
371

    
372
      case '\n':
373
        Advance();
374
        has_line_terminator_before_next_ = true;
375
        token = Token::WHITESPACE;
376
        break;
377

    
378
      case '"': case '\'':
379
        token = ScanString();
380
        break;
381

    
382
      case '<':
383
        // < <= << <<= <!--
384
        Advance();
385
        if (c0_ == '=') {
386
          token = Select(Token::LTE);
387
        } else if (c0_ == '<') {
388
          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
389
        } else if (c0_ == '!') {
390
          token = ScanHtmlComment();
391
        } else {
392
          token = Token::LT;
393
        }
394
        break;
395

    
396
      case '>':
397
        // > >= >> >>= >>> >>>=
398
        Advance();
399
        if (c0_ == '=') {
400
          token = Select(Token::GTE);
401
        } else if (c0_ == '>') {
402
          // >> >>= >>> >>>=
403
          Advance();
404
          if (c0_ == '=') {
405
            token = Select(Token::ASSIGN_SAR);
406
          } else if (c0_ == '>') {
407
            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
408
          } else {
409
            token = Token::SAR;
410
          }
411
        } else {
412
          token = Token::GT;
413
        }
414
        break;
415

    
416
      case '=':
417
        // = == ===
418
        Advance();
419
        if (c0_ == '=') {
420
          token = Select('=', Token::EQ_STRICT, Token::EQ);
421
        } else {
422
          token = Token::ASSIGN;
423
        }
424
        break;
425

    
426
      case '!':
427
        // ! != !==
428
        Advance();
429
        if (c0_ == '=') {
430
          token = Select('=', Token::NE_STRICT, Token::NE);
431
        } else {
432
          token = Token::NOT;
433
        }
434
        break;
435

    
436
      case '+':
437
        // + ++ +=
438
        Advance();
439
        if (c0_ == '+') {
440
          token = Select(Token::INC);
441
        } else if (c0_ == '=') {
442
          token = Select(Token::ASSIGN_ADD);
443
        } else {
444
          token = Token::ADD;
445
        }
446
        break;
447

    
448
      case '-':
449
        // - -- --> -=
450
        Advance();
451
        if (c0_ == '-') {
452
          Advance();
453
          if (c0_ == '>' && has_line_terminator_before_next_) {
454
            // For compatibility with SpiderMonkey, we skip lines that
455
            // start with an HTML comment end '-->'.
456
            token = SkipSingleLineComment();
457
          } else {
458
            token = Token::DEC;
459
          }
460
        } else if (c0_ == '=') {
461
          token = Select(Token::ASSIGN_SUB);
462
        } else {
463
          token = Token::SUB;
464
        }
465
        break;
466

    
467
      case '*':
468
        // * *=
469
        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
470
        break;
471

    
472
      case '%':
473
        // % %=
474
        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
475
        break;
476

    
477
      case '/':
478
        // /  // /* /=
479
        Advance();
480
        if (c0_ == '/') {
481
          token = SkipSingleLineComment();
482
        } else if (c0_ == '*') {
483
          token = SkipMultiLineComment();
484
        } else if (c0_ == '=') {
485
          token = Select(Token::ASSIGN_DIV);
486
        } else {
487
          token = Token::DIV;
488
        }
489
        break;
490

    
491
      case '&':
492
        // & && &=
493
        Advance();
494
        if (c0_ == '&') {
495
          token = Select(Token::AND);
496
        } else if (c0_ == '=') {
497
          token = Select(Token::ASSIGN_BIT_AND);
498
        } else {
499
          token = Token::BIT_AND;
500
        }
501
        break;
502

    
503
      case '|':
504
        // | || |=
505
        Advance();
506
        if (c0_ == '|') {
507
          token = Select(Token::OR);
508
        } else if (c0_ == '=') {
509
          token = Select(Token::ASSIGN_BIT_OR);
510
        } else {
511
          token = Token::BIT_OR;
512
        }
513
        break;
514

    
515
      case '^':
516
        // ^ ^=
517
        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
518
        break;
519

    
520
      case '.':
521
        // . Number
522
        Advance();
523
        if (IsDecimalDigit(c0_)) {
524
          token = ScanNumber(true);
525
        } else {
526
          token = Token::PERIOD;
527
        }
528
        break;
529

    
530
      case ':':
531
        token = Select(Token::COLON);
532
        break;
533

    
534
      case ';':
535
        token = Select(Token::SEMICOLON);
536
        break;
537

    
538
      case ',':
539
        token = Select(Token::COMMA);
540
        break;
541

    
542
      case '(':
543
        token = Select(Token::LPAREN);
544
        break;
545

    
546
      case ')':
547
        token = Select(Token::RPAREN);
548
        break;
549

    
550
      case '[':
551
        token = Select(Token::LBRACK);
552
        break;
553

    
554
      case ']':
555
        token = Select(Token::RBRACK);
556
        break;
557

    
558
      case '{':
559
        token = Select(Token::LBRACE);
560
        break;
561

    
562
      case '}':
563
        token = Select(Token::RBRACE);
564
        break;
565

    
566
      case '?':
567
        token = Select(Token::CONDITIONAL);
568
        break;
569

    
570
      case '~':
571
        token = Select(Token::BIT_NOT);
572
        break;
573

    
574
      default:
575
        if (unicode_cache_->IsIdentifierStart(c0_)) {
576
          token = ScanIdentifierOrKeyword();
577
        } else if (IsDecimalDigit(c0_)) {
578
          token = ScanNumber(false);
579
        } else if (SkipWhiteSpace()) {
580
          token = Token::WHITESPACE;
581
        } else if (c0_ < 0) {
582
          token = Token::EOS;
583
        } else {
584
          token = Select(Token::ILLEGAL);
585
        }
586
        break;
587
    }
588

    
589
    // Continue scanning for tokens as long as we're just skipping
590
    // whitespace.
591
  } while (token == Token::WHITESPACE);
592

    
593
  next_.location.end_pos = source_pos();
594
  next_.token = token;
595
}
596

    
597

    
598
void Scanner::SeekForward(int pos) {
599
  // After this call, we will have the token at the given position as
600
  // the "next" token. The "current" token will be invalid.
601
  if (pos == next_.location.beg_pos) return;
602
  int current_pos = source_pos();
603
  ASSERT_EQ(next_.location.end_pos, current_pos);
604
  // Positions inside the lookahead token aren't supported.
605
  ASSERT(pos >= current_pos);
606
  if (pos != current_pos) {
607
    source_->SeekForward(pos - source_->pos());
608
    Advance();
609
    // This function is only called to seek to the location
610
    // of the end of a function (at the "}" token). It doesn't matter
611
    // whether there was a line terminator in the part we skip.
612
    has_line_terminator_before_next_ = false;
613
    has_multiline_comment_before_next_ = false;
614
  }
615
  Scan();
616
}
617

    
618

    
619
bool Scanner::ScanEscape() {
620
  uc32 c = c0_;
621
  Advance();
622

    
623
  // Skip escaped newlines.
624
  if (unicode_cache_->IsLineTerminator(c)) {
625
    // Allow CR+LF newlines in multiline string literals.
626
    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
627
    // Allow LF+CR newlines in multiline string literals.
628
    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
629
    return true;
630
  }
631

    
632
  switch (c) {
633
    case '\'':  // fall through
634
    case '"' :  // fall through
635
    case '\\': break;
636
    case 'b' : c = '\b'; break;
637
    case 'f' : c = '\f'; break;
638
    case 'n' : c = '\n'; break;
639
    case 'r' : c = '\r'; break;
640
    case 't' : c = '\t'; break;
641
    case 'u' : {
642
      c = ScanHexNumber(4);
643
      if (c < 0) return false;
644
      break;
645
    }
646
    case 'v' : c = '\v'; break;
647
    case 'x' : {
648
      c = ScanHexNumber(2);
649
      if (c < 0) return false;
650
      break;
651
    }
652
    case '0' :  // fall through
653
    case '1' :  // fall through
654
    case '2' :  // fall through
655
    case '3' :  // fall through
656
    case '4' :  // fall through
657
    case '5' :  // fall through
658
    case '6' :  // fall through
659
    case '7' : c = ScanOctalEscape(c, 2); break;
660
  }
661

    
662
  // According to ECMA-262, section 7.8.4, characters not covered by the
663
  // above cases should be illegal, but they are commonly handled as
664
  // non-escaped characters by JS VMs.
665
  AddLiteralChar(c);
666
  return true;
667
}
668

    
669

    
670
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
671
// ECMA-262. Other JS VMs support them.
672
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
673
  uc32 x = c - '0';
674
  int i = 0;
675
  for (; i < length; i++) {
676
    int d = c0_ - '0';
677
    if (d < 0 || d > 7) break;
678
    int nx = x * 8 + d;
679
    if (nx >= 256) break;
680
    x = nx;
681
    Advance();
682
  }
683
  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
684
  // Remember the position of octal escape sequences so that an error
685
  // can be reported later (in strict mode).
686
  // We don't report the error immediately, because the octal escape can
687
  // occur before the "use strict" directive.
688
  if (c != '0' || i > 0) {
689
    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
690
  }
691
  return x;
692
}
693

    
694

    
695
Token::Value Scanner::ScanString() {
696
  uc32 quote = c0_;
697
  Advance();  // consume quote
698

    
699
  LiteralScope literal(this);
700
  while (c0_ != quote && c0_ >= 0
701
         && !unicode_cache_->IsLineTerminator(c0_)) {
702
    uc32 c = c0_;
703
    Advance();
704
    if (c == '\\') {
705
      if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
706
    } else {
707
      AddLiteralChar(c);
708
    }
709
  }
710
  if (c0_ != quote) return Token::ILLEGAL;
711
  literal.Complete();
712

    
713
  Advance();  // consume quote
714
  return Token::STRING;
715
}
716

    
717

    
718
void Scanner::ScanDecimalDigits() {
719
  while (IsDecimalDigit(c0_))
720
    AddLiteralCharAdvance();
721
}
722

    
723

    
724
Token::Value Scanner::ScanNumber(bool seen_period) {
725
  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
726

    
727
  enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
728

    
729
  LiteralScope literal(this);
730
  if (seen_period) {
731
    // we have already seen a decimal point of the float
732
    AddLiteralChar('.');
733
    ScanDecimalDigits();  // we know we have at least one digit
734

    
735
  } else {
736
    // if the first character is '0' we must check for octals and hex
737
    if (c0_ == '0') {
738
      int start_pos = source_pos();  // For reporting octal positions.
739
      AddLiteralCharAdvance();
740

    
741
      // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
742
      // an octal number.
743
      if (c0_ == 'x' || c0_ == 'X') {
744
        // hex number
745
        kind = HEX;
746
        AddLiteralCharAdvance();
747
        if (!IsHexDigit(c0_)) {
748
          // we must have at least one hex digit after 'x'/'X'
749
          return Token::ILLEGAL;
750
        }
751
        while (IsHexDigit(c0_)) {
752
          AddLiteralCharAdvance();
753
        }
754
      } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
755
        kind = OCTAL;
756
        AddLiteralCharAdvance();
757
        if (!IsOctalDigit(c0_)) {
758
          // we must have at least one octal digit after 'o'/'O'
759
          return Token::ILLEGAL;
760
        }
761
        while (IsOctalDigit(c0_)) {
762
          AddLiteralCharAdvance();
763
        }
764
      } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
765
        kind = BINARY;
766
        AddLiteralCharAdvance();
767
        if (!IsBinaryDigit(c0_)) {
768
          // we must have at least one binary digit after 'b'/'B'
769
          return Token::ILLEGAL;
770
        }
771
        while (IsBinaryDigit(c0_)) {
772
          AddLiteralCharAdvance();
773
        }
774
      } else if ('0' <= c0_ && c0_ <= '7') {
775
        // (possible) octal number
776
        kind = IMPLICIT_OCTAL;
777
        while (true) {
778
          if (c0_ == '8' || c0_ == '9') {
779
            kind = DECIMAL;
780
            break;
781
          }
782
          if (c0_  < '0' || '7'  < c0_) {
783
            // Octal literal finished.
784
            octal_pos_ = Location(start_pos, source_pos());
785
            break;
786
          }
787
          AddLiteralCharAdvance();
788
        }
789
      }
790
    }
791

    
792
    // Parse decimal digits and allow trailing fractional part.
793
    if (kind == DECIMAL) {
794
      ScanDecimalDigits();  // optional
795
      if (c0_ == '.') {
796
        AddLiteralCharAdvance();
797
        ScanDecimalDigits();  // optional
798
      }
799
    }
800
  }
801

    
802
  // scan exponent, if any
803
  if (c0_ == 'e' || c0_ == 'E') {
804
    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
805
    if (kind != DECIMAL) return Token::ILLEGAL;
806
    // scan exponent
807
    AddLiteralCharAdvance();
808
    if (c0_ == '+' || c0_ == '-')
809
      AddLiteralCharAdvance();
810
    if (!IsDecimalDigit(c0_)) {
811
      // we must have at least one decimal digit after 'e'/'E'
812
      return Token::ILLEGAL;
813
    }
814
    ScanDecimalDigits();
815
  }
816

    
817
  // The source character immediately following a numeric literal must
818
  // not be an identifier start or a decimal digit; see ECMA-262
819
  // section 7.8.3, page 17 (note that we read only one decimal digit
820
  // if the value is 0).
821
  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
822
    return Token::ILLEGAL;
823

    
824
  literal.Complete();
825

    
826
  return Token::NUMBER;
827
}
828

    
829

    
830
uc32 Scanner::ScanIdentifierUnicodeEscape() {
831
  Advance();
832
  if (c0_ != 'u') return -1;
833
  Advance();
834
  uc32 result = ScanHexNumber(4);
835
  if (result < 0) PushBack('u');
836
  return result;
837
}
838

    
839

    
840
// ----------------------------------------------------------------------------
841
// Keyword Matcher
842

    
843
#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
844
  KEYWORD_GROUP('b')                                                \
845
  KEYWORD("break", Token::BREAK)                                    \
846
  KEYWORD_GROUP('c')                                                \
847
  KEYWORD("case", Token::CASE)                                      \
848
  KEYWORD("catch", Token::CATCH)                                    \
849
  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
850
  KEYWORD("const", Token::CONST)                                    \
851
  KEYWORD("continue", Token::CONTINUE)                              \
852
  KEYWORD_GROUP('d')                                                \
853
  KEYWORD("debugger", Token::DEBUGGER)                              \
854
  KEYWORD("default", Token::DEFAULT)                                \
855
  KEYWORD("delete", Token::DELETE)                                  \
856
  KEYWORD("do", Token::DO)                                          \
857
  KEYWORD_GROUP('e')                                                \
858
  KEYWORD("else", Token::ELSE)                                      \
859
  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
860
  KEYWORD("export", harmony_modules                                 \
861
                    ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
862
  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
863
  KEYWORD_GROUP('f')                                                \
864
  KEYWORD("false", Token::FALSE_LITERAL)                            \
865
  KEYWORD("finally", Token::FINALLY)                                \
866
  KEYWORD("for", Token::FOR)                                        \
867
  KEYWORD("function", Token::FUNCTION)                              \
868
  KEYWORD_GROUP('i')                                                \
869
  KEYWORD("if", Token::IF)                                          \
870
  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
871
  KEYWORD("import", harmony_modules                                 \
872
                    ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
873
  KEYWORD("in", Token::IN)                                          \
874
  KEYWORD("instanceof", Token::INSTANCEOF)                          \
875
  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
876
  KEYWORD_GROUP('l')                                                \
877
  KEYWORD("let", harmony_scoping                                    \
878
                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
879
  KEYWORD_GROUP('n')                                                \
880
  KEYWORD("new", Token::NEW)                                        \
881
  KEYWORD("null", Token::NULL_LITERAL)                              \
882
  KEYWORD_GROUP('p')                                                \
883
  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
884
  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
885
  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
886
  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
887
  KEYWORD_GROUP('r')                                                \
888
  KEYWORD("return", Token::RETURN)                                  \
889
  KEYWORD_GROUP('s')                                                \
890
  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
891
  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
892
  KEYWORD("switch", Token::SWITCH)                                  \
893
  KEYWORD_GROUP('t')                                                \
894
  KEYWORD("this", Token::THIS)                                      \
895
  KEYWORD("throw", Token::THROW)                                    \
896
  KEYWORD("true", Token::TRUE_LITERAL)                              \
897
  KEYWORD("try", Token::TRY)                                        \
898
  KEYWORD("typeof", Token::TYPEOF)                                  \
899
  KEYWORD_GROUP('v')                                                \
900
  KEYWORD("var", Token::VAR)                                        \
901
  KEYWORD("void", Token::VOID)                                      \
902
  KEYWORD_GROUP('w')                                                \
903
  KEYWORD("while", Token::WHILE)                                    \
904
  KEYWORD("with", Token::WITH)                                      \
905
  KEYWORD_GROUP('y')                                                \
906
  KEYWORD("yield", Token::YIELD)
907

    
908

    
909
static Token::Value KeywordOrIdentifierToken(const char* input,
910
                                             int input_length,
911
                                             bool harmony_scoping,
912
                                             bool harmony_modules) {
913
  ASSERT(input_length >= 1);
914
  const int kMinLength = 2;
915
  const int kMaxLength = 10;
916
  if (input_length < kMinLength || input_length > kMaxLength) {
917
    return Token::IDENTIFIER;
918
  }
919
  switch (input[0]) {
920
    default:
921
#define KEYWORD_GROUP_CASE(ch)                                \
922
      break;                                                  \
923
    case ch:
924
#define KEYWORD(keyword, token)                               \
925
    {                                                         \
926
      /* 'keyword' is a char array, so sizeof(keyword) is */  \
927
      /* strlen(keyword) plus 1 for the NUL char. */          \
928
      const int keyword_length = sizeof(keyword) - 1;         \
929
      STATIC_ASSERT(keyword_length >= kMinLength);            \
930
      STATIC_ASSERT(keyword_length <= kMaxLength);            \
931
      if (input_length == keyword_length &&                   \
932
          input[1] == keyword[1] &&                           \
933
          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
934
          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
935
          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
936
          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
937
          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
938
          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
939
          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
940
          (keyword_length <= 9 || input[9] == keyword[9])) {  \
941
        return token;                                         \
942
      }                                                       \
943
    }
944
    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
945
  }
946
  return Token::IDENTIFIER;
947
}
948

    
949

    
950
Token::Value Scanner::ScanIdentifierOrKeyword() {
951
  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
952
  LiteralScope literal(this);
953
  // Scan identifier start character.
954
  if (c0_ == '\\') {
955
    uc32 c = ScanIdentifierUnicodeEscape();
956
    // Only allow legal identifier start characters.
957
    if (c < 0 ||
958
        c == '\\' ||  // No recursive escapes.
959
        !unicode_cache_->IsIdentifierStart(c)) {
960
      return Token::ILLEGAL;
961
    }
962
    AddLiteralChar(c);
963
    return ScanIdentifierSuffix(&literal);
964
  }
965

    
966
  uc32 first_char = c0_;
967
  Advance();
968
  AddLiteralChar(first_char);
969

    
970
  // Scan the rest of the identifier characters.
971
  while (unicode_cache_->IsIdentifierPart(c0_)) {
972
    if (c0_ != '\\') {
973
      uc32 next_char = c0_;
974
      Advance();
975
      AddLiteralChar(next_char);
976
      continue;
977
    }
978
    // Fallthrough if no longer able to complete keyword.
979
    return ScanIdentifierSuffix(&literal);
980
  }
981

    
982
  literal.Complete();
983

    
984
  if (next_.literal_chars->is_ascii()) {
985
    Vector<const char> chars = next_.literal_chars->ascii_literal();
986
    return KeywordOrIdentifierToken(chars.start(),
987
                                    chars.length(),
988
                                    harmony_scoping_,
989
                                    harmony_modules_);
990
  }
991

    
992
  return Token::IDENTIFIER;
993
}
994

    
995

    
996
Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
997
  // Scan the rest of the identifier characters.
998
  while (unicode_cache_->IsIdentifierPart(c0_)) {
999
    if (c0_ == '\\') {
1000
      uc32 c = ScanIdentifierUnicodeEscape();
1001
      // Only allow legal identifier part characters.
1002
      if (c < 0 ||
1003
          c == '\\' ||
1004
          !unicode_cache_->IsIdentifierPart(c)) {
1005
        return Token::ILLEGAL;
1006
      }
1007
      AddLiteralChar(c);
1008
    } else {
1009
      AddLiteralChar(c0_);
1010
      Advance();
1011
    }
1012
  }
1013
  literal->Complete();
1014

    
1015
  return Token::IDENTIFIER;
1016
}
1017

    
1018

    
1019
bool Scanner::ScanRegExpPattern(bool seen_equal) {
1020
  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1021
  bool in_character_class = false;
1022

    
1023
  // Previous token is either '/' or '/=', in the second case, the
1024
  // pattern starts at =.
1025
  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1026
  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1027

    
1028
  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1029
  // the scanner should pass uninterpreted bodies to the RegExp
1030
  // constructor.
1031
  LiteralScope literal(this);
1032
  if (seen_equal) {
1033
    AddLiteralChar('=');
1034
  }
1035

    
1036
  while (c0_ != '/' || in_character_class) {
1037
    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1038
    if (c0_ == '\\') {  // Escape sequence.
1039
      AddLiteralCharAdvance();
1040
      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1041
      AddLiteralCharAdvance();
1042
      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1043
      // only "safe" characters are allowed (letters, digits, underscore),
1044
      // otherwise the escape isn't valid and the invalid character has
1045
      // its normal meaning. I.e., we can just continue scanning without
1046
      // worrying whether the following characters are part of the escape
1047
      // or not, since any '/', '\\' or '[' is guaranteed to not be part
1048
      // of the escape sequence.
1049

    
1050
      // TODO(896): At some point, parse RegExps more throughly to capture
1051
      // octal esacpes in strict mode.
1052
    } else {  // Unescaped character.
1053
      if (c0_ == '[') in_character_class = true;
1054
      if (c0_ == ']') in_character_class = false;
1055
      AddLiteralCharAdvance();
1056
    }
1057
  }
1058
  Advance();  // consume '/'
1059

    
1060
  literal.Complete();
1061

    
1062
  return true;
1063
}
1064

    
1065

    
1066
bool Scanner::ScanLiteralUnicodeEscape() {
1067
  ASSERT(c0_ == '\\');
1068
  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1069
  Advance();
1070
  int i = 1;
1071
  if (c0_ == 'u') {
1072
    i++;
1073
    while (i < 6) {
1074
      Advance();
1075
      if (!IsHexDigit(c0_)) break;
1076
      chars_read[i] = c0_;
1077
      i++;
1078
    }
1079
  }
1080
  if (i < 6) {
1081
    // Incomplete escape. Undo all advances and return false.
1082
    while (i > 0) {
1083
      i--;
1084
      PushBack(chars_read[i]);
1085
    }
1086
    return false;
1087
  }
1088
  // Complete escape. Add all chars to current literal buffer.
1089
  for (int i = 0; i < 6; i++) {
1090
    AddLiteralChar(chars_read[i]);
1091
  }
1092
  return true;
1093
}
1094

    
1095

    
1096
bool Scanner::ScanRegExpFlags() {
1097
  // Scan regular expression flags.
1098
  LiteralScope literal(this);
1099
  while (unicode_cache_->IsIdentifierPart(c0_)) {
1100
    if (c0_ != '\\') {
1101
      AddLiteralCharAdvance();
1102
    } else {
1103
      if (!ScanLiteralUnicodeEscape()) {
1104
        break;
1105
      }
1106
      Advance();
1107
    }
1108
  }
1109
  literal.Complete();
1110

    
1111
  next_.location.end_pos = source_pos() - 1;
1112
  return true;
1113
}
1114

    
1115

    
1116
int DuplicateFinder::AddAsciiSymbol(Vector<const char> key, int value) {
1117
  return AddSymbol(Vector<const byte>::cast(key), true, value);
1118
}
1119

    
1120

    
1121
int DuplicateFinder::AddUtf16Symbol(Vector<const uint16_t> key, int value) {
1122
  return AddSymbol(Vector<const byte>::cast(key), false, value);
1123
}
1124

    
1125

    
1126
int DuplicateFinder::AddSymbol(Vector<const byte> key,
1127
                               bool is_ascii,
1128
                               int value) {
1129
  uint32_t hash = Hash(key, is_ascii);
1130
  byte* encoding = BackupKey(key, is_ascii);
1131
  HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1132
  int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1133
  entry->value =
1134
    reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1135
  return old_value;
1136
}
1137

    
1138

    
1139
int DuplicateFinder::AddNumber(Vector<const char> key, int value) {
1140
  ASSERT(key.length() > 0);
1141
  // Quick check for already being in canonical form.
1142
  if (IsNumberCanonical(key)) {
1143
    return AddAsciiSymbol(key, value);
1144
  }
1145

    
1146
  int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1147
  double double_value = StringToDouble(unicode_constants_, key, flags, 0.0);
1148
  int length;
1149
  const char* string;
1150
  if (!std::isfinite(double_value)) {
1151
    string = "Infinity";
1152
    length = 8;  // strlen("Infinity");
1153
  } else {
1154
    string = DoubleToCString(double_value,
1155
                             Vector<char>(number_buffer_, kBufferSize));
1156
    length = StrLength(string);
1157
  }
1158
  return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1159
                                      length), true, value);
1160
}
1161

    
1162

    
1163
bool DuplicateFinder::IsNumberCanonical(Vector<const char> number) {
1164
  // Test for a safe approximation of number literals that are already
1165
  // in canonical form: max 15 digits, no leading zeroes, except an
1166
  // integer part that is a single zero, and no trailing zeros below
1167
  // the decimal point.
1168
  int pos = 0;
1169
  int length = number.length();
1170
  if (number.length() > 15) return false;
1171
  if (number[pos] == '0') {
1172
    pos++;
1173
  } else {
1174
    while (pos < length &&
1175
           static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1176
  }
1177
  if (length == pos) return true;
1178
  if (number[pos] != '.') return false;
1179
  pos++;
1180
  bool invalid_last_digit = true;
1181
  while (pos < length) {
1182
    byte digit = number[pos] - '0';
1183
    if (digit > '9' - '0') return false;
1184
    invalid_last_digit = (digit == 0);
1185
    pos++;
1186
  }
1187
  return !invalid_last_digit;
1188
}
1189

    
1190

    
1191
uint32_t DuplicateFinder::Hash(Vector<const byte> key, bool is_ascii) {
1192
  // Primitive hash function, almost identical to the one used
1193
  // for strings (except that it's seeded by the length and ASCII-ness).
1194
  int length = key.length();
1195
  uint32_t hash = (length << 1) | (is_ascii ? 1 : 0) ;
1196
  for (int i = 0; i < length; i++) {
1197
    uint32_t c = key[i];
1198
    hash = (hash + c) * 1025;
1199
    hash ^= (hash >> 6);
1200
  }
1201
  return hash;
1202
}
1203

    
1204

    
1205
bool DuplicateFinder::Match(void* first, void* second) {
1206
  // Decode lengths.
1207
  // Length + ASCII-bit is encoded as base 128, most significant heptet first,
1208
  // with a 8th bit being non-zero while there are more heptets.
1209
  // The value encodes the number of bytes following, and whether the original
1210
  // was ASCII.
1211
  byte* s1 = reinterpret_cast<byte*>(first);
1212
  byte* s2 = reinterpret_cast<byte*>(second);
1213
  uint32_t length_ascii_field = 0;
1214
  byte c1;
1215
  do {
1216
    c1 = *s1;
1217
    if (c1 != *s2) return false;
1218
    length_ascii_field = (length_ascii_field << 7) | (c1 & 0x7f);
1219
    s1++;
1220
    s2++;
1221
  } while ((c1 & 0x80) != 0);
1222
  int length = static_cast<int>(length_ascii_field >> 1);
1223
  return memcmp(s1, s2, length) == 0;
1224
}
1225

    
1226

    
1227
byte* DuplicateFinder::BackupKey(Vector<const byte> bytes,
1228
                                 bool is_ascii) {
1229
  uint32_t ascii_length = (bytes.length() << 1) | (is_ascii ? 1 : 0);
1230
  backing_store_.StartSequence();
1231
  // Emit ascii_length as base-128 encoded number, with the 7th bit set
1232
  // on the byte of every heptet except the last, least significant, one.
1233
  if (ascii_length >= (1 << 7)) {
1234
    if (ascii_length >= (1 << 14)) {
1235
      if (ascii_length >= (1 << 21)) {
1236
        if (ascii_length >= (1 << 28)) {
1237
          backing_store_.Add(static_cast<byte>((ascii_length >> 28) | 0x80));
1238
        }
1239
        backing_store_.Add(static_cast<byte>((ascii_length >> 21) | 0x80u));
1240
      }
1241
      backing_store_.Add(static_cast<byte>((ascii_length >> 14) | 0x80u));
1242
    }
1243
    backing_store_.Add(static_cast<byte>((ascii_length >> 7) | 0x80u));
1244
  }
1245
  backing_store_.Add(static_cast<byte>(ascii_length & 0x7f));
1246

    
1247
  backing_store_.AddBlock(bytes);
1248
  return backing_store_.EndSequence().start();
1249
}
1250

    
1251
} }  // namespace v8::internal