The data contained in this repository can be downloaded to your computer using one of several clients.
Please see the documentation of your version control software client for more information.
Please select the desired protocol below to get the URL.
This URL has Read-Only access.
main_repo / deps / v8 / src / scanner.cc @ 40c0f755
History | View | Annotate | Download (21.8 KB)
1 |
// Copyright 2006-2008 the V8 project authors. All rights reserved.
|
---|---|
2 |
// Redistribution and use in source and binary forms, with or without
|
3 |
// modification, are permitted provided that the following conditions are
|
4 |
// met:
|
5 |
//
|
6 |
// * Redistributions of source code must retain the above copyright
|
7 |
// notice, this list of conditions and the following disclaimer.
|
8 |
// * Redistributions in binary form must reproduce the above
|
9 |
// copyright notice, this list of conditions and the following
|
10 |
// disclaimer in the documentation and/or other materials provided
|
11 |
// with the distribution.
|
12 |
// * Neither the name of Google Inc. nor the names of its
|
13 |
// contributors may be used to endorse or promote products derived
|
14 |
// from this software without specific prior written permission.
|
15 |
//
|
16 |
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17 |
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
18 |
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
19 |
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
20 |
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
21 |
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
22 |
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
23 |
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
24 |
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
25 |
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
26 |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27 |
|
28 |
#include "v8.h" |
29 |
|
30 |
#include "ast.h" |
31 |
#include "scanner.h" |
32 |
|
33 |
namespace v8 { namespace internal { |
34 |
|
35 |
// ----------------------------------------------------------------------------
|
36 |
// Character predicates
|
37 |
|
38 |
|
39 |
unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
|
40 |
unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
|
41 |
unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
|
42 |
unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
|
43 |
|
44 |
|
45 |
StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_; |
46 |
|
47 |
|
48 |
// ----------------------------------------------------------------------------
|
49 |
// UTF8Buffer
|
50 |
|
51 |
UTF8Buffer::UTF8Buffer() : data_(NULL) {
|
52 |
Initialize(NULL, 0); |
53 |
} |
54 |
|
55 |
|
56 |
UTF8Buffer::~UTF8Buffer() { |
57 |
DeleteArray(data_); |
58 |
} |
59 |
|
60 |
|
61 |
void UTF8Buffer::Initialize(char* src, int length) { |
62 |
DeleteArray(data_); |
63 |
data_ = src; |
64 |
size_ = length; |
65 |
Reset(); |
66 |
} |
67 |
|
68 |
|
69 |
void UTF8Buffer::AddChar(uc32 c) {
|
70 |
const int min_size = 1024; |
71 |
if (pos_ + static_cast<int>(unibrow::Utf8::kMaxEncodedSize) > size_) { |
72 |
int new_size = size_ * 2; |
73 |
if (new_size < min_size) {
|
74 |
new_size = min_size; |
75 |
} |
76 |
char* new_data = NewArray<char>(new_size); |
77 |
memcpy(new_data, data_, pos_); |
78 |
DeleteArray(data_); |
79 |
data_ = new_data; |
80 |
size_ = new_size; |
81 |
} |
82 |
if (static_cast<unsigned>(c) < unibrow::Utf8::kMaxOneByteChar) { |
83 |
data_[pos_++] = c; // common case: 7bit ASCII
|
84 |
} else {
|
85 |
pos_ += unibrow::Utf8::Encode(&data_[pos_], c); |
86 |
} |
87 |
ASSERT(pos_ <= size_); |
88 |
} |
89 |
|
90 |
|
91 |
// ----------------------------------------------------------------------------
|
92 |
// UTF16Buffer
|
93 |
|
94 |
|
95 |
UTF16Buffer::UTF16Buffer() |
96 |
: pos_(0),
|
97 |
pushback_buffer_(0),
|
98 |
last_(0),
|
99 |
stream_(NULL) { }
|
100 |
|
101 |
|
102 |
void UTF16Buffer::Initialize(Handle<String> data,
|
103 |
unibrow::CharacterStream* input) { |
104 |
data_ = data; |
105 |
pos_ = 0;
|
106 |
stream_ = input; |
107 |
} |
108 |
|
109 |
|
110 |
Handle<String> UTF16Buffer::SubString(int start, int end) { |
111 |
return internal::SubString(data_, start, end);
|
112 |
} |
113 |
|
114 |
|
115 |
void UTF16Buffer::PushBack(uc32 ch) {
|
116 |
pushback_buffer()->Add(last_); |
117 |
last_ = ch; |
118 |
pos_--; |
119 |
} |
120 |
|
121 |
|
122 |
uc32 UTF16Buffer::Advance() { |
123 |
// NOTE: It is of importance to Persian / Farsi resources that we do
|
124 |
// *not* strip format control characters in the scanner; see
|
125 |
//
|
126 |
// https://bugzilla.mozilla.org/show_bug.cgi?id=274152
|
127 |
//
|
128 |
// So, even though ECMA-262, section 7.1, page 11, dictates that we
|
129 |
// must remove Unicode format-control characters, we do not. This is
|
130 |
// in line with how IE and SpiderMonkey handles it.
|
131 |
if (!pushback_buffer()->is_empty()) {
|
132 |
pos_++; |
133 |
return last_ = pushback_buffer()->RemoveLast();
|
134 |
} else if (stream_->has_more()) { |
135 |
pos_++; |
136 |
uc32 next = stream_->GetNext(); |
137 |
return last_ = next;
|
138 |
} else {
|
139 |
// note: currently the following increment is necessary to avoid a
|
140 |
// test-parser problem!
|
141 |
pos_++; |
142 |
return last_ = static_cast<uc32>(-1); |
143 |
} |
144 |
} |
145 |
|
146 |
|
147 |
void UTF16Buffer::SeekForward(int pos) { |
148 |
pos_ = pos; |
149 |
ASSERT(pushback_buffer()->is_empty()); |
150 |
stream_->Seek(pos); |
151 |
} |
152 |
|
153 |
|
154 |
// ----------------------------------------------------------------------------
|
155 |
// Scanner
|
156 |
|
157 |
Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) { |
158 |
Token::Initialize(); |
159 |
} |
160 |
|
161 |
|
162 |
void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
|
163 |
int position) {
|
164 |
// Initialize the source buffer.
|
165 |
source_.Initialize(source, stream); |
166 |
position_ = position; |
167 |
|
168 |
// Reset literals buffer
|
169 |
literals_.Reset(); |
170 |
|
171 |
// Set c0_ (one character ahead)
|
172 |
ASSERT(kCharacterLookaheadBufferSize == 1);
|
173 |
Advance(); |
174 |
|
175 |
// Skip initial whitespace (allowing HTML comment ends) and scan
|
176 |
// first token.
|
177 |
SkipWhiteSpace(true);
|
178 |
Scan(); |
179 |
} |
180 |
|
181 |
|
182 |
Handle<String> Scanner::SubString(int start, int end) { |
183 |
return source_.SubString(start - position_, end - position_);
|
184 |
} |
185 |
|
186 |
|
187 |
Token::Value Scanner::Next() { |
188 |
// BUG 1215673: Find a thread safe way to set a stack limit in
|
189 |
// pre-parse mode. Otherwise, we cannot safely pre-parse from other
|
190 |
// threads.
|
191 |
current_ = next_; |
192 |
// Check for stack-overflow before returning any tokens.
|
193 |
StackLimitCheck check; |
194 |
if (check.HasOverflowed()) {
|
195 |
stack_overflow_ = true;
|
196 |
next_.token = Token::ILLEGAL; |
197 |
} else {
|
198 |
Scan(); |
199 |
} |
200 |
return current_.token;
|
201 |
} |
202 |
|
203 |
|
204 |
void Scanner::StartLiteral() {
|
205 |
next_.literal_pos = literals_.pos(); |
206 |
} |
207 |
|
208 |
|
209 |
void Scanner::AddChar(uc32 c) {
|
210 |
literals_.AddChar(c); |
211 |
} |
212 |
|
213 |
|
214 |
void Scanner::TerminateLiteral() {
|
215 |
next_.literal_end = literals_.pos(); |
216 |
AddChar(0);
|
217 |
} |
218 |
|
219 |
|
220 |
void Scanner::AddCharAdvance() {
|
221 |
AddChar(c0_); |
222 |
Advance(); |
223 |
} |
224 |
|
225 |
|
226 |
void Scanner::Advance() {
|
227 |
c0_ = source_.Advance(); |
228 |
} |
229 |
|
230 |
|
231 |
void Scanner::PushBack(uc32 ch) {
|
232 |
source_.PushBack(ch); |
233 |
c0_ = ch; |
234 |
} |
235 |
|
236 |
|
237 |
static inline bool IsByteOrderMark(uc32 c) { |
238 |
// The Unicode value U+FFFE is guaranteed never to be assigned as a
|
239 |
// Unicode character; this implies that in a Unicode context the
|
240 |
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
|
241 |
// character expressed in little-endian byte order (since it could
|
242 |
// not be a U+FFFE character expressed in big-endian byte
|
243 |
// order). Nevertheless, we check for it to be compatible with
|
244 |
// Spidermonkey.
|
245 |
return c == 0xFEFF || c == 0xFFFE; |
246 |
} |
247 |
|
248 |
|
249 |
void Scanner::SkipWhiteSpace(bool initial) { |
250 |
has_line_terminator_before_next_ = initial; |
251 |
|
252 |
while (true) { |
253 |
// We treat byte-order marks (BOMs) as whitespace for better
|
254 |
// compatibility with Spidermonkey and other JavaScript engines.
|
255 |
while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
|
256 |
// IsWhiteSpace() includes line terminators!
|
257 |
if (kIsLineTerminator.get(c0_))
|
258 |
// Ignore line terminators, but remember them. This is necessary
|
259 |
// for automatic semicolon insertion.
|
260 |
has_line_terminator_before_next_ = true;
|
261 |
Advance(); |
262 |
} |
263 |
|
264 |
// If there is an HTML comment end '-->' at the beginning of a
|
265 |
// line (with only whitespace in front of it), we treat the rest
|
266 |
// of the line as a comment. This is in line with the way
|
267 |
// SpiderMonkey handles it.
|
268 |
if (c0_ == '-' && has_line_terminator_before_next_) { |
269 |
Advance(); |
270 |
if (c0_ == '-') { |
271 |
Advance(); |
272 |
if (c0_ == '>') { |
273 |
// Treat the rest of the line as a comment.
|
274 |
SkipSingleLineComment(); |
275 |
// Continue skipping white space after the comment.
|
276 |
continue;
|
277 |
} |
278 |
PushBack('-'); // undo Advance() |
279 |
} |
280 |
PushBack('-'); // undo Advance() |
281 |
} |
282 |
return;
|
283 |
} |
284 |
} |
285 |
|
286 |
|
287 |
Token::Value Scanner::SkipSingleLineComment() { |
288 |
Advance(); |
289 |
|
290 |
// The line terminator at the end of the line is not considered
|
291 |
// to be part of the single-line comment; it is recognized
|
292 |
// separately by the lexical grammar and becomes part of the
|
293 |
// stream of input elements for the syntactic grammar (see
|
294 |
// ECMA-262, section 7.4, page 12).
|
295 |
while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) { |
296 |
Advance(); |
297 |
} |
298 |
|
299 |
return Token::COMMENT;
|
300 |
} |
301 |
|
302 |
|
303 |
Token::Value Scanner::SkipMultiLineComment() { |
304 |
ASSERT(c0_ == '*');
|
305 |
Advance(); |
306 |
|
307 |
while (c0_ >= 0) { |
308 |
char ch = c0_;
|
309 |
Advance(); |
310 |
// If we have reached the end of the multi-line comment, we
|
311 |
// consume the '/' and insert a whitespace. This way all
|
312 |
// multi-line comments are treated as whitespace - even the ones
|
313 |
// containing line terminators. This contradicts ECMA-262, section
|
314 |
// 7.4, page 12, that says that multi-line comments containing
|
315 |
// line terminators should be treated as a line terminator, but it
|
316 |
// matches the behaviour of SpiderMonkey and KJS.
|
317 |
if (ch == '*' && c0_ == '/') { |
318 |
c0_ = ' ';
|
319 |
return Token::COMMENT;
|
320 |
} |
321 |
} |
322 |
|
323 |
// Unterminated multi-line comment.
|
324 |
return Token::ILLEGAL;
|
325 |
} |
326 |
|
327 |
|
328 |
Token::Value Scanner::ScanHtmlComment() { |
329 |
// Check for <!-- comments.
|
330 |
ASSERT(c0_ == '!');
|
331 |
Advance(); |
332 |
if (c0_ == '-') { |
333 |
Advance(); |
334 |
if (c0_ == '-') return SkipSingleLineComment(); |
335 |
PushBack('-'); // undo Advance() |
336 |
} |
337 |
PushBack('!'); // undo Advance() |
338 |
ASSERT(c0_ == '!');
|
339 |
return Token::LT;
|
340 |
} |
341 |
|
342 |
|
343 |
void Scanner::Scan() {
|
344 |
Token::Value token; |
345 |
bool has_line_terminator = false; |
346 |
do {
|
347 |
SkipWhiteSpace(has_line_terminator); |
348 |
|
349 |
// Remember the line terminator in previous loop
|
350 |
has_line_terminator = has_line_terminator_before_next(); |
351 |
|
352 |
// Remember the position of the next token
|
353 |
next_.location.beg_pos = source_pos(); |
354 |
|
355 |
token = ScanToken(); |
356 |
} while (token == Token::COMMENT);
|
357 |
|
358 |
next_.location.end_pos = source_pos(); |
359 |
next_.token = token; |
360 |
} |
361 |
|
362 |
|
363 |
void Scanner::SeekForward(int pos) { |
364 |
source_.SeekForward(pos - 1);
|
365 |
Advance(); |
366 |
Scan(); |
367 |
} |
368 |
|
369 |
|
370 |
uc32 Scanner::ScanHexEscape(uc32 c, int length) {
|
371 |
ASSERT(length <= 4); // prevent overflow |
372 |
|
373 |
uc32 digits[4];
|
374 |
uc32 x = 0;
|
375 |
for (int i = 0; i < length; i++) { |
376 |
digits[i] = c0_; |
377 |
int d = HexValue(c0_);
|
378 |
if (d < 0) { |
379 |
// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
|
380 |
// should be illegal, but other JS VMs just return the
|
381 |
// non-escaped version of the original character.
|
382 |
|
383 |
// Push back digits read, except the last one (in c0_).
|
384 |
for (int j = i-1; j >= 0; j--) { |
385 |
PushBack(digits[j]); |
386 |
} |
387 |
// Notice: No handling of error - treat it as "\u"->"u".
|
388 |
return c;
|
389 |
} |
390 |
x = x * 16 + d;
|
391 |
Advance(); |
392 |
} |
393 |
|
394 |
return x;
|
395 |
} |
396 |
|
397 |
|
398 |
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
|
399 |
// ECMA-262. Other JS VMs support them.
|
400 |
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
|
401 |
uc32 x = c - '0';
|
402 |
for (int i = 0; i < length; i++) { |
403 |
int d = c0_ - '0'; |
404 |
if (d < 0 || d > 7) break; |
405 |
int nx = x * 8 + d; |
406 |
if (nx >= 256) break; |
407 |
x = nx; |
408 |
Advance(); |
409 |
} |
410 |
return x;
|
411 |
} |
412 |
|
413 |
|
414 |
void Scanner::ScanEscape() {
|
415 |
uc32 c = c0_; |
416 |
Advance(); |
417 |
|
418 |
// Skip escaped newlines.
|
419 |
if (kIsLineTerminator.get(c)) {
|
420 |
// Allow CR+LF newlines in multiline string literals.
|
421 |
if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
|
422 |
// Allow LF+CR newlines in multiline string literals.
|
423 |
if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
|
424 |
return;
|
425 |
} |
426 |
|
427 |
switch (c) {
|
428 |
case '\'': // fall through |
429 |
case '"' : // fall through |
430 |
case '\\': break; |
431 |
case 'b' : c = '\b'; break; |
432 |
case 'f' : c = '\f'; break; |
433 |
case 'n' : c = '\n'; break; |
434 |
case 'r' : c = '\r'; break; |
435 |
case 't' : c = '\t'; break; |
436 |
case 'u' : c = ScanHexEscape(c, 4); break; |
437 |
case 'v' : c = '\v'; break; |
438 |
case 'x' : c = ScanHexEscape(c, 2); break; |
439 |
case '0' : // fall through |
440 |
case '1' : // fall through |
441 |
case '2' : // fall through |
442 |
case '3' : // fall through |
443 |
case '4' : // fall through |
444 |
case '5' : // fall through |
445 |
case '6' : // fall through |
446 |
case '7' : c = ScanOctalEscape(c, 2); break; |
447 |
} |
448 |
|
449 |
// According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
|
450 |
// should be illegal, but they are commonly handled
|
451 |
// as non-escaped characters by JS VMs.
|
452 |
AddChar(c); |
453 |
} |
454 |
|
455 |
|
456 |
Token::Value Scanner::ScanString() { |
457 |
uc32 quote = c0_; |
458 |
Advance(); // consume quote
|
459 |
|
460 |
StartLiteral(); |
461 |
while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) { |
462 |
uc32 c = c0_; |
463 |
Advance(); |
464 |
if (c == '\\') { |
465 |
if (c0_ < 0) return Token::ILLEGAL; |
466 |
ScanEscape(); |
467 |
} else {
|
468 |
AddChar(c); |
469 |
} |
470 |
} |
471 |
if (c0_ != quote) {
|
472 |
return Token::ILLEGAL;
|
473 |
} |
474 |
TerminateLiteral(); |
475 |
|
476 |
Advance(); // consume quote
|
477 |
return Token::STRING;
|
478 |
} |
479 |
|
480 |
|
481 |
Token::Value Scanner::Select(Token::Value tok) { |
482 |
Advance(); |
483 |
return tok;
|
484 |
} |
485 |
|
486 |
|
487 |
Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { |
488 |
Advance(); |
489 |
if (c0_ == next) {
|
490 |
Advance(); |
491 |
return then;
|
492 |
} else {
|
493 |
return else_;
|
494 |
} |
495 |
} |
496 |
|
497 |
|
498 |
Token::Value Scanner::ScanToken() { |
499 |
switch (c0_) {
|
500 |
// strings
|
501 |
case '"': case '\'': |
502 |
return ScanString();
|
503 |
|
504 |
case '<': |
505 |
// < <= << <<= <!--
|
506 |
Advance(); |
507 |
if (c0_ == '=') return Select(Token::LTE); |
508 |
if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL); |
509 |
if (c0_ == '!') return ScanHtmlComment(); |
510 |
return Token::LT;
|
511 |
|
512 |
case '>': |
513 |
// > >= >> >>= >>> >>>=
|
514 |
Advance(); |
515 |
if (c0_ == '=') return Select(Token::GTE); |
516 |
if (c0_ == '>') { |
517 |
// >> >>= >>> >>>=
|
518 |
Advance(); |
519 |
if (c0_ == '=') return Select(Token::ASSIGN_SAR); |
520 |
if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR); |
521 |
return Token::SAR;
|
522 |
} |
523 |
return Token::GT;
|
524 |
|
525 |
case '=': |
526 |
// = == ===
|
527 |
Advance(); |
528 |
if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ); |
529 |
return Token::ASSIGN;
|
530 |
|
531 |
case '!': |
532 |
// ! != !==
|
533 |
Advance(); |
534 |
if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE); |
535 |
return Token::NOT;
|
536 |
|
537 |
case '+': |
538 |
// + ++ +=
|
539 |
Advance(); |
540 |
if (c0_ == '+') return Select(Token::INC); |
541 |
if (c0_ == '=') return Select(Token::ASSIGN_ADD); |
542 |
return Token::ADD;
|
543 |
|
544 |
case '-': |
545 |
// - -- -=
|
546 |
Advance(); |
547 |
if (c0_ == '-') return Select(Token::DEC); |
548 |
if (c0_ == '=') return Select(Token::ASSIGN_SUB); |
549 |
return Token::SUB;
|
550 |
|
551 |
case '*': |
552 |
// * *=
|
553 |
return Select('=', Token::ASSIGN_MUL, Token::MUL); |
554 |
|
555 |
case '%': |
556 |
// % %=
|
557 |
return Select('=', Token::ASSIGN_MOD, Token::MOD); |
558 |
|
559 |
case '/': |
560 |
// / // /* /=
|
561 |
Advance(); |
562 |
if (c0_ == '/') return SkipSingleLineComment(); |
563 |
if (c0_ == '*') return SkipMultiLineComment(); |
564 |
if (c0_ == '=') return Select(Token::ASSIGN_DIV); |
565 |
return Token::DIV;
|
566 |
|
567 |
case '&': |
568 |
// & && &=
|
569 |
Advance(); |
570 |
if (c0_ == '&') return Select(Token::AND); |
571 |
if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND); |
572 |
return Token::BIT_AND;
|
573 |
|
574 |
case '|': |
575 |
// | || |=
|
576 |
Advance(); |
577 |
if (c0_ == '|') return Select(Token::OR); |
578 |
if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR); |
579 |
return Token::BIT_OR;
|
580 |
|
581 |
case '^': |
582 |
// ^ ^=
|
583 |
return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); |
584 |
|
585 |
case '.': |
586 |
// . Number
|
587 |
Advance(); |
588 |
if (IsDecimalDigit(c0_)) return ScanNumber(true); |
589 |
return Token::PERIOD;
|
590 |
|
591 |
case ':': |
592 |
return Select(Token::COLON);
|
593 |
|
594 |
case ';': |
595 |
return Select(Token::SEMICOLON);
|
596 |
|
597 |
case ',': |
598 |
return Select(Token::COMMA);
|
599 |
|
600 |
case '(': |
601 |
return Select(Token::LPAREN);
|
602 |
|
603 |
case ')': |
604 |
return Select(Token::RPAREN);
|
605 |
|
606 |
case '[': |
607 |
return Select(Token::LBRACK);
|
608 |
|
609 |
case ']': |
610 |
return Select(Token::RBRACK);
|
611 |
|
612 |
case '{': |
613 |
return Select(Token::LBRACE);
|
614 |
|
615 |
case '}': |
616 |
return Select(Token::RBRACE);
|
617 |
|
618 |
case '?': |
619 |
return Select(Token::CONDITIONAL);
|
620 |
|
621 |
case '~': |
622 |
return Select(Token::BIT_NOT);
|
623 |
|
624 |
default:
|
625 |
if (kIsIdentifierStart.get(c0_))
|
626 |
return ScanIdentifier();
|
627 |
if (IsDecimalDigit(c0_))
|
628 |
return ScanNumber(false); |
629 |
if (c0_ < 0) |
630 |
return Token::EOS;
|
631 |
return Select(Token::ILLEGAL);
|
632 |
} |
633 |
|
634 |
UNREACHABLE(); |
635 |
return Token::ILLEGAL;
|
636 |
} |
637 |
|
638 |
|
639 |
// Returns true if any decimal digits were scanned, returns false otherwise.
|
640 |
void Scanner::ScanDecimalDigits() {
|
641 |
while (IsDecimalDigit(c0_))
|
642 |
AddCharAdvance(); |
643 |
} |
644 |
|
645 |
|
646 |
Token::Value Scanner::ScanNumber(bool seen_period) {
|
647 |
ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
|
648 |
|
649 |
enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
|
650 |
|
651 |
StartLiteral(); |
652 |
if (seen_period) {
|
653 |
// we have already seen a decimal point of the float
|
654 |
AddChar('.');
|
655 |
ScanDecimalDigits(); // we know we have at least one digit
|
656 |
|
657 |
} else {
|
658 |
// if the first character is '0' we must check for octals and hex
|
659 |
if (c0_ == '0') { |
660 |
AddCharAdvance(); |
661 |
|
662 |
// either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
|
663 |
if (c0_ == 'x' || c0_ == 'X') { |
664 |
// hex number
|
665 |
kind = HEX; |
666 |
AddCharAdvance(); |
667 |
if (!IsHexDigit(c0_))
|
668 |
// we must have at least one hex digit after 'x'/'X'
|
669 |
return Token::ILLEGAL;
|
670 |
while (IsHexDigit(c0_))
|
671 |
AddCharAdvance(); |
672 |
|
673 |
} else if ('0' <= c0_ && c0_ <= '7') { |
674 |
// (possible) octal number
|
675 |
kind = OCTAL; |
676 |
while (true) { |
677 |
if (c0_ == '8' || c0_ == '9') { |
678 |
kind = DECIMAL; |
679 |
break;
|
680 |
} |
681 |
if (c0_ < '0' || '7' < c0_) break; |
682 |
AddCharAdvance(); |
683 |
} |
684 |
} |
685 |
} |
686 |
|
687 |
// Parse decimal digits and allow trailing fractional part.
|
688 |
if (kind == DECIMAL) {
|
689 |
ScanDecimalDigits(); // optional
|
690 |
if (c0_ == '.') { |
691 |
AddCharAdvance(); |
692 |
ScanDecimalDigits(); // optional
|
693 |
} |
694 |
} |
695 |
} |
696 |
|
697 |
// scan exponent, if any
|
698 |
if (c0_ == 'e' || c0_ == 'E') { |
699 |
ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
|
700 |
if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed |
701 |
// scan exponent
|
702 |
AddCharAdvance(); |
703 |
if (c0_ == '+' || c0_ == '-') |
704 |
AddCharAdvance(); |
705 |
if (!IsDecimalDigit(c0_))
|
706 |
// we must have at least one decimal digit after 'e'/'E'
|
707 |
return Token::ILLEGAL;
|
708 |
ScanDecimalDigits(); |
709 |
} |
710 |
TerminateLiteral(); |
711 |
|
712 |
// The source character immediately following a numeric literal must
|
713 |
// not be an identifier start or a decimal digit; see ECMA-262
|
714 |
// section 7.8.3, page 17 (note that we read only one decimal digit
|
715 |
// if the value is 0).
|
716 |
if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
|
717 |
return Token::ILLEGAL;
|
718 |
|
719 |
return Token::NUMBER;
|
720 |
} |
721 |
|
722 |
|
723 |
uc32 Scanner::ScanIdentifierUnicodeEscape() { |
724 |
Advance(); |
725 |
if (c0_ != 'u') return unibrow::Utf8::kBadChar; |
726 |
Advance(); |
727 |
uc32 c = ScanHexEscape('u', 4); |
728 |
// We do not allow a unicode escape sequence to start another
|
729 |
// unicode escape sequence.
|
730 |
if (c == '\\') return unibrow::Utf8::kBadChar; |
731 |
return c;
|
732 |
} |
733 |
|
734 |
|
735 |
Token::Value Scanner::ScanIdentifier() { |
736 |
ASSERT(kIsIdentifierStart.get(c0_)); |
737 |
|
738 |
bool has_escapes = false; |
739 |
|
740 |
StartLiteral(); |
741 |
// Scan identifier start character.
|
742 |
if (c0_ == '\\') { |
743 |
has_escapes = true;
|
744 |
uc32 c = ScanIdentifierUnicodeEscape(); |
745 |
// Only allow legal identifier start characters.
|
746 |
if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL; |
747 |
AddChar(c); |
748 |
} else {
|
749 |
AddCharAdvance(); |
750 |
} |
751 |
// Scan the rest of the identifier characters.
|
752 |
while (kIsIdentifierPart.get(c0_)) {
|
753 |
if (c0_ == '\\') { |
754 |
has_escapes = true;
|
755 |
uc32 c = ScanIdentifierUnicodeEscape(); |
756 |
// Only allow legal identifier part characters.
|
757 |
if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL; |
758 |
AddChar(c); |
759 |
} else {
|
760 |
AddCharAdvance(); |
761 |
} |
762 |
} |
763 |
TerminateLiteral(); |
764 |
|
765 |
// We don't have any 1-letter keywords (this is probably a common case).
|
766 |
if ((next_.literal_end - next_.literal_pos) == 1) |
767 |
return Token::IDENTIFIER;
|
768 |
|
769 |
// If the identifier contains unicode escapes, it must not be
|
770 |
// resolved to a keyword.
|
771 |
if (has_escapes)
|
772 |
return Token::IDENTIFIER;
|
773 |
|
774 |
return Token::Lookup(&literals_.data()[next_.literal_pos]);
|
775 |
} |
776 |
|
777 |
|
778 |
|
779 |
bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
|
780 |
// Checks whether the buffer contains an identifier (no escape).
|
781 |
if (!buffer->has_more()) return false; |
782 |
if (!kIsIdentifierStart.get(buffer->GetNext())) return false; |
783 |
while (buffer->has_more()) {
|
784 |
if (!kIsIdentifierPart.get(buffer->GetNext())) return false; |
785 |
} |
786 |
return true; |
787 |
} |
788 |
|
789 |
|
790 |
bool Scanner::ScanRegExpPattern(bool seen_equal) { |
791 |
// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
|
792 |
bool in_character_class = false; |
793 |
|
794 |
// Previous token is either '/' or '/=', in the second case, the
|
795 |
// pattern starts at =.
|
796 |
next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); |
797 |
next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); |
798 |
|
799 |
// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
|
800 |
// the scanner should pass uninterpreted bodies to the RegExp
|
801 |
// constructor.
|
802 |
StartLiteral(); |
803 |
if (seen_equal)
|
804 |
AddChar('=');
|
805 |
|
806 |
while (c0_ != '/' || in_character_class) { |
807 |
if (kIsLineTerminator.get(c0_) || c0_ < 0) |
808 |
return false; |
809 |
if (c0_ == '\\') { // escaped character |
810 |
AddCharAdvance(); |
811 |
if (kIsLineTerminator.get(c0_) || c0_ < 0) |
812 |
return false; |
813 |
AddCharAdvance(); |
814 |
} else { // unescaped character |
815 |
if (c0_ == '[') |
816 |
in_character_class = true;
|
817 |
if (c0_ == ']') |
818 |
in_character_class = false;
|
819 |
AddCharAdvance(); |
820 |
} |
821 |
} |
822 |
Advance(); // consume '/'
|
823 |
|
824 |
TerminateLiteral(); |
825 |
|
826 |
return true; |
827 |
} |
828 |
|
829 |
bool Scanner::ScanRegExpFlags() {
|
830 |
// Scan regular expression flags.
|
831 |
StartLiteral(); |
832 |
while (kIsIdentifierPart.get(c0_)) {
|
833 |
if (c0_ == '\\') { |
834 |
uc32 c = ScanIdentifierUnicodeEscape(); |
835 |
if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { |
836 |
// We allow any escaped character, unlike the restriction on
|
837 |
// IdentifierPart when it is used to build an IdentifierName.
|
838 |
AddChar(c); |
839 |
continue;
|
840 |
} |
841 |
} |
842 |
AddCharAdvance(); |
843 |
} |
844 |
TerminateLiteral(); |
845 |
|
846 |
next_.location.end_pos = source_pos() - 1;
|
847 |
return true; |
848 |
} |
849 |
|
850 |
} } // namespace v8::internal
|