The data contained in this repository can be downloaded to your computer using one of several clients.
Please see the documentation of your version control software client for more information.

Please select the desired protocol below to get the URL.

This URL has Read-Only access.

Statistics
| Branch: | Revision:

main_repo / deps / v8 / src / unicode.h @ 40c0f755

History | View | Annotate | Download (8.56 KB)

1
// Copyright 2007-2008 the V8 project authors. All rights reserved.
2
// Redistribution and use in source and binary forms, with or without
3
// modification, are permitted provided that the following conditions are
4
// met:
5
//
6
//     * Redistributions of source code must retain the above copyright
7
//       notice, this list of conditions and the following disclaimer.
8
//     * Redistributions in binary form must reproduce the above
9
//       copyright notice, this list of conditions and the following
10
//       disclaimer in the documentation and/or other materials provided
11
//       with the distribution.
12
//     * Neither the name of Google Inc. nor the names of its
13
//       contributors may be used to endorse or promote products derived
14
//       from this software without specific prior written permission.
15
//
16
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27

    
28
#ifndef __UNIBROW_H__
29
#define __UNIBROW_H__
30

    
31
#include <sys/types.h>
32

    
33
/**
34
 * \file
35
 * Definitions and convenience functions for working with unicode.
36
 */
37

    
38
namespace unibrow {
39

    
40
typedef unsigned int uchar;
41
typedef unsigned char byte;
42

    
43
/**
44
 * The max length of the result of converting the case of a single
45
 * character.
46
 */
47
static const int kMaxMappingSize = 4;
48

    
49
template <class T, int size = 256>
50
class Predicate {
51
 public:
52
  inline Predicate() { }
53
  inline bool get(uchar c);
54
 private:
55
  friend class Test;
56
  bool CalculateValue(uchar c);
57
  struct CacheEntry {
58
    inline CacheEntry() : code_point_(0), value_(0) { }
59
    inline CacheEntry(uchar code_point, bool value)
60
      : code_point_(code_point),
61
        value_(value) { }
62
    uchar code_point_ : 21;
63
    bool value_ : 1;
64
  };
65
  static const int kSize = size;
66
  static const int kMask = kSize - 1;
67
  CacheEntry entries_[kSize];
68
};
69

    
70
// A cache used in case conversion.  It caches the value for characters
71
// that either have no mapping or map to a single character independent
72
// of context.  Characters that map to more than one character or that
73
// map differently depending on context are always looked up.
74
template <class T, int size = 256>
75
class Mapping {
76
 public:
77
  inline Mapping() { }
78
  inline int get(uchar c, uchar n, uchar* result);
79
 private:
80
  friend class Test;
81
  int CalculateValue(uchar c, uchar n, uchar* result);
82
  struct CacheEntry {
83
    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84
    inline CacheEntry(uchar code_point, signed offset)
85
      : code_point_(code_point),
86
        offset_(offset) { }
87
    uchar code_point_;
88
    signed offset_;
89
    static const int kNoChar = (1 << 21) - 1;
90
  };
91
  static const int kSize = size;
92
  static const int kMask = kSize - 1;
93
  CacheEntry entries_[kSize];
94
};
95

    
96
class UnicodeData {
97
 private:
98
  friend class Test;
99
  static int GetByteCount();
100
  static uchar kMaxCodePoint;
101
};
102

    
103
// --- U t f   8 ---
104

    
105
template <typename Data>
106
class Buffer {
107
 public:
108
  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109
  inline Buffer() : data_(0), length_(0) { }
110
  Data data() { return data_; }
111
  unsigned length() { return length_; }
112
 private:
113
  Data data_;
114
  unsigned length_;
115
};
116

    
117
class Utf8 {
118
 public:
119
  static inline uchar Length(uchar chr);
120
  static inline unsigned Encode(char* out, uchar c);
121
  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122
      unsigned capacity, unsigned* chars_read, unsigned* offset);
123
  static const uchar kBadChar = 0xFFFD;
124
  static const unsigned kMaxEncodedSize   = 4;
125
  static const unsigned kMaxOneByteChar   = 0x7f;
126
  static const unsigned kMaxTwoByteChar   = 0x7ff;
127
  static const unsigned kMaxThreeByteChar = 0xffff;
128
  static const unsigned kMaxFourByteChar  = 0x1fffff;
129

    
130
 private:
131
  template <unsigned s> friend class Utf8InputBuffer;
132
  friend class Test;
133
  static inline uchar ValueOf(const byte* str,
134
                              unsigned length,
135
                              unsigned* cursor);
136
  static uchar CalculateValue(const byte* str,
137
                              unsigned length,
138
                              unsigned* cursor);
139
};
140

    
141
// --- C h a r a c t e r   S t r e a m ---
142

    
143
class CharacterStream {
144
 public:
145
  inline uchar GetNext();
146
  inline bool has_more() { return remaining_ != 0; }
147
  // Note that default implementation is not efficient.
148
  virtual void Seek(unsigned);
149
  unsigned Length();
150
  virtual ~CharacterStream() { }
151
  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152
      unsigned& offset);
153
  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154
      unsigned capacity, unsigned& offset);
155
  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156
      unsigned capacity, unsigned& offset);
157
  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158
  virtual void Rewind() = 0;
159
 protected:
160
  virtual void FillBuffer() = 0;
161
  // The number of characters left in the current buffer
162
  unsigned remaining_;
163
  // The current offset within the buffer
164
  unsigned cursor_;
165
  // The buffer containing the decoded characters.
166
  const byte* buffer_;
167
};
168

    
169
// --- I n p u t   B u f f e r ---
170

    
171
/**
172
 * Provides efficient access to encoded characters in strings.  It
173
 * does so by reading characters one block at a time, rather than one
174
 * character at a time, which gives string implementations an
175
 * opportunity to optimize the decoding.
176
 */
177
template <class Reader, class Input = Reader*, unsigned kSize = 256>
178
class InputBuffer : public CharacterStream {
179
 public:
180
  virtual void Rewind();
181
  inline void Reset(Input input);
182
  void Seek(unsigned position);
183
  inline void Reset(unsigned position, Input input);
184
 protected:
185
  InputBuffer() { }
186
  explicit InputBuffer(Input input) { Reset(input); }
187
  virtual void FillBuffer();
188

    
189
  // A custom offset that can be used by the string implementation to
190
  // mark progress within the encoded string.
191
  unsigned offset_;
192
  // The input string
193
  Input input_;
194
  // To avoid heap allocation, we keep an internal buffer to which
195
  // the encoded string can write its characters.  The string
196
  // implementation is free to decide whether it wants to use this
197
  // buffer or not.
198
  byte util_buffer_[kSize];
199
};
200

    
201
// --- U t f 8   I n p u t   B u f f e r ---
202

    
203
template <unsigned s = 256>
204
class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
205
 public:
206
  inline Utf8InputBuffer() { }
207
  inline Utf8InputBuffer(const char* data, unsigned length);
208
  inline void Reset(const char* data, unsigned length) {
209
    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
210
        Buffer<const char*>(data, length));
211
  }
212
};
213

    
214
struct Uppercase {
215
  static bool Is(uchar c);
216
};
217
struct Lowercase {
218
  static bool Is(uchar c);
219
};
220
struct Letter {
221
  static bool Is(uchar c);
222
};
223
struct Space {
224
  static bool Is(uchar c);
225
};
226
struct Number {
227
  static bool Is(uchar c);
228
};
229
struct WhiteSpace {
230
  static bool Is(uchar c);
231
};
232
struct LineTerminator {
233
  static bool Is(uchar c);
234
};
235
struct CombiningMark {
236
  static bool Is(uchar c);
237
};
238
struct ConnectorPunctuation {
239
  static bool Is(uchar c);
240
};
241
struct ToLowercase {
242
  static const int kMaxWidth = 3;
243
  static int Convert(uchar c,
244
                     uchar n,
245
                     uchar* result,
246
                     bool* allow_caching_ptr);
247
};
248
struct ToUppercase {
249
  static const int kMaxWidth = 3;
250
  static int Convert(uchar c,
251
                     uchar n,
252
                     uchar* result,
253
                     bool* allow_caching_ptr);
254
};
255
struct Ecma262Canonicalize {
256
  static const int kMaxWidth = 1;
257
  static int Convert(uchar c,
258
                     uchar n,
259
                     uchar* result,
260
                     bool* allow_caching_ptr);
261
};
262
struct Ecma262UnCanonicalize {
263
  static const int kMaxWidth = 4;
264
  static int Convert(uchar c,
265
                     uchar n,
266
                     uchar* result,
267
                     bool* allow_caching_ptr);
268
};
269
struct CanonicalizationRange {
270
  static const int kMaxWidth = 1;
271
  static int Convert(uchar c,
272
                     uchar n,
273
                     uchar* result,
274
                     bool* allow_caching_ptr);
275
};
276

    
277
}  // namespace unibrow
278

    
279
#endif  // __UNIBROW_H__