-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcodepoints.cpp
130 lines (122 loc) · 2.36 KB
/
codepoints.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include "codepoints.hpp"
using namespace std;
string ASSUME_EIGHT_BIT(const i7_string&text) {
return {text.begin(), text.end()};
}
// See Unicode 6.0, Chapter 4.6.
bool is_i7_whitespace(i7_codepoint codepoint) {
switch (codepoint) {
case 0x0009:
case 0x000A:
case 0x000B:
case 0x000C:
case 0x000D:
case 0x0020:
case 0x007C: // `|', an Inform 7 paragraph break.
case 0x0085:
case 0x00A0:
case 0x1680:
case 0x180E:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200A:
case 0x2028:
case 0x2029:
case 0x202F:
case 0x205F:
case 0x3000:
return true;
default:
return false;
}
}
bool is_i7_punctuation(i7_codepoint codepoint) {
switch (codepoint) {
case 0x0021:
case 0x0022:
case 0x0027:
case 0x0028:
case 0x0029:
case 0x002A:
case 0x002B:
case 0x002D:
case 0x002E:
case 0x003A:
case 0x003B:
case 0x005B:
case 0x005C:
case 0x005D:
return true;
default:
return false;
}
}
bool is_i7_letter(i7_codepoint codepoint) {
return !is_i7_whitespace(codepoint) && !is_i7_punctuation(codepoint) && codepoint != TERMINATOR_CODEPOINT;
}
bool is_i7_digit(i7_codepoint codepoint) {
return ('0' <= codepoint) && (codepoint <= '9');
}
bool is_i7_lexical_delimiter_letter(i7_codepoint codepoint) {
switch (codepoint) {
case 'D':
case 'O':
case 'C':
case 'U':
case 'M':
case 'E':
case 'N':
case 'T':
case 'A':
case 'I':
return true;
default:
return false;
}
}
// This bit gotten by trawling the internet until I found ni's @<Return Unicode
// fancy equivalents as simpler literals@>, and then adjusted it to use a switch
// and therefore return slightly faster in the common case.
i7_codepoint i7_normalize(i7_codepoint codepoint) {
switch (codepoint) {
case 0x85:
return '\n';
case 0xA0:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200a:
return ' ';
case 0x2010:
case 0x2011:
case 0x2012:
case 0x2013:
case 0x2014:
return '-';
case 0x2018:
case 0x2019:
return '\'';
case 0x201c:
case 0x201d:
return '"';
case 0x2028:
case 0x2029:
return '\n';
}
return codepoint;
}