1 |
#ifndef INC_CharScanner_hpp__ |
2 |
#define INC_CharScanner_hpp__ |
3 |
|
4 |
/* ANTLR Translator Generator |
5 |
* Project led by Terence Parr at http://www.jGuru.com |
6 |
* Software rights: http://www.antlr.org/license.html |
7 |
* |
8 |
* $Id$ |
9 |
*/ |
10 |
|
11 |
#include <antlr/config.hpp> |
12 |
|
13 |
#include <map> |
14 |
|
15 |
#ifdef HAS_NOT_CCTYPE_H |
16 |
#include <ctype.h> |
17 |
#else |
18 |
#include <cctype> |
19 |
#endif |
20 |
|
21 |
#if ( _MSC_VER == 1200 ) |
22 |
// VC6 seems to need this |
23 |
// note that this is not a standard C++ include file. |
24 |
# include <stdio.h> |
25 |
#endif |
26 |
|
27 |
#include <antlr/TokenStream.hpp> |
28 |
#include <antlr/RecognitionException.hpp> |
29 |
#include <antlr/SemanticException.hpp> |
30 |
#include <antlr/MismatchedCharException.hpp> |
31 |
#include <antlr/InputBuffer.hpp> |
32 |
#include <antlr/BitSet.hpp> |
33 |
#include <antlr/LexerSharedInputState.hpp> |
34 |
|
35 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
36 |
namespace antlr { |
37 |
#endif |
38 |
|
39 |
class ANTLR_API CharScanner; |
40 |
|
41 |
ANTLR_C_USING(tolower) |
42 |
|
43 |
#ifdef ANTLR_REALLY_NO_STRCASECMP |
44 |
// Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior |
45 |
// on the mac has neither... |
46 |
inline int strcasecmp(const char *s1, const char *s2) |
47 |
{ |
48 |
while (true) |
49 |
{ |
50 |
char c1 = tolower(*s1++), |
51 |
c2 = tolower(*s2++); |
52 |
if (c1 < c2) return -1; |
53 |
if (c1 > c2) return 1; |
54 |
if (c1 == 0) return 0; |
55 |
} |
56 |
} |
57 |
#else |
58 |
#ifdef NO_STRCASECMP |
59 |
ANTLR_C_USING(stricmp) |
60 |
#else |
61 |
ANTLR_C_USING(strcasecmp) |
62 |
#endif |
63 |
#endif |
64 |
|
65 |
/** Functor for the literals map |
66 |
*/ |
67 |
class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { |
68 |
private: |
69 |
const CharScanner* scanner; |
70 |
public: |
71 |
#ifdef NO_TEMPLATE_PARTS |
72 |
CharScannerLiteralsLess() {} // not really used, definition to appease MSVC |
73 |
#endif |
74 |
CharScannerLiteralsLess(const CharScanner* theScanner) |
75 |
: scanner(theScanner) |
76 |
{ |
77 |
} |
78 |
bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; |
79 |
// defaults are good enough.. |
80 |
// CharScannerLiteralsLess(const CharScannerLiteralsLess&); |
81 |
// CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); |
82 |
}; |
83 |
|
84 |
/** Superclass of generated lexers |
85 |
*/ |
86 |
class ANTLR_API CharScanner : public TokenStream { |
87 |
protected: |
88 |
typedef RefToken (*factory_type)(); |
89 |
public: |
90 |
CharScanner(InputBuffer& cb, bool case_sensitive ); |
91 |
CharScanner(InputBuffer* cb, bool case_sensitive ); |
92 |
CharScanner(const LexerSharedInputState& state, bool case_sensitive ); |
93 |
|
94 |
virtual ~CharScanner() |
95 |
{ |
96 |
} |
97 |
|
98 |
virtual int LA(unsigned int i); |
99 |
|
100 |
virtual void append(char c) |
101 |
{ |
102 |
if (saveConsumedInput) |
103 |
{ |
104 |
size_t l = text.length(); |
105 |
|
106 |
if ((l%256) == 0) |
107 |
text.reserve(l+256); |
108 |
|
109 |
text.replace(l,0,&c,1); |
110 |
} |
111 |
} |
112 |
|
113 |
virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) |
114 |
{ |
115 |
if( saveConsumedInput ) |
116 |
text += s; |
117 |
} |
118 |
|
119 |
virtual void commit() |
120 |
{ |
121 |
inputState->getInput().commit(); |
122 |
} |
123 |
|
124 |
/** called by the generated lexer to do error recovery, override to |
125 |
* customize the behaviour. |
126 |
*/ |
127 |
virtual void recover(const RecognitionException& ex, const BitSet& tokenSet) |
128 |
{ |
129 |
consume(); |
130 |
consumeUntil(tokenSet); |
131 |
} |
132 |
|
133 |
virtual void consume() |
134 |
{ |
135 |
if (inputState->guessing == 0) |
136 |
{ |
137 |
int c = LA(1); |
138 |
if (caseSensitive) |
139 |
{ |
140 |
append(c); |
141 |
} |
142 |
else |
143 |
{ |
144 |
// use input.LA(), not LA(), to get original case |
145 |
// CharScanner.LA() would toLower it. |
146 |
append(inputState->getInput().LA(1)); |
147 |
} |
148 |
|
149 |
// RK: in a sense I don't like this automatic handling. |
150 |
if (c == '\t') |
151 |
tab(); |
152 |
else |
153 |
inputState->column++; |
154 |
} |
155 |
inputState->getInput().consume(); |
156 |
} |
157 |
|
158 |
/** Consume chars until one matches the given char */ |
159 |
virtual void consumeUntil(int c) |
160 |
{ |
161 |
for(;;) |
162 |
{ |
163 |
int la_1 = LA(1); |
164 |
if( la_1 == EOF_CHAR || la_1 == c ) |
165 |
break; |
166 |
consume(); |
167 |
} |
168 |
} |
169 |
|
170 |
/** Consume chars until one matches the given set */ |
171 |
virtual void consumeUntil(const BitSet& set) |
172 |
{ |
173 |
for(;;) |
174 |
{ |
175 |
int la_1 = LA(1); |
176 |
if( la_1 == EOF_CHAR || set.member(la_1) ) |
177 |
break; |
178 |
consume(); |
179 |
} |
180 |
} |
181 |
|
182 |
/// Mark the current position and return a id for it |
183 |
virtual unsigned int mark() |
184 |
{ |
185 |
return inputState->getInput().mark(); |
186 |
} |
187 |
/// Rewind the scanner to a previously marked position |
188 |
virtual void rewind(unsigned int pos) |
189 |
{ |
190 |
inputState->getInput().rewind(pos); |
191 |
} |
192 |
|
193 |
/// See if input contains character 'c' throw MismatchedCharException if not |
194 |
virtual void match(int c) |
195 |
{ |
196 |
int la_1 = LA(1); |
197 |
if ( la_1 != c ) |
198 |
throw MismatchedCharException(la_1, c, false, this); |
199 |
consume(); |
200 |
} |
201 |
|
202 |
/** See if input contains element from bitset b |
203 |
* throw MismatchedCharException if not |
204 |
*/ |
205 |
virtual void match(const BitSet& b) |
206 |
{ |
207 |
int la_1 = LA(1); |
208 |
|
209 |
if ( !b.member(la_1) ) |
210 |
throw MismatchedCharException( la_1, b, false, this ); |
211 |
consume(); |
212 |
} |
213 |
|
214 |
/** See if input contains string 's' throw MismatchedCharException if not |
215 |
* @note the string cannot match EOF |
216 |
*/ |
217 |
virtual void match( const char* s ) |
218 |
{ |
219 |
while( *s != '\0' ) |
220 |
{ |
221 |
// the & 0xFF is here to prevent sign extension lateron |
222 |
int la_1 = LA(1), c = (*s++ & 0xFF); |
223 |
|
224 |
if ( la_1 != c ) |
225 |
throw MismatchedCharException(la_1, c, false, this); |
226 |
|
227 |
consume(); |
228 |
} |
229 |
} |
230 |
/** See if input contains string 's' throw MismatchedCharException if not |
231 |
* @note the string cannot match EOF |
232 |
*/ |
233 |
virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) |
234 |
{ |
235 |
size_t len = s.length(); |
236 |
|
237 |
for (size_t i = 0; i < len; i++) |
238 |
{ |
239 |
// the & 0xFF is here to prevent sign extension lateron |
240 |
int la_1 = LA(1), c = (s[i] & 0xFF); |
241 |
|
242 |
if ( la_1 != c ) |
243 |
throw MismatchedCharException(la_1, c, false, this); |
244 |
|
245 |
consume(); |
246 |
} |
247 |
} |
248 |
/** See if input does not contain character 'c' |
249 |
* throw MismatchedCharException if not |
250 |
*/ |
251 |
virtual void matchNot(int c) |
252 |
{ |
253 |
int la_1 = LA(1); |
254 |
|
255 |
if ( la_1 == c ) |
256 |
throw MismatchedCharException(la_1, c, true, this); |
257 |
|
258 |
consume(); |
259 |
} |
260 |
/** See if input contains character in range c1-c2 |
261 |
* throw MismatchedCharException if not |
262 |
*/ |
263 |
virtual void matchRange(int c1, int c2) |
264 |
{ |
265 |
int la_1 = LA(1); |
266 |
|
267 |
if ( la_1 < c1 || la_1 > c2 ) |
268 |
throw MismatchedCharException(la_1, c1, c2, false, this); |
269 |
|
270 |
consume(); |
271 |
} |
272 |
|
273 |
virtual bool getCaseSensitive() const |
274 |
{ |
275 |
return caseSensitive; |
276 |
} |
277 |
|
278 |
virtual void setCaseSensitive(bool t) |
279 |
{ |
280 |
caseSensitive = t; |
281 |
} |
282 |
|
283 |
virtual bool getCaseSensitiveLiterals() const=0; |
284 |
|
285 |
/// Get the line the scanner currently is in (starts at 1) |
286 |
virtual int getLine() const |
287 |
{ |
288 |
return inputState->line; |
289 |
} |
290 |
|
291 |
/// set the line number |
292 |
virtual void setLine(int l) |
293 |
{ |
294 |
inputState->line = l; |
295 |
} |
296 |
|
297 |
/// Get the column the scanner currently is in (starts at 1) |
298 |
virtual int getColumn() const |
299 |
{ |
300 |
return inputState->column; |
301 |
} |
302 |
/// set the column number |
303 |
virtual void setColumn(int c) |
304 |
{ |
305 |
inputState->column = c; |
306 |
} |
307 |
|
308 |
/// get the filename for the file currently used |
309 |
virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const |
310 |
{ |
311 |
return inputState->filename; |
312 |
} |
313 |
/// Set the filename the scanner is using (used in error messages) |
314 |
virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) |
315 |
{ |
316 |
inputState->filename = f; |
317 |
} |
318 |
|
319 |
virtual bool getCommitToPath() const |
320 |
{ |
321 |
return commitToPath; |
322 |
} |
323 |
|
324 |
virtual void setCommitToPath(bool commit) |
325 |
{ |
326 |
commitToPath = commit; |
327 |
} |
328 |
|
329 |
/** return a copy of the current text buffer */ |
330 |
virtual const ANTLR_USE_NAMESPACE(std)string& getText() const |
331 |
{ |
332 |
return text; |
333 |
} |
334 |
|
335 |
virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) |
336 |
{ |
337 |
text = s; |
338 |
} |
339 |
|
340 |
virtual void resetText() |
341 |
{ |
342 |
text = ""; |
343 |
inputState->tokenStartColumn = inputState->column; |
344 |
inputState->tokenStartLine = inputState->line; |
345 |
} |
346 |
|
347 |
virtual RefToken getTokenObject() const |
348 |
{ |
349 |
return _returnToken; |
350 |
} |
351 |
|
352 |
/** Used to keep track of line breaks, needs to be called from |
353 |
* within generated lexers when a \n \r is encountered. |
354 |
*/ |
355 |
virtual void newline() |
356 |
{ |
357 |
++inputState->line; |
358 |
inputState->column = 1; |
359 |
} |
360 |
|
361 |
/** Advance the current column number by an appropriate amount according |
362 |
* to the tabsize. This method needs to be explicitly called from the |
363 |
* lexer rules encountering tabs. |
364 |
*/ |
365 |
virtual void tab() |
366 |
{ |
367 |
int c = getColumn(); |
368 |
int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop |
369 |
setColumn( nc ); |
370 |
} |
371 |
/// set the tabsize. Returns the old tabsize |
372 |
int setTabsize( int size ) |
373 |
{ |
374 |
int oldsize = tabsize; |
375 |
tabsize = size; |
376 |
return oldsize; |
377 |
} |
378 |
/// Return the tabsize used by the scanner |
379 |
int getTabSize() const |
380 |
{ |
381 |
return tabsize; |
382 |
} |
383 |
|
384 |
/** Report exception errors caught in nextToken() */ |
385 |
virtual void reportError(const RecognitionException& e); |
386 |
|
387 |
/** Parser error-reporting function can be overridden in subclass */ |
388 |
virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); |
389 |
|
390 |
/** Parser warning-reporting function can be overridden in subclass */ |
391 |
virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); |
392 |
|
393 |
virtual InputBuffer& getInputBuffer() |
394 |
{ |
395 |
return inputState->getInput(); |
396 |
} |
397 |
|
398 |
virtual LexerSharedInputState getInputState() |
399 |
{ |
400 |
return inputState; |
401 |
} |
402 |
|
403 |
/** set the input state for the lexer. |
404 |
* @note state is a reference counted object, hence no reference */ |
405 |
virtual void setInputState(LexerSharedInputState state) |
406 |
{ |
407 |
inputState = state; |
408 |
} |
409 |
|
410 |
/// Set the factory for created tokens |
411 |
virtual void setTokenObjectFactory(factory_type factory) |
412 |
{ |
413 |
tokenFactory = factory; |
414 |
} |
415 |
|
416 |
/** Test the token text against the literals table |
417 |
* Override this method to perform a different literals test |
418 |
*/ |
419 |
virtual int testLiteralsTable(int ttype) const |
420 |
{ |
421 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); |
422 |
if (i != literals.end()) |
423 |
ttype = (*i).second; |
424 |
return ttype; |
425 |
} |
426 |
|
427 |
/** Test the text passed in against the literals table |
428 |
* Override this method to perform a different literals test |
429 |
* This is used primarily when you want to test a portion of |
430 |
* a token |
431 |
*/ |
432 |
virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const |
433 |
{ |
434 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); |
435 |
if (i != literals.end()) |
436 |
ttype = (*i).second; |
437 |
return ttype; |
438 |
} |
439 |
|
440 |
/// Override this method to get more specific case handling |
441 |
virtual int toLower(int c) const |
442 |
{ |
443 |
// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) |
444 |
// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) |
445 |
// this one is more structural. Maybe make this configurable. |
446 |
return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); |
447 |
} |
448 |
|
449 |
/** This method is called by YourLexer::nextToken() when the lexer has |
450 |
* hit EOF condition. EOF is NOT a character. |
451 |
* This method is not called if EOF is reached during |
452 |
* syntactic predicate evaluation or during evaluation |
453 |
* of normal lexical rules, which presumably would be |
454 |
* an IOException. This traps the "normal" EOF condition. |
455 |
* |
456 |
* uponEOF() is called after the complete evaluation of |
457 |
* the previous token and only if your parser asks |
458 |
* for another token beyond that last non-EOF token. |
459 |
* |
460 |
* You might want to throw token or char stream exceptions |
461 |
* like: "Heh, premature eof" or a retry stream exception |
462 |
* ("I found the end of this file, go back to referencing file"). |
463 |
*/ |
464 |
virtual void uponEOF() |
465 |
{ |
466 |
} |
467 |
|
468 |
/// Methods used to change tracing behavior |
469 |
virtual void traceIndent(); |
470 |
virtual void traceIn(const char* rname); |
471 |
virtual void traceOut(const char* rname); |
472 |
|
473 |
#ifndef NO_STATIC_CONSTS |
474 |
static const int EOF_CHAR = EOF; |
475 |
#else |
476 |
enum { |
477 |
EOF_CHAR = EOF |
478 |
}; |
479 |
#endif |
480 |
protected: |
481 |
ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token |
482 |
/// flag indicating wether consume saves characters |
483 |
bool saveConsumedInput; |
484 |
factory_type tokenFactory; ///< Factory for tokens |
485 |
bool caseSensitive; ///< Is this lexer case sensitive |
486 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass |
487 |
|
488 |
RefToken _returnToken; ///< used to return tokens w/o using return val |
489 |
|
490 |
/// Input state, gives access to input stream, shared among different lexers |
491 |
LexerSharedInputState inputState; |
492 |
|
493 |
/** Used during filter mode to indicate that path is desired. |
494 |
* A subsequent scan error will report an error as usual |
495 |
* if acceptPath=true; |
496 |
*/ |
497 |
bool commitToPath; |
498 |
|
499 |
int tabsize; ///< tab size the scanner uses. |
500 |
|
501 |
/// Create a new RefToken of type t |
502 |
virtual RefToken makeToken(int t) |
503 |
{ |
504 |
RefToken tok = tokenFactory(); |
505 |
tok->setType(t); |
506 |
tok->setColumn(inputState->tokenStartColumn); |
507 |
tok->setLine(inputState->tokenStartLine); |
508 |
return tok; |
509 |
} |
510 |
|
511 |
/** Tracer class, used when -traceLexer is passed to antlr |
512 |
*/ |
513 |
class Tracer { |
514 |
private: |
515 |
CharScanner* parser; |
516 |
const char* text; |
517 |
|
518 |
Tracer(const Tracer& other); // undefined |
519 |
Tracer& operator=(const Tracer& other); // undefined |
520 |
public: |
521 |
Tracer( CharScanner* p,const char* t ) |
522 |
: parser(p), text(t) |
523 |
{ |
524 |
parser->traceIn(text); |
525 |
} |
526 |
~Tracer() |
527 |
{ |
528 |
parser->traceOut(text); |
529 |
} |
530 |
}; |
531 |
|
532 |
int traceDepth; |
533 |
private: |
534 |
CharScanner( const CharScanner& other ); // undefined |
535 |
CharScanner& operator=( const CharScanner& other ); // undefined |
536 |
|
537 |
#ifndef NO_STATIC_CONSTS |
538 |
static const int NO_CHAR = 0; |
539 |
#else |
540 |
enum { |
541 |
NO_CHAR = 0 |
542 |
}; |
543 |
#endif |
544 |
}; |
545 |
|
546 |
inline int CharScanner::LA(unsigned int i) |
547 |
{ |
548 |
int c = inputState->getInput().LA(i); |
549 |
|
550 |
if ( caseSensitive ) |
551 |
return c; |
552 |
else |
553 |
return toLower(c); // VC 6 tolower bug caught in toLower. |
554 |
} |
555 |
|
556 |
inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const |
557 |
{ |
558 |
if (scanner->getCaseSensitiveLiterals()) |
559 |
return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); |
560 |
else |
561 |
{ |
562 |
#ifdef NO_STRCASECMP |
563 |
return (stricmp(x.c_str(),y.c_str())<0); |
564 |
#else |
565 |
return (strcasecmp(x.c_str(),y.c_str())<0); |
566 |
#endif |
567 |
} |
568 |
} |
569 |
|
570 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
571 |
} |
572 |
#endif |
573 |
|
574 |
#endif //INC_CharScanner_hpp__ |