1 |
#ifndef INC_CharScanner_hpp__ |
2 |
#define INC_CharScanner_hpp__ |
3 |
|
4 |
/* ANTLR Translator Generator |
5 |
* Project led by Terence Parr at http://www.jGuru.com |
6 |
* Software rights: http://www.antlr.org/license.html |
7 |
* |
8 |
* $Id$ |
9 |
*/ |
10 |
|
11 |
#include <antlr/config.hpp> |
12 |
#include <cstdio> |
13 |
#include <map> |
14 |
#include <cstring> |
15 |
|
16 |
#ifdef HAS_NOT_CCTYPE_H |
17 |
#include <ctype.h> |
18 |
#else |
19 |
#include <cctype> |
20 |
#endif |
21 |
|
22 |
#if ( _MSC_VER == 1200 ) |
23 |
// VC6 seems to need this |
24 |
// note that this is not a standard C++ include file. |
25 |
# include <stdio.h> |
26 |
#endif |
27 |
|
28 |
#include <antlr/TokenStream.hpp> |
29 |
#include <antlr/RecognitionException.hpp> |
30 |
#include <antlr/SemanticException.hpp> |
31 |
#include <antlr/MismatchedCharException.hpp> |
32 |
#include <antlr/InputBuffer.hpp> |
33 |
#include <antlr/BitSet.hpp> |
34 |
#include <antlr/LexerSharedInputState.hpp> |
35 |
|
36 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
37 |
namespace antlr { |
38 |
#endif |
39 |
|
40 |
class ANTLR_API CharScanner; |
41 |
|
42 |
ANTLR_C_USING(tolower) |
43 |
|
44 |
#ifdef ANTLR_REALLY_NO_STRCASECMP |
45 |
// Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior |
46 |
// on the mac has neither... |
47 |
inline int strcasecmp(const char *s1, const char *s2) |
48 |
{ |
49 |
while (true) |
50 |
{ |
51 |
char c1 = tolower(*s1++), |
52 |
c2 = tolower(*s2++); |
53 |
if (c1 < c2) return -1; |
54 |
if (c1 > c2) return 1; |
55 |
if (c1 == 0) return 0; |
56 |
} |
57 |
} |
58 |
#else |
59 |
#ifdef NO_STRCASECMP |
60 |
ANTLR_C_USING(stricmp) |
61 |
#else |
62 |
ANTLR_C_USING(strcasecmp) |
63 |
#endif |
64 |
#endif |
65 |
|
66 |
/** Functor for the literals map |
67 |
*/ |
68 |
class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { |
69 |
private: |
70 |
const CharScanner* scanner; |
71 |
public: |
72 |
#ifdef NO_TEMPLATE_PARTS |
73 |
CharScannerLiteralsLess() {} // not really used, definition to appease MSVC |
74 |
#endif |
75 |
CharScannerLiteralsLess(const CharScanner* theScanner) |
76 |
: scanner(theScanner) |
77 |
{ |
78 |
} |
79 |
bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; |
80 |
// defaults are good enough.. |
81 |
// CharScannerLiteralsLess(const CharScannerLiteralsLess&); |
82 |
// CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); |
83 |
}; |
84 |
|
85 |
/** Superclass of generated lexers |
86 |
*/ |
87 |
class ANTLR_API CharScanner : public TokenStream { |
88 |
protected: |
89 |
typedef RefToken (*factory_type)(); |
90 |
public: |
91 |
CharScanner(InputBuffer& cb, bool case_sensitive ); |
92 |
CharScanner(InputBuffer* cb, bool case_sensitive ); |
93 |
CharScanner(const LexerSharedInputState& state, bool case_sensitive ); |
94 |
|
95 |
virtual ~CharScanner() |
96 |
{ |
97 |
} |
98 |
|
99 |
virtual int LA(unsigned int i); |
100 |
|
101 |
virtual void append(char c) |
102 |
{ |
103 |
if (saveConsumedInput) |
104 |
{ |
105 |
size_t l = text.length(); |
106 |
|
107 |
if ((l%256) == 0) |
108 |
text.reserve(l+256); |
109 |
|
110 |
text.replace(l,0,&c,1); |
111 |
} |
112 |
} |
113 |
|
114 |
virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) |
115 |
{ |
116 |
if( saveConsumedInput ) |
117 |
text += s; |
118 |
} |
119 |
|
120 |
virtual void commit() |
121 |
{ |
122 |
inputState->getInput().commit(); |
123 |
} |
124 |
|
125 |
/** called by the generated lexer to do error recovery, override to |
126 |
* customize the behaviour. |
127 |
*/ |
128 |
virtual void recover(const RecognitionException& ex, const BitSet& tokenSet) |
129 |
{ |
130 |
consume(); |
131 |
consumeUntil(tokenSet); |
132 |
} |
133 |
|
134 |
virtual void consume() |
135 |
{ |
136 |
if (inputState->guessing == 0) |
137 |
{ |
138 |
int c = LA(1); |
139 |
if (caseSensitive) |
140 |
{ |
141 |
append(c); |
142 |
} |
143 |
else |
144 |
{ |
145 |
// use input.LA(), not LA(), to get original case |
146 |
// CharScanner.LA() would toLower it. |
147 |
append(inputState->getInput().LA(1)); |
148 |
} |
149 |
|
150 |
// RK: in a sense I don't like this automatic handling. |
151 |
if (c == '\t') |
152 |
tab(); |
153 |
else |
154 |
inputState->column++; |
155 |
} |
156 |
inputState->getInput().consume(); |
157 |
} |
158 |
|
159 |
/** Consume chars until one matches the given char */ |
160 |
virtual void consumeUntil(int c) |
161 |
{ |
162 |
for(;;) |
163 |
{ |
164 |
int la_1 = LA(1); |
165 |
if( la_1 == EOF_CHAR || la_1 == c ) |
166 |
break; |
167 |
consume(); |
168 |
} |
169 |
} |
170 |
|
171 |
/** Consume chars until one matches the given set */ |
172 |
virtual void consumeUntil(const BitSet& set) |
173 |
{ |
174 |
for(;;) |
175 |
{ |
176 |
int la_1 = LA(1); |
177 |
if( la_1 == EOF_CHAR || set.member(la_1) ) |
178 |
break; |
179 |
consume(); |
180 |
} |
181 |
} |
182 |
|
183 |
/// Mark the current position and return a id for it |
184 |
virtual unsigned int mark() |
185 |
{ |
186 |
return inputState->getInput().mark(); |
187 |
} |
188 |
/// Rewind the scanner to a previously marked position |
189 |
virtual void rewind(unsigned int pos) |
190 |
{ |
191 |
inputState->getInput().rewind(pos); |
192 |
} |
193 |
|
194 |
/// See if input contains character 'c' throw MismatchedCharException if not |
195 |
virtual void match(int c) |
196 |
{ |
197 |
int la_1 = LA(1); |
198 |
if ( la_1 != c ) |
199 |
throw MismatchedCharException(la_1, c, false, this); |
200 |
consume(); |
201 |
} |
202 |
|
203 |
/** See if input contains element from bitset b |
204 |
* throw MismatchedCharException if not |
205 |
*/ |
206 |
virtual void match(const BitSet& b) |
207 |
{ |
208 |
int la_1 = LA(1); |
209 |
|
210 |
if ( !b.member(la_1) ) |
211 |
throw MismatchedCharException( la_1, b, false, this ); |
212 |
consume(); |
213 |
} |
214 |
|
215 |
/** See if input contains string 's' throw MismatchedCharException if not |
216 |
* @note the string cannot match EOF |
217 |
*/ |
218 |
virtual void match( const char* s ) |
219 |
{ |
220 |
while( *s != '\0' ) |
221 |
{ |
222 |
// the & 0xFF is here to prevent sign extension lateron |
223 |
int la_1 = LA(1), c = (*s++ & 0xFF); |
224 |
|
225 |
if ( la_1 != c ) |
226 |
throw MismatchedCharException(la_1, c, false, this); |
227 |
|
228 |
consume(); |
229 |
} |
230 |
} |
231 |
/** See if input contains string 's' throw MismatchedCharException if not |
232 |
* @note the string cannot match EOF |
233 |
*/ |
234 |
virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) |
235 |
{ |
236 |
size_t len = s.length(); |
237 |
|
238 |
for (size_t i = 0; i < len; i++) |
239 |
{ |
240 |
// the & 0xFF is here to prevent sign extension lateron |
241 |
int la_1 = LA(1), c = (s[i] & 0xFF); |
242 |
|
243 |
if ( la_1 != c ) |
244 |
throw MismatchedCharException(la_1, c, false, this); |
245 |
|
246 |
consume(); |
247 |
} |
248 |
} |
249 |
/** See if input does not contain character 'c' |
250 |
* throw MismatchedCharException if not |
251 |
*/ |
252 |
virtual void matchNot(int c) |
253 |
{ |
254 |
int la_1 = LA(1); |
255 |
|
256 |
if ( la_1 == c ) |
257 |
throw MismatchedCharException(la_1, c, true, this); |
258 |
|
259 |
consume(); |
260 |
} |
261 |
/** See if input contains character in range c1-c2 |
262 |
* throw MismatchedCharException if not |
263 |
*/ |
264 |
virtual void matchRange(int c1, int c2) |
265 |
{ |
266 |
int la_1 = LA(1); |
267 |
|
268 |
if ( la_1 < c1 || la_1 > c2 ) |
269 |
throw MismatchedCharException(la_1, c1, c2, false, this); |
270 |
|
271 |
consume(); |
272 |
} |
273 |
|
274 |
virtual bool getCaseSensitive() const |
275 |
{ |
276 |
return caseSensitive; |
277 |
} |
278 |
|
279 |
virtual void setCaseSensitive(bool t) |
280 |
{ |
281 |
caseSensitive = t; |
282 |
} |
283 |
|
284 |
virtual bool getCaseSensitiveLiterals() const=0; |
285 |
|
286 |
/// Get the line the scanner currently is in (starts at 1) |
287 |
virtual int getLine() const |
288 |
{ |
289 |
return inputState->line; |
290 |
} |
291 |
|
292 |
/// set the line number |
293 |
virtual void setLine(int l) |
294 |
{ |
295 |
inputState->line = l; |
296 |
} |
297 |
|
298 |
/// Get the column the scanner currently is in (starts at 1) |
299 |
virtual int getColumn() const |
300 |
{ |
301 |
return inputState->column; |
302 |
} |
303 |
/// set the column number |
304 |
virtual void setColumn(int c) |
305 |
{ |
306 |
inputState->column = c; |
307 |
} |
308 |
|
309 |
/// get the filename for the file currently used |
310 |
virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const |
311 |
{ |
312 |
return inputState->filename; |
313 |
} |
314 |
/// Set the filename the scanner is using (used in error messages) |
315 |
virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) |
316 |
{ |
317 |
inputState->filename = f; |
318 |
} |
319 |
|
320 |
virtual bool getCommitToPath() const |
321 |
{ |
322 |
return commitToPath; |
323 |
} |
324 |
|
325 |
virtual void setCommitToPath(bool commit) |
326 |
{ |
327 |
commitToPath = commit; |
328 |
} |
329 |
|
330 |
/** return a copy of the current text buffer */ |
331 |
virtual const ANTLR_USE_NAMESPACE(std)string& getText() const |
332 |
{ |
333 |
return text; |
334 |
} |
335 |
|
336 |
virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) |
337 |
{ |
338 |
text = s; |
339 |
} |
340 |
|
341 |
virtual void resetText() |
342 |
{ |
343 |
text = ""; |
344 |
inputState->tokenStartColumn = inputState->column; |
345 |
inputState->tokenStartLine = inputState->line; |
346 |
} |
347 |
|
348 |
virtual RefToken getTokenObject() const |
349 |
{ |
350 |
return _returnToken; |
351 |
} |
352 |
|
353 |
/** Used to keep track of line breaks, needs to be called from |
354 |
* within generated lexers when a \n \r is encountered. |
355 |
*/ |
356 |
virtual void newline() |
357 |
{ |
358 |
++inputState->line; |
359 |
inputState->column = 1; |
360 |
} |
361 |
|
362 |
/** Advance the current column number by an appropriate amount according |
363 |
* to the tabsize. This method needs to be explicitly called from the |
364 |
* lexer rules encountering tabs. |
365 |
*/ |
366 |
virtual void tab() |
367 |
{ |
368 |
int c = getColumn(); |
369 |
int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop |
370 |
setColumn( nc ); |
371 |
} |
372 |
/// set the tabsize. Returns the old tabsize |
373 |
int setTabsize( int size ) |
374 |
{ |
375 |
int oldsize = tabsize; |
376 |
tabsize = size; |
377 |
return oldsize; |
378 |
} |
379 |
/// Return the tabsize used by the scanner |
380 |
int getTabSize() const |
381 |
{ |
382 |
return tabsize; |
383 |
} |
384 |
|
385 |
/** Report exception errors caught in nextToken() */ |
386 |
virtual void reportError(const RecognitionException& e); |
387 |
|
388 |
/** Parser error-reporting function can be overridden in subclass */ |
389 |
virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); |
390 |
|
391 |
/** Parser warning-reporting function can be overridden in subclass */ |
392 |
virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); |
393 |
|
394 |
virtual InputBuffer& getInputBuffer() |
395 |
{ |
396 |
return inputState->getInput(); |
397 |
} |
398 |
|
399 |
virtual LexerSharedInputState getInputState() |
400 |
{ |
401 |
return inputState; |
402 |
} |
403 |
|
404 |
/** set the input state for the lexer. |
405 |
* @note state is a reference counted object, hence no reference */ |
406 |
virtual void setInputState(LexerSharedInputState state) |
407 |
{ |
408 |
inputState = state; |
409 |
} |
410 |
|
411 |
/// Set the factory for created tokens |
412 |
virtual void setTokenObjectFactory(factory_type factory) |
413 |
{ |
414 |
tokenFactory = factory; |
415 |
} |
416 |
|
417 |
/** Test the token text against the literals table |
418 |
* Override this method to perform a different literals test |
419 |
*/ |
420 |
virtual int testLiteralsTable(int ttype) const |
421 |
{ |
422 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); |
423 |
if (i != literals.end()) |
424 |
ttype = (*i).second; |
425 |
return ttype; |
426 |
} |
427 |
|
428 |
/** Test the text passed in against the literals table |
429 |
* Override this method to perform a different literals test |
430 |
* This is used primarily when you want to test a portion of |
431 |
* a token |
432 |
*/ |
433 |
virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const |
434 |
{ |
435 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); |
436 |
if (i != literals.end()) |
437 |
ttype = (*i).second; |
438 |
return ttype; |
439 |
} |
440 |
|
441 |
/// Override this method to get more specific case handling |
442 |
virtual int toLower(int c) const |
443 |
{ |
444 |
// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) |
445 |
// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) |
446 |
// this one is more structural. Maybe make this configurable. |
447 |
return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); |
448 |
} |
449 |
|
450 |
/** This method is called by YourLexer::nextToken() when the lexer has |
451 |
* hit EOF condition. EOF is NOT a character. |
452 |
* This method is not called if EOF is reached during |
453 |
* syntactic predicate evaluation or during evaluation |
454 |
* of normal lexical rules, which presumably would be |
455 |
* an IOException. This traps the "normal" EOF condition. |
456 |
* |
457 |
* uponEOF() is called after the complete evaluation of |
458 |
* the previous token and only if your parser asks |
459 |
* for another token beyond that last non-EOF token. |
460 |
* |
461 |
* You might want to throw token or char stream exceptions |
462 |
* like: "Heh, premature eof" or a retry stream exception |
463 |
* ("I found the end of this file, go back to referencing file"). |
464 |
*/ |
465 |
virtual void uponEOF() |
466 |
{ |
467 |
} |
468 |
|
469 |
/// Methods used to change tracing behavior |
470 |
virtual void traceIndent(); |
471 |
virtual void traceIn(const char* rname); |
472 |
virtual void traceOut(const char* rname); |
473 |
|
474 |
#ifndef NO_STATIC_CONSTS |
475 |
static const int EOF_CHAR = EOF; |
476 |
#else |
477 |
enum { |
478 |
EOF_CHAR = EOF |
479 |
}; |
480 |
#endif |
481 |
protected: |
482 |
ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token |
483 |
/// flag indicating wether consume saves characters |
484 |
bool saveConsumedInput; |
485 |
factory_type tokenFactory; ///< Factory for tokens |
486 |
bool caseSensitive; ///< Is this lexer case sensitive |
487 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass |
488 |
|
489 |
RefToken _returnToken; ///< used to return tokens w/o using return val |
490 |
|
491 |
/// Input state, gives access to input stream, shared among different lexers |
492 |
LexerSharedInputState inputState; |
493 |
|
494 |
/** Used during filter mode to indicate that path is desired. |
495 |
* A subsequent scan error will report an error as usual |
496 |
* if acceptPath=true; |
497 |
*/ |
498 |
bool commitToPath; |
499 |
|
500 |
int tabsize; ///< tab size the scanner uses. |
501 |
|
502 |
/// Create a new RefToken of type t |
503 |
virtual RefToken makeToken(int t) |
504 |
{ |
505 |
RefToken tok = tokenFactory(); |
506 |
tok->setType(t); |
507 |
tok->setColumn(inputState->tokenStartColumn); |
508 |
tok->setLine(inputState->tokenStartLine); |
509 |
return tok; |
510 |
} |
511 |
|
512 |
/** Tracer class, used when -traceLexer is passed to antlr |
513 |
*/ |
514 |
class Tracer { |
515 |
private: |
516 |
CharScanner* parser; |
517 |
const char* text; |
518 |
|
519 |
Tracer(const Tracer& other); // undefined |
520 |
Tracer& operator=(const Tracer& other); // undefined |
521 |
public: |
522 |
Tracer( CharScanner* p,const char* t ) |
523 |
: parser(p), text(t) |
524 |
{ |
525 |
parser->traceIn(text); |
526 |
} |
527 |
~Tracer() |
528 |
{ |
529 |
parser->traceOut(text); |
530 |
} |
531 |
}; |
532 |
|
533 |
int traceDepth; |
534 |
private: |
535 |
CharScanner( const CharScanner& other ); // undefined |
536 |
CharScanner& operator=( const CharScanner& other ); // undefined |
537 |
|
538 |
#ifndef NO_STATIC_CONSTS |
539 |
static const int NO_CHAR = 0; |
540 |
#else |
541 |
enum { |
542 |
NO_CHAR = 0 |
543 |
}; |
544 |
#endif |
545 |
}; |
546 |
|
547 |
inline int CharScanner::LA(unsigned int i) |
548 |
{ |
549 |
int c = inputState->getInput().LA(i); |
550 |
|
551 |
if ( caseSensitive ) |
552 |
return c; |
553 |
else |
554 |
return toLower(c); // VC 6 tolower bug caught in toLower. |
555 |
} |
556 |
|
557 |
inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const |
558 |
{ |
559 |
if (scanner->getCaseSensitiveLiterals()) |
560 |
return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); |
561 |
else |
562 |
{ |
563 |
#ifdef NO_STRCASECMP |
564 |
return (stricmp(x.c_str(),y.c_str())<0); |
565 |
#else |
566 |
return (strcasecmp(x.c_str(),y.c_str())<0); |
567 |
#endif |
568 |
} |
569 |
} |
570 |
|
571 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
572 |
} |
573 |
#endif |
574 |
|
575 |
#endif //INC_CharScanner_hpp__ |