1 |
#ifndef INC_CharScanner_hpp__ |
2 |
#define INC_CharScanner_hpp__ |
3 |
|
4 |
/* ANTLR Translator Generator |
5 |
* Project led by Terence Parr at http://www.jGuru.com |
6 |
* Software rights: http://www.antlr.org/license.html |
7 |
* |
8 |
* $Id$ |
9 |
*/ |
10 |
|
11 |
#include <antlr/config.hpp> |
12 |
|
13 |
#include <map> |
14 |
#include <cstdio> |
15 |
|
16 |
#ifdef HAS_NOT_CCTYPE_H |
17 |
#include <ctype.h> |
18 |
#else |
19 |
#include <cctype> |
20 |
#endif |
21 |
|
22 |
#if ( _MSC_VER == 1200 ) |
23 |
// VC6 seems to need this |
24 |
// note that this is not a standard C++ include file. |
25 |
# include <stdio.h> |
26 |
#endif |
27 |
|
28 |
#include <antlr/TokenStream.hpp> |
29 |
#include <antlr/RecognitionException.hpp> |
30 |
#include <antlr/SemanticException.hpp> |
31 |
#include <antlr/MismatchedCharException.hpp> |
32 |
#include <antlr/InputBuffer.hpp> |
33 |
#include <antlr/BitSet.hpp> |
34 |
#include <antlr/LexerSharedInputState.hpp> |
35 |
|
36 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
37 |
namespace antlr { |
38 |
#endif |
39 |
|
40 |
class ANTLR_API CharScanner; |
41 |
|
42 |
ANTLR_C_USING(tolower) |
43 |
|
44 |
#if !defined(HAVE_STRCASECMP) && defined(HAVE_STRICMP) && !defined(stricmp) |
45 |
#define strcasecmp stricmp |
46 |
#endif |
47 |
#if !defined(HAVE_STRNCASECMP) && defined(HAVE_STRNICMP) && !defined(strnicmp) |
48 |
#define strncasecmp strnicmp |
49 |
#endif |
50 |
|
51 |
|
52 |
#if !defined(HAVE_STRCASECMP) && !defined(HAVE_STRICMP) |
53 |
inline int strcasecmp(const char *s1, const char *s2) |
54 |
{ |
55 |
while (true) |
56 |
{ |
57 |
char c1 = tolower(*s1++), |
58 |
c2 = tolower(*s2++); |
59 |
if (c1 < c2) return -1; |
60 |
if (c1 > c2) return 1; |
61 |
if (c1 == 0) return 0; |
62 |
} |
63 |
} |
64 |
#endif |
65 |
|
66 |
/** Functor for the literals map |
67 |
*/ |
68 |
class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { |
69 |
private: |
70 |
const CharScanner* scanner; |
71 |
public: |
72 |
#ifdef NO_TEMPLATE_PARTS |
73 |
CharScannerLiteralsLess() {} // not really used, definition to appease MSVC |
74 |
#endif |
75 |
CharScannerLiteralsLess(const CharScanner* theScanner) |
76 |
: scanner(theScanner) |
77 |
{ |
78 |
} |
79 |
bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; |
80 |
// defaults are good enough.. |
81 |
// CharScannerLiteralsLess(const CharScannerLiteralsLess&); |
82 |
// CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); |
83 |
}; |
84 |
|
85 |
/** Superclass of generated lexers |
86 |
*/ |
87 |
class ANTLR_API CharScanner : public TokenStream { |
88 |
protected: |
89 |
typedef RefToken (*factory_type)(); |
90 |
public: |
91 |
CharScanner(InputBuffer& cb, bool case_sensitive ); |
92 |
CharScanner(InputBuffer* cb, bool case_sensitive ); |
93 |
CharScanner(const LexerSharedInputState& state, bool case_sensitive ); |
94 |
|
95 |
virtual ~CharScanner() |
96 |
{ |
97 |
} |
98 |
|
99 |
virtual int LA(unsigned int i); |
100 |
|
101 |
virtual void append(char c) |
102 |
{ |
103 |
if (saveConsumedInput) |
104 |
{ |
105 |
size_t l = text.length(); |
106 |
|
107 |
if ((l%256) == 0) |
108 |
text.reserve(l+256); |
109 |
|
110 |
text.replace(l,0,&c,1); |
111 |
} |
112 |
} |
113 |
|
114 |
virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) |
115 |
{ |
116 |
if( saveConsumedInput ) |
117 |
text += s; |
118 |
} |
119 |
|
120 |
virtual void commit() |
121 |
{ |
122 |
inputState->getInput().commit(); |
123 |
} |
124 |
|
125 |
virtual void consume() |
126 |
{ |
127 |
if (inputState->guessing == 0) |
128 |
{ |
129 |
int c = LA(1); |
130 |
if (caseSensitive) |
131 |
{ |
132 |
append(c); |
133 |
} |
134 |
else |
135 |
{ |
136 |
// use input.LA(), not LA(), to get original case |
137 |
// CharScanner.LA() would toLower it. |
138 |
append(inputState->getInput().LA(1)); |
139 |
} |
140 |
|
141 |
// RK: in a sense I don't like this automatic handling. |
142 |
if (c == '\t') |
143 |
tab(); |
144 |
else |
145 |
inputState->column++; |
146 |
} |
147 |
inputState->getInput().consume(); |
148 |
} |
149 |
|
150 |
/** Consume chars until one matches the given char */ |
151 |
virtual void consumeUntil(int c) |
152 |
{ |
153 |
for(;;) |
154 |
{ |
155 |
int la_1 = LA(1); |
156 |
if( la_1 == EOF_CHAR || la_1 == c ) |
157 |
break; |
158 |
consume(); |
159 |
} |
160 |
} |
161 |
|
162 |
/** Consume chars until one matches the given set */ |
163 |
virtual void consumeUntil(const BitSet& set) |
164 |
{ |
165 |
for(;;) |
166 |
{ |
167 |
int la_1 = LA(1); |
168 |
if( la_1 == EOF_CHAR || set.member(la_1) ) |
169 |
break; |
170 |
consume(); |
171 |
} |
172 |
} |
173 |
|
174 |
/// Mark the current position and return a id for it |
175 |
virtual unsigned int mark() |
176 |
{ |
177 |
return inputState->getInput().mark(); |
178 |
} |
179 |
/// Rewind the scanner to a previously marked position |
180 |
virtual void rewind(unsigned int pos) |
181 |
{ |
182 |
inputState->getInput().rewind(pos); |
183 |
} |
184 |
|
185 |
/// See if input contains character 'c' throw MismatchedCharException if not |
186 |
virtual void match(int c) |
187 |
{ |
188 |
int la_1 = LA(1); |
189 |
if ( la_1 != c ) |
190 |
throw MismatchedCharException(la_1, c, false, this); |
191 |
consume(); |
192 |
} |
193 |
|
194 |
/** See if input contains element from bitset b |
195 |
* throw MismatchedCharException if not |
196 |
*/ |
197 |
virtual void match(const BitSet& b) |
198 |
{ |
199 |
int la_1 = LA(1); |
200 |
|
201 |
if ( !b.member(la_1) ) |
202 |
throw MismatchedCharException( la_1, b, false, this ); |
203 |
consume(); |
204 |
} |
205 |
|
206 |
/** See if input contains string 's' throw MismatchedCharException if not |
207 |
* @note the string cannot match EOF |
208 |
*/ |
209 |
virtual void match( const char* s ) |
210 |
{ |
211 |
while( *s != '\0' ) |
212 |
{ |
213 |
// the & 0xFF is here to prevent sign extension lateron |
214 |
int la_1 = LA(1), c = (*s++ & 0xFF); |
215 |
|
216 |
if ( la_1 != c ) |
217 |
throw MismatchedCharException(la_1, c, false, this); |
218 |
|
219 |
consume(); |
220 |
} |
221 |
} |
222 |
/** See if input contains string 's' throw MismatchedCharException if not |
223 |
* @note the string cannot match EOF |
224 |
*/ |
225 |
virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) |
226 |
{ |
227 |
size_t len = s.length(); |
228 |
|
229 |
for (size_t i = 0; i < len; i++) |
230 |
{ |
231 |
// the & 0xFF is here to prevent sign extension lateron |
232 |
int la_1 = LA(1), c = (s[i] & 0xFF); |
233 |
|
234 |
if ( la_1 != c ) |
235 |
throw MismatchedCharException(la_1, c, false, this); |
236 |
|
237 |
consume(); |
238 |
} |
239 |
} |
240 |
/** See if input does not contain character 'c' |
241 |
* throw MismatchedCharException if not |
242 |
*/ |
243 |
virtual void matchNot(int c) |
244 |
{ |
245 |
int la_1 = LA(1); |
246 |
|
247 |
if ( la_1 == c ) |
248 |
throw MismatchedCharException(la_1, c, true, this); |
249 |
|
250 |
consume(); |
251 |
} |
252 |
/** See if input contains character in range c1-c2 |
253 |
* throw MismatchedCharException if not |
254 |
*/ |
255 |
virtual void matchRange(int c1, int c2) |
256 |
{ |
257 |
int la_1 = LA(1); |
258 |
|
259 |
if ( la_1 < c1 || la_1 > c2 ) |
260 |
throw MismatchedCharException(la_1, c1, c2, false, this); |
261 |
|
262 |
consume(); |
263 |
} |
264 |
|
265 |
virtual bool getCaseSensitive() const |
266 |
{ |
267 |
return caseSensitive; |
268 |
} |
269 |
|
270 |
virtual void setCaseSensitive(bool t) |
271 |
{ |
272 |
caseSensitive = t; |
273 |
} |
274 |
|
275 |
virtual bool getCaseSensitiveLiterals() const=0; |
276 |
|
277 |
/// Get the line the scanner currently is in (starts at 1) |
278 |
virtual int getLine() const |
279 |
{ |
280 |
return inputState->line; |
281 |
} |
282 |
|
283 |
/// set the line number |
284 |
virtual void setLine(int l) |
285 |
{ |
286 |
inputState->line = l; |
287 |
} |
288 |
|
289 |
/// Get the column the scanner currently is in (starts at 1) |
290 |
virtual int getColumn() const |
291 |
{ |
292 |
return inputState->column; |
293 |
} |
294 |
/// set the column number |
295 |
virtual void setColumn(int c) |
296 |
{ |
297 |
inputState->column = c; |
298 |
} |
299 |
|
300 |
/// get the filename for the file currently used |
301 |
virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const |
302 |
{ |
303 |
return inputState->filename; |
304 |
} |
305 |
/// Set the filename the scanner is using (used in error messages) |
306 |
virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) |
307 |
{ |
308 |
inputState->filename = f; |
309 |
} |
310 |
|
311 |
virtual bool getCommitToPath() const |
312 |
{ |
313 |
return commitToPath; |
314 |
} |
315 |
|
316 |
virtual void setCommitToPath(bool commit) |
317 |
{ |
318 |
commitToPath = commit; |
319 |
} |
320 |
|
321 |
/** return a copy of the current text buffer */ |
322 |
virtual const ANTLR_USE_NAMESPACE(std)string& getText() const |
323 |
{ |
324 |
return text; |
325 |
} |
326 |
|
327 |
virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) |
328 |
{ |
329 |
text = s; |
330 |
} |
331 |
|
332 |
virtual void resetText() |
333 |
{ |
334 |
text = ""; |
335 |
inputState->tokenStartColumn = inputState->column; |
336 |
inputState->tokenStartLine = inputState->line; |
337 |
} |
338 |
|
339 |
virtual RefToken getTokenObject() const |
340 |
{ |
341 |
return _returnToken; |
342 |
} |
343 |
|
344 |
/** Used to keep track of line breaks, needs to be called from |
345 |
* within generated lexers when a \n \r is encountered. |
346 |
*/ |
347 |
virtual void newline() |
348 |
{ |
349 |
++inputState->line; |
350 |
inputState->column = 1; |
351 |
} |
352 |
|
353 |
/** Advance the current column number by an appropriate amount according |
354 |
* to the tabsize. This method needs to be explicitly called from the |
355 |
* lexer rules encountering tabs. |
356 |
*/ |
357 |
virtual void tab() |
358 |
{ |
359 |
int c = getColumn(); |
360 |
int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop |
361 |
setColumn( nc ); |
362 |
} |
363 |
/// set the tabsize. Returns the old tabsize |
364 |
int setTabsize( int size ) |
365 |
{ |
366 |
int oldsize = tabsize; |
367 |
tabsize = size; |
368 |
return oldsize; |
369 |
} |
370 |
/// Return the tabsize used by the scanner |
371 |
int getTabSize() const |
372 |
{ |
373 |
return tabsize; |
374 |
} |
375 |
|
376 |
/** Report exception errors caught in nextToken() */ |
377 |
virtual void reportError(const RecognitionException& e); |
378 |
|
379 |
/** Parser error-reporting function can be overridden in subclass */ |
380 |
virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); |
381 |
|
382 |
/** Parser warning-reporting function can be overridden in subclass */ |
383 |
virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); |
384 |
|
385 |
virtual InputBuffer& getInputBuffer() |
386 |
{ |
387 |
return inputState->getInput(); |
388 |
} |
389 |
|
390 |
virtual LexerSharedInputState getInputState() |
391 |
{ |
392 |
return inputState; |
393 |
} |
394 |
|
395 |
/** set the input state for the lexer. |
396 |
* @note state is a reference counted object, hence no reference */ |
397 |
virtual void setInputState(LexerSharedInputState state) |
398 |
{ |
399 |
inputState = state; |
400 |
} |
401 |
|
402 |
/// Set the factory for created tokens |
403 |
virtual void setTokenObjectFactory(factory_type factory) |
404 |
{ |
405 |
tokenFactory = factory; |
406 |
} |
407 |
|
408 |
/** Test the token text against the literals table |
409 |
* Override this method to perform a different literals test |
410 |
*/ |
411 |
virtual int testLiteralsTable(int ttype) const |
412 |
{ |
413 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); |
414 |
if (i != literals.end()) |
415 |
ttype = (*i).second; |
416 |
return ttype; |
417 |
} |
418 |
|
419 |
/** Test the text passed in against the literals table |
420 |
* Override this method to perform a different literals test |
421 |
* This is used primarily when you want to test a portion of |
422 |
* a token |
423 |
*/ |
424 |
virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const |
425 |
{ |
426 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); |
427 |
if (i != literals.end()) |
428 |
ttype = (*i).second; |
429 |
return ttype; |
430 |
} |
431 |
|
432 |
/// Override this method to get more specific case handling |
433 |
virtual int toLower(int c) const |
434 |
{ |
435 |
// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) |
436 |
// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) |
437 |
// this one is more structural. Maybe make this configurable. |
438 |
return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); |
439 |
} |
440 |
|
441 |
/** This method is called by YourLexer::nextToken() when the lexer has |
442 |
* hit EOF condition. EOF is NOT a character. |
443 |
* This method is not called if EOF is reached during |
444 |
* syntactic predicate evaluation or during evaluation |
445 |
* of normal lexical rules, which presumably would be |
446 |
* an IOException. This traps the "normal" EOF condition. |
447 |
* |
448 |
* uponEOF() is called after the complete evaluation of |
449 |
* the previous token and only if your parser asks |
450 |
* for another token beyond that last non-EOF token. |
451 |
* |
452 |
* You might want to throw token or char stream exceptions |
453 |
* like: "Heh, premature eof" or a retry stream exception |
454 |
* ("I found the end of this file, go back to referencing file"). |
455 |
*/ |
456 |
virtual void uponEOF() |
457 |
{ |
458 |
} |
459 |
|
460 |
/// Methods used to change tracing behavior |
461 |
virtual void traceIndent(); |
462 |
virtual void traceIn(const char* rname); |
463 |
virtual void traceOut(const char* rname); |
464 |
|
465 |
#ifndef NO_STATIC_CONSTS |
466 |
static const int EOF_CHAR = EOF; |
467 |
#else |
468 |
enum { |
469 |
EOF_CHAR = EOF |
470 |
}; |
471 |
#endif |
472 |
protected: |
473 |
ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token |
474 |
/// flag indicating wether consume saves characters |
475 |
bool saveConsumedInput; |
476 |
factory_type tokenFactory; ///< Factory for tokens |
477 |
bool caseSensitive; ///< Is this lexer case sensitive |
478 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass |
479 |
|
480 |
RefToken _returnToken; ///< used to return tokens w/o using return val |
481 |
|
482 |
/// Input state, gives access to input stream, shared among different lexers |
483 |
LexerSharedInputState inputState; |
484 |
|
485 |
/** Used during filter mode to indicate that path is desired. |
486 |
* A subsequent scan error will report an error as usual |
487 |
* if acceptPath=true; |
488 |
*/ |
489 |
bool commitToPath; |
490 |
|
491 |
int tabsize; ///< tab size the scanner uses. |
492 |
|
493 |
/// Create a new RefToken of type t |
494 |
virtual RefToken makeToken(int t) |
495 |
{ |
496 |
RefToken tok = tokenFactory(); |
497 |
tok->setType(t); |
498 |
tok->setColumn(inputState->tokenStartColumn); |
499 |
tok->setLine(inputState->tokenStartLine); |
500 |
return tok; |
501 |
} |
502 |
|
503 |
/** Tracer class, used when -traceLexer is passed to antlr |
504 |
*/ |
505 |
class Tracer { |
506 |
private: |
507 |
CharScanner* parser; |
508 |
const char* text; |
509 |
|
510 |
Tracer(const Tracer& other); // undefined |
511 |
Tracer& operator=(const Tracer& other); // undefined |
512 |
public: |
513 |
Tracer( CharScanner* p,const char* t ) |
514 |
: parser(p), text(t) |
515 |
{ |
516 |
parser->traceIn(text); |
517 |
} |
518 |
~Tracer() |
519 |
{ |
520 |
parser->traceOut(text); |
521 |
} |
522 |
}; |
523 |
|
524 |
int traceDepth; |
525 |
private: |
526 |
CharScanner( const CharScanner& other ); // undefined |
527 |
CharScanner& operator=( const CharScanner& other ); // undefined |
528 |
|
529 |
#ifndef NO_STATIC_CONSTS |
530 |
static const int NO_CHAR = 0; |
531 |
#else |
532 |
enum { |
533 |
NO_CHAR = 0 |
534 |
}; |
535 |
#endif |
536 |
}; |
537 |
|
538 |
inline int CharScanner::LA(unsigned int i) |
539 |
{ |
540 |
int c = inputState->getInput().LA(i); |
541 |
|
542 |
if ( caseSensitive ) |
543 |
return c; |
544 |
else |
545 |
return toLower(c); // VC 6 tolower bug caught in toLower. |
546 |
} |
547 |
|
548 |
inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const |
549 |
{ |
550 |
if (scanner->getCaseSensitiveLiterals()) |
551 |
return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); |
552 |
else |
553 |
{ |
554 |
#ifdef NO_STRCASECMP |
555 |
return (stricmp(x.c_str(),y.c_str())<0); |
556 |
#else |
557 |
return (strcasecmp(x.c_str(),y.c_str())<0); |
558 |
#endif |
559 |
} |
560 |
} |
561 |
|
562 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
563 |
} |
564 |
#endif |
565 |
|
566 |
#endif //INC_CharScanner_hpp__ |