- def tokenize(s):
- keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
- token_specification = [
- ('NUMBER', r'\\d+(\\.\\d*)?'), # Integer or decimal number
- ('ASSIGN', r':='), # Assignment operator
- ('END', r';'), # Statement terminator
- ('ID', r'[A-Za-z]+'), # Identifiers
- ('OP', r'[+*\\/\\-]'), # Arithmetic operators
- ('NEWLINE', r'\\n'), # Line endings
- ('SKIP', r'[ \\t]'), # Skip over spaces and tabs
- ]
- tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
- get_token = re.compile(tok_regex).match
- line = 1
- pos = line_start = 0
- mo = get_token(s)
- print(mo.end())
- print(mo.lastgroup)
- print(mo.lastindex)
- print(mo.groupdict())
- while mo is not None:
- typ = mo.lastgroup
- if typ == 'NEWLINE':
- line_start = pos
- line += 1
- elif typ != 'SKIP':
- val = mo.group(typ)
- if typ == 'ID' and val in keywords:
- typ = val
- yield Token(typ, val, line, mo.start()-line_start)
- pos = mo.end()
- mo = get_token(s, pos)
- if pos != len(s):
- raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
- statements = ''':=
- IF quantity THEN
- total := total + price * quantity;
- tax := price * 0.05;
- ENDIF;
- '''
- for token in tokenize(statements):
- print(token)
- #该片段来自于http://www.codesnippet.cn/detail/140620134040.html
来源: http://www.codesnippet.cn/detail/140620134040.html