bratlex.py
2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
'''
Tokenisation for the brat stand-off format.
Example, test tokenisation on a collection:
find . -name '*.ann' | parallel cat | ./bratlex.py
Author: Pontus Stenetorp <pontus stenetorp se>
Version: 2011-07-11
'''
try:
import ply.lex as lex
except ImportError:
# We need to add ply to path
from sys import path as sys_path
from os.path import join as path_join
from os.path import dirname
sys_path.append(path_join(dirname(__file__), '../lib/ply-3.4'))
import ply.lex as lex
tokens = (
# Primitives
'COLON',
'NEWLINE',
'SPACE',
'TAB',
'WILDCARD',
# Identifiers
'COMMENT_ID',
'EVENT_ID',
'MODIFIER_ID',
'RELATION_ID',
'TEXT_BOUND_ID',
# Values
'INTEGER',
'TYPE',
# Special-case for freetext
'FREETEXT',
)
states = (
('freetext', 'exclusive'),
)
t_COLON = r':'
t_SPACE = r'\ '
t_WILDCARD = r'\*'
def t_COMMENT_ID(t):
r'\#[0-9]+'
return t
def t_EVENT_ID(t):
r'E[0-9]+'
return t
def t_MODIFIER_ID(t):
r'M[0-9]+'
return t
def t_RELATION_ID(t):
r'R[0-9]+'
return t
def t_TEXT_BOUND_ID(t):
r'T[0-9]+'
return t
def t_NEWLINE(t):
r'\n'
# Increment the lexers line-count
t.lexer.lineno += 1
# Reset the count of tabs on this line
t.lexer.line_tab_count = 0
return t
def t_TAB(t):
r'\t'
# Increment the number of tabs we have soon on this line
t.lexer.line_tab_count += 1
if t.lexer.line_tab_count == 2:
t.lexer.begin('freetext')
return t
def t_INTEGER(t):
r'\d+'
t.value = int(t.value)
return t
def t_TYPE(t):
r'[A-Z][A-Za-z_-]*'
return t
def t_freetext_FREETEXT(t):
r'[^\n\t]+'
return t
def t_freetext_TAB(t):
r'\t'
# End freetext mode INITAL
t.lexer.begin('INITIAL')
return t
def t_freetext_NEWLINE(t):
r'\n'
# Increment the lexers line-count
t.lexer.lineno += 1
# Reset the count of tabs on this line
t.lexer.line_tab_count = 0
# End freetext mode INITAL
t.lexer.begin('INITIAL')
return t
# Error handling rule
def t_error(t):
print "Illegal character '%s'" % t.value[0]
raise Exception
t.lexer.skip(1)
def t_freetext_error(t):
return t_error(t)
lexer = lex.lex()
lexer.line_tab_count = 0
if __name__ == '__main__':
from sys import stdin
for line in stdin:
lexer.input(line)
for tok in lexer:
pass
print tok