mecab.py
1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
# -*- coding: utf-8 -*-`
'''
MeCab wrapper for brat
http://mecab.sourceforge.net/
Author: Pontus Stenetorp <pontus stenetorp se>
Version: 2011-05-17
'''
from os.path import dirname
from os.path import join as path_join
from re import compile as re_compile
from re import DOTALL
### Constants
# TODO: EXTERNAL_DIR_PATH really should be specified elsewhere
EXTERNAL_DIR_PATH = path_join(dirname(__file__), '..', '..', 'external')
MECAB_PYTHON_PATH = path_join(EXTERNAL_DIR_PATH, 'mecab-python-0.98')
WAKATI_REGEX = re_compile(r'(\S.*?)(?:(?:(?<!\s)\s|$))', DOTALL)
###
try:
import MeCab as mecab
except ImportError:
# We probably haven't added the path yet
from sys import path as sys_path
sys_path.append(MECAB_PYTHON_PATH)
import MeCab as mecab
# Boundaries are on the form: [start, end]
def token_offsets_gen(text):
# Parse in Wakati format
tagger = mecab.Tagger('-O wakati')
# Parse into Wakati format, MeCab only takes utf-8
parse = tagger.parse(text.encode('utf-8'))
# Remember to decode or you WILL get the number of bytes
parse = parse.decode('utf-8')
# Wakati inserts spaces, but only after non-space tokens.
# We find these iteratively and then allow additional spaces to be treated
# as seperate tokens.
# XXX: MeCab rapes newlines by removing them, we need to align ourselves
last_end = 0
for tok in (m.group(1) for m in WAKATI_REGEX.finditer(parse)):
start = text.find(tok, last_end)
end = start + len(tok)
yield [start, end]
last_end = end
if __name__ == '__main__':
# Minor test: Is it a duck? Maybe?
sentence = u'鴨かも?'
token_offsets = [t for t in token_offsets_gen(sentence)]
segmented = [sentence[start:end + 1] for start, end in token_offsets]
print '\t'.join((sentence, unicode(token_offsets), '|'.join(segmented)))