Commit a6443fde71219cfa91058232eb1b0f039b1a5661
1 parent
df3ada33
poprawki w budowniczym - żeby w końcu działało np. dla "dwuipółletni"
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@129 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
5 changed files
with
25 additions
and
22 deletions
CMakeLists.txt
... | ... | @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") |
36 | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
37 | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
38 | 38 | else () |
39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
40 | 40 | endif () |
41 | 41 | endif () |
42 | 42 | |
... | ... |
fsabuilder/buildanalyzer.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
4 | - --tagset-file=../input/polimorf.tagset \ | |
5 | - --segments-file=../input/segmenty.dat \ | |
3 | +python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | |
4 | + --tagset-file=/tmp/polimorf-sgjp.tagset \ | |
5 | + --segments-file=/tmp/segmenty.dat \ | |
6 | 6 | --analyzer \ |
7 | 7 | --serialization-method=SIMPLE \ |
8 | 8 | --trim-supneg \ |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -7,6 +7,11 @@ import re |
7 | 7 | import logging |
8 | 8 | from morfeuszbuilder.utils import exceptions |
9 | 9 | |
10 | +def _cutHomonymFromLemma(lemma): | |
11 | + if lemma: | |
12 | + lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
13 | + return lemma | |
14 | + | |
10 | 15 | class Segtypes(object): |
11 | 16 | |
12 | 17 | def __init__(self, tagset, segrulesConfigFile): |
... | ... | @@ -33,6 +38,8 @@ class Segtypes(object): |
33 | 38 | |
34 | 39 | print self.segnum2Segtype |
35 | 40 | |
41 | +# self._debugSegnums() | |
42 | + | |
36 | 43 | def _validate(self, msg, lineNum, cond): |
37 | 44 | if not cond: |
38 | 45 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) |
... | ... | @@ -171,15 +178,16 @@ class Segtypes(object): |
171 | 178 | return self.segtype2Segnum[segTypeString] |
172 | 179 | |
173 | 180 | def lexeme2Segnum(self, lemma, tagnum): |
181 | + lemma = _cutHomonymFromLemma(lemma) | |
174 | 182 | res = self._lemmaTagnum2Segnum.get((lemma, tagnum), None) |
175 | - if not res: | |
183 | + if res is None: | |
176 | 184 | res = self._tagnum2Segnum.get(tagnum, None) |
177 | 185 | return res |
178 | 186 | |
179 | 187 | class SegtypePattern(object): |
180 | 188 | |
181 | 189 | def __init__(self, lemma, pattern, segnum): |
182 | - self.lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
190 | + self.lemma = _cutHomonymFromLemma(lemma) | |
183 | 191 | self.pattern = pattern |
184 | 192 | self.segnum = segnum |
185 | 193 | |
... | ... | @@ -189,8 +197,7 @@ class SegtypePattern(object): |
189 | 197 | patterns2Match = [] |
190 | 198 | patterns2Match.append(self.pattern.replace('%', '.*')) |
191 | 199 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
192 | - if lemma: | |
193 | - lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
200 | + lemma = _cutHomonymFromLemma(lemma) | |
194 | 201 | if (self.lemma is None or self.lemma == lemma) \ |
195 | 202 | and any([re.match(p, tag) for p in patterns2Match]): |
196 | 203 | return self.segnum |
... | ... |
input/PoliMorfSmall.tab
... | ... | @@ -697,3 +697,13 @@ biało biały adja pospolita |
697 | 697 | czerwony czerwony:homo1 adj:sg:acc:m3:pos pospolita |
698 | 698 | czerwony czerwony:homo2 adj:sg:acc:m3:pos pospolita |
699 | 699 | że +że qub pospolita |
700 | +wielo wiele:n1 num:comp | |
701 | +pół pół num:comp | |
702 | +pół pół num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec | |
703 | +i i:i interj | |
704 | +i i:j conj | |
705 | +i i:q qub | |
706 | +dwu dwa num:comp | |
707 | +dwu dwa num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr | |
708 | +dwu dwa num:pl:inst:m1.m2.m3.n2:congr | |
709 | +dwu dwa num:pl:nom.acc.voc:m1:rec | |
... | ... |
nbproject/configurations.xml
... | ... | @@ -112,8 +112,6 @@ |
112 | 112 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | 113 | </item> |
114 | 114 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
115 | - <ccTool flags="1"> | |
116 | - </ccTool> | |
117 | 115 | </item> |
118 | 116 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
119 | 117 | ex="false" |
... | ... | @@ -394,26 +392,18 @@ |
394 | 392 | </ccTool> |
395 | 393 | </item> |
396 | 394 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
397 | - <ccTool flags="1"> | |
398 | - </ccTool> | |
399 | 395 | </item> |
400 | 396 | <item path="morfeusz/charset/CharsetConverter.cpp" |
401 | 397 | ex="false" |
402 | 398 | tool="1" |
403 | 399 | flavor2="4"> |
404 | - <ccTool flags="1"> | |
405 | - </ccTool> | |
406 | 400 | </item> |
407 | 401 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
408 | - <ccTool flags="1"> | |
409 | - </ccTool> | |
410 | 402 | </item> |
411 | 403 | <item path="morfeusz/charset/conversion_tables.cpp" |
412 | 404 | ex="false" |
413 | 405 | tool="1" |
414 | 406 | flavor2="4"> |
415 | - <ccTool flags="1"> | |
416 | - </ccTool> | |
417 | 407 | </item> |
418 | 408 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
419 | 409 | <ccTool flags="1"> |
... | ... | @@ -510,12 +500,8 @@ |
510 | 500 | ex="false" |
511 | 501 | tool="1" |
512 | 502 | flavor2="4"> |
513 | - <ccTool flags="1"> | |
514 | - </ccTool> | |
515 | 503 | </item> |
516 | 504 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
517 | - <ccTool flags="1"> | |
518 | - </ccTool> | |
519 | 505 | </item> |
520 | 506 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
521 | 507 | <ccTool flags="0"> |
... | ... |