Commit a6443fde71219cfa91058232eb1b0f039b1a5661
1 parent
df3ada33
poprawki w budowniczym - żeby w końcu działało np. dla "dwuipółletni"
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@129 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
5 changed files
with
25 additions
and
22 deletions
CMakeLists.txt
| ... | ... | @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") |
| 36 | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
| 37 | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
| 38 | 38 | else () |
| 39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
| 39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
| 40 | 40 | endif () |
| 41 | 41 | endif () |
| 42 | 42 | |
| ... | ... |
fsabuilder/buildanalyzer.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | |
| 3 | -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
| 4 | - --tagset-file=../input/polimorf.tagset \ | |
| 5 | - --segments-file=../input/segmenty.dat \ | |
| 3 | +python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | |
| 4 | + --tagset-file=/tmp/polimorf-sgjp.tagset \ | |
| 5 | + --segments-file=/tmp/segmenty.dat \ | |
| 6 | 6 | --analyzer \ |
| 7 | 7 | --serialization-method=SIMPLE \ |
| 8 | 8 | --trim-supneg \ |
| ... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
| ... | ... | @@ -7,6 +7,11 @@ import re |
| 7 | 7 | import logging |
| 8 | 8 | from morfeuszbuilder.utils import exceptions |
| 9 | 9 | |
| 10 | +def _cutHomonymFromLemma(lemma): | |
| 11 | + if lemma: | |
| 12 | + lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
| 13 | + return lemma | |
| 14 | + | |
| 10 | 15 | class Segtypes(object): |
| 11 | 16 | |
| 12 | 17 | def __init__(self, tagset, segrulesConfigFile): |
| ... | ... | @@ -33,6 +38,8 @@ class Segtypes(object): |
| 33 | 38 | |
| 34 | 39 | print self.segnum2Segtype |
| 35 | 40 | |
| 41 | +# self._debugSegnums() | |
| 42 | + | |
| 36 | 43 | def _validate(self, msg, lineNum, cond): |
| 37 | 44 | if not cond: |
| 38 | 45 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) |
| ... | ... | @@ -171,15 +178,16 @@ class Segtypes(object): |
| 171 | 178 | return self.segtype2Segnum[segTypeString] |
| 172 | 179 | |
| 173 | 180 | def lexeme2Segnum(self, lemma, tagnum): |
| 181 | + lemma = _cutHomonymFromLemma(lemma) | |
| 174 | 182 | res = self._lemmaTagnum2Segnum.get((lemma, tagnum), None) |
| 175 | - if not res: | |
| 183 | + if res is None: | |
| 176 | 184 | res = self._tagnum2Segnum.get(tagnum, None) |
| 177 | 185 | return res |
| 178 | 186 | |
| 179 | 187 | class SegtypePattern(object): |
| 180 | 188 | |
| 181 | 189 | def __init__(self, lemma, pattern, segnum): |
| 182 | - self.lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
| 190 | + self.lemma = _cutHomonymFromLemma(lemma) | |
| 183 | 191 | self.pattern = pattern |
| 184 | 192 | self.segnum = segnum |
| 185 | 193 | |
| ... | ... | @@ -189,8 +197,7 @@ class SegtypePattern(object): |
| 189 | 197 | patterns2Match = [] |
| 190 | 198 | patterns2Match.append(self.pattern.replace('%', '.*')) |
| 191 | 199 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
| 192 | - if lemma: | |
| 193 | - lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
| 200 | + lemma = _cutHomonymFromLemma(lemma) | |
| 194 | 201 | if (self.lemma is None or self.lemma == lemma) \ |
| 195 | 202 | and any([re.match(p, tag) for p in patterns2Match]): |
| 196 | 203 | return self.segnum |
| ... | ... |
input/PoliMorfSmall.tab
| ... | ... | @@ -697,3 +697,13 @@ biało biały adja pospolita |
| 697 | 697 | czerwony czerwony:homo1 adj:sg:acc:m3:pos pospolita |
| 698 | 698 | czerwony czerwony:homo2 adj:sg:acc:m3:pos pospolita |
| 699 | 699 | że +że qub pospolita |
| 700 | +wielo wiele:n1 num:comp | |
| 701 | +pół pół num:comp | |
| 702 | +pół pół num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec | |
| 703 | +i i:i interj | |
| 704 | +i i:j conj | |
| 705 | +i i:q qub | |
| 706 | +dwu dwa num:comp | |
| 707 | +dwu dwa num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr | |
| 708 | +dwu dwa num:pl:inst:m1.m2.m3.n2:congr | |
| 709 | +dwu dwa num:pl:nom.acc.voc:m1:rec | |
| ... | ... |
nbproject/configurations.xml
| ... | ... | @@ -112,8 +112,6 @@ |
| 112 | 112 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 113 | 113 | </item> |
| 114 | 114 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
| 115 | - <ccTool flags="1"> | |
| 116 | - </ccTool> | |
| 117 | 115 | </item> |
| 118 | 116 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
| 119 | 117 | ex="false" |
| ... | ... | @@ -394,26 +392,18 @@ |
| 394 | 392 | </ccTool> |
| 395 | 393 | </item> |
| 396 | 394 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
| 397 | - <ccTool flags="1"> | |
| 398 | - </ccTool> | |
| 399 | 395 | </item> |
| 400 | 396 | <item path="morfeusz/charset/CharsetConverter.cpp" |
| 401 | 397 | ex="false" |
| 402 | 398 | tool="1" |
| 403 | 399 | flavor2="4"> |
| 404 | - <ccTool flags="1"> | |
| 405 | - </ccTool> | |
| 406 | 400 | </item> |
| 407 | 401 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
| 408 | - <ccTool flags="1"> | |
| 409 | - </ccTool> | |
| 410 | 402 | </item> |
| 411 | 403 | <item path="morfeusz/charset/conversion_tables.cpp" |
| 412 | 404 | ex="false" |
| 413 | 405 | tool="1" |
| 414 | 406 | flavor2="4"> |
| 415 | - <ccTool flags="1"> | |
| 416 | - </ccTool> | |
| 417 | 407 | </item> |
| 418 | 408 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
| 419 | 409 | <ccTool flags="1"> |
| ... | ... | @@ -510,12 +500,8 @@ |
| 510 | 500 | ex="false" |
| 511 | 501 | tool="1" |
| 512 | 502 | flavor2="4"> |
| 513 | - <ccTool flags="1"> | |
| 514 | - </ccTool> | |
| 515 | 503 | </item> |
| 516 | 504 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
| 517 | - <ccTool flags="1"> | |
| 518 | - </ccTool> | |
| 519 | 505 | </item> |
| 520 | 506 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
| 521 | 507 | <ccTool flags="0"> |
| ... | ... |