Commit 2f308cb1e0849b51145883f2b46db45c41f33574

Authored by Wojciech Jaworski
1 parent 766cb2a4

dodanie zasobów z jednoskami wielosłownymi

Too many changes to show.

To preserve performance only 6 of 12 files are displayed.

morphology2/TODO
... ... @@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć
2 2 dodać tagger
3 3 usunąć reguły praet z dołączonym aglutynatem!
4 4 i usunąć excluded interps z subsyntax
  5 +dodać morfeusz_suplementy wydobyte z zasobów MWE
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -21,227 +21,197 @@ open Xstd
21 21 open ENIAMsubsyntaxTypes
22 22 open ENIAMtokenizerTypes
23 23  
24   -let load_dict dict filename =
  24 +type sel = V of string | S of string | G
  25 +
  26 +type t =
  27 + L of string * string * sel list
  28 + | O of string
  29 + | D of string * string
  30 +
  31 +let process_interp lemma interp =
  32 + match Xstring.split ":" interp with
  33 + cat :: interp -> L(lemma,cat,Xlist.map interp (function
  34 + "$c" -> S "c"
  35 + | "$n" -> S "n"
  36 + | "$g" -> S "g"
  37 + | "$d" -> S "d"
  38 + | "$C" -> S "C"
  39 + | "_" -> G
  40 + | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
  41 + | _ -> failwith "process_interp"
  42 +
  43 +let load_mwe_dict dict filename =
25 44 File.fold_tab filename dict (fun dict -> function
26   - [orth; lemma; interp] ->
27   - let s = List.hd (Str.split_delim (Str.regexp " ") orth) in
28   - StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
  45 + [orths; lemma; interp] ->
  46 + let orths = Xstring.split " " orths in
  47 + if orths = [] then failwith "load_mwe_dict" else
  48 + let s = List.hd orths in
  49 + let orths = Xlist.map orths (fun s -> O s) in
  50 + let lemma,cat,interp = match process_interp lemma interp with
  51 + L(lemma,cat,interp) -> lemma,cat,interp
  52 + | _ -> failwith "load_mwe_dict2" in
  53 + StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
29 54 | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
30 55  
31   -let mwe_dict =
32   - let dict = load_dict StringMap.empty brev_filename in
33   - let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
34   -(* let dict = load_dict dict complete_entries_filename in*)
35   - let dict = load_dict dict mwe_filename in
36   - dict
37   -
38   -let preselect_dict orths dict =
39   - StringSet.fold orths [] (fun rules orth ->
40   - try
41   - let l = StringMap.find dict orth in
42   - Xlist.fold l rules (fun rules (orth,lemma,interp) ->
43   - (* print_endline ("preselect_dict: " ^ orth); *)
44   - let match_list = Str.split (Str.regexp " ") orth in
45   - let b = Xlist.fold match_list true (fun b s ->
46   - (* if not (StringSet.mem orths s) then print_endline s; *)
47   - StringSet.mem orths s && b) in
48   - if b then (match_list,lemma,interp) :: rules else rules)
49   - with Not_found -> rules)
50   -
51   -
52   -(*
53   -type matching = {
54   - prefix: tokens list;
55   - matched: token_record list;
56   - suffix: tokens list;
57   - pattern: pat list;
58   - command: token_record list -> token;
59   - last: int
60   - }
61   -
62   -let rec find_abr_pattern_tail matchings found = function
63   - [] -> found
64   - | token :: l ->
65   - let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching ->
66   - match matching.pattern with
67   - [pat] ->
68   - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
69   - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
70   - matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else
71   - matchings, found
72   - | pat :: pattern ->
73   - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
74   - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
75   - {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else
76   - matchings, found
77   - | [] -> matchings, matching :: found) in
78   - if matchings = [] then found else find_abr_pattern_tail matchings found l
79   -
80   -let rec find_abr_pattern all_matchings found = function
81   - token :: l ->
82   - let matchings = Xlist.fold all_matchings [] (fun matchings matching ->
83   - match matching.pattern with
84   - pat :: pattern ->
85   - (if PrePatterns.match_token (pat,token.token) then
86   - [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings
87   - | _ -> failwith "find_abr_pattern: ni") in
88   - let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in
89   - find_abr_pattern all_matchings found l
90   - | [] -> found
91   -
92   -let rec make_abr_orth = function
93   - [] -> ""
94   - | [t] -> t.orth
95   - | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l)
96   -
97   -let find_abr_patterns patterns tokens =
98   - let found = find_abr_pattern (Xlist.map patterns (fun pattern ->
99   - {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in
100   - Xlist.rev_map found (fun matching ->
101   - let t1 = List.hd (List.rev matching.matched) in
102   - let t2 = List.hd matching.matched in
103   - t1.beg,
104   - t2.beg + t2.len - t1.beg,
105   - t2.next,
106   - make_abr_orth (List.rev matching.matched))
107   -
108   -let split_interp line gloss interp =
109   - if interp = "xxx" then [gloss, "xxx"] else
110   - Xlist.map (Str.split (Str.regexp " ") interp) (fun s ->
111   - match Str.split (Str.regexp "|") s with
112   - [lemma;interp] -> lemma, interp
113   - | _ -> failwith ("bad brev entry: " ^ line))
114   -
115   -let load_brev_dict () =
116   - let lines = File.load_lines "data/brev_20151215.tab" in
117   - List.rev (Xlist.rev_map lines (fun line ->
118   - match Str.split_delim (Str.regexp "\t") line with
119   - [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
120   - | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
121   - | _ -> failwith ("load_brev_dict: " ^ line)))
122   -
123   -let parse_lemma lemma =
124   - if lemma = ":" then lemma,"" else
125   - match Str.split (Str.regexp ":") lemma with
126   - [x] -> x,""
127   - | [x;y] -> x,y
128   - | _ -> failwith ("parse_lemma: " ^ lemma)
129   -
130   -let make_orths orth beg len lexeme_postags_list =
131   - let n = Xlist.size lexeme_postags_list in
132   - let orth_list =
133   - if n = 1 then [orth,beg,len] else
134   - List.rev (Int.fold 1 n [] (fun l i ->
135   - (orth ^ "_" ^ string_of_int i,
136   - (if i=1 then beg else beg+len-n+i-1),
137   - if i=1 then len-n+1 else 1) :: l)) in
138   - List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) ->
139   - (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list))
140   -
141   -let brev_dict = load_brev_dict ()
142   -
143   -(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe,
144   - Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *)
145   -let process_brev paths (*tokens*) = paths
146   -(* let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) ->
147   - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
148   - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
149   - let orths = make_orths orth beg len lexeme_postags_list in
150   - ENIAMpaths.add_path paths beg next orths)) in
151   - paths*)
152   -
153   -let rec preselect_mwe_dict_token set = function
154   - SmallLetter orth -> StringSet.add set orth
155   - | CapLetter(orth,lc) -> StringSet.add set orth
156   - | AllSmall orth -> StringSet.add set orth
157   - | AllCap(orth,lc,lc2) -> StringSet.add set orth
158   - | FirstCap(orth,lc,_,_) -> StringSet.add set orth
159   - | SomeCap orth -> StringSet.add set orth
160   - | Symbol orth -> StringSet.add set orth
161   - | Dig(v,"dig") -> StringSet.add set v
162   - | Other2 orth -> StringSet.add set orth
163   - | _ -> set
164   -
165   -let rec preselect_mwe_dict_tokens set = function
166   - Token t -> preselect_mwe_dict_token set t.token
167   - | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens
168   - | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens
169   -
170   -let preselect_mwe_dict mwe_dict tokens =
171   - let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in
172   - let set = StringSet.fold set StringSet.empty (fun set orth ->
173   - try
174   - let l = StringMap.find mwe_dict orth in
175   - Xlist.fold l set StringSet.add
176   - with Not_found -> set) in
177   -(* StringSet.iter set print_endline; *)
178   - StringSet.fold set [] (fun l s ->
179   - match Str.split_delim (Str.regexp "\t") s with
180   - [lemma; interp; sense] ->
181   - (match Str.split_delim (Str.regexp ":") interp with
182   - orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l
183   - | _ -> failwith "preselect_mwe_dict")
184   - | _ -> failwith "preselect_mwe_dict")
185   -
186   -let simplify_lemma lemma =
187   - match Str.split (Str.regexp "-") lemma with
188   - [x;"1"] -> x
189   - | [x;"2"] -> x
190   - | [x;"3"] -> x
191   - | [x;"4"] -> x
192   - | [x;"5"] -> x
193   - | _ -> lemma
194   -
195   -let mwe_dict = load_mwe_dict ()
196   -
197   -let process_mwe paths (*tokens*) = paths
198   -(* let mwe_dict = preselect_mwe_dict mwe_dict tokens in
199   - let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) ->
200   - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
201   - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
202   - let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in
203   - ENIAMpaths.add_path paths beg next orths)) in
204   - paths*)
205   -*)
  56 +let process_orth = function
  57 + [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
  58 + | [Lexer.T orth] -> O orth
  59 + | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
  60 + | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
  61 + | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
  62 +
  63 +let load_mwe_dict2 (dict,dict2) filename =
  64 + File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
  65 + [orths; lemma] ->
  66 + (* print_endline (orths ^ "\t" ^ lemma); *)
  67 + let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
  68 + (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
  69 + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
  70 + (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
  71 + let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
  72 + let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
  73 + (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
  74 + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
  75 + (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
  76 + let lemma,cat,interp = match process_orth tokens with
  77 + L(lemma,cat,interp) -> lemma,cat,interp
  78 + | _ -> failwith "load_mwe_dict2" in
  79 + if orths = [] then failwith "load_mwe_dict2" else
  80 + (match List.hd orths with
  81 + L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
  82 + | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
  83 + | D _ -> failwith "load_mwe_dict2")
  84 + | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
  85 +
  86 +let mwe_dict,mwe_dict2 =
  87 + let dict = load_mwe_dict StringMap.empty brev_filename in
  88 + let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
  89 + let dict = load_mwe_dict dict mwe_filename in
  90 + let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in
  91 + let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in
  92 + let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in
  93 + dict,dict2
206 94  
207 95 let get_orths paths =
208 96 IntMap.fold paths StringSet.empty (fun orths _ map ->
209 97 IntMap.fold map orths (fun orths _ l ->
210   - Xlist.fold l orths (fun orths t ->
  98 + TokenEnvSet.fold l orths (fun orths t ->
211 99 StringSet.add orths (ENIAMtokens.get_orth t.token))))
212 100  
  101 +let get_lemmas paths =
  102 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  103 + IntMap.fold map orths (fun orths _ l ->
  104 + TokenEnvSet.fold l orths (fun orths t ->
  105 + StringSet.add orths (ENIAMtokens.get_lemma t.token))))
  106 +
213 107 let get_intnum_orths paths =
214 108 IntMap.fold paths StringMap.empty (fun orths _ map ->
215 109 IntMap.fold map orths (fun orths _ l ->
216   - Xlist.fold l orths (fun orths t ->
  110 + TokenEnvSet.fold l orths (fun orths t ->
217 111 match t.token with
218 112 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
219 113 | _ -> orths)))
220 114  
221   -let rec match_path_rec map found (t:token_env) rev = function
222   - [] -> (t :: rev) :: found
  115 +let preselect orths lemmas rules l =
  116 + Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
  117 + let b = Xlist.fold match_list true (fun b -> function
  118 + O s -> StringSet.mem orths s && b
  119 + | L(s,_,_) -> StringSet.mem lemmas s && b
  120 + | D(s,_) -> failwith "preselect") in
  121 + if b then (match_list,lemma,cat,interp) :: rules else rules)
  122 +
  123 +let preselect_dict orths lemmas dict rules =
  124 + StringSet.fold orths rules (fun rules orth ->
  125 + try
  126 + preselect orths lemmas rules (StringMap.find dict orth)
  127 + with Not_found -> rules)
  128 +
  129 +let preselect_dict2 orths lemmas dict2 rules =
  130 + StringSet.fold lemmas rules (fun rules lemma ->
  131 + try
  132 + preselect orths lemmas rules (StringMap.find dict2 lemma)
  133 + with Not_found -> rules)
  134 +
  135 +let add_ordnum_rules orths rules =
  136 + StringMap.fold orths rules (fun rules orth lemmas ->
  137 + StringSet.fold lemmas rules (fun rules lemma ->
  138 + (* Printf.printf "%s %s\n%!" orth lemma; *)
  139 + ([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
  140 +
  141 +let select_rules paths mwe_dict mwe_dict2 =
  142 + let orths = get_orths paths in
  143 + let lemmas = get_lemmas paths in
  144 + let intnum_orths = get_intnum_orths paths in
  145 + let rules = preselect_dict orths lemmas mwe_dict [] in
  146 + let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
  147 + let rules = add_ordnum_rules intnum_orths rules in
  148 + rules
  149 +
  150 +let rec check_interp sels = function
  151 + [],[] -> true
  152 + | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
  153 + | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
  154 + | S s :: interp, l2 :: interp2 ->
  155 + (try
  156 + let l = Xlist.assoc sels s in
  157 + let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
  158 + if b then check_interp sels (interp,interp2) else false
  159 + with Not_found -> check_interp sels (interp,interp2))
  160 + | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
  161 + | _ -> failwith "check_interp"
  162 +
  163 +let rec get_sels sels = function
  164 + [],[] -> sels
  165 + | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
  166 + | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  167 + | S s :: interp, l2 :: interp2 ->
  168 + (try
  169 + let l = Xlist.assoc sels s in
  170 + let sels = List.remove_assoc s sels in
  171 + let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
  172 + get_sels ((s,l) :: sels) (interp,interp2)
  173 + with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
  174 + | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  175 + | _ -> failwith "get_sels"
  176 +
  177 +let rec match_path_rec map found (t:token_env) sels rev = function
  178 + [] -> (t :: rev, sels) :: found
223 179 | s :: l ->
224 180 let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
225 181 let found2 = IntMap.fold map2 [] (fun found2 _ l ->
226   - Xlist.fold l found2 (fun found2 new_t ->
227   - if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in
228   - Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l)
  182 + TokenEnvSet.fold l found2 (fun found2 new_t ->
  183 + match s,new_t.token with
  184 + O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
  185 + | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
  186 + Xlist.fold interps2 found2 (fun found2 interp2 ->
  187 + if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
  188 + (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
  189 + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
  190 + | _ -> found2)) in
  191 + Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
229 192  
230 193 let match_path map = function
231 194 [] -> failwith "match_path"
232 195 | s :: l ->
233 196 let found = IntMap.fold map [] (fun found i map2 ->
234 197 IntMap.fold map2 found (fun found j l ->
235   - Xlist.fold l found (fun found t ->
236   - if ENIAMtokens.get_orth t.token = s then t :: found else found))) in
237   - Xlist.fold found [] (fun found t -> match_path_rec map found t [] l)
  198 + TokenEnvSet.fold l found (fun found t ->
  199 + match s,t.token with
  200 + O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
  201 + | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
  202 + Xlist.fold interps2 found (fun found interp2 ->
  203 + if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
  204 + (t,get_sels [] (interp,interp2)) :: found else found)
  205 + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
  206 + | _ -> found))) in
  207 + Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
238 208  
239 209 let concat_orths l =
240 210 let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
241 211 let n = Xstring.size s in
242 212 if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
243 213  
244   -let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *)
  214 +let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
245 215 let l = List.rev matching in
246 216 let beg = (List.hd l).beg in
247 217 let t = List.hd matching in
... ... @@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na
251 221 beg=beg;
252 222 len=len;
253 223 next=t.next;
254   - token=ENIAMtokens.make_lemma (lemma,interp);
  224 + token=Lemma(lemma,cat,[Xlist.map interp (function
  225 + S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
  226 + | V s -> [s]
  227 + | G -> ["_"])]);
255 228 weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
256 229 attrs=ENIAMtokens.merge_attrs l}
257 230  
258 231 let add_token paths t =
259 232 let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
260   - let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in
  233 + let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
261 234 IntMap.add paths t.beg map
262 235  
263   -let apply_rule paths (match_list,lemma,interp) =
  236 +let apply_rule paths (match_list,lemma,cat,interp) =
264 237 (* print_endline ("apply_rule: " ^ lemma); *)
265 238 let matchings_found = match_path paths match_list in
266   - Xlist.fold matchings_found paths (fun paths matching ->
  239 + Xlist.fold matchings_found paths (fun paths (matching,sels) ->
267 240 try
268   - let token = create_token matching lemma interp in
  241 + let token = create_token matching sels lemma cat interp in
269 242 add_token paths token
270 243 with Not_found -> paths)
271 244  
272   -(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
273   -let add_ordnum_rules rules paths =
274   - let orths = get_intnum_orths paths in
275   - StringMap.fold orths rules (fun rules orth lemmas ->
276   - StringSet.fold lemmas rules (fun rules lemma ->
277   - (* Printf.printf "%s %s\n%!" orth lemma; *)
278   - ([orth;"."],lemma,"ordnum") :: rules))
279   -
280 245 let process (paths,last) =
281 246 let paths = Xlist.fold paths IntMap.empty add_token in
282   - let orths = get_orths paths in
283   - let rules = preselect_dict orths mwe_dict in
284   - let rules = add_ordnum_rules rules paths in
  247 + let rules = select_rules paths mwe_dict mwe_dict2 in
  248 + let paths = Xlist.fold rules paths apply_rule in
  249 + let rules = select_rules paths mwe_dict mwe_dict2 in
  250 + let paths = Xlist.fold rules paths apply_rule in
  251 + let rules = select_rules paths mwe_dict mwe_dict2 in
  252 + let paths = Xlist.fold rules paths apply_rule in
  253 + let rules = select_rules paths mwe_dict mwe_dict2 in
285 254 let paths = Xlist.fold rules paths apply_rule in
286 255 let paths = IntMap.fold paths [] (fun paths _ map ->
287 256 IntMap.fold map paths (fun paths _ l ->
288   - Xlist.fold l paths (fun paths t ->
  257 + TokenEnvSet.fold l paths (fun paths t ->
289 258 t :: paths))) in
290 259 ENIAMpaths.sort (paths,last)
... ...
subsyntax/ENIAMsubsyntaxTypes.ml
... ... @@ -48,6 +48,9 @@ let brev_filename = resource_path ^ &quot;/subsyntax/brev.tab&quot;
48 48 let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
49 49 let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
50 50 let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"
  51 +let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic"
  52 +let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic"
  53 +let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic"
51 54  
52 55 let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"
53 56  
... ...
subsyntax/TODO
1   -- dokończyć rozpoznawanie MWE
2   -- dodać zasoby MWE
3 1 - rozpoznawanie MWE ze Słowosieci
  2 +- kompresowanie tokenów mających indentyczne lematy (albo po przetworzeniu, albo kompresowanie interpretacji przed rozpoznaniem mwe)
4 3  
5 4 - jak przetwarzać num:comp
6 5 - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
... ...
subsyntax/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
... ... @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES)
32 32 eniam-subsyntax.cmxa: $(SOURCES)
33 33 ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^
34 34  
35   -test: test.ml
36   - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
  35 +test: $(SOURCES) test.ml
  36 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
37 37  
38 38 interface: interface.ml
39 39 $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml
... ...
subsyntax/resources/README
1   -File NKJP1M-lemma-freq.tab in this folder was created on the basis of
  1 +File NKJP1M-lemma-freq.tab in this folder was created on the basis of
2 2  
3 3 NKJP1M: the manually annotated 1-million word subcorpus sampled
4 4 from texts of a subset of the National Corpus of Polish.
5 5 version 1.2
6 6  
7   -File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of
  7 +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of
8 8  
9 9 SGJP: Grammatical Dictionary of Polish, version 20151020
10 10 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
11 11 Woliński, Robert Wołosz, Danuta Skowrońska
12 12  
13   -and also on the basis of
  13 +and also on the basis of
14 14 PoliMorf, version 20151020
  15 +
  16 +File SEJF.dic is created on the basis of
  17 +Grammatical Lexicon of Polish Phraseology
  18 +Copyright © Monika Czerepowicka, Agata Savary
  19 +Copyright © Institute of Computer Science Polish Academy of Sciences
  20 +The data are available under the CC BY-SA license.
  21 +
  22 +File SEJFEK.dic is created on the basis of
  23 +Grammatical Lexicon of Polish Economic Phraseology
  24 +Copyright © Filip Makowiecki, Agata Savary
  25 +Copyright © Institute of Computer Science Polish Academy of Sciences
  26 +The data are available under the CC BY-SA license.
  27 +
  28 +File SAWA.dic is created on the basis of
  29 +Grammatical Lexicon of Warsaw Urban Proper Names
  30 +Copyright © Małgorzata Marciniak, Celina Heliasz, Joanna Rabiega-Wiśniewska, Piotr Sikora, Marcin Woliński, Agata Savary
  31 +Copyright © Institute of Computer Science Polish Academy of Sciences
  32 +The data are available under the CC BY-SA license.
... ...