Commit 2f308cb1e0849b51145883f2b46db45c41f33574
1 parent
766cb2a4
dodanie zasobów z jednoskami wielosłownymi
Showing
12 changed files
with
27932 additions
and
223 deletions
Too many changes to show.
To preserve performance only 6 of 12 files are displayed.
morphology2/TODO
subsyntax/ENIAM_MWE.ml
... | ... | @@ -21,227 +21,197 @@ open Xstd |
21 | 21 | open ENIAMsubsyntaxTypes |
22 | 22 | open ENIAMtokenizerTypes |
23 | 23 | |
24 | -let load_dict dict filename = | |
24 | +type sel = V of string | S of string | G | |
25 | + | |
26 | +type t = | |
27 | + L of string * string * sel list | |
28 | + | O of string | |
29 | + | D of string * string | |
30 | + | |
31 | +let process_interp lemma interp = | |
32 | + match Xstring.split ":" interp with | |
33 | + cat :: interp -> L(lemma,cat,Xlist.map interp (function | |
34 | + "$c" -> S "c" | |
35 | + | "$n" -> S "n" | |
36 | + | "$g" -> S "g" | |
37 | + | "$d" -> S "d" | |
38 | + | "$C" -> S "C" | |
39 | + | "_" -> G | |
40 | + | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) | |
41 | + | _ -> failwith "process_interp" | |
42 | + | |
43 | +let load_mwe_dict dict filename = | |
25 | 44 | File.fold_tab filename dict (fun dict -> function |
26 | - [orth; lemma; interp] -> | |
27 | - let s = List.hd (Str.split_delim (Str.regexp " ") orth) in | |
28 | - StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l) | |
45 | + [orths; lemma; interp] -> | |
46 | + let orths = Xstring.split " " orths in | |
47 | + if orths = [] then failwith "load_mwe_dict" else | |
48 | + let s = List.hd orths in | |
49 | + let orths = Xlist.map orths (fun s -> O s) in | |
50 | + let lemma,cat,interp = match process_interp lemma interp with | |
51 | + L(lemma,cat,interp) -> lemma,cat,interp | |
52 | + | _ -> failwith "load_mwe_dict2" in | |
53 | + StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l) | |
29 | 54 | | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'")) |
30 | 55 | |
31 | -let mwe_dict = | |
32 | - let dict = load_dict StringMap.empty brev_filename in | |
33 | - let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | |
34 | -(* let dict = load_dict dict complete_entries_filename in*) | |
35 | - let dict = load_dict dict mwe_filename in | |
36 | - dict | |
37 | - | |
38 | -let preselect_dict orths dict = | |
39 | - StringSet.fold orths [] (fun rules orth -> | |
40 | - try | |
41 | - let l = StringMap.find dict orth in | |
42 | - Xlist.fold l rules (fun rules (orth,lemma,interp) -> | |
43 | - (* print_endline ("preselect_dict: " ^ orth); *) | |
44 | - let match_list = Str.split (Str.regexp " ") orth in | |
45 | - let b = Xlist.fold match_list true (fun b s -> | |
46 | - (* if not (StringSet.mem orths s) then print_endline s; *) | |
47 | - StringSet.mem orths s && b) in | |
48 | - if b then (match_list,lemma,interp) :: rules else rules) | |
49 | - with Not_found -> rules) | |
50 | - | |
51 | - | |
52 | -(* | |
53 | -type matching = { | |
54 | - prefix: tokens list; | |
55 | - matched: token_record list; | |
56 | - suffix: tokens list; | |
57 | - pattern: pat list; | |
58 | - command: token_record list -> token; | |
59 | - last: int | |
60 | - } | |
61 | - | |
62 | -let rec find_abr_pattern_tail matchings found = function | |
63 | - [] -> found | |
64 | - | token :: l -> | |
65 | - let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching -> | |
66 | - match matching.pattern with | |
67 | - [pat] -> | |
68 | - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in | |
69 | - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then | |
70 | - matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else | |
71 | - matchings, found | |
72 | - | pat :: pattern -> | |
73 | - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in | |
74 | - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then | |
75 | - {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else | |
76 | - matchings, found | |
77 | - | [] -> matchings, matching :: found) in | |
78 | - if matchings = [] then found else find_abr_pattern_tail matchings found l | |
79 | - | |
80 | -let rec find_abr_pattern all_matchings found = function | |
81 | - token :: l -> | |
82 | - let matchings = Xlist.fold all_matchings [] (fun matchings matching -> | |
83 | - match matching.pattern with | |
84 | - pat :: pattern -> | |
85 | - (if PrePatterns.match_token (pat,token.token) then | |
86 | - [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings | |
87 | - | _ -> failwith "find_abr_pattern: ni") in | |
88 | - let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in | |
89 | - find_abr_pattern all_matchings found l | |
90 | - | [] -> found | |
91 | - | |
92 | -let rec make_abr_orth = function | |
93 | - [] -> "" | |
94 | - | [t] -> t.orth | |
95 | - | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l) | |
96 | - | |
97 | -let find_abr_patterns patterns tokens = | |
98 | - let found = find_abr_pattern (Xlist.map patterns (fun pattern -> | |
99 | - {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in | |
100 | - Xlist.rev_map found (fun matching -> | |
101 | - let t1 = List.hd (List.rev matching.matched) in | |
102 | - let t2 = List.hd matching.matched in | |
103 | - t1.beg, | |
104 | - t2.beg + t2.len - t1.beg, | |
105 | - t2.next, | |
106 | - make_abr_orth (List.rev matching.matched)) | |
107 | - | |
108 | -let split_interp line gloss interp = | |
109 | - if interp = "xxx" then [gloss, "xxx"] else | |
110 | - Xlist.map (Str.split (Str.regexp " ") interp) (fun s -> | |
111 | - match Str.split (Str.regexp "|") s with | |
112 | - [lemma;interp] -> lemma, interp | |
113 | - | _ -> failwith ("bad brev entry: " ^ line)) | |
114 | - | |
115 | -let load_brev_dict () = | |
116 | - let lines = File.load_lines "data/brev_20151215.tab" in | |
117 | - List.rev (Xlist.rev_map lines (fun line -> | |
118 | - match Str.split_delim (Str.regexp "\t") line with | |
119 | - [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp | |
120 | - | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp | |
121 | - | _ -> failwith ("load_brev_dict: " ^ line))) | |
122 | - | |
123 | -let parse_lemma lemma = | |
124 | - if lemma = ":" then lemma,"" else | |
125 | - match Str.split (Str.regexp ":") lemma with | |
126 | - [x] -> x,"" | |
127 | - | [x;y] -> x,y | |
128 | - | _ -> failwith ("parse_lemma: " ^ lemma) | |
129 | - | |
130 | -let make_orths orth beg len lexeme_postags_list = | |
131 | - let n = Xlist.size lexeme_postags_list in | |
132 | - let orth_list = | |
133 | - if n = 1 then [orth,beg,len] else | |
134 | - List.rev (Int.fold 1 n [] (fun l i -> | |
135 | - (orth ^ "_" ^ string_of_int i, | |
136 | - (if i=1 then beg else beg+len-n+i-1), | |
137 | - if i=1 then len-n+1 else 1) :: l)) in | |
138 | - List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) -> | |
139 | - (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list)) | |
140 | - | |
141 | -let brev_dict = load_brev_dict () | |
142 | - | |
143 | -(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe, | |
144 | - Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *) | |
145 | -let process_brev paths (*tokens*) = paths | |
146 | -(* let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) -> | |
147 | - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in | |
148 | - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) -> | |
149 | - let orths = make_orths orth beg len lexeme_postags_list in | |
150 | - ENIAMpaths.add_path paths beg next orths)) in | |
151 | - paths*) | |
152 | - | |
153 | -let rec preselect_mwe_dict_token set = function | |
154 | - SmallLetter orth -> StringSet.add set orth | |
155 | - | CapLetter(orth,lc) -> StringSet.add set orth | |
156 | - | AllSmall orth -> StringSet.add set orth | |
157 | - | AllCap(orth,lc,lc2) -> StringSet.add set orth | |
158 | - | FirstCap(orth,lc,_,_) -> StringSet.add set orth | |
159 | - | SomeCap orth -> StringSet.add set orth | |
160 | - | Symbol orth -> StringSet.add set orth | |
161 | - | Dig(v,"dig") -> StringSet.add set v | |
162 | - | Other2 orth -> StringSet.add set orth | |
163 | - | _ -> set | |
164 | - | |
165 | -let rec preselect_mwe_dict_tokens set = function | |
166 | - Token t -> preselect_mwe_dict_token set t.token | |
167 | - | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens | |
168 | - | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens | |
169 | - | |
170 | -let preselect_mwe_dict mwe_dict tokens = | |
171 | - let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in | |
172 | - let set = StringSet.fold set StringSet.empty (fun set orth -> | |
173 | - try | |
174 | - let l = StringMap.find mwe_dict orth in | |
175 | - Xlist.fold l set StringSet.add | |
176 | - with Not_found -> set) in | |
177 | -(* StringSet.iter set print_endline; *) | |
178 | - StringSet.fold set [] (fun l s -> | |
179 | - match Str.split_delim (Str.regexp "\t") s with | |
180 | - [lemma; interp; sense] -> | |
181 | - (match Str.split_delim (Str.regexp ":") interp with | |
182 | - orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l | |
183 | - | _ -> failwith "preselect_mwe_dict") | |
184 | - | _ -> failwith "preselect_mwe_dict") | |
185 | - | |
186 | -let simplify_lemma lemma = | |
187 | - match Str.split (Str.regexp "-") lemma with | |
188 | - [x;"1"] -> x | |
189 | - | [x;"2"] -> x | |
190 | - | [x;"3"] -> x | |
191 | - | [x;"4"] -> x | |
192 | - | [x;"5"] -> x | |
193 | - | _ -> lemma | |
194 | - | |
195 | -let mwe_dict = load_mwe_dict () | |
196 | - | |
197 | -let process_mwe paths (*tokens*) = paths | |
198 | -(* let mwe_dict = preselect_mwe_dict mwe_dict tokens in | |
199 | - let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) -> | |
200 | - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in | |
201 | - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) -> | |
202 | - let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in | |
203 | - ENIAMpaths.add_path paths beg next orths)) in | |
204 | - paths*) | |
205 | -*) | |
56 | +let process_orth = function | |
57 | + [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp | |
58 | + | [Lexer.T orth] -> O orth | |
59 | + | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp | |
60 | + | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) | |
61 | + | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) | |
62 | + | |
63 | +let load_mwe_dict2 (dict,dict2) filename = | |
64 | + File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function | |
65 | + [orths; lemma] -> | |
66 | + (* print_endline (orths ^ "\t" ^ lemma); *) | |
67 | + let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in | |
68 | + (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *) | |
69 | + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in | |
70 | + (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *) | |
71 | + let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in | |
72 | + let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in | |
73 | + (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *) | |
74 | + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in | |
75 | + (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *) | |
76 | + let lemma,cat,interp = match process_orth tokens with | |
77 | + L(lemma,cat,interp) -> lemma,cat,interp | |
78 | + | _ -> failwith "load_mwe_dict2" in | |
79 | + if orths = [] then failwith "load_mwe_dict2" else | |
80 | + (match List.hd orths with | |
81 | + L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l) | |
82 | + | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2 | |
83 | + | D _ -> failwith "load_mwe_dict2") | |
84 | + | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) | |
85 | + | |
86 | +let mwe_dict,mwe_dict2 = | |
87 | + let dict = load_mwe_dict StringMap.empty brev_filename in | |
88 | + let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | |
89 | + let dict = load_mwe_dict dict mwe_filename in | |
90 | + let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in | |
91 | + let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in | |
92 | + let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in | |
93 | + dict,dict2 | |
206 | 94 | |
207 | 95 | let get_orths paths = |
208 | 96 | IntMap.fold paths StringSet.empty (fun orths _ map -> |
209 | 97 | IntMap.fold map orths (fun orths _ l -> |
210 | - Xlist.fold l orths (fun orths t -> | |
98 | + TokenEnvSet.fold l orths (fun orths t -> | |
211 | 99 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) |
212 | 100 | |
101 | +let get_lemmas paths = | |
102 | + IntMap.fold paths StringSet.empty (fun orths _ map -> | |
103 | + IntMap.fold map orths (fun orths _ l -> | |
104 | + TokenEnvSet.fold l orths (fun orths t -> | |
105 | + StringSet.add orths (ENIAMtokens.get_lemma t.token)))) | |
106 | + | |
213 | 107 | let get_intnum_orths paths = |
214 | 108 | IntMap.fold paths StringMap.empty (fun orths _ map -> |
215 | 109 | IntMap.fold map orths (fun orths _ l -> |
216 | - Xlist.fold l orths (fun orths t -> | |
110 | + TokenEnvSet.fold l orths (fun orths t -> | |
217 | 111 | match t.token with |
218 | 112 | Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) |
219 | 113 | | _ -> orths))) |
220 | 114 | |
221 | -let rec match_path_rec map found (t:token_env) rev = function | |
222 | - [] -> (t :: rev) :: found | |
115 | +let preselect orths lemmas rules l = | |
116 | + Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> | |
117 | + let b = Xlist.fold match_list true (fun b -> function | |
118 | + O s -> StringSet.mem orths s && b | |
119 | + | L(s,_,_) -> StringSet.mem lemmas s && b | |
120 | + | D(s,_) -> failwith "preselect") in | |
121 | + if b then (match_list,lemma,cat,interp) :: rules else rules) | |
122 | + | |
123 | +let preselect_dict orths lemmas dict rules = | |
124 | + StringSet.fold orths rules (fun rules orth -> | |
125 | + try | |
126 | + preselect orths lemmas rules (StringMap.find dict orth) | |
127 | + with Not_found -> rules) | |
128 | + | |
129 | +let preselect_dict2 orths lemmas dict2 rules = | |
130 | + StringSet.fold lemmas rules (fun rules lemma -> | |
131 | + try | |
132 | + preselect orths lemmas rules (StringMap.find dict2 lemma) | |
133 | + with Not_found -> rules) | |
134 | + | |
135 | +let add_ordnum_rules orths rules = | |
136 | + StringMap.fold orths rules (fun rules orth lemmas -> | |
137 | + StringSet.fold lemmas rules (fun rules lemma -> | |
138 | + (* Printf.printf "%s %s\n%!" orth lemma; *) | |
139 | + ([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules)) | |
140 | + | |
141 | +let select_rules paths mwe_dict mwe_dict2 = | |
142 | + let orths = get_orths paths in | |
143 | + let lemmas = get_lemmas paths in | |
144 | + let intnum_orths = get_intnum_orths paths in | |
145 | + let rules = preselect_dict orths lemmas mwe_dict [] in | |
146 | + let rules = preselect_dict2 orths lemmas mwe_dict2 rules in | |
147 | + let rules = add_ordnum_rules intnum_orths rules in | |
148 | + rules | |
149 | + | |
150 | +let rec check_interp sels = function | |
151 | + [],[] -> true | |
152 | + | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2) | |
153 | + | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false | |
154 | + | S s :: interp, l2 :: interp2 -> | |
155 | + (try | |
156 | + let l = Xlist.assoc sels s in | |
157 | + let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in | |
158 | + if b then check_interp sels (interp,interp2) else false | |
159 | + with Not_found -> check_interp sels (interp,interp2)) | |
160 | + | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2) | |
161 | + | _ -> failwith "check_interp" | |
162 | + | |
163 | +let rec get_sels sels = function | |
164 | + [],[] -> sels | |
165 | + | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2) | |
166 | + | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2) | |
167 | + | S s :: interp, l2 :: interp2 -> | |
168 | + (try | |
169 | + let l = Xlist.assoc sels s in | |
170 | + let sels = List.remove_assoc s sels in | |
171 | + let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in | |
172 | + get_sels ((s,l) :: sels) (interp,interp2) | |
173 | + with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2)) | |
174 | + | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2) | |
175 | + | _ -> failwith "get_sels" | |
176 | + | |
177 | +let rec match_path_rec map found (t:token_env) sels rev = function | |
178 | + [] -> (t :: rev, sels) :: found | |
223 | 179 | | s :: l -> |
224 | 180 | let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in |
225 | 181 | let found2 = IntMap.fold map2 [] (fun found2 _ l -> |
226 | - Xlist.fold l found2 (fun found2 new_t -> | |
227 | - if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in | |
228 | - Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l) | |
182 | + TokenEnvSet.fold l found2 (fun found2 new_t -> | |
183 | + match s,new_t.token with | |
184 | + O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2 | |
185 | + | L(s,cat,interp), Lemma(s2,cat2,interps2) -> | |
186 | + Xlist.fold interps2 found2 (fun found2 interp2 -> | |
187 | + if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then | |
188 | + (new_t,get_sels sels (interp,interp2)) :: found2 else found2) | |
189 | + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 | |
190 | + | _ -> found2)) in | |
191 | + Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) | |
229 | 192 | |
230 | 193 | let match_path map = function |
231 | 194 | [] -> failwith "match_path" |
232 | 195 | | s :: l -> |
233 | 196 | let found = IntMap.fold map [] (fun found i map2 -> |
234 | 197 | IntMap.fold map2 found (fun found j l -> |
235 | - Xlist.fold l found (fun found t -> | |
236 | - if ENIAMtokens.get_orth t.token = s then t :: found else found))) in | |
237 | - Xlist.fold found [] (fun found t -> match_path_rec map found t [] l) | |
198 | + TokenEnvSet.fold l found (fun found t -> | |
199 | + match s,t.token with | |
200 | + O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found | |
201 | + | L(s,cat,interp), Lemma(s2,cat2,interps2) -> | |
202 | + Xlist.fold interps2 found (fun found interp2 -> | |
203 | + if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then | |
204 | + (t,get_sels [] (interp,interp2)) :: found else found) | |
205 | + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found | |
206 | + | _ -> found))) in | |
207 | + Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) | |
238 | 208 | |
239 | 209 | let concat_orths l = |
240 | 210 | let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in |
241 | 211 | let n = Xstring.size s in |
242 | 212 | if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s |
243 | 213 | |
244 | -let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *) | |
214 | +let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *) | |
245 | 215 | let l = List.rev matching in |
246 | 216 | let beg = (List.hd l).beg in |
247 | 217 | let t = List.hd matching in |
... | ... | @@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na |
251 | 221 | beg=beg; |
252 | 222 | len=len; |
253 | 223 | next=t.next; |
254 | - token=ENIAMtokens.make_lemma (lemma,interp); | |
224 | + token=Lemma(lemma,cat,[Xlist.map interp (function | |
225 | + S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) | |
226 | + | V s -> [s] | |
227 | + | G -> ["_"])]); | |
255 | 228 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) |
256 | 229 | attrs=ENIAMtokens.merge_attrs l} |
257 | 230 | |
258 | 231 | let add_token paths t = |
259 | 232 | let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in |
260 | - let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in | |
233 | + let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in | |
261 | 234 | IntMap.add paths t.beg map |
262 | 235 | |
263 | -let apply_rule paths (match_list,lemma,interp) = | |
236 | +let apply_rule paths (match_list,lemma,cat,interp) = | |
264 | 237 | (* print_endline ("apply_rule: " ^ lemma); *) |
265 | 238 | let matchings_found = match_path paths match_list in |
266 | - Xlist.fold matchings_found paths (fun paths matching -> | |
239 | + Xlist.fold matchings_found paths (fun paths (matching,sels) -> | |
267 | 240 | try |
268 | - let token = create_token matching lemma interp in | |
241 | + let token = create_token matching sels lemma cat interp in | |
269 | 242 | add_token paths token |
270 | 243 | with Not_found -> paths) |
271 | 244 | |
272 | -(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *) | |
273 | -let add_ordnum_rules rules paths = | |
274 | - let orths = get_intnum_orths paths in | |
275 | - StringMap.fold orths rules (fun rules orth lemmas -> | |
276 | - StringSet.fold lemmas rules (fun rules lemma -> | |
277 | - (* Printf.printf "%s %s\n%!" orth lemma; *) | |
278 | - ([orth;"."],lemma,"ordnum") :: rules)) | |
279 | - | |
280 | 245 | let process (paths,last) = |
281 | 246 | let paths = Xlist.fold paths IntMap.empty add_token in |
282 | - let orths = get_orths paths in | |
283 | - let rules = preselect_dict orths mwe_dict in | |
284 | - let rules = add_ordnum_rules rules paths in | |
247 | + let rules = select_rules paths mwe_dict mwe_dict2 in | |
248 | + let paths = Xlist.fold rules paths apply_rule in | |
249 | + let rules = select_rules paths mwe_dict mwe_dict2 in | |
250 | + let paths = Xlist.fold rules paths apply_rule in | |
251 | + let rules = select_rules paths mwe_dict mwe_dict2 in | |
252 | + let paths = Xlist.fold rules paths apply_rule in | |
253 | + let rules = select_rules paths mwe_dict mwe_dict2 in | |
285 | 254 | let paths = Xlist.fold rules paths apply_rule in |
286 | 255 | let paths = IntMap.fold paths [] (fun paths _ map -> |
287 | 256 | IntMap.fold map paths (fun paths _ l -> |
288 | - Xlist.fold l paths (fun paths t -> | |
257 | + TokenEnvSet.fold l paths (fun paths t -> | |
289 | 258 | t :: paths))) in |
290 | 259 | ENIAMpaths.sort (paths,last) |
... | ... |
subsyntax/ENIAMsubsyntaxTypes.ml
... | ... | @@ -48,6 +48,9 @@ let brev_filename = resource_path ^ "/subsyntax/brev.tab" |
48 | 48 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" |
49 | 49 | let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" |
50 | 50 | let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" |
51 | +let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" | |
52 | +let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" | |
53 | +let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" | |
51 | 54 | |
52 | 55 | let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab" |
53 | 56 | |
... | ... |
subsyntax/TODO
1 | -- dokończyć rozpoznawanie MWE | |
2 | -- dodać zasoby MWE | |
3 | 1 | - rozpoznawanie MWE ze Słowosieci |
2 | +- kompresowanie tokenów mających indentyczne lematy (albo po przetworzeniu, albo kompresowanie interpretacji przed rozpoznaniem mwe) | |
4 | 3 | |
5 | 4 | - jak przetwarzać num:comp |
6 | 5 | - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga |
... | ... |
subsyntax/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml |
... | ... | @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) |
32 | 32 | eniam-subsyntax.cmxa: $(SOURCES) |
33 | 33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ |
34 | 34 | |
35 | -test: test.ml | |
36 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | |
35 | +test: $(SOURCES) test.ml | |
36 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | |
37 | 37 | |
38 | 38 | interface: interface.ml |
39 | 39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml |
... | ... |
subsyntax/resources/README
1 | -File NKJP1M-lemma-freq.tab in this folder was created on the basis of | |
1 | +File NKJP1M-lemma-freq.tab in this folder was created on the basis of | |
2 | 2 | |
3 | 3 | NKJP1M: the manually annotated 1-million word subcorpus sampled |
4 | 4 | from texts of a subset of the National Corpus of Polish. |
5 | 5 | version 1.2 |
6 | 6 | |
7 | -File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of | |
7 | +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of | |
8 | 8 | |
9 | 9 | SGJP: Grammatical Dictionary of Polish, version 20151020 |
10 | 10 | Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin |
11 | 11 | Woliński, Robert Wołosz, Danuta Skowrońska |
12 | 12 | |
13 | -and also on the basis of | |
13 | +and also on the basis of | |
14 | 14 | PoliMorf, version 20151020 |
15 | + | |
16 | +File SEJF.dic is created on the basis of | |
17 | +Grammatical Lexicon of Polish Phraseology | |
18 | +Copyright © Monika Czerepowicka, Agata Savary | |
19 | +Copyright © Institute of Computer Science Polish Academy of Sciences | |
20 | +The data are available under the CC BY-SA license. | |
21 | + | |
22 | +File SEJFEK.dic is created on the basis of | |
23 | +Grammatical Lexicon of Polish Economic Phraseology | |
24 | +Copyright © Filip Makowiecki, Agata Savary | |
25 | +Copyright © Institute of Computer Science Polish Academy of Sciences | |
26 | +The data are available under the CC BY-SA license. | |
27 | + | |
28 | +File SAWA.dic is created on the basis of | |
29 | +Grammatical Lexicon of Warsaw Urban Proper Names | |
30 | +Copyright © Małgorzata Marciniak, Celina Heliasz, Joanna Rabiega-Wiśniewska, Piotr Sikora, Marcin Woliński, Agata Savary | |
31 | +Copyright © Institute of Computer Science Polish Academy of Sciences | |
32 | +The data are available under the CC BY-SA license. | |
... | ... |