Commit 2f308cb1e0849b51145883f2b46db45c41f33574
1 parent
766cb2a4
dodanie zasobów z jednoskami wielosłownymi
Showing
12 changed files
with
27932 additions
and
223 deletions
Too many changes to show.
To preserve performance only 6 of 12 files are displayed.
morphology2/TODO
@@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć | @@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć | ||
2 | dodać tagger | 2 | dodać tagger |
3 | usunąć reguły praet z dołączonym aglutynatem! | 3 | usunąć reguły praet z dołączonym aglutynatem! |
4 | i usunąć excluded interps z subsyntax | 4 | i usunąć excluded interps z subsyntax |
5 | +dodać morfeusz_suplementy wydobyte z zasobów MWE |
subsyntax/ENIAM_MWE.ml
@@ -21,227 +21,197 @@ open Xstd | @@ -21,227 +21,197 @@ open Xstd | ||
21 | open ENIAMsubsyntaxTypes | 21 | open ENIAMsubsyntaxTypes |
22 | open ENIAMtokenizerTypes | 22 | open ENIAMtokenizerTypes |
23 | 23 | ||
24 | -let load_dict dict filename = | 24 | +type sel = V of string | S of string | G |
25 | + | ||
26 | +type t = | ||
27 | + L of string * string * sel list | ||
28 | + | O of string | ||
29 | + | D of string * string | ||
30 | + | ||
31 | +let process_interp lemma interp = | ||
32 | + match Xstring.split ":" interp with | ||
33 | + cat :: interp -> L(lemma,cat,Xlist.map interp (function | ||
34 | + "$c" -> S "c" | ||
35 | + | "$n" -> S "n" | ||
36 | + | "$g" -> S "g" | ||
37 | + | "$d" -> S "d" | ||
38 | + | "$C" -> S "C" | ||
39 | + | "_" -> G | ||
40 | + | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) | ||
41 | + | _ -> failwith "process_interp" | ||
42 | + | ||
43 | +let load_mwe_dict dict filename = | ||
25 | File.fold_tab filename dict (fun dict -> function | 44 | File.fold_tab filename dict (fun dict -> function |
26 | - [orth; lemma; interp] -> | ||
27 | - let s = List.hd (Str.split_delim (Str.regexp " ") orth) in | ||
28 | - StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l) | 45 | + [orths; lemma; interp] -> |
46 | + let orths = Xstring.split " " orths in | ||
47 | + if orths = [] then failwith "load_mwe_dict" else | ||
48 | + let s = List.hd orths in | ||
49 | + let orths = Xlist.map orths (fun s -> O s) in | ||
50 | + let lemma,cat,interp = match process_interp lemma interp with | ||
51 | + L(lemma,cat,interp) -> lemma,cat,interp | ||
52 | + | _ -> failwith "load_mwe_dict2" in | ||
53 | + StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l) | ||
29 | | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'")) | 54 | | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'")) |
30 | 55 | ||
31 | -let mwe_dict = | ||
32 | - let dict = load_dict StringMap.empty brev_filename in | ||
33 | - let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | ||
34 | -(* let dict = load_dict dict complete_entries_filename in*) | ||
35 | - let dict = load_dict dict mwe_filename in | ||
36 | - dict | ||
37 | - | ||
38 | -let preselect_dict orths dict = | ||
39 | - StringSet.fold orths [] (fun rules orth -> | ||
40 | - try | ||
41 | - let l = StringMap.find dict orth in | ||
42 | - Xlist.fold l rules (fun rules (orth,lemma,interp) -> | ||
43 | - (* print_endline ("preselect_dict: " ^ orth); *) | ||
44 | - let match_list = Str.split (Str.regexp " ") orth in | ||
45 | - let b = Xlist.fold match_list true (fun b s -> | ||
46 | - (* if not (StringSet.mem orths s) then print_endline s; *) | ||
47 | - StringSet.mem orths s && b) in | ||
48 | - if b then (match_list,lemma,interp) :: rules else rules) | ||
49 | - with Not_found -> rules) | ||
50 | - | ||
51 | - | ||
52 | -(* | ||
53 | -type matching = { | ||
54 | - prefix: tokens list; | ||
55 | - matched: token_record list; | ||
56 | - suffix: tokens list; | ||
57 | - pattern: pat list; | ||
58 | - command: token_record list -> token; | ||
59 | - last: int | ||
60 | - } | ||
61 | - | ||
62 | -let rec find_abr_pattern_tail matchings found = function | ||
63 | - [] -> found | ||
64 | - | token :: l -> | ||
65 | - let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching -> | ||
66 | - match matching.pattern with | ||
67 | - [pat] -> | ||
68 | - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in | ||
69 | - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then | ||
70 | - matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else | ||
71 | - matchings, found | ||
72 | - | pat :: pattern -> | ||
73 | - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in | ||
74 | - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then | ||
75 | - {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else | ||
76 | - matchings, found | ||
77 | - | [] -> matchings, matching :: found) in | ||
78 | - if matchings = [] then found else find_abr_pattern_tail matchings found l | ||
79 | - | ||
80 | -let rec find_abr_pattern all_matchings found = function | ||
81 | - token :: l -> | ||
82 | - let matchings = Xlist.fold all_matchings [] (fun matchings matching -> | ||
83 | - match matching.pattern with | ||
84 | - pat :: pattern -> | ||
85 | - (if PrePatterns.match_token (pat,token.token) then | ||
86 | - [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings | ||
87 | - | _ -> failwith "find_abr_pattern: ni") in | ||
88 | - let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in | ||
89 | - find_abr_pattern all_matchings found l | ||
90 | - | [] -> found | ||
91 | - | ||
92 | -let rec make_abr_orth = function | ||
93 | - [] -> "" | ||
94 | - | [t] -> t.orth | ||
95 | - | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l) | ||
96 | - | ||
97 | -let find_abr_patterns patterns tokens = | ||
98 | - let found = find_abr_pattern (Xlist.map patterns (fun pattern -> | ||
99 | - {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in | ||
100 | - Xlist.rev_map found (fun matching -> | ||
101 | - let t1 = List.hd (List.rev matching.matched) in | ||
102 | - let t2 = List.hd matching.matched in | ||
103 | - t1.beg, | ||
104 | - t2.beg + t2.len - t1.beg, | ||
105 | - t2.next, | ||
106 | - make_abr_orth (List.rev matching.matched)) | ||
107 | - | ||
108 | -let split_interp line gloss interp = | ||
109 | - if interp = "xxx" then [gloss, "xxx"] else | ||
110 | - Xlist.map (Str.split (Str.regexp " ") interp) (fun s -> | ||
111 | - match Str.split (Str.regexp "|") s with | ||
112 | - [lemma;interp] -> lemma, interp | ||
113 | - | _ -> failwith ("bad brev entry: " ^ line)) | ||
114 | - | ||
115 | -let load_brev_dict () = | ||
116 | - let lines = File.load_lines "data/brev_20151215.tab" in | ||
117 | - List.rev (Xlist.rev_map lines (fun line -> | ||
118 | - match Str.split_delim (Str.regexp "\t") line with | ||
119 | - [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp | ||
120 | - | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp | ||
121 | - | _ -> failwith ("load_brev_dict: " ^ line))) | ||
122 | - | ||
123 | -let parse_lemma lemma = | ||
124 | - if lemma = ":" then lemma,"" else | ||
125 | - match Str.split (Str.regexp ":") lemma with | ||
126 | - [x] -> x,"" | ||
127 | - | [x;y] -> x,y | ||
128 | - | _ -> failwith ("parse_lemma: " ^ lemma) | ||
129 | - | ||
130 | -let make_orths orth beg len lexeme_postags_list = | ||
131 | - let n = Xlist.size lexeme_postags_list in | ||
132 | - let orth_list = | ||
133 | - if n = 1 then [orth,beg,len] else | ||
134 | - List.rev (Int.fold 1 n [] (fun l i -> | ||
135 | - (orth ^ "_" ^ string_of_int i, | ||
136 | - (if i=1 then beg else beg+len-n+i-1), | ||
137 | - if i=1 then len-n+1 else 1) :: l)) in | ||
138 | - List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) -> | ||
139 | - (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list)) | ||
140 | - | ||
141 | -let brev_dict = load_brev_dict () | ||
142 | - | ||
143 | -(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe, | ||
144 | - Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *) | ||
145 | -let process_brev paths (*tokens*) = paths | ||
146 | -(* let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) -> | ||
147 | - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in | ||
148 | - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) -> | ||
149 | - let orths = make_orths orth beg len lexeme_postags_list in | ||
150 | - ENIAMpaths.add_path paths beg next orths)) in | ||
151 | - paths*) | ||
152 | - | ||
153 | -let rec preselect_mwe_dict_token set = function | ||
154 | - SmallLetter orth -> StringSet.add set orth | ||
155 | - | CapLetter(orth,lc) -> StringSet.add set orth | ||
156 | - | AllSmall orth -> StringSet.add set orth | ||
157 | - | AllCap(orth,lc,lc2) -> StringSet.add set orth | ||
158 | - | FirstCap(orth,lc,_,_) -> StringSet.add set orth | ||
159 | - | SomeCap orth -> StringSet.add set orth | ||
160 | - | Symbol orth -> StringSet.add set orth | ||
161 | - | Dig(v,"dig") -> StringSet.add set v | ||
162 | - | Other2 orth -> StringSet.add set orth | ||
163 | - | _ -> set | ||
164 | - | ||
165 | -let rec preselect_mwe_dict_tokens set = function | ||
166 | - Token t -> preselect_mwe_dict_token set t.token | ||
167 | - | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens | ||
168 | - | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens | ||
169 | - | ||
170 | -let preselect_mwe_dict mwe_dict tokens = | ||
171 | - let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in | ||
172 | - let set = StringSet.fold set StringSet.empty (fun set orth -> | ||
173 | - try | ||
174 | - let l = StringMap.find mwe_dict orth in | ||
175 | - Xlist.fold l set StringSet.add | ||
176 | - with Not_found -> set) in | ||
177 | -(* StringSet.iter set print_endline; *) | ||
178 | - StringSet.fold set [] (fun l s -> | ||
179 | - match Str.split_delim (Str.regexp "\t") s with | ||
180 | - [lemma; interp; sense] -> | ||
181 | - (match Str.split_delim (Str.regexp ":") interp with | ||
182 | - orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l | ||
183 | - | _ -> failwith "preselect_mwe_dict") | ||
184 | - | _ -> failwith "preselect_mwe_dict") | ||
185 | - | ||
186 | -let simplify_lemma lemma = | ||
187 | - match Str.split (Str.regexp "-") lemma with | ||
188 | - [x;"1"] -> x | ||
189 | - | [x;"2"] -> x | ||
190 | - | [x;"3"] -> x | ||
191 | - | [x;"4"] -> x | ||
192 | - | [x;"5"] -> x | ||
193 | - | _ -> lemma | ||
194 | - | ||
195 | -let mwe_dict = load_mwe_dict () | ||
196 | - | ||
197 | -let process_mwe paths (*tokens*) = paths | ||
198 | -(* let mwe_dict = preselect_mwe_dict mwe_dict tokens in | ||
199 | - let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) -> | ||
200 | - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in | ||
201 | - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) -> | ||
202 | - let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in | ||
203 | - ENIAMpaths.add_path paths beg next orths)) in | ||
204 | - paths*) | ||
205 | -*) | 56 | +let process_orth = function |
57 | + [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp | ||
58 | + | [Lexer.T orth] -> O orth | ||
59 | + | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp | ||
60 | + | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) | ||
61 | + | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) | ||
62 | + | ||
63 | +let load_mwe_dict2 (dict,dict2) filename = | ||
64 | + File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function | ||
65 | + [orths; lemma] -> | ||
66 | + (* print_endline (orths ^ "\t" ^ lemma); *) | ||
67 | + let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in | ||
68 | + (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *) | ||
69 | + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in | ||
70 | + (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *) | ||
71 | + let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in | ||
72 | + let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in | ||
73 | + (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *) | ||
74 | + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in | ||
75 | + (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *) | ||
76 | + let lemma,cat,interp = match process_orth tokens with | ||
77 | + L(lemma,cat,interp) -> lemma,cat,interp | ||
78 | + | _ -> failwith "load_mwe_dict2" in | ||
79 | + if orths = [] then failwith "load_mwe_dict2" else | ||
80 | + (match List.hd orths with | ||
81 | + L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l) | ||
82 | + | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2 | ||
83 | + | D _ -> failwith "load_mwe_dict2") | ||
84 | + | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) | ||
85 | + | ||
86 | +let mwe_dict,mwe_dict2 = | ||
87 | + let dict = load_mwe_dict StringMap.empty brev_filename in | ||
88 | + let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | ||
89 | + let dict = load_mwe_dict dict mwe_filename in | ||
90 | + let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in | ||
91 | + let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in | ||
92 | + let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in | ||
93 | + dict,dict2 | ||
206 | 94 | ||
207 | let get_orths paths = | 95 | let get_orths paths = |
208 | IntMap.fold paths StringSet.empty (fun orths _ map -> | 96 | IntMap.fold paths StringSet.empty (fun orths _ map -> |
209 | IntMap.fold map orths (fun orths _ l -> | 97 | IntMap.fold map orths (fun orths _ l -> |
210 | - Xlist.fold l orths (fun orths t -> | 98 | + TokenEnvSet.fold l orths (fun orths t -> |
211 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) | 99 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) |
212 | 100 | ||
101 | +let get_lemmas paths = | ||
102 | + IntMap.fold paths StringSet.empty (fun orths _ map -> | ||
103 | + IntMap.fold map orths (fun orths _ l -> | ||
104 | + TokenEnvSet.fold l orths (fun orths t -> | ||
105 | + StringSet.add orths (ENIAMtokens.get_lemma t.token)))) | ||
106 | + | ||
213 | let get_intnum_orths paths = | 107 | let get_intnum_orths paths = |
214 | IntMap.fold paths StringMap.empty (fun orths _ map -> | 108 | IntMap.fold paths StringMap.empty (fun orths _ map -> |
215 | IntMap.fold map orths (fun orths _ l -> | 109 | IntMap.fold map orths (fun orths _ l -> |
216 | - Xlist.fold l orths (fun orths t -> | 110 | + TokenEnvSet.fold l orths (fun orths t -> |
217 | match t.token with | 111 | match t.token with |
218 | Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) | 112 | Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) |
219 | | _ -> orths))) | 113 | | _ -> orths))) |
220 | 114 | ||
221 | -let rec match_path_rec map found (t:token_env) rev = function | ||
222 | - [] -> (t :: rev) :: found | 115 | +let preselect orths lemmas rules l = |
116 | + Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> | ||
117 | + let b = Xlist.fold match_list true (fun b -> function | ||
118 | + O s -> StringSet.mem orths s && b | ||
119 | + | L(s,_,_) -> StringSet.mem lemmas s && b | ||
120 | + | D(s,_) -> failwith "preselect") in | ||
121 | + if b then (match_list,lemma,cat,interp) :: rules else rules) | ||
122 | + | ||
123 | +let preselect_dict orths lemmas dict rules = | ||
124 | + StringSet.fold orths rules (fun rules orth -> | ||
125 | + try | ||
126 | + preselect orths lemmas rules (StringMap.find dict orth) | ||
127 | + with Not_found -> rules) | ||
128 | + | ||
129 | +let preselect_dict2 orths lemmas dict2 rules = | ||
130 | + StringSet.fold lemmas rules (fun rules lemma -> | ||
131 | + try | ||
132 | + preselect orths lemmas rules (StringMap.find dict2 lemma) | ||
133 | + with Not_found -> rules) | ||
134 | + | ||
135 | +let add_ordnum_rules orths rules = | ||
136 | + StringMap.fold orths rules (fun rules orth lemmas -> | ||
137 | + StringSet.fold lemmas rules (fun rules lemma -> | ||
138 | + (* Printf.printf "%s %s\n%!" orth lemma; *) | ||
139 | + ([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules)) | ||
140 | + | ||
141 | +let select_rules paths mwe_dict mwe_dict2 = | ||
142 | + let orths = get_orths paths in | ||
143 | + let lemmas = get_lemmas paths in | ||
144 | + let intnum_orths = get_intnum_orths paths in | ||
145 | + let rules = preselect_dict orths lemmas mwe_dict [] in | ||
146 | + let rules = preselect_dict2 orths lemmas mwe_dict2 rules in | ||
147 | + let rules = add_ordnum_rules intnum_orths rules in | ||
148 | + rules | ||
149 | + | ||
150 | +let rec check_interp sels = function | ||
151 | + [],[] -> true | ||
152 | + | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2) | ||
153 | + | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false | ||
154 | + | S s :: interp, l2 :: interp2 -> | ||
155 | + (try | ||
156 | + let l = Xlist.assoc sels s in | ||
157 | + let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in | ||
158 | + if b then check_interp sels (interp,interp2) else false | ||
159 | + with Not_found -> check_interp sels (interp,interp2)) | ||
160 | + | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2) | ||
161 | + | _ -> failwith "check_interp" | ||
162 | + | ||
163 | +let rec get_sels sels = function | ||
164 | + [],[] -> sels | ||
165 | + | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2) | ||
166 | + | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2) | ||
167 | + | S s :: interp, l2 :: interp2 -> | ||
168 | + (try | ||
169 | + let l = Xlist.assoc sels s in | ||
170 | + let sels = List.remove_assoc s sels in | ||
171 | + let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in | ||
172 | + get_sels ((s,l) :: sels) (interp,interp2) | ||
173 | + with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2)) | ||
174 | + | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2) | ||
175 | + | _ -> failwith "get_sels" | ||
176 | + | ||
177 | +let rec match_path_rec map found (t:token_env) sels rev = function | ||
178 | + [] -> (t :: rev, sels) :: found | ||
223 | | s :: l -> | 179 | | s :: l -> |
224 | let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in | 180 | let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in |
225 | let found2 = IntMap.fold map2 [] (fun found2 _ l -> | 181 | let found2 = IntMap.fold map2 [] (fun found2 _ l -> |
226 | - Xlist.fold l found2 (fun found2 new_t -> | ||
227 | - if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in | ||
228 | - Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l) | 182 | + TokenEnvSet.fold l found2 (fun found2 new_t -> |
183 | + match s,new_t.token with | ||
184 | + O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2 | ||
185 | + | L(s,cat,interp), Lemma(s2,cat2,interps2) -> | ||
186 | + Xlist.fold interps2 found2 (fun found2 interp2 -> | ||
187 | + if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then | ||
188 | + (new_t,get_sels sels (interp,interp2)) :: found2 else found2) | ||
189 | + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 | ||
190 | + | _ -> found2)) in | ||
191 | + Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) | ||
229 | 192 | ||
230 | let match_path map = function | 193 | let match_path map = function |
231 | [] -> failwith "match_path" | 194 | [] -> failwith "match_path" |
232 | | s :: l -> | 195 | | s :: l -> |
233 | let found = IntMap.fold map [] (fun found i map2 -> | 196 | let found = IntMap.fold map [] (fun found i map2 -> |
234 | IntMap.fold map2 found (fun found j l -> | 197 | IntMap.fold map2 found (fun found j l -> |
235 | - Xlist.fold l found (fun found t -> | ||
236 | - if ENIAMtokens.get_orth t.token = s then t :: found else found))) in | ||
237 | - Xlist.fold found [] (fun found t -> match_path_rec map found t [] l) | 198 | + TokenEnvSet.fold l found (fun found t -> |
199 | + match s,t.token with | ||
200 | + O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found | ||
201 | + | L(s,cat,interp), Lemma(s2,cat2,interps2) -> | ||
202 | + Xlist.fold interps2 found (fun found interp2 -> | ||
203 | + if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then | ||
204 | + (t,get_sels [] (interp,interp2)) :: found else found) | ||
205 | + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found | ||
206 | + | _ -> found))) in | ||
207 | + Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) | ||
238 | 208 | ||
239 | let concat_orths l = | 209 | let concat_orths l = |
240 | let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in | 210 | let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in |
241 | let n = Xstring.size s in | 211 | let n = Xstring.size s in |
242 | if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s | 212 | if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s |
243 | 213 | ||
244 | -let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *) | 214 | +let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *) |
245 | let l = List.rev matching in | 215 | let l = List.rev matching in |
246 | let beg = (List.hd l).beg in | 216 | let beg = (List.hd l).beg in |
247 | let t = List.hd matching in | 217 | let t = List.hd matching in |
@@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na | @@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na | ||
251 | beg=beg; | 221 | beg=beg; |
252 | len=len; | 222 | len=len; |
253 | next=t.next; | 223 | next=t.next; |
254 | - token=ENIAMtokens.make_lemma (lemma,interp); | 224 | + token=Lemma(lemma,cat,[Xlist.map interp (function |
225 | + S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) | ||
226 | + | V s -> [s] | ||
227 | + | G -> ["_"])]); | ||
255 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) | 228 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) |
256 | attrs=ENIAMtokens.merge_attrs l} | 229 | attrs=ENIAMtokens.merge_attrs l} |
257 | 230 | ||
258 | let add_token paths t = | 231 | let add_token paths t = |
259 | let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in | 232 | let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in |
260 | - let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in | 233 | + let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in |
261 | IntMap.add paths t.beg map | 234 | IntMap.add paths t.beg map |
262 | 235 | ||
263 | -let apply_rule paths (match_list,lemma,interp) = | 236 | +let apply_rule paths (match_list,lemma,cat,interp) = |
264 | (* print_endline ("apply_rule: " ^ lemma); *) | 237 | (* print_endline ("apply_rule: " ^ lemma); *) |
265 | let matchings_found = match_path paths match_list in | 238 | let matchings_found = match_path paths match_list in |
266 | - Xlist.fold matchings_found paths (fun paths matching -> | 239 | + Xlist.fold matchings_found paths (fun paths (matching,sels) -> |
267 | try | 240 | try |
268 | - let token = create_token matching lemma interp in | 241 | + let token = create_token matching sels lemma cat interp in |
269 | add_token paths token | 242 | add_token paths token |
270 | with Not_found -> paths) | 243 | with Not_found -> paths) |
271 | 244 | ||
272 | -(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *) | ||
273 | -let add_ordnum_rules rules paths = | ||
274 | - let orths = get_intnum_orths paths in | ||
275 | - StringMap.fold orths rules (fun rules orth lemmas -> | ||
276 | - StringSet.fold lemmas rules (fun rules lemma -> | ||
277 | - (* Printf.printf "%s %s\n%!" orth lemma; *) | ||
278 | - ([orth;"."],lemma,"ordnum") :: rules)) | ||
279 | - | ||
280 | let process (paths,last) = | 245 | let process (paths,last) = |
281 | let paths = Xlist.fold paths IntMap.empty add_token in | 246 | let paths = Xlist.fold paths IntMap.empty add_token in |
282 | - let orths = get_orths paths in | ||
283 | - let rules = preselect_dict orths mwe_dict in | ||
284 | - let rules = add_ordnum_rules rules paths in | 247 | + let rules = select_rules paths mwe_dict mwe_dict2 in |
248 | + let paths = Xlist.fold rules paths apply_rule in | ||
249 | + let rules = select_rules paths mwe_dict mwe_dict2 in | ||
250 | + let paths = Xlist.fold rules paths apply_rule in | ||
251 | + let rules = select_rules paths mwe_dict mwe_dict2 in | ||
252 | + let paths = Xlist.fold rules paths apply_rule in | ||
253 | + let rules = select_rules paths mwe_dict mwe_dict2 in | ||
285 | let paths = Xlist.fold rules paths apply_rule in | 254 | let paths = Xlist.fold rules paths apply_rule in |
286 | let paths = IntMap.fold paths [] (fun paths _ map -> | 255 | let paths = IntMap.fold paths [] (fun paths _ map -> |
287 | IntMap.fold map paths (fun paths _ l -> | 256 | IntMap.fold map paths (fun paths _ l -> |
288 | - Xlist.fold l paths (fun paths t -> | 257 | + TokenEnvSet.fold l paths (fun paths t -> |
289 | t :: paths))) in | 258 | t :: paths))) in |
290 | ENIAMpaths.sort (paths,last) | 259 | ENIAMpaths.sort (paths,last) |
subsyntax/ENIAMsubsyntaxTypes.ml
@@ -48,6 +48,9 @@ let brev_filename = resource_path ^ "/subsyntax/brev.tab" | @@ -48,6 +48,9 @@ let brev_filename = resource_path ^ "/subsyntax/brev.tab" | ||
48 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" | 48 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" |
49 | let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" | 49 | let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" |
50 | let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" | 50 | let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" |
51 | +let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" | ||
52 | +let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" | ||
53 | +let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" | ||
51 | 54 | ||
52 | let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab" | 55 | let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab" |
53 | 56 |
subsyntax/TODO
1 | -- dokończyć rozpoznawanie MWE | ||
2 | -- dodać zasoby MWE | ||
3 | - rozpoznawanie MWE ze Słowosieci | 1 | - rozpoznawanie MWE ze Słowosieci |
2 | +- kompresowanie tokenów mających indentyczne lematy (albo po przetworzeniu, albo kompresowanie interpretacji przed rozpoznaniem mwe) | ||
4 | 3 | ||
5 | - jak przetwarzać num:comp | 4 | - jak przetwarzać num:comp |
6 | - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga | 5 | - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga |
subsyntax/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | ||
3 | OCAMLDEP=ocamldep | 3 | OCAMLDEP=ocamldep |
4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | OCAMLFLAGS=$(INCLUDES) -g | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | 6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa |
7 | INSTALLDIR=`ocamlc -where`/eniam | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | ||
9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml | 9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml |
@@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) | @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) | ||
32 | eniam-subsyntax.cmxa: $(SOURCES) | 32 | eniam-subsyntax.cmxa: $(SOURCES) |
33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ | 33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ |
34 | 34 | ||
35 | -test: test.ml | ||
36 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | 35 | +test: $(SOURCES) test.ml |
36 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | ||
37 | 37 | ||
38 | interface: interface.ml | 38 | interface: interface.ml |
39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml | 39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml |
subsyntax/resources/README
1 | -File NKJP1M-lemma-freq.tab in this folder was created on the basis of | 1 | +File NKJP1M-lemma-freq.tab in this folder was created on the basis of |
2 | 2 | ||
3 | NKJP1M: the manually annotated 1-million word subcorpus sampled | 3 | NKJP1M: the manually annotated 1-million word subcorpus sampled |
4 | from texts of a subset of the National Corpus of Polish. | 4 | from texts of a subset of the National Corpus of Polish. |
5 | version 1.2 | 5 | version 1.2 |
6 | 6 | ||
7 | -File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of | 7 | +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of |
8 | 8 | ||
9 | SGJP: Grammatical Dictionary of Polish, version 20151020 | 9 | SGJP: Grammatical Dictionary of Polish, version 20151020 |
10 | Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin | 10 | Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin |
11 | Woliński, Robert Wołosz, Danuta Skowrońska | 11 | Woliński, Robert Wołosz, Danuta Skowrońska |
12 | 12 | ||
13 | -and also on the basis of | 13 | +and also on the basis of |
14 | PoliMorf, version 20151020 | 14 | PoliMorf, version 20151020 |
15 | + | ||
16 | +File SEJF.dic is created on the basis of | ||
17 | +Grammatical Lexicon of Polish Phraseology | ||
18 | +Copyright © Monika Czerepowicka, Agata Savary | ||
19 | +Copyright © Institute of Computer Science Polish Academy of Sciences | ||
20 | +The data are available under the CC BY-SA license. | ||
21 | + | ||
22 | +File SEJFEK.dic is created on the basis of | ||
23 | +Grammatical Lexicon of Polish Economic Phraseology | ||
24 | +Copyright © Filip Makowiecki, Agata Savary | ||
25 | +Copyright © Institute of Computer Science Polish Academy of Sciences | ||
26 | +The data are available under the CC BY-SA license. | ||
27 | + | ||
28 | +File SAWA.dic is created on the basis of | ||
29 | +Grammatical Lexicon of Warsaw Urban Proper Names | ||
30 | +Copyright © Małgorzata Marciniak, Celina Heliasz, Joanna Rabiega-Wiśniewska, Piotr Sikora, Marcin Woliński, Agata Savary | ||
31 | +Copyright © Institute of Computer Science Polish Academy of Sciences | ||
32 | +The data are available under the CC BY-SA license. |