Commit 2f308cb1e0849b51145883f2b46db45c41f33574

Authored by Wojciech Jaworski
1 parent 766cb2a4

dodanie zasobów z jednoskami wielosłownymi

Too many changes to show.

To preserve performance only 6 of 12 files are displayed.

morphology2/TODO
@@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć @@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć
2 dodać tagger 2 dodać tagger
3 usunąć reguły praet z dołączonym aglutynatem! 3 usunąć reguły praet z dołączonym aglutynatem!
4 i usunąć excluded interps z subsyntax 4 i usunąć excluded interps z subsyntax
  5 +dodać morfeusz_suplementy wydobyte z zasobów MWE
subsyntax/ENIAM_MWE.ml
@@ -21,227 +21,197 @@ open Xstd @@ -21,227 +21,197 @@ open Xstd
21 open ENIAMsubsyntaxTypes 21 open ENIAMsubsyntaxTypes
22 open ENIAMtokenizerTypes 22 open ENIAMtokenizerTypes
23 23
24 -let load_dict dict filename = 24 +type sel = V of string | S of string | G
  25 +
  26 +type t =
  27 + L of string * string * sel list
  28 + | O of string
  29 + | D of string * string
  30 +
  31 +let process_interp lemma interp =
  32 + match Xstring.split ":" interp with
  33 + cat :: interp -> L(lemma,cat,Xlist.map interp (function
  34 + "$c" -> S "c"
  35 + | "$n" -> S "n"
  36 + | "$g" -> S "g"
  37 + | "$d" -> S "d"
  38 + | "$C" -> S "C"
  39 + | "_" -> G
  40 + | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
  41 + | _ -> failwith "process_interp"
  42 +
  43 +let load_mwe_dict dict filename =
25 File.fold_tab filename dict (fun dict -> function 44 File.fold_tab filename dict (fun dict -> function
26 - [orth; lemma; interp] ->  
27 - let s = List.hd (Str.split_delim (Str.regexp " ") orth) in  
28 - StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l) 45 + [orths; lemma; interp] ->
  46 + let orths = Xstring.split " " orths in
  47 + if orths = [] then failwith "load_mwe_dict" else
  48 + let s = List.hd orths in
  49 + let orths = Xlist.map orths (fun s -> O s) in
  50 + let lemma,cat,interp = match process_interp lemma interp with
  51 + L(lemma,cat,interp) -> lemma,cat,interp
  52 + | _ -> failwith "load_mwe_dict2" in
  53 + StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
29 | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'")) 54 | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
30 55
31 -let mwe_dict =  
32 - let dict = load_dict StringMap.empty brev_filename in  
33 - let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in  
34 -(* let dict = load_dict dict complete_entries_filename in*)  
35 - let dict = load_dict dict mwe_filename in  
36 - dict  
37 -  
38 -let preselect_dict orths dict =  
39 - StringSet.fold orths [] (fun rules orth ->  
40 - try  
41 - let l = StringMap.find dict orth in  
42 - Xlist.fold l rules (fun rules (orth,lemma,interp) ->  
43 - (* print_endline ("preselect_dict: " ^ orth); *)  
44 - let match_list = Str.split (Str.regexp " ") orth in  
45 - let b = Xlist.fold match_list true (fun b s ->  
46 - (* if not (StringSet.mem orths s) then print_endline s; *)  
47 - StringSet.mem orths s && b) in  
48 - if b then (match_list,lemma,interp) :: rules else rules)  
49 - with Not_found -> rules)  
50 -  
51 -  
52 -(*  
53 -type matching = {  
54 - prefix: tokens list;  
55 - matched: token_record list;  
56 - suffix: tokens list;  
57 - pattern: pat list;  
58 - command: token_record list -> token;  
59 - last: int  
60 - }  
61 -  
62 -let rec find_abr_pattern_tail matchings found = function  
63 - [] -> found  
64 - | token :: l ->  
65 - let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching ->  
66 - match matching.pattern with  
67 - [pat] ->  
68 - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in  
69 - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then  
70 - matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else  
71 - matchings, found  
72 - | pat :: pattern ->  
73 - let matchings = if token.beg <= matching.last then matching :: matchings else matchings in  
74 - if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then  
75 - {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else  
76 - matchings, found  
77 - | [] -> matchings, matching :: found) in  
78 - if matchings = [] then found else find_abr_pattern_tail matchings found l  
79 -  
80 -let rec find_abr_pattern all_matchings found = function  
81 - token :: l ->  
82 - let matchings = Xlist.fold all_matchings [] (fun matchings matching ->  
83 - match matching.pattern with  
84 - pat :: pattern ->  
85 - (if PrePatterns.match_token (pat,token.token) then  
86 - [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings  
87 - | _ -> failwith "find_abr_pattern: ni") in  
88 - let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in  
89 - find_abr_pattern all_matchings found l  
90 - | [] -> found  
91 -  
92 -let rec make_abr_orth = function  
93 - [] -> ""  
94 - | [t] -> t.orth  
95 - | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l)  
96 -  
97 -let find_abr_patterns patterns tokens =  
98 - let found = find_abr_pattern (Xlist.map patterns (fun pattern ->  
99 - {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in  
100 - Xlist.rev_map found (fun matching ->  
101 - let t1 = List.hd (List.rev matching.matched) in  
102 - let t2 = List.hd matching.matched in  
103 - t1.beg,  
104 - t2.beg + t2.len - t1.beg,  
105 - t2.next,  
106 - make_abr_orth (List.rev matching.matched))  
107 -  
108 -let split_interp line gloss interp =  
109 - if interp = "xxx" then [gloss, "xxx"] else  
110 - Xlist.map (Str.split (Str.regexp " ") interp) (fun s ->  
111 - match Str.split (Str.regexp "|") s with  
112 - [lemma;interp] -> lemma, interp  
113 - | _ -> failwith ("bad brev entry: " ^ line))  
114 -  
115 -let load_brev_dict () =  
116 - let lines = File.load_lines "data/brev_20151215.tab" in  
117 - List.rev (Xlist.rev_map lines (fun line ->  
118 - match Str.split_delim (Str.regexp "\t") line with  
119 - [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp  
120 - | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp  
121 - | _ -> failwith ("load_brev_dict: " ^ line)))  
122 -  
123 -let parse_lemma lemma =  
124 - if lemma = ":" then lemma,"" else  
125 - match Str.split (Str.regexp ":") lemma with  
126 - [x] -> x,""  
127 - | [x;y] -> x,y  
128 - | _ -> failwith ("parse_lemma: " ^ lemma)  
129 -  
130 -let make_orths orth beg len lexeme_postags_list =  
131 - let n = Xlist.size lexeme_postags_list in  
132 - let orth_list =  
133 - if n = 1 then [orth,beg,len] else  
134 - List.rev (Int.fold 1 n [] (fun l i ->  
135 - (orth ^ "_" ^ string_of_int i,  
136 - (if i=1 then beg else beg+len-n+i-1),  
137 - if i=1 then len-n+1 else 1) :: l)) in  
138 - List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) ->  
139 - (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list))  
140 -  
141 -let brev_dict = load_brev_dict ()  
142 -  
143 -(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe,  
144 - Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *)  
145 -let process_brev paths (*tokens*) = paths  
146 -(* let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) ->  
147 - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in  
148 - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->  
149 - let orths = make_orths orth beg len lexeme_postags_list in  
150 - ENIAMpaths.add_path paths beg next orths)) in  
151 - paths*)  
152 -  
153 -let rec preselect_mwe_dict_token set = function  
154 - SmallLetter orth -> StringSet.add set orth  
155 - | CapLetter(orth,lc) -> StringSet.add set orth  
156 - | AllSmall orth -> StringSet.add set orth  
157 - | AllCap(orth,lc,lc2) -> StringSet.add set orth  
158 - | FirstCap(orth,lc,_,_) -> StringSet.add set orth  
159 - | SomeCap orth -> StringSet.add set orth  
160 - | Symbol orth -> StringSet.add set orth  
161 - | Dig(v,"dig") -> StringSet.add set v  
162 - | Other2 orth -> StringSet.add set orth  
163 - | _ -> set  
164 -  
165 -let rec preselect_mwe_dict_tokens set = function  
166 - Token t -> preselect_mwe_dict_token set t.token  
167 - | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens  
168 - | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens  
169 -  
170 -let preselect_mwe_dict mwe_dict tokens =  
171 - let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in  
172 - let set = StringSet.fold set StringSet.empty (fun set orth ->  
173 - try  
174 - let l = StringMap.find mwe_dict orth in  
175 - Xlist.fold l set StringSet.add  
176 - with Not_found -> set) in  
177 -(* StringSet.iter set print_endline; *)  
178 - StringSet.fold set [] (fun l s ->  
179 - match Str.split_delim (Str.regexp "\t") s with  
180 - [lemma; interp; sense] ->  
181 - (match Str.split_delim (Str.regexp ":") interp with  
182 - orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l  
183 - | _ -> failwith "preselect_mwe_dict")  
184 - | _ -> failwith "preselect_mwe_dict")  
185 -  
186 -let simplify_lemma lemma =  
187 - match Str.split (Str.regexp "-") lemma with  
188 - [x;"1"] -> x  
189 - | [x;"2"] -> x  
190 - | [x;"3"] -> x  
191 - | [x;"4"] -> x  
192 - | [x;"5"] -> x  
193 - | _ -> lemma  
194 -  
195 -let mwe_dict = load_mwe_dict ()  
196 -  
197 -let process_mwe paths (*tokens*) = paths  
198 -(* let mwe_dict = preselect_mwe_dict mwe_dict tokens in  
199 - let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) ->  
200 - let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in  
201 - Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->  
202 - let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in  
203 - ENIAMpaths.add_path paths beg next orths)) in  
204 - paths*)  
205 -*) 56 +let process_orth = function
  57 + [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
  58 + | [Lexer.T orth] -> O orth
  59 + | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
  60 + | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
  61 + | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
  62 +
  63 +let load_mwe_dict2 (dict,dict2) filename =
  64 + File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
  65 + [orths; lemma] ->
  66 + (* print_endline (orths ^ "\t" ^ lemma); *)
  67 + let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
  68 + (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
  69 + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
  70 + (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
  71 + let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
  72 + let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
  73 + (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
  74 + let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
  75 + (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
  76 + let lemma,cat,interp = match process_orth tokens with
  77 + L(lemma,cat,interp) -> lemma,cat,interp
  78 + | _ -> failwith "load_mwe_dict2" in
  79 + if orths = [] then failwith "load_mwe_dict2" else
  80 + (match List.hd orths with
  81 + L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
  82 + | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
  83 + | D _ -> failwith "load_mwe_dict2")
  84 + | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
  85 +
  86 +let mwe_dict,mwe_dict2 =
  87 + let dict = load_mwe_dict StringMap.empty brev_filename in
  88 + let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
  89 + let dict = load_mwe_dict dict mwe_filename in
  90 + let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in
  91 + let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in
  92 + let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in
  93 + dict,dict2
206 94
207 let get_orths paths = 95 let get_orths paths =
208 IntMap.fold paths StringSet.empty (fun orths _ map -> 96 IntMap.fold paths StringSet.empty (fun orths _ map ->
209 IntMap.fold map orths (fun orths _ l -> 97 IntMap.fold map orths (fun orths _ l ->
210 - Xlist.fold l orths (fun orths t -> 98 + TokenEnvSet.fold l orths (fun orths t ->
211 StringSet.add orths (ENIAMtokens.get_orth t.token)))) 99 StringSet.add orths (ENIAMtokens.get_orth t.token))))
212 100
  101 +let get_lemmas paths =
  102 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  103 + IntMap.fold map orths (fun orths _ l ->
  104 + TokenEnvSet.fold l orths (fun orths t ->
  105 + StringSet.add orths (ENIAMtokens.get_lemma t.token))))
  106 +
213 let get_intnum_orths paths = 107 let get_intnum_orths paths =
214 IntMap.fold paths StringMap.empty (fun orths _ map -> 108 IntMap.fold paths StringMap.empty (fun orths _ map ->
215 IntMap.fold map orths (fun orths _ l -> 109 IntMap.fold map orths (fun orths _ l ->
216 - Xlist.fold l orths (fun orths t -> 110 + TokenEnvSet.fold l orths (fun orths t ->
217 match t.token with 111 match t.token with
218 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) 112 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
219 | _ -> orths))) 113 | _ -> orths)))
220 114
221 -let rec match_path_rec map found (t:token_env) rev = function  
222 - [] -> (t :: rev) :: found 115 +let preselect orths lemmas rules l =
  116 + Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
  117 + let b = Xlist.fold match_list true (fun b -> function
  118 + O s -> StringSet.mem orths s && b
  119 + | L(s,_,_) -> StringSet.mem lemmas s && b
  120 + | D(s,_) -> failwith "preselect") in
  121 + if b then (match_list,lemma,cat,interp) :: rules else rules)
  122 +
  123 +let preselect_dict orths lemmas dict rules =
  124 + StringSet.fold orths rules (fun rules orth ->
  125 + try
  126 + preselect orths lemmas rules (StringMap.find dict orth)
  127 + with Not_found -> rules)
  128 +
  129 +let preselect_dict2 orths lemmas dict2 rules =
  130 + StringSet.fold lemmas rules (fun rules lemma ->
  131 + try
  132 + preselect orths lemmas rules (StringMap.find dict2 lemma)
  133 + with Not_found -> rules)
  134 +
  135 +let add_ordnum_rules orths rules =
  136 + StringMap.fold orths rules (fun rules orth lemmas ->
  137 + StringSet.fold lemmas rules (fun rules lemma ->
  138 + (* Printf.printf "%s %s\n%!" orth lemma; *)
  139 + ([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
  140 +
  141 +let select_rules paths mwe_dict mwe_dict2 =
  142 + let orths = get_orths paths in
  143 + let lemmas = get_lemmas paths in
  144 + let intnum_orths = get_intnum_orths paths in
  145 + let rules = preselect_dict orths lemmas mwe_dict [] in
  146 + let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
  147 + let rules = add_ordnum_rules intnum_orths rules in
  148 + rules
  149 +
  150 +let rec check_interp sels = function
  151 + [],[] -> true
  152 + | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
  153 + | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
  154 + | S s :: interp, l2 :: interp2 ->
  155 + (try
  156 + let l = Xlist.assoc sels s in
  157 + let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
  158 + if b then check_interp sels (interp,interp2) else false
  159 + with Not_found -> check_interp sels (interp,interp2))
  160 + | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
  161 + | _ -> failwith "check_interp"
  162 +
  163 +let rec get_sels sels = function
  164 + [],[] -> sels
  165 + | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
  166 + | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  167 + | S s :: interp, l2 :: interp2 ->
  168 + (try
  169 + let l = Xlist.assoc sels s in
  170 + let sels = List.remove_assoc s sels in
  171 + let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
  172 + get_sels ((s,l) :: sels) (interp,interp2)
  173 + with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
  174 + | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  175 + | _ -> failwith "get_sels"
  176 +
  177 +let rec match_path_rec map found (t:token_env) sels rev = function
  178 + [] -> (t :: rev, sels) :: found
223 | s :: l -> 179 | s :: l ->
224 let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in 180 let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
225 let found2 = IntMap.fold map2 [] (fun found2 _ l -> 181 let found2 = IntMap.fold map2 [] (fun found2 _ l ->
226 - Xlist.fold l found2 (fun found2 new_t ->  
227 - if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in  
228 - Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l) 182 + TokenEnvSet.fold l found2 (fun found2 new_t ->
  183 + match s,new_t.token with
  184 + O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
  185 + | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
  186 + Xlist.fold interps2 found2 (fun found2 interp2 ->
  187 + if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
  188 + (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
  189 + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
  190 + | _ -> found2)) in
  191 + Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
229 192
230 let match_path map = function 193 let match_path map = function
231 [] -> failwith "match_path" 194 [] -> failwith "match_path"
232 | s :: l -> 195 | s :: l ->
233 let found = IntMap.fold map [] (fun found i map2 -> 196 let found = IntMap.fold map [] (fun found i map2 ->
234 IntMap.fold map2 found (fun found j l -> 197 IntMap.fold map2 found (fun found j l ->
235 - Xlist.fold l found (fun found t ->  
236 - if ENIAMtokens.get_orth t.token = s then t :: found else found))) in  
237 - Xlist.fold found [] (fun found t -> match_path_rec map found t [] l) 198 + TokenEnvSet.fold l found (fun found t ->
  199 + match s,t.token with
  200 + O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
  201 + | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
  202 + Xlist.fold interps2 found (fun found interp2 ->
  203 + if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
  204 + (t,get_sels [] (interp,interp2)) :: found else found)
  205 + | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
  206 + | _ -> found))) in
  207 + Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
238 208
239 let concat_orths l = 209 let concat_orths l =
240 let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in 210 let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
241 let n = Xstring.size s in 211 let n = Xstring.size s in
242 if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s 212 if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
243 213
244 -let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *) 214 +let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
245 let l = List.rev matching in 215 let l = List.rev matching in
246 let beg = (List.hd l).beg in 216 let beg = (List.hd l).beg in
247 let t = List.hd matching in 217 let t = List.hd matching in
@@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na @@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na
251 beg=beg; 221 beg=beg;
252 len=len; 222 len=len;
253 next=t.next; 223 next=t.next;
254 - token=ENIAMtokens.make_lemma (lemma,interp); 224 + token=Lemma(lemma,cat,[Xlist.map interp (function
  225 + S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
  226 + | V s -> [s]
  227 + | G -> ["_"])]);
255 weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) 228 weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
256 attrs=ENIAMtokens.merge_attrs l} 229 attrs=ENIAMtokens.merge_attrs l}
257 230
258 let add_token paths t = 231 let add_token paths t =
259 let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in 232 let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
260 - let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in 233 + let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
261 IntMap.add paths t.beg map 234 IntMap.add paths t.beg map
262 235
263 -let apply_rule paths (match_list,lemma,interp) = 236 +let apply_rule paths (match_list,lemma,cat,interp) =
264 (* print_endline ("apply_rule: " ^ lemma); *) 237 (* print_endline ("apply_rule: " ^ lemma); *)
265 let matchings_found = match_path paths match_list in 238 let matchings_found = match_path paths match_list in
266 - Xlist.fold matchings_found paths (fun paths matching -> 239 + Xlist.fold matchings_found paths (fun paths (matching,sels) ->
267 try 240 try
268 - let token = create_token matching lemma interp in 241 + let token = create_token matching sels lemma cat interp in
269 add_token paths token 242 add_token paths token
270 with Not_found -> paths) 243 with Not_found -> paths)
271 244
272 -(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)  
273 -let add_ordnum_rules rules paths =  
274 - let orths = get_intnum_orths paths in  
275 - StringMap.fold orths rules (fun rules orth lemmas ->  
276 - StringSet.fold lemmas rules (fun rules lemma ->  
277 - (* Printf.printf "%s %s\n%!" orth lemma; *)  
278 - ([orth;"."],lemma,"ordnum") :: rules))  
279 -  
280 let process (paths,last) = 245 let process (paths,last) =
281 let paths = Xlist.fold paths IntMap.empty add_token in 246 let paths = Xlist.fold paths IntMap.empty add_token in
282 - let orths = get_orths paths in  
283 - let rules = preselect_dict orths mwe_dict in  
284 - let rules = add_ordnum_rules rules paths in 247 + let rules = select_rules paths mwe_dict mwe_dict2 in
  248 + let paths = Xlist.fold rules paths apply_rule in
  249 + let rules = select_rules paths mwe_dict mwe_dict2 in
  250 + let paths = Xlist.fold rules paths apply_rule in
  251 + let rules = select_rules paths mwe_dict mwe_dict2 in
  252 + let paths = Xlist.fold rules paths apply_rule in
  253 + let rules = select_rules paths mwe_dict mwe_dict2 in
285 let paths = Xlist.fold rules paths apply_rule in 254 let paths = Xlist.fold rules paths apply_rule in
286 let paths = IntMap.fold paths [] (fun paths _ map -> 255 let paths = IntMap.fold paths [] (fun paths _ map ->
287 IntMap.fold map paths (fun paths _ l -> 256 IntMap.fold map paths (fun paths _ l ->
288 - Xlist.fold l paths (fun paths t -> 257 + TokenEnvSet.fold l paths (fun paths t ->
289 t :: paths))) in 258 t :: paths))) in
290 ENIAMpaths.sort (paths,last) 259 ENIAMpaths.sort (paths,last)
subsyntax/ENIAMsubsyntaxTypes.ml
@@ -48,6 +48,9 @@ let brev_filename = resource_path ^ &quot;/subsyntax/brev.tab&quot; @@ -48,6 +48,9 @@ let brev_filename = resource_path ^ &quot;/subsyntax/brev.tab&quot;
48 let fixed_filename = resource_path ^ "/Walenty/fixed.tab" 48 let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
49 let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" 49 let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
50 let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" 50 let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"
  51 +let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic"
  52 +let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic"
  53 +let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic"
51 54
52 let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab" 55 let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"
53 56
subsyntax/TODO
1 -- dokończyć rozpoznawanie MWE  
2 -- dodać zasoby MWE  
3 - rozpoznawanie MWE ze Słowosieci 1 - rozpoznawanie MWE ze Słowosieci
  2 +- kompresowanie tokenów mających indentyczne lematy (albo po przetworzeniu, albo kompresowanie interpretacji przed rozpoznaniem mwe)
4 3
5 - jak przetwarzać num:comp 4 - jak przetwarzać num:comp
6 - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga 5 - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
subsyntax/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 OCAMLDEP=ocamldep 3 OCAMLDEP=ocamldep
4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 OCAMLFLAGS=$(INCLUDES) -g 5 OCAMLFLAGS=$(INCLUDES) -g
6 -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa 6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa
7 INSTALLDIR=`ocamlc -where`/eniam 7 INSTALLDIR=`ocamlc -where`/eniam
8 8
9 SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml 9 SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
@@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES)
32 eniam-subsyntax.cmxa: $(SOURCES) 32 eniam-subsyntax.cmxa: $(SOURCES)
33 ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ 33 ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^
34 34
35 -test: test.ml  
36 - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml 35 +test: $(SOURCES) test.ml
  36 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
37 37
38 interface: interface.ml 38 interface: interface.ml
39 $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml 39 $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml
subsyntax/resources/README
1 -File NKJP1M-lemma-freq.tab in this folder was created on the basis of 1 +File NKJP1M-lemma-freq.tab in this folder was created on the basis of
2 2
3 NKJP1M: the manually annotated 1-million word subcorpus sampled 3 NKJP1M: the manually annotated 1-million word subcorpus sampled
4 from texts of a subset of the National Corpus of Polish. 4 from texts of a subset of the National Corpus of Polish.
5 version 1.2 5 version 1.2
6 6
7 -File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of 7 +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of
8 8
9 SGJP: Grammatical Dictionary of Polish, version 20151020 9 SGJP: Grammatical Dictionary of Polish, version 20151020
10 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin 10 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
11 Woliński, Robert Wołosz, Danuta Skowrońska 11 Woliński, Robert Wołosz, Danuta Skowrońska
12 12
13 -and also on the basis of 13 +and also on the basis of
14 PoliMorf, version 20151020 14 PoliMorf, version 20151020
  15 +
  16 +File SEJF.dic is created on the basis of
  17 +Grammatical Lexicon of Polish Phraseology
  18 +Copyright © Monika Czerepowicka, Agata Savary
  19 +Copyright © Institute of Computer Science Polish Academy of Sciences
  20 +The data are available under the CC BY-SA license.
  21 +
  22 +File SEJFEK.dic is created on the basis of
  23 +Grammatical Lexicon of Polish Economic Phraseology
  24 +Copyright © Filip Makowiecki, Agata Savary
  25 +Copyright © Institute of Computer Science Polish Academy of Sciences
  26 +The data are available under the CC BY-SA license.
  27 +
  28 +File SAWA.dic is created on the basis of
  29 +Grammatical Lexicon of Warsaw Urban Proper Names
  30 +Copyright © Małgorzata Marciniak, Celina Heliasz, Joanna Rabiega-Wiśniewska, Piotr Sikora, Marcin Woliński, Agata Savary
  31 +Copyright © Institute of Computer Science Polish Academy of Sciences
  32 +The data are available under the CC BY-SA license.