NKJP.ml
18.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
(*TYPE DEFINITIONS*)
type segm = {
id_seg: string; (* -; [segm_]1.1-seg; [morph_]1.1-seg; [senses_]1.1-seg *)
pos: int;
length: int option;
orth: string option;
disamb: string option;
sense: string option}
type sentence = {
id_s: string; (* -; [segm_]1.57-s; [morph_]1.57-s; [senses_]1.57-s *)
segments: segm list}
type abs = {
id_ab: string; (* [txt_]1.1-ab; -; -; - *)
contents: string option;
sentences: sentence list}
type text = {
id_source: string;
id_p: string; (* [txt_]1[-div]; [segm_]1[-p]; [morph_]1[-p]; [senses_]1[-p] *)
abs: abs list}
type 'a fold = string -> source:string list -> channel:string list -> 'a -> ('a -> text -> 'a) -> 'a
(*STRING CONVERSION*)
let string_of_some_int = function
| None -> ""
| Some x -> string_of_int x
let string_of_some = function
| None -> ""
| Some x -> x
let string_of_segm seg =
seg.id_seg ^ "; " ^
(string_of_int seg.pos) ^ "; " ^
(string_of_some_int seg.length) ^ "; " ^
string_of_some seg.orth ^ "; " ^
string_of_some seg.disamb ^ "; " ^
string_of_some seg.sense ^ "; "
let string_of_sentence s =
List.fold_left (fun x y -> x ^ "\t \t \t" ^ (string_of_segm y) ^ "\n") (s.id_s ^ ";\n") s.segments
let string_of_abs a =
List.fold_left (fun x y -> x ^ "\t \t" ^ (string_of_sentence y) ^ "\n") (a.id_ab ^ ";\n" ^ (string_of_some a.contents) ^ ";\n") a.sentences
let string_of_text t =
List.fold_left (fun x y -> x ^ "\t" ^ (string_of_abs y) ^ "\n") (t.id_source ^ ";\n" ^ t.id_p ^ ";\n") t.abs
(*COMPARISONS*)
let compare_dotted a b =
let fst id = int_of_string (NKJPxmlbasics.before_split id "\\.") in
let snd id = int_of_string (NKJPxmlbasics.after_split (NKJPxmlbasics.before_split id "-") "\\.") in
if compare (fst a) (fst b) <> 0 then compare (fst a) (fst b)
else compare (snd a) (snd b)
let compare_string a b =
if a = b then 0 else
if a < b then -1 else 1
let compare_segm id_ab1 id_ab2 segm1 segm2 =
if compare_dotted id_ab1 id_ab2 <> 0 then compare_dotted id_ab1 id_ab2
else compare segm1.pos segm2.pos
let compare_sentence id_ab1 id_ab2 sentence1 sentence2 = compare_segm id_ab1 id_ab2 (List.hd sentence1.segments) (List.hd sentence2.segments)
let compare_flat_abs abs1 abs2 =
match abs1.sentences, abs2.sentences with
| {segments = hd1 :: []; _}::[], {segments = hd2 :: []; _}::[] ->
compare_segm abs1.id_ab abs2.id_ab hd1 hd2
| _, _ -> failwith "compare_flat_abs"
let compare_abs abs1 abs2 =
let num x = int_of_string (NKJPxmlbasics.after_split (NKJPxmlbasics.before_split x "-") "\\.") in
let num1 = num abs1.id_ab in
let num2 = num abs2.id_ab in
if num1 = num2 then 0
else if num1 < num2 then -1 else 1
let compare_text text1 text2 =
if text1.id_source = text2.id_source then compare (int_of_string text1.id_p) (int_of_string text2.id_p)
else if text1.id_source < text2.id_source then -1 else 1
(*given node of xml file it compares its possible ids from xml:id and corresp and returns
the id when values mathes else prints to stdout where values are incorrect*)
let compare_id_corresp file source =
let id = NKJPxmlbasics.find_attribute source "xml:id" in
let corresp = NKJPxmlbasics.find_attribute source "corresp" in
let id_trun = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split id "_") "-" in
let corresp_trun = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split corresp "_") "-" in
if id_trun = corresp_trun then Some id_trun
else begin
Printf.printf "compare_id_corresp, invalid input at %s %s %s %s %s \n" file id corresp id_trun corresp_trun;
None
end
(*MERGES*)
let merge_list compare_el merge_el list1 list2 =
let sort_list1 = List.sort compare_el list1 in
let sort_list2 = List.sort compare_el list2
in
let rec submerge acc = function
| _, [] -> acc
| [], temp -> temp@acc
| hd1::tl1, hd2::tl2 ->
let c = compare_el hd1 hd2 in
if c = 0 then submerge ((merge_el hd1 hd2)::acc) (hd1::tl1, tl2)
else if c < 0 then submerge acc (tl1, hd2::tl2)
else submerge (hd2::acc) (hd1::tl1, tl2)
in
List.sort compare_el (submerge [] (sort_list1, sort_list2))
let implication a = function
| None -> a
| Some x ->
match a with
| Some y -> Some x
| None -> Some x
let merge_segm segm1 segm2 =
assert(segm1.id_seg = segm2.id_seg);
{
id_seg = segm1.id_seg;
pos = segm1.pos;
length = implication segm1.length segm2.length;
orth = implication segm1.orth segm2.orth;
disamb = implication segm1.disamb segm2.disamb;
sense = implication segm1.sense segm2.sense;
}
let merge_sentence id_ab1 id_ab2 sentence1 sentence2 =
assert(sentence1.id_s = sentence2.id_s);
{
id_s = sentence1.id_s;
segments = merge_list (compare_segm id_ab1 id_ab2) merge_segm sentence1.segments sentence2.segments
}
let merge_abs abs1 abs2 =
assert(abs1.id_ab = abs2.id_ab);
{
id_ab = abs1.id_ab;
contents = implication abs1.contents abs2.contents;
sentences = merge_list (compare_sentence abs1.id_ab abs2.id_ab) (merge_sentence abs1.id_ab abs2.id_ab) abs1.sentences abs2.sentences
}
let merge_text ~flat text1 text2 =
assert (text1.id_source = text2.id_source);
assert (text1.id_p = text2.id_p);
let abs_comparison = if flat then compare_flat_abs else compare_abs in
let abs = merge_list abs_comparison merge_abs text1.abs text2.abs in
{
id_source = text1.id_source;
id_p = text1.id_p;
abs
}
let merge_flat_text text1 text2 =
assert (text1.id_source = text2.id_source);
assert (text1.id_p = text2.id_p);
let abs = merge_list compare_flat_abs merge_abs text1.abs text2.abs in
{
id_source = text1.id_source;
id_p = text1.id_p;
abs
}
let merge_file = merge_list compare_text (merge_text ~flat:false)
let merge_flat_file = merge_list compare_text (merge_text ~flat:true)
let unflatten s =
match List.rev (List.sort compare_flat_abs s) with
| [] -> []
| hd::tl ->
let exporter x y =
match x with
| [] -> assert false
| hd::tl ->
if hd.id_ab = y.id_ab then
let sentences = match hd.sentences, y.sentences with
| [], _
| _, [] -> assert false
| hd1::tl1, hd2::[] ->
if hd1.id_s = hd2.id_s
then {id_s = hd1.id_s; segments = hd2.segments @ hd1.segments}::tl1
else hd2::hd1::tl1
| _, _ -> assert false
in
{id_ab = hd.id_ab; contents = implication hd.contents y.contents; sentences} :: tl
else y::hd::tl
in
List.fold_left exporter [hd] tl
let unflatten_file file =
let exporter = function {id_source; id_p; abs = flat_abs} -> {id_source; id_p; abs = unflatten flat_abs}
in
List.map exporter file
let flatten = function
| [] -> assert false
| l ->
let exporter x y =
(List.map (fun z -> {id_ab = x.id_ab; contents=None; sentences=[z]}) x.sentences) @ y
in
List.fold_right exporter l []
(*SYSTEM UTILITIES*)
module StringMap = Map.Make(String);;
let create_map l =
List.fold_left (fun x1 x2 -> (StringMap.add x2 true x1)) StringMap.empty l
let validate path name ~source_validate ~channel_validate =
try
let header = Xml.parse_file (path ^ name ^ "/header.xml") in
let info_list = NKJPxmlbasics.go_to header ["teiHeader";"profileDesc";"textClass";"catRef"] in
let source_predicate x = (NKJPxmlbasics.find_attribute x "scheme") = "#taxonomy-NKJP-type" in
let channel_predicate x = (NKJPxmlbasics.find_attribute x "scheme") = "#taxonomy-NKJP-channel" in
let source_info = List.hd (List.filter source_predicate info_list) in
let channel_info = List.hd (List.filter channel_predicate info_list) in
let source_res = NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute source_info "target") "_" in
let channel_res = NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute channel_info "target") "_" in
source_validate source_res && channel_validate channel_res
with
| Xml.Error err -> begin
print_endline name;
print_endline (Xml.error err);
false
end
let get_dirnames path ~source ~channel =
let source_map = create_map source in
let channel_map = create_map channel in
let source_check x =
if source = [] then true
else StringMap.mem x source_map
in
let channel_check x =
if channel = [] then true
else StringMap.mem x channel_map
in
List.fold_left
(fun folder_list folder ->
if
Sys.is_directory (path ^ folder) &&
validate path folder ~source_validate:source_check ~channel_validate:channel_check
then folder :: folder_list
else folder_list)
[]
(Array.to_list (Sys.readdir path))
(*FOLDS*)
let fold_folder_left path s f ending ~source ~channel=
let folder_list = get_dirnames path ~source ~channel in
List.fold_left
(fun x y ->
f x (y, ( Xml.parse_file (path ^ y ^ ending)))
) s folder_list
let fold_folder_right path f s ending ~source ~channel =
let folder_list = get_dirnames path ~source:source ~channel:channel in
List.fold_right
(fun x y ->
f (x, ( Xml.parse_file (path ^ x ^ ending))) y
) folder_list s
let fold_export path ~source ~channel s f export_function ending =
let exporter =
fun x1 y1 ->
List.fold_left f x1 (export_function path y1)
in
fold_folder_left path s exporter ending ~source ~channel
(*UNIVERSAL EXPORTS*)
let notRejected seg = List.for_all (fun x -> x <> ("nkjp:rejected", "true")) (Xml.attribs seg)
let rec filterRejected = function
| Xml.Element (tag_name, _, _) as seg when tag_name="seg" -> if notRejected seg then [seg] else []
| Xml.Element (tag_name, _, children) when tag_name="s" || tag_name="choice" || tag_name="nkjp:paren" ->
List.fold_left (fun x y -> x @ (filterRejected y)) [] children
| _ -> failwith "filterRejected: given node is not seg s choice nor nkjp:paren"
let export_type_text name source export_type_abslist =
match source with
| Xml.Element (tag_name, _, children) when tag_name="p" ->
(match compare_id_corresp name source with
| None -> None
| Some id_p ->
let t = List.sort compare_abs (export_type_abslist children) in
Some { id_source = name; id_p ; abs = t})
| _ -> failwith "export_segm_p: given node is not p"
let export_ab export_seg = function
| Xml.Element (tag_name, _, children) as source when tag_name="s" ->
(let id_s = compare_id_corresp "not_aviable" source in
match id_s with
| None -> failwith "export_ab: missing id"
| Some id ->
let exporter x =
let id_ab = Xml.attrib x "ab" in
let sentences = [{ id_s = id; segments = [export_seg x]}] in
{id_ab; contents = None; sentences}
in
List.map exporter (filterRejected source))
| _ -> failwith "export_ab: given node is not s"
let export_seg_fs export_fs = function
| Xml.Element (tag_name, _, children) as source when tag_name="seg" ->
(match compare_id_corresp "not_aviable" source with
| Some id_seg ->
let id_seg = id_seg ^ "-seg" in
let pos = int_of_string (Xml.attrib source "pos") in
(match children with
| [] ->
{ id_seg; pos; length = None;
orth = None; disamb = None; sense = None}
| hd::[] -> export_fs pos id_seg hd
| _ -> failwith "too much fs")
| None -> failwith "export_seg")
| _ -> failwith "given node is not seg"
(*TEXT*)
let export_text_ab source =
match source with
| Xml.Element(tag_name, _, children) when tag_name="ab" -> {
id_ab = NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute source "xml:id") "_";
contents = NKJPxmlbasics.find_text source;
sentences = []}
| _ -> failwith "export_text_ab: given node is not ab"
let export_text_div name parsed_text =
match parsed_text with
| Xml.Element (tag_name, _, children) when tag_name="div" ->
let id_div = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute parsed_text "xml:id") "_") "-" in
let ab_list = List.map export_text_ab children
in {
id_source = name;
id_p = id_div;
abs = ab_list}
| _ -> failwith "export_text_div: given node is not div"
let export_text_file path (name, parsed_text) =
let div_list = NKJPxmlbasics.go_to parsed_text ["teiCorpus";"TEI";"text";"body";"div"]
in
List.fold_right (fun x y -> (export_text_div name x)::y) div_list []
let fold_text path ~source ~channel s f =
(fold_export path ~source ~channel s f export_text_file "/text.xml")
(*SEGM*)
let rec extract_ab_id = function
| Xml.Element (tag_name, _, children) as source when tag_name="seg" ->
let corresp = NKJPxmlbasics.find_attribute source "corresp" in
NKJPxmlbasics.after_split( NKJPxmlbasics.before_split (NKJPxmlbasics.after_split corresp "(") ",") "_"
| _ -> failwith "extract_ab_id"
let export_segm_seg ~id_s = function
| Xml.Element (tag_name, _, children) as source when tag_name="seg" ->
let corresp = NKJPxmlbasics.find_attribute source "corresp" in
let id_seg = NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute source "xml:id") "_" in
let pos = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split corresp "ab,") "," in
let len = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split corresp ",") ")" in
let id_ab = extract_ab_id source in
let segment = { id_seg; pos = int_of_string pos; length = Some (int_of_string len);
orth = None; disamb = None; sense = None; }
in
[{
id_ab;
contents = None;
sentences = [{ id_s; segments = [ segment ] }]
}]
| _ -> failwith "export_seg: given node is not seg"
let export_segm_s_ab = function
| Xml.Element (tag_name, _, children) as source when tag_name="s" ->
let temp_id_s = NKJPxmlbasics.find_attribute source "xml:id" in
let id_s = NKJPxmlbasics.before_split (NKJPxmlbasics.after_split temp_id_s "_") "-" in
List.flatten (List.map (export_segm_seg ~id_s) (filterRejected source))
| _ -> failwith "export_s: given node is not s"
let export_segm_slist ~flat sourcelist =
(if flat then flatten else unflatten) (List.flatten (List.map export_segm_s_ab sourcelist))
let export_segm_file ~flat path (name, parsed_text) =
let p_list = NKJPxmlbasics.go_to parsed_text ["teiCorpus";"TEI";"text";"body";"p"] in
(*exports every p to some text and if it's not none value adds it to a list*)
let exporter x y =
match export_type_text name x (export_segm_slist ~flat) with
| None -> y
| Some v -> v::y
in
let segm = List.fold_right exporter p_list [] in
let parsed_text = Xml.parse_file (path^name^"/text.xml") in
let text = export_text_file path (name, parsed_text)
in
List.sort compare_text (merge_file text segm)
let fold_segm path ~source ~channel s f =
(fold_export path ~source ~channel s f (export_segm_file ~flat:false) "/ann_segmentation.xml")
(*MORPH*)
let grandchildren = function
| Xml.Element (_, _, children) ->
let temp x y = match x with
| Xml.Element (_, _, children) -> children @ y
| _ -> y
in
List.fold_right temp children []
| _ -> failwith "grandchildren"
let export_morph_f ({id_seg; pos; length; orth; disamb; sense} as acc) = function
| Xml.Element (tag_name, _, children) as source when tag_name="f" ->
(match NKJPxmlbasics.find_attribute source "name" with
| "orth" ->
if (List.tl children = []) then
{ id_seg; pos; length; orth = implication (NKJPxmlbasics.find_text (List.hd children)) orth; disamb; sense }
else assert false
| "disamb" ->
(match List.filter (fun x -> NKJPxmlbasics.find_attribute x "name" = "interpretation") (grandchildren source) with
| [] -> assert false
| hd::tl ->
if tl = [] then
{ id_seg; pos; length; orth; disamb = implication (NKJPxmlbasics.find_text hd) disamb; sense }
else assert false)
| _ -> acc )
| _ -> failwith "export_morph_f"
let export_morph_fs pos id_seg = function
| Xml.Element (tag_name, _, children) when tag_name="fs" ->
let empty = { id_seg; pos; length = None; orth = None; disamb = None; sense = None } in
List.fold_left export_morph_f empty children
| _ -> failwith "export_morph_fs"
let export_morph_seg = export_seg_fs export_morph_fs
let export_morph_ab = export_ab export_morph_seg
let export_morph_p id_source = function
| Xml.Element (tag_name, _, children) as source when tag_name="p" ->
let abs = List.fold_right (fun x y -> export_morph_ab x @ y) children [] in
(match compare_id_corresp "not_aviable" source with
| Some id_p -> { id_source; id_p; abs }
| None -> failwith "export_morph_p")
| _ -> failwith "export_morph_p"
let export_morph_file ~flat path (name, parsed_text) =
let parsed_segm = Xml.parse_file (path^name^"/ann_segmentation.xml") in
let segm = export_segm_file ~flat:true path (name, parsed_segm) in
let p_list = NKJPxmlbasics.go_to parsed_text ["teiCorpus";"TEI";"text";"body";"p"] in
let morph = List.map (fun x -> (export_morph_p name x)) p_list in
if flat then merge_flat_file segm morph else unflatten_file (merge_flat_file segm morph)
let fold_morph path ~source ~channel s f =
(fold_export path ~source ~channel s f (export_morph_file ~flat:false) "/ann_morphosyntax.xml")
(*SENSE*)
let export_sense_seg = function
| Xml.Element (tag_name, _, _) as source when tag_name = "seg" ->
(match compare_id_corresp "not_aviable" source with
| Some id_seg ->
(let id_seg = id_seg ^ "-seg" in
let sense =
(match grandchildren source with
| [] -> None
| hd::[] -> Some (NKJPxmlbasics.after_split (NKJPxmlbasics.find_attribute hd "fVal") "#")
| _ -> failwith "export_sense_seg: too much fs") in
let pos = int_of_string (Xml.attrib source "pos") in
{id_seg; pos; length = None; orth = None; disamb = None; sense})
| None -> failwith "export_sense_seg")
| _ -> failwith "export_sense_seg"
let export_sense_ab = export_ab export_sense_seg
let export_sense_p id_source = function
| Xml.Element (tag_name, _, children) as source when tag_name="p" ->
let abs = List.fold_right (fun x y -> export_sense_ab x @ y) children []in
(match compare_id_corresp "not_aviable" source with
| Some id_p -> { id_source; id_p; abs }
| None -> failwith "export_sense_p")
| _ -> failwith "export_sense_p"
let export_sense_file path (name, parsed_text) =
let parsed_morph = Xml.parse_file (path^name^"/ann_morphosyntax.xml") in
let morph = export_morph_file ~flat:true path (name, parsed_morph) in
let p_list = NKJPxmlbasics.go_to parsed_text ["teiCorpus";"TEI";"text";"body";"p"] in
let sense = List.map (fun x -> (export_sense_p name x)) p_list in
unflatten_file (merge_flat_file morph sense)
let fold_sense path ~source ~channel s f =
(fold_export path ~source ~channel s f export_sense_file "/ann_senses.xml")