nouns.ml
1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
open Xstd
open PreTypes
let wynik = ref []
let get_paths = function
DepSentence paths -> Array.to_list paths
| _ -> failwith "get_paths"
let if_cat cats = function
Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats
| _ -> false
let if_interps interps token =
let interp = match token with
Lemma(_,_,i) -> i
| _ -> [[[]]] in
let if_interp nr value =
List.exists (fun x ->
try
List.exists (fun y ->
y = value) (List.nth x nr)
with _ -> false
) interp in
Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))
let lemma_string = function
Lemma(l,c,_) -> l ^ " " ^ c
| _ -> failwith "lemma_string"
let get_noun_part paths tokens i id super label =
List.iteri (fun i2 (id2,super2,label2) ->
if super2 = i &&
if_cat ["subst"; "ppron3"; "depr"; "ger"] (ExtArray.get tokens id2).token &&
if_interps [1,"gen"] (ExtArray.get tokens id2).token
then wynik := ((lemma_string (ExtArray.get tokens id).token) ^ " " ^
(ExtArray.get tokens id2).orth ^ " " ^
(lemma_string (ExtArray.get tokens id2).token) ^ "\n")
:: !wynik
) paths
let parse_for_nouns paths tokens =
List.iteri (fun i (id,super,label) ->
if if_cat ["subst"; "ppron3"; "depr"; "ger"] (ExtArray.get tokens id).token
then (get_noun_part paths tokens i id super label)) paths
let process_conll_corpus_for_nouns filename =
let oc = open_out "../miscellaneous/nouns_skladnica.txt" in
let corpus = File.file_in filename (fun file -> CONLL.load_corpus file) in
Xlist.iter corpus (fun (p_record, tokens) -> parse_for_nouns (get_paths p_record.psentence) tokens);
Xlist.iter (List.sort compare !wynik) (output_string oc); flush oc