Commit 584f96578ef06859f8178daa45954162be0229cf
1 parent
9043cc39
wczytywanie header.xml
Showing
1 changed file
with
25 additions
and
7 deletions
NKJP2/ENIAM_NKJP.ml
... | ... | @@ -19,6 +19,16 @@ |
19 | 19 | |
20 | 20 | open Xstd |
21 | 21 | |
22 | +let load_header path name = | |
23 | + match Xml.parse_file (path ^ name ^ "/header.xml") with | |
24 | + Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); | |
25 | + Xml.Element("profileDesc",[],[Xml.Element("textClass",[],[ | |
26 | + Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); | |
27 | + Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); | |
28 | + Xml.Element("revisionDesc",_,_)]) -> | |
29 | + typ,channel | |
30 | + | _ -> failwith "load_header" | |
31 | + | |
22 | 32 | let get_folders path = |
23 | 33 | Xlist.sort (Xlist.fold (Array.to_list (Sys.readdir path)) [] (fun l folder -> |
24 | 34 | if Sys.is_directory (path ^ folder) then folder :: l else l)) compare |
... | ... | @@ -83,17 +93,24 @@ let load_segmentation path name = |
83 | 93 | List.rev (Xlist.rev_map entries load_segm_entry) |
84 | 94 | | _ -> failwith "load_segmentation" |
85 | 95 | |
86 | -let rec load_morph_token = function | |
96 | +let load_disamb = function | |
97 | + Xml.Element("fs",["feats",_;"type","tool_report"], | |
98 | + [Xml.Element("f",["fVal",_;"name","choice"],_); | |
99 | + Xml.Element("f",["name","interpretation"],[Xml.Element("string",[],[Xml.PCData interp])])]) -> | |
100 | + interp | |
101 | + | xml -> failwith ("load_disamb: " ^ Xml.to_string_fmt xml) | |
102 | + | |
103 | +let load_morph_token = function | |
87 | 104 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
88 | 105 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
89 | - Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],_)])]) -> | |
90 | - () | |
106 | + Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> | |
107 | + corresp,id_seg,orth,load_disamb disamb | |
91 | 108 | | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
92 | 109 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
93 | 110 | Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); |
94 | - Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],_)])]) -> | |
95 | - () | |
96 | - | xml -> failwith ("load_morph_token 1: " ^ Xml.to_string_fmt xml) | |
111 | + Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> | |
112 | + corresp,id_seg,orth,load_disamb disamb | |
113 | + | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) | |
97 | 114 | |
98 | 115 | let load_morph_sentence = function |
99 | 116 | Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> |
... | ... | @@ -120,9 +137,10 @@ let _ = |
120 | 137 | let names = get_folders nkjp_path in |
121 | 138 | Xlist.iter names (fun name -> |
122 | 139 | print_endline name; |
140 | + let header = load_text nkjp_path name in | |
123 | 141 | (* let text = load_text nkjp_path name in *) |
124 | 142 | (* let segmentation = load_segmentation nkjp_path name in *) |
125 | - let morphosyntax = load_morphosyntax nkjp_path name in | |
143 | + (* let morphosyntax = load_morphosyntax nkjp_path name in *) | |
126 | 144 | ()) |
127 | 145 | |
128 | 146 | (* |
... | ... |