Commit cdcea6621a46bb7412051ca37b9ef7c516a53ba0
1 parent
6c86906e
scalenie zdań i akapitów w NKJP1M
Showing
1 changed file
with
36 additions
and
614 deletions
NKJP2/ENIAM_NKJP.ml
... | ... | @@ -199,10 +199,16 @@ let rec merge_tokens name id_p rev = function |
199 | 199 | let id_div,id_ab,beg,len = parse_seg_corresp corresp in( |
200 | 200 | (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *) |
201 | 201 | let lemma,cat,interp = parse_disamb disamb in |
202 | - merge_tokens name id_p ((id_div,id_ab,beg,nps,len,orth,lemma,cat,interp) :: rev) (segmentation,morphosyntax)) | |
202 | + merge_tokens name id_p ((id_div,id_ab,(beg,len,nps,orth,lemma,cat,interp)) :: rev) (segmentation,morphosyntax)) | |
203 | 203 | | [],[] -> List.rev rev |
204 | 204 | | _ -> failwith "merge_tokens 1" |
205 | 205 | |
206 | +let rec split_sentences id_div id_ab rev rev2 = function | |
207 | + (id_div2,id_ab2,token) :: l -> | |
208 | + if id_div = id_div2 && id_ab = id_ab2 then split_sentences id_div id_ab (token :: rev) rev2 l else | |
209 | + split_sentences id_div2 id_ab2 [token] ((id_div,id_ab,List.rev rev) :: rev2) l | |
210 | + | [] -> List.rev ((id_div,id_ab,List.rev rev) :: rev2) | |
211 | + | |
206 | 212 | let rec merge_sentences name id_p rev = function |
207 | 213 | ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation, |
208 | 214 | ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"}, |
... | ... | @@ -211,10 +217,34 @@ let rec merge_sentences name id_p rev = function |
211 | 217 | if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else |
212 | 218 | if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else |
213 | 219 | let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in |
214 | - merge_sentences name id_p ((id_segm_p,id_segm_s,tokens) :: rev) (segmentation,morphosyntax) | |
220 | + let id_s = string_of_int id_segm_p ^ "." ^ string_of_int id_segm_s in | |
221 | + if tokens = [] then failwith "merge_sentences 4" else | |
222 | + let id_div,id_ab,token = List.hd tokens in | |
223 | + let l = match split_sentences id_div id_ab [token] [] tokens with | |
224 | + [id_div,id_ab,tokens] -> [id_div,id_ab,id_s,tokens] | |
225 | + | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2] -> [id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1] | |
226 | + | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2;id_div3,id_ab3,tokens3] -> [id_div3,id_ab3,id_s^"c",tokens3;id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1] | |
227 | + | _ -> failwith (Printf.sprintf "merge_sentences 5: %s %d %d" name id_div id_ab) in | |
228 | + merge_sentences name id_p (l @ rev) (segmentation,morphosyntax) | |
215 | 229 | | [],[] -> List.rev rev |
216 | 230 | | _ -> failwith "merge_sentences" |
217 | 231 | |
232 | +let rec merge_paragraph id_div id_ab rev = function | |
233 | + (id_div2,id_ab2,id_s,tokens) :: sentences -> | |
234 | + if id_div <> id_div2 || id_ab <> id_ab2 then List.rev rev, (id_div2,id_ab2,id_s,tokens) :: sentences | |
235 | + else merge_paragraph id_div id_ab ((id_s,tokens) :: rev) sentences | |
236 | + | [] -> List.rev rev, [] | |
237 | + | |
238 | +let rec merge_paragraphs name id_p rev = function | |
239 | + ({corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"},paragraph) :: paragraphs, | |
240 | + (id_div2,id_ab2,id_s,tokens) :: sentences -> | |
241 | + if id_div <> id_div2 && id_ab <> id_ab2 then failwith "merge_paragraphs 1" else | |
242 | + let l,sentences = merge_paragraph id_div id_ab [id_s,tokens] sentences in | |
243 | + (* Printf.printf "%d.%d: %s\n" id_div id_ab (String.concat " " (Xlist.map l fst)); *) | |
244 | + merge_paragraphs name id_p ((paragraph,l) :: rev) (paragraphs,sentences) | |
245 | + | [],[] -> List.rev rev | |
246 | + | _ -> failwith ("merge_paragraphs 2: " ^ name ^ " " ^ string_of_int id_p) | |
247 | + | |
218 | 248 | let rec merge_entries name rev = function |
219 | 249 | ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, |
220 | 250 | ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, |
... | ... | @@ -223,7 +253,8 @@ let rec merge_entries name rev = function |
223 | 253 | {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> |
224 | 254 | if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else |
225 | 255 | let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in |
226 | - merge_entries name ((id_div,paragraphs,sentences) :: rev) (text,segmentation,morphosyntax) | |
256 | + let paragraphs = merge_paragraphs name id_div [] (paragraphs,sentences) in | |
257 | + merge_entries name ((id_div,paragraphs) :: rev) (text,segmentation,morphosyntax) | |
227 | 258 | | [],[],[] -> List.rev rev |
228 | 259 | | _ -> failwith "merge_entries" |
229 | 260 | |
... | ... | @@ -232,6 +263,7 @@ let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" |
232 | 263 | let _ = |
233 | 264 | let names = get_folders nkjp_path in |
234 | 265 | Xlist.iter names (fun name -> |
266 | + if name = "030-2-000000012" then () else | |
235 | 267 | (* print_endline name; *) |
236 | 268 | let typ,channel = load_header nkjp_path name in |
237 | 269 | (* print_endline typ; *) |
... | ... | @@ -271,7 +303,7 @@ frekwencje kanałów |
271 | 303 | 28 prasa_tygodnik |
272 | 304 | |
273 | 305 | frekwencje łączne typów-kanałów |
274 | - 127 fakt ksiazka | |
306 | + 127 fakt ksiazka | |
275 | 307 | 56 inf-por ksiazka |
276 | 308 | 283 konwers mowiony |
277 | 309 | 2 listy ksiazka |
... | ... | @@ -292,613 +324,3 @@ frekwencje łączne typów-kanałów |
292 | 324 | 387 urzed prasa_inne |
293 | 325 | |
294 | 326 | *) |
295 | -(* | |
296 | - | |
297 | -type id = {hash: bool; suffix: string; numbers: int list} | |
298 | - | |
299 | -let empty_id = {hash = false; suffix = ""; numbers = []} | |
300 | - | |
301 | -let parse_id s = | |
302 | - if String.length s = 0 then empty_id else | |
303 | - if String.length s < 6 then failwith "za krótkie id" else | |
304 | - let hash,s = if (String.get s 0) = '#' then true, String.sub s 1 (String.length s - 1) else false, s in | |
305 | - if String.sub s 0 4 <> "wal_" then failwith "id nie ma wal" else | |
306 | - let s = String.sub s 4 (String.length s - 4) in | |
307 | - let s,suf = match Str.split (Str.regexp "-") s with | |
308 | - [s;suf] -> s,suf | |
309 | - | _ -> failwith ("parse_id: zła ilość '-' " ^ s) in | |
310 | - let id = {hash = hash; suffix = suf; numbers = try Xlist.map (Xstring.split "\\." s) int_of_string with _ -> failwith ("parse_id: " ^ s)} in | |
311 | - id | |
312 | - | |
313 | -let string_of_id id = | |
314 | - (if id.hash then "#" else "") ^ "wal_" ^ (String.concat "." (Xlist.map id.numbers string_of_int)) ^ "-" ^ id.suffix | |
315 | - | |
316 | -type tei = | |
317 | - Symbol of string | |
318 | - | TEIstring of string | |
319 | - | Binary of bool | |
320 | - | Numeric of int | |
321 | - | F of string * tei | |
322 | - | Fset of string * tei list | |
323 | - | Fs of string * tei list | |
324 | - | Id of id | |
325 | - | SameAs of id * string | |
326 | - | |
327 | -let rec tei_to_string = function | |
328 | - Symbol s -> Printf.sprintf "Symbol %s" s | |
329 | - | TEIstring s -> Printf.sprintf "String %s" s | |
330 | - | Binary b -> Printf.sprintf "Binary %s" (string_of_bool b) | |
331 | - | Numeric n -> Printf.sprintf "Numeric %d" n | |
332 | - | F(s,t) -> Printf.sprintf "F(%s,%s)" s (tei_to_string t) | |
333 | - | Fset(s,l) -> Printf.sprintf "Fset(%s,[%s])" s (String.concat ";" (Xlist.map l tei_to_string)) | |
334 | - | Fs(s,l) -> Printf.sprintf "Fs(%s,[%s])" s (String.concat ";" (Xlist.map l tei_to_string)) | |
335 | - | Id id -> Printf.sprintf "Id(%s)" (string_of_id id) | |
336 | - | SameAs(id,s) -> Printf.sprintf "F(Id,%s)" s | |
337 | - | |
338 | -let rec parse_tei = function | |
339 | - Xml.Element("f",["name",name],[Xml.Element("vColl",["org","set"],set)]) -> | |
340 | - Fset(name,List.rev (Xlist.map set parse_tei)) | |
341 | - | Xml.Element("f", ["name",name],[]) -> Fset(name,[]) | |
342 | - | Xml.Element("f", ["name",name],[tei]) -> F(name,parse_tei tei) | |
343 | - | Xml.Element("f", ["name",name],set) -> Fset(name,List.rev (Xlist.map set parse_tei)) | |
344 | - | Xml.Element("fs", ["type",name], l) -> Fs(name,List.rev (Xlist.rev_map l parse_tei)) | |
345 | - | Xml.Element("fs", ["xml:id",id;"type",name], l) -> Fs(name,Id(parse_id id) :: List.rev (Xlist.rev_map l parse_tei)) | |
346 | - | Xml.Element("symbol",["value",value],[]) -> Symbol value | |
347 | - | Xml.Element("string",[], [Xml.PCData s]) -> TEIstring s | |
348 | - | Xml.Element("string",[], []) -> TEIstring "" | |
349 | - | Xml.Element("binary",["value",value],[]) -> Binary(try bool_of_string value with _ -> failwith "parse_tei") | |
350 | - | Xml.Element("numeric",["value",value],[]) -> Numeric(try int_of_string value with _ -> failwith "parse_tei") | |
351 | - | Xml.Element("fs", ["sameAs", same_as; "type",name], []) -> SameAs(parse_id same_as,name) | |
352 | - | Xml.Element("fs", ["sameAs", same_as], []) -> SameAs(parse_id same_as,"") | |
353 | - | xml -> failwith ("parse_tei: " ^ Xml.to_string_fmt xml) | |
354 | - | |
355 | -let parse_gf = function | |
356 | - "subj" -> SUBJ | |
357 | - | "obj" -> OBJ | |
358 | - | s -> failwith ("parse_gf: " ^ s) | |
359 | - | |
360 | -let parse_control arg = function | |
361 | - "controller" -> {arg with cr="1" :: arg.cr} | |
362 | - | "controllee" -> {arg with ce="1" :: arg.cr} | |
363 | - | "controller2" -> {arg with cr="2" :: arg.cr} | |
364 | - | "controllee2" -> {arg with ce="2" :: arg.cr} | |
365 | - | s -> failwith ("parse_control: " ^ s) | |
366 | - | |
367 | -let parse_case = function | |
368 | - "nom" -> Case "nom" | |
369 | - | "gen" -> Case "gen" | |
370 | - | "dat" -> Case "dat" | |
371 | - | "acc" -> Case "acc" | |
372 | - | "inst" -> Case "inst" | |
373 | - | "loc" -> Case "loc" | |
374 | - | "str" -> Str | |
375 | - | "pred" -> Case "pred" | |
376 | - | "part" -> Part | |
377 | - | "postp" -> Case "postp" | |
378 | - | "agr" -> CaseAgr | |
379 | - | s -> failwith ("parse_case: " ^ s) | |
380 | - | |
381 | -let parse_aspect = function | |
382 | - "perf" -> Aspect "perf" | |
383 | - | "imperf" -> Aspect "imperf" | |
384 | - | "_" -> AspectUndef | |
385 | - | "" -> AspectNA | |
386 | - | s -> failwith ("parse_aspect: " ^ s) | |
387 | - | |
388 | -let parse_negation = function | |
389 | - "_" -> NegationUndef | |
390 | - | "neg" -> Negation | |
391 | - | "aff" -> Aff | |
392 | - | "" -> NegationNA | |
393 | - | s -> failwith ("parse_negation: " ^ s) | |
394 | - | |
395 | -let parse_number = function | |
396 | - "sg" -> Number "sg" | |
397 | - | "pl" -> Number "pl" | |
398 | - | "agr" -> NumberAgr | |
399 | - | "_" -> NumberUndef | |
400 | - | s -> failwith ("parse_number: " ^ s) | |
401 | - | |
402 | -let parse_gender = function | |
403 | - "m1" -> Gender "m1" | |
404 | - | "m3" -> Gender "m3" | |
405 | - | "n" -> Genders["n1";"n2"] | |
406 | - | "f" -> Gender "f" | |
407 | - | "m1.n" -> Genders["m1";"n1";"n2"] | |
408 | - | "_" -> GenderUndef | |
409 | - | "agr" -> GenderAgr | |
410 | - | s -> failwith ("parse_gender: " ^ s) | |
411 | - | |
412 | -let parse_grad = function | |
413 | - "pos" -> Grad "pos" | |
414 | - | "com" -> Grad "com" | |
415 | - | "sup" -> Grad "sup" | |
416 | - | "_" -> GradUndef | |
417 | - | s -> failwith ("parse_grad: " ^ s) | |
418 | - | |
419 | -let rec parse_restr = function | |
420 | - "natr" -> Natr | |
421 | - | "atr" -> Atr | |
422 | - | "ratr" -> Ratr | |
423 | - | "atr1" -> Atr1 | |
424 | - | "ratr1" -> Ratr1 | |
425 | - | s -> failwith ("parse_restr: " ^ s) | |
426 | - | |
427 | - | |
428 | -let parse_comp = function | |
429 | - "int" -> Int,[] | |
430 | - | "rel" -> Rel,[] | |
431 | - | "co" -> CompTypeUndef,[Comp "co"] (* subst qub prep comp *) | |
432 | - | "kto" -> CompTypeUndef,[Comp "kto"] (* subst *) | |
433 | - | "ile" -> CompTypeUndef,[Comp "ile"] (* num adv *) | |
434 | - | "jaki" -> CompTypeUndef,[Comp "jaki"] (* adj *) | |
435 | - | "który" -> CompTypeUndef,[Comp "który"] (* adj *) | |
436 | - | "czyj" -> CompTypeUndef,[Comp "czyj"] (* adj *) | |
437 | - | "jak" -> CompTypeUndef,[Comp "jak"] (* prep conj adv *) | |
438 | - | "kiedy" -> CompTypeUndef,[Comp "kiedy"] (* comp adv *) | |
439 | - | "gdzie" -> CompTypeUndef,[Comp "gdzie"] (* qub adv *) | |
440 | - | "odkąd" -> CompTypeUndef,[Comp "odkąd"] (* adv *) | |
441 | - | "skąd" -> CompTypeUndef,[Comp "skąd"] (* adv *) | |
442 | - | "dokąd" -> CompTypeUndef,[Comp "dokąd"] (* adv *) | |
443 | - | "którędy" -> CompTypeUndef,[Comp "którędy"] (* adv *) | |
444 | - | "dlaczego" -> CompTypeUndef,[Comp "dlaczego"] (* adv *) | |
445 | - | "czemu" -> CompTypeUndef,[Comp "czemu"] (* adv *) | |
446 | - | "czy" -> CompTypeUndef,[Comp "czy"] (* qub conj *) | |
447 | - | "jakby" -> CompTypeUndef,[Comp "jakby"] (* qub comp *) | |
448 | - | "jakoby" -> CompTypeUndef,[Comp "jakoby"] (* qub comp *) | |
449 | - | "gdy" -> CompTypeUndef,[Gdy] (* adv; gdyby: qub comp *) | |
450 | - | "dopóki" -> CompTypeUndef,[Comp "dopóki"] (* comp *) | |
451 | - | "zanim" -> CompTypeUndef,[Comp "zanim"] (* comp *) | |
452 | - | "jeśli" -> CompTypeUndef,[Comp "jeśli"] (* comp *) | |
453 | - | "żeby2" -> CompTypeUndef,[Zeby] | |
454 | - | "żeby" -> CompTypeUndef,[Comp "żeby"] (* qub comp *) | |
455 | - | "że" -> CompTypeUndef,[Comp "że"] (* qub comp *) | |
456 | - | "aż" -> CompTypeUndef,[Comp "aż"] (* qub comp *) | |
457 | - | "bo" -> CompTypeUndef,[Comp "bo"] (* qub comp *) | |
458 | - | s -> failwith ("parse_comp: " ^ s) | |
459 | - | |
460 | -let load_type_constrains = function | |
461 | - | Symbol value -> | |
462 | - (match parse_comp value with | |
463 | - CompTypeUndef,[c] -> c | |
464 | - | _ -> failwith "load_type_constrains") | |
465 | - | xml -> failwith ("load_type_constrains:\n " ^ tei_to_string xml) | |
466 | - | |
467 | -let load_ctype = function | |
468 | - | F("type",Fs("type_def", x)) -> | |
469 | - (match x with | |
470 | - | [F("conjunction",Symbol value)] -> parse_comp value | |
471 | - | [F("conjunction",Symbol value);Fset("constraints",set)] -> | |
472 | - (match parse_comp value with | |
473 | - CompTypeUndef, _ -> failwith "load_ctype" | |
474 | - | ctype,[] -> ctype, List.rev (Xlist.rev_map set load_type_constrains) | |
475 | - | _ -> failwith "load_ctype") | |
476 | - | l -> failwith ("load_ctype 2:\n " ^ String.concat "\n" (Xlist.map l tei_to_string))) | |
477 | - | xml -> failwith ("load_ctype:\n " ^ tei_to_string xml) | |
478 | - | |
479 | -let load_lemmas_set = function | |
480 | - | TEIstring mstring -> mstring | |
481 | - | xml -> failwith ("load_lemmas_set:\n " ^ tei_to_string xml) | |
482 | - | |
483 | -let check_lemma s = | |
484 | - match Str.full_split (Str.regexp "(\\|)") s with | |
485 | - [Str.Text s] -> Lexeme s | |
486 | - | [Str.Text "E"; Str.Delim "("; Str.Text g; Str.Delim ")"] -> Elexeme(parse_gender g) | |
487 | - | _ -> failwith "check_lemma" | |
488 | - | |
489 | -let make_lemma = function | |
490 | - | _,_,[lemma] -> check_lemma lemma | |
491 | - | "XOR","concat",lemmas -> XOR(Xlist.map lemmas check_lemma) | |
492 | - | "OR","coord",lemmas -> ORcoord(Xlist.map lemmas check_lemma) | |
493 | - | "OR","concat",lemmas -> ORconcat(Xlist.map lemmas check_lemma) | |
494 | - | _ -> failwith "make_lemma" | |
495 | - | |
496 | -let process_lex_phrase lemma = function | |
497 | - NP(case),number,GenderUndef,GradUndef,NegationUndef,ReflUndef -> [SUBST(number,case),lemma] | |
498 | - | PrepNP(prep,case),number,GenderUndef,GradUndef,NegationUndef,ReflUndef -> [PREP case,Lexeme prep;SUBST(number,case),lemma] | |
499 | - | AdjP(case),number,gender,grad,NegationUndef,ReflUndef -> [ADJ(number,case,gender,grad),lemma] | |
500 | - | PrepAdjP(prep,case),number,gender,grad,NegationUndef,ReflUndef -> [PREP case,Lexeme prep;ADJ(number,case,gender,grad),lemma] | |
501 | - | InfP(aspect),NumberUndef,GenderUndef,GradUndef,negation,ReflTrue -> [INF(aspect,negation),lemma;QUB,Lexeme "się"] | |
502 | - | InfP(aspect),NumberUndef,GenderUndef,GradUndef,negation,refl -> [INF(aspect,negation),lemma] | |
503 | - | PpasP(case),number,gender,GradUndef,negation,ReflUndef -> [PPAS(number,case,gender,AspectUndef,negation),lemma] | |
504 | - | PrepPpasP(prep,case),number,gender,GradUndef,negation,ReflUndef -> [PREP case,Lexeme prep;PPAS(number,case,gender,AspectUndef,negation),lemma] | |
505 | - | PactP(case),number,gender,GradUndef,negation,ReflTrue -> [PACT(number,case,gender,AspectUndef,negation),lemma;QUB,Lexeme "się"] | |
506 | - | PactP(case),number,gender,GradUndef,negation,refl -> [PACT(number,case,gender,AspectUndef,negation),lemma] | |
507 | - | PrepGerP(prep,case),number,GenderUndef,GradUndef,negation,ReflTrue -> [PREP case,Lexeme prep;GER(number,case,GenderUndef,AspectUndef,negation),lemma;QUB,Lexeme "się"] | |
508 | - | PrepGerP(prep,case),number,GenderUndef,GradUndef,negation,refl -> [PREP case,Lexeme prep;GER(number,case,GenderUndef,AspectUndef,negation),lemma] | |
509 | - | Qub,NumberUndef,GenderUndef,GradUndef,NegationUndef,ReflUndef -> [QUB,lemma] | |
510 | - | AdvP(mode),NumberUndef,GenderUndef,grad,NegationUndef,ReflUndef -> [ADV grad,lemma] | |
511 | - | phrase,number,gender,grad,negation,reflex -> | |
512 | - Printf.printf "%s %s %s %s %s %s\n" (ENIAMwalStringOf.phrase phrase) (ENIAMwalStringOf.number number) | |
513 | - (ENIAMwalStringOf.gender gender) (ENIAMwalStringOf.grad grad) (ENIAMwalStringOf.negation negation) (ENIAMwalStringOf.refl reflex); [] | |
514 | - | |
515 | -let new_schema r cr ce morfs = | |
516 | - {psn_id=(-1); gf=r; role=""; role_attr=""; mode=[]; sel_prefs=[]; cr=cr; ce=ce; morfs=morfs} | |
517 | - | |
518 | -let rec process_lex lex = function | |
519 | - | Phrase(ComparP prep),arguments,Lexeme "",Lexeme "" -> | |
520 | - LexPhrase([COMPAR,Lexeme prep],(Ratrs,Xlist.map arguments (fun morf -> new_schema ARG [] [] [morf]))) | |
521 | - | PhraseAbbr(Xp mode,[argument]),_,_,_ -> | |
522 | - let lex = {lex with lex_argument=argument; lex_mode=mode :: lex.lex_mode} in | |
523 | - process_lex lex (lex.lex_argument,lex.lex_arguments,lex.lex_lemma,lex.lex_numeral_lemma) | |
524 | - (* | PhraseAbbr(Advp mode,[]),[],lemma,Lexeme "" -> | |
525 | - let poss = process_lex_phrase lemma (AdvP,lex.lex_number,lex.lex_gender,lex.lex_degree,lex.lex_negation,lex.lex_reflex) in | |
526 | - LexPhrase(poss,lex.lex_modification) *) | |
527 | - | Phrase (NumP(case)),[],lemma,num_lemma -> LexPhrase([NUM(case,GenderUndef),num_lemma;SUBST(NumberUndef,CaseUndef),lemma],lex.lex_modification) | |
528 | - | Phrase (PrepNumP(prep,case)),[],lemma,num_lemma -> LexPhrase([PREP case,Lexeme prep;NUM(case,GenderUndef),num_lemma;SUBST(NumberUndef,CaseUndef),lemma],lex.lex_modification) | |
529 | - | PhraseComp(Cp,(ctype,[Comp comp])),[],lemma,Lexeme "" -> | |
530 | - if lex.lex_reflex = ReflTrue then LexPhrase([COMP ctype,Lexeme comp;PERS(lex.lex_negation),lemma;QUB,Lexeme "się"],lex.lex_modification) | |
531 | - else LexPhrase([COMP ctype,Lexeme comp;PERS(lex.lex_negation),lemma],lex.lex_modification) | |
532 | - | PhraseComp(Cp,(ctype,[Comp comp1;Comp comp2])),[],lemma,Lexeme "" -> | |
533 | - if lex.lex_reflex = ReflTrue then LexPhrase([COMP ctype,XOR[Lexeme comp1;Lexeme comp2];PERS(lex.lex_negation),lemma;QUB,Lexeme "się"],lex.lex_modification) | |
534 | - else LexPhrase([COMP ctype,XOR[Lexeme comp1;Lexeme comp2];PERS(lex.lex_negation),lemma],lex.lex_modification) | |
535 | - | Phrase phrase,[],lemma,Lexeme "" -> | |
536 | - let poss = process_lex_phrase lemma (phrase,lex.lex_number,lex.lex_gender,lex.lex_degree,lex.lex_negation,lex.lex_reflex) in | |
537 | - LexPhrase(poss,lex.lex_modification) | |
538 | - | (argument,arguments,lemma,numeral_lemma) -> | |
539 | - let s = Printf.sprintf "%s [%s] %s %s\n" (ENIAMwalStringOf.morf argument) | |
540 | - (String.concat ";" (Xlist.map arguments ENIAMwalStringOf.morf)) | |
541 | - (ENIAMwalStringOf.lex lemma) (ENIAMwalStringOf.lex numeral_lemma) in | |
542 | - failwith ("process_lex: " ^ s) | |
543 | - | |
544 | -(* UWAGA: refl_id może się zmienić wraz z wersją Walentego *) | |
545 | -let refl_id = 25 | |
546 | -let refl_position = {empty_position with role="Lemma"; mode=["lemma"]; morfs=[MorfId refl_id]} | |
547 | - | |
548 | -let rec load_category = function | |
549 | - | F("category",Fs("category_def",x)) -> | |
550 | - (match x with | |
551 | - | [F("name",Symbol value)] -> value, [] | |
552 | - | [F("name",Symbol value);Fset("constraints",set)] -> | |
553 | - value, List.rev (Xlist.rev_map set (load_phrase (ref []))) | |
554 | - | l -> failwith ("load_category 2:\n " ^ String.concat "\n" (Xlist.map l tei_to_string))) | |
555 | - | xml -> failwith ("load_category:\n " ^ tei_to_string xml) | |
556 | - | |
557 | -and load_modification_def = function (*pomocnicza do load_lex *) | |
558 | - | [F("type",Symbol value)] -> parse_restr value, [] | |
559 | - | [F("type",Symbol value); Fset("positions",set)] -> | |
560 | - parse_restr value, List.rev (Xlist.rev_map set (load_position (-1) (-1) (ref IntMap.empty))) | |
561 | - | x -> Printf.printf "%s\n" (tei_to_string (List.hd x)); | |
562 | - failwith "load_modification_def:\n" | |
563 | - | |
564 | -and load_lex arg xml = match xml with | |
565 | - | F("argument",set) -> | |
566 | - let mode = ref [] in | |
567 | - let a = load_phrase mode set in | |
568 | - {arg with lex_argument = a; lex_mode = !mode} | |
569 | - | Fset("arguments",set) -> | |
570 | - {arg with lex_arguments=List.rev (Xlist.rev_map set (load_phrase (ref [])))} | |
571 | - | F("modification",Fs("modification_def",x)) -> {arg with lex_modification = load_modification_def x} | |
572 | - | F("lemma",Fs("lemma_def",[F("selection_mode",Symbol value1); | |
573 | - F("cooccurrence",Symbol value2); | |
574 | - Fset("lemmas",lemmas)])) -> | |
575 | - {arg with lex_lemma = make_lemma (value1, value2, List.rev (Xlist.rev_map lemmas load_lemmas_set))} | |
576 | - | F("numeral_lemma",Fs("numeral_lemma_def",[F("selection_mode",Symbol value1); | |
577 | - F("cooccurrence",Symbol value2); | |
578 | - Fset("lemmas",lemmas)])) -> | |
579 | - {arg with lex_numeral_lemma = make_lemma (value1, value2, List.rev (Xlist.rev_map lemmas load_lemmas_set))} | |
580 | - | F("negation",Symbol value) -> {arg with lex_negation = parse_negation value} | |
581 | - | F("degree",Symbol value) -> {arg with lex_degree = parse_grad value} | |
582 | - | F("number",Symbol value) -> {arg with lex_number = parse_number value} | |
583 | - | F("reflex",Binary true) -> {arg with lex_reflex = ReflTrue} | |
584 | - | F("reflex",Binary false) -> {arg with lex_reflex = ReflFalse} | |
585 | - | Fset("reflex",[]) -> {arg with lex_reflex = ReflEmpty} | |
586 | - | Fset("gender",[Symbol value]) -> {arg with lex_gender = parse_gender value} | |
587 | - | xml -> | |
588 | - Printf.printf "%s\n" (tei_to_string xml); | |
589 | - failwith "load_lex:\n " | |
590 | - | |
591 | -and load_phrase mode = function | |
592 | - | Fs("np",[F("case",Symbol a)]) -> Phrase (NP(parse_case a)); | |
593 | - | Fs("prepnp", [F("preposition",Symbol a);F("case",Symbol b)]) -> Phrase (PrepNP(a, parse_case b)) | |
594 | - | Fs("adjp", [F("case",Symbol a)]) -> Phrase (AdjP(parse_case a)) | |
595 | - | Fs("prepadjp", [F("preposition",Symbol a);F("case",Symbol b)]) -> Phrase (PrepAdjP(a, parse_case b)) | |
596 | - | Fs("comprepnp", [e;F("complex_preposition",TEIstring a)]) -> Phrase (ComprepNP(a)) | |
597 | - | Fs("comprepnp", [F("complex_preposition",TEIstring a)]) -> Phrase (ComprepNP(a)) | |
598 | - | Fs("cp", [a]) -> PhraseComp(Cp,load_ctype a) | |
599 | - | Fs("ncp", [F("case",Symbol a);b]) -> PhraseComp(Ncp(parse_case a),load_ctype b) | |
600 | - | Fs("prepncp", [F("preposition",Symbol a);F("case",Symbol b);c]) -> PhraseComp(Prepncp(a, parse_case b),load_ctype c) | |
601 | - | Fs("infp", [F("aspect",Symbol a)]) -> Phrase (InfP(parse_aspect a)) | |
602 | - | Fs("xp", [a]) -> let x,y = load_category a in mode:=x :: !mode; PhraseAbbr(Xp x,y) | |
603 | - | Fs("xp", [e;a]) -> let x,y = load_category a in mode:=x :: !mode; PhraseAbbr(Xp x,y) | |
604 | - | Fs("advp", [F("category",Symbol a)]) -> mode:=a :: !mode; Phrase(AdvP(a)) | |
605 | - | Fs("advp", [e;F("category",Symbol a)]) -> mode:=a :: !mode; Phrase(AdvP(a)) | |
606 | - | Fs("nonch", []) -> mode:="nonch" :: !mode; PhraseAbbr(Nonch,[]) | |
607 | - | Fs("or", []) -> Phrase Or | |
608 | - | Fs("refl", []) -> mode:="refl" :: !mode; LexPhrase([QUB,Lexeme "się"],(Natr,[])) | |
609 | - | Fs("E", []) -> E Null | |
610 | - | Fs("lex", x) -> | |
611 | - let lex = Xlist.fold x empty_lex load_lex in | |
612 | - mode := lex.lex_mode @ !mode; | |
613 | - process_lex lex (lex.lex_argument,lex.lex_arguments,lex.lex_lemma,lex.lex_numeral_lemma) | |
614 | - | Fs("fixed", [F("argument",a);F("string",TEIstring b)]) -> Phrase (FixedP((*snd (load_phrase a),*)b)) | |
615 | - | Fs("possp", [e]) -> mode:="possp" :: !mode; PhraseAbbr(Possp,[]) | |
616 | - | Fs("possp", []) -> mode:="possp" :: !mode; PhraseAbbr(Possp,[]) | |
617 | - | Fs("recip", []) -> mode:="recip" :: !mode; LexPhrase([QUB,Lexeme "się"],(Natr,[])) | |
618 | - | Fs("distrp", [e]) -> mode:="distrp" :: !mode; PhraseAbbr(Distrp,[]) | |
619 | - | Fs("distrp", []) -> mode:="distrp" :: !mode; PhraseAbbr(Distrp,[]) | |
620 | - | Fs("compar", [F("compar_category",Symbol value)]) -> Phrase(ComparP value) | |
621 | - | Fs("gerp", [F("case",Symbol a)]) -> Phrase (GerP(parse_case a)) | |
622 | - | Fs("prepgerp", [F("preposition",Symbol a);F("case",Symbol b)]) -> Phrase (PrepGerP(a, parse_case b)) | |
623 | - | Fs("nump", [F("case",Symbol a)]) -> Phrase (NumP(parse_case a)) | |
624 | - | Fs("prepnump", [F("preposition",Symbol a);F("case",Symbol b)]) -> Phrase (PrepNumP(a, parse_case b)) | |
625 | - | Fs("ppasp", [F("case",Symbol a)]) -> Phrase (PpasP(parse_case a)) | |
626 | - | Fs("prepppasp", [F("preposition",Symbol a);F("case",Symbol b)]) -> Phrase (PrepPpasP(a, parse_case b)) | |
627 | - | Fs("qub", []) -> Phrase Qub | |
628 | - | Fs("pactp", [F("case",Symbol a)]) -> Phrase (PactP(parse_case a)) | |
629 | - | Fs("adverb",[F("adverb",Symbol s)]) -> LexPhrase([ADV (Grad "pos"),Lexeme s],(Natr,[])) | |
630 | - | xml -> failwith ("load_phrase match:\n " ^ tei_to_string xml) | |
631 | - | |
632 | -and load_phrase_id ent sch psn phrases mode = function | |
633 | - | Fs(morf,Id{hash=false; numbers=[ent_id;sch_id;psn_id;id]; suffix="phr"} :: l) -> | |
634 | - if ent_id = ent && sch_id = sch && psn_id = psn then | |
635 | - let morf = load_phrase mode (Fs(morf, l)) in | |
636 | - phrases := IntMap.add_inc (!phrases) id morf (fun morf2 -> if morf = morf2 then morf else failwith "load_phrase_id"); | |
637 | - MorfId id | |
638 | - else failwith (Printf.sprintf "load_phrase %d %d" ent ent_id) | |
639 | - | Fs(morf, l) -> load_phrase mode (Fs(morf, l)) | |
640 | - | _ -> failwith "load_phrase_id" | |
641 | - | |
642 | - | |
643 | -and load_control arg = function | |
644 | - | Symbol value -> parse_control arg value | |
645 | - | xml -> failwith ("load_control:\n " ^ tei_to_string xml) | |
646 | - | |
647 | -and load_position_info ent sch phrases arg = function | |
648 | - | F("function",Symbol value) -> {arg with gf = parse_gf value} | |
649 | - | Fset("phrases",phrases_set) -> | |
650 | - let mode = ref [] in | |
651 | - let morfs = List.rev (Xlist.rev_map phrases_set (load_phrase_id ent sch arg.psn_id phrases mode)) in | |
652 | - {arg with morfs = morfs; mode = StringSet.to_list (StringSet.of_list (!mode))} | |
653 | - | Fset("control",control_set) -> Xlist.fold control_set arg load_control | |
654 | - | Id{hash=false; numbers=[ent_id;sch_id;id]; suffix="psn"} -> | |
655 | - if ent_id = ent && sch_id = sch then {arg with psn_id = id} | |
656 | - else failwith (Printf.sprintf "load_position_info %d %d" ent ent_id) | |
657 | - | xml -> failwith ("load_position_info:\n " ^ tei_to_string xml) | |
658 | - | |
659 | -and load_position ent sch phrases = function | |
660 | - | Fs("position", listt) -> | |
661 | - Xlist.fold listt empty_position (load_position_info ent sch phrases) | |
662 | - | xml -> failwith ("load_position:\n " ^ tei_to_string xml) | |
663 | - | |
664 | -let parse_opinion = function | |
665 | - "cer" -> Pewny | |
666 | - | "col" -> Potoczny | |
667 | - | "unc" -> Watpliwy | |
668 | - | "dat" -> Archaiczny | |
669 | - | "bad" -> Zly | |
670 | - | "vul" -> Wulgarny | |
671 | - | "unk" -> Nieokreslony | |
672 | - | "met" -> Metaforyczny | |
673 | - | "dom" -> Dziedzinowy | |
674 | - | "rar" -> Sporadyczny | |
675 | - | "wątpliwy" -> Watpliwy | |
676 | - | "dobry" -> Pewny | |
677 | - | "zły" -> Zly | |
678 | - | x -> failwith ("parse_opinion: " ^ x) | |
679 | - | |
680 | -let load_schema_info ent phrases (arg:schema) = function | |
681 | - | F("opinion",Symbol opinion_value) -> {arg with opinion = parse_opinion opinion_value} | |
682 | - | F("inherent_sie",Binary b) -> {arg with reflexiveMark = b} | |
683 | - | F("aspect",Symbol aspect_value) -> {arg with aspect = parse_aspect aspect_value} | |
684 | - | Fset("aspect", []) -> arg | |
685 | - | F("negativity",Symbol negativity_value) -> {arg with negativity = parse_negation negativity_value} | |
686 | - | Fset("negativity",[]) -> arg | |
687 | - | F("predicativity",Binary true) -> {arg with predicativity = PredTrue} | |
688 | - | F("predicativity",Binary false) -> {arg with predicativity = PredFalse} | |
689 | - | Fset("positions", positions) -> | |
690 | - {arg with positions = List.rev (Xlist.rev_map positions (load_position ent arg.sch_id phrases))} | |
691 | - | F("text_rep",TEIstring text_rep) -> {arg with text_rep = text_rep} | |
692 | - | Id{hash=false; numbers=[ent_id;id]; suffix="sch"} -> if ent_id = ent then {arg with sch_id = id} else failwith (Printf.sprintf "load_schema_info %d %d" ent ent_id) | |
693 | - | xml -> failwith ("load_schema_info\n " ^ tei_to_string xml) | |
694 | - | |
695 | -let load_schema ent phrases = function | |
696 | - Fs("schema", schema) -> | |
697 | - let result = {sch_id = (-1); opinion = OpinionUndef; reflexiveMark = false; aspect = AspectUndef; | |
698 | - negativity = NegationUndef; predicativity = PredUndef; positions = []; text_rep=""} in | |
699 | - let result = Xlist.fold schema result (load_schema_info ent phrases) in | |
700 | - result | |
701 | - | xml -> failwith ("load_schema:\n " ^ tei_to_string xml) | |
702 | - | |
703 | -let load_phrases_set ent = function | |
704 | - | SameAs({hash=true; numbers=[ent_id;sch_id;psn_id;phr_id]; suffix="phr"},"phrase") -> | |
705 | - if ent_id <> ent then failwith (Printf.sprintf "load_phrases_set %d %d" ent ent_id) else | |
706 | - sch_id,psn_id,phr_id | |
707 | - | xml -> failwith ("load_phrases_set :\n " ^ tei_to_string xml) | |
708 | - | |
709 | -let load_example_info ent arg = function | |
710 | - | F("meaning",SameAs({hash=true; numbers=[ent_id;id]; suffix="mng"},"lexical_unit")) -> | |
711 | - if ent_id = ent then {arg with meaning = id} else failwith (Printf.sprintf "load_example_info %d %d" ent ent_id) | |
712 | - | Fset("phrases",phrases_set) -> | |
713 | - {arg with phrases = List.rev (Xlist.rev_map phrases_set (load_phrases_set ent))} | |
714 | - | F("sentence",TEIstring sentence_string) -> {arg with sentence = sentence_string} | |
715 | - | F("source",Symbol source_value) -> {arg with source = source_value} | |
716 | - | F("opinion",Symbol opinion_value) -> {arg with opinion = parse_opinion opinion_value} | |
717 | - | F("note",TEIstring note_string) -> {arg with note = note_string} | |
718 | - | Id{hash=false; numbers=[ent_id;id]; suffix="exm"} -> if ent_id = ent then {arg with exm_id = id} else failwith (Printf.sprintf "load_example_info %d %d" ent ent_id) | |
719 | - | xml -> failwith ("load_example_info: \n " ^ tei_to_string xml) | |
720 | - | |
721 | -let load_example ent = function | |
722 | - | Fs("example",example_elements) -> | |
723 | - let result = {exm_id = (-1); meaning = (-1); phrases = []; sentence = ""; | |
724 | - source = ""; opinion = OpinionUndef; note = "";} in | |
725 | - let result = Xlist.fold example_elements result (load_example_info ent) in | |
726 | - result | |
727 | - | xml -> failwith ("load_example: \n " ^ tei_to_string xml) | |
728 | - | |
729 | -let load_self_prefs_sets name ent frm = function | |
730 | - | Numeric value -> if name = "synsets" then SynsetId value else failwith "load_self_prefs_sets" | |
731 | - | Symbol value -> if name = "predefs" then Predef value else failwith "load_self_prefs_sets" | |
732 | - | Fs("relation",[F("type",Symbol value);F("to",SameAs({hash=true; numbers=[ent_id;frm_id;arg_id]; suffix="arg"}, "argument"))]) -> | |
733 | - if ent_id <> ent || frm_id <> frm || name <> "relations" then failwith (Printf.sprintf "load_self_prefs_sets %d %d" ent ent_id) | |
734 | - else RelationArgId(value,arg_id) | |
735 | - | xml -> failwith ("load_self_prefs_sets: \n " ^ tei_to_string xml) | |
736 | - | |
737 | -let load_argument_self_prefs ent frm = function | |
738 | - | Fset(name,self_prefs_set) -> | |
739 | - List.rev (Xlist.rev_map self_prefs_set (load_self_prefs_sets name ent frm)) | |
740 | - | xml -> failwith ("load_argument_self_prefs: \n " ^ tei_to_string xml) | |
741 | - | |
742 | -let load_argument_info ent frm arg = function | |
743 | - | F("role",Symbol value) -> {arg with role = value} | |
744 | - | F("role_attribute",Symbol value) -> {arg with role_attribute = value} | |
745 | - | F("sel_prefs",Fs("sel_prefs_groups", self_prefs)) -> | |
746 | - {arg with sel_prefs = List.flatten (List.rev (Xlist.rev_map self_prefs (load_argument_self_prefs ent frm)))} | |
747 | - (* | Id id -> {arg with arg_id = id} *) | |
748 | - | Id{hash=false; numbers=[ent_id;frm_id;id]; suffix="arg"} -> | |
749 | - if ent_id = ent && frm_id = frm then {arg with arg_id = id} | |
750 | - else failwith (Printf.sprintf "load_argument_info %d %d" ent ent_id) | |
751 | - | xml -> failwith ("load_argument_info :\n " ^ tei_to_string xml) | |
752 | - | |
753 | -let load_arguments_set ent frm = function | |
754 | - | Fs("argument", info) -> | |
755 | - let result = {arg_id = (-1); role = ""; role_attribute = ""; sel_prefs = []} in | |
756 | - let result = Xlist.fold info result (load_argument_info ent frm) in | |
757 | - result | |
758 | - | xml -> failwith ("load_arguments_set :\n " ^ tei_to_string xml) | |
759 | - | |
760 | -let load_meanings_set ent = function | |
761 | - | SameAs({hash=true; numbers=[ent_id;id]; suffix="mng"},"lexical_unit") -> | |
762 | - if ent_id = ent then id else failwith (Printf.sprintf "load_meanings_set %d %d" ent ent_id) | |
763 | - | xml -> failwith ("load_meanings_set :\n " ^ tei_to_string xml) | |
764 | - | |
765 | -let load_frame ent = function | |
766 | - | Fs("frame",[ | |
767 | - Id{hash=false; numbers=[ent_id;id]; suffix="frm"}; | |
768 | - F("opinion",Symbol opinion); | |
769 | - Fset("meanings",meanings_set); | |
770 | - Fset("arguments",arguments_set)]) -> | |
771 | - if ent_id <> ent then failwith (Printf.sprintf "load_frame %d %d" ent ent_id) else | |
772 | - {frm_id = id; | |
773 | - opinion = parse_opinion opinion; | |
774 | - meanings = List.rev (Xlist.rev_map meanings_set (load_meanings_set ent)); | |
775 | - arguments = List.rev (Xlist.rev_map arguments_set (load_arguments_set ent id))} | |
776 | - | xml -> failwith ("load_frame :\n " ^ tei_to_string xml) | |
777 | - | |
778 | -let load_meaning_info ent arg = function | |
779 | - | F("name",TEIstring name_string) -> {arg with name = name_string} | |
780 | - | F("variant",TEIstring variant_string) -> {arg with variant = variant_string} | |
781 | - | F("plwnluid",Numeric value) -> {arg with plwnluid = value} | |
782 | - | F("gloss",TEIstring gloss_string) -> {arg with gloss = gloss_string} | |
783 | - | Id{hash=false; numbers=[ent_id;id]; suffix="mng"} -> if ent_id = ent then {arg with mng_id = id} else failwith (Printf.sprintf "load_meaning_info %d %d" ent ent_id) | |
784 | - | xml -> failwith ("load_meaning_info:\n " ^ tei_to_string xml) | |
785 | - | |
786 | - | |
787 | -let load_meaning ent = function | |
788 | - | Fs("lexical_unit", meaning_info) -> | |
789 | - Xlist.fold meaning_info empty_meaning (load_meaning_info ent) | |
790 | - | xml -> failwith ("load_meaning:\n " ^ tei_to_string xml) | |
791 | - | |
792 | -let load_alter_connection ent = function | |
793 | - | Fs("connection", [ | |
794 | - F("argument",SameAs({hash=true; numbers=[ent_id;frm_id;arg_id]; suffix="arg"},"argument")); | |
795 | - Fset("phrases",phrases)]) -> | |
796 | - if ent_id <> ent then failwith (Printf.sprintf "load_alter_connection %d %d" ent ent_id) else | |
797 | - let phrases,sch_set = Xlist.fold phrases (IntMap.empty,IntSet.empty) (fun (phrases,sch_set) phrase -> | |
798 | - let sch_id,psn_id,phr_id = load_phrases_set ent phrase in | |
799 | - IntMap.add_inc phrases psn_id [phr_id] (fun l -> phr_id :: l), | |
800 | - IntSet.add sch_set sch_id) in | |
801 | - if IntSet.size sch_set <> 1 then failwith (Printf.sprintf "load_alter_connection: |sch_set|=%d" (IntSet.size sch_set)) else | |
802 | - IntSet.min_elt sch_set, frm_id, | |
803 | - {argument = arg_id; phrases = IntMap.fold phrases [] (fun l psn phrs -> (psn,phrs) :: l)} | |
804 | - | xml -> failwith ("load_alter_connections: \n " ^ tei_to_string xml) | |
805 | - | |
806 | -let load_alternations ent = function | |
807 | - | Fs("alternation",[Fset("connections",connections_set)]) -> | |
808 | - let conns,sch_set,frm_set = Xlist.fold connections_set ([],IntSet.empty,IntSet.empty) (fun (conns,sch_set,frm_set) conn -> | |
809 | - let sch_id,frm_id,conn = load_alter_connection ent conn in | |
810 | - conn :: conns, IntSet.add sch_set sch_id, IntSet.add frm_set frm_id) in | |
811 | - if IntSet.size sch_set <> 1 then failwith (Printf.sprintf "load_alternations: |sch_set|=%d" (IntSet.size sch_set)) else | |
812 | - if IntSet.size frm_set <> 1 then failwith (Printf.sprintf "load_alternations: |frm_set|=%d" (IntSet.size sch_set)) else | |
813 | - {schema=IntSet.min_elt sch_set; frame=IntSet.min_elt frm_set; connections=List.rev conns} | |
814 | - | xml -> failwith ("load_alternations: \n " ^ tei_to_string xml) | |
815 | - | |
816 | -let load_entry phrases = function | |
817 | - | Xml.Element("entry",["xml:id",id], l) -> | |
818 | - (* print_endline id; *) | |
819 | - let id = match parse_id id with | |
820 | - {hash=false; numbers=[id]; suffix="ent"} -> id | |
821 | - | _ -> failwith "process_meanings" in | |
822 | - let entry = {empty_entry with ent_id = id} in | |
823 | - Xlist.fold l entry (fun e -> function | |
824 | - Xml.Element("form", [], [ | |
825 | - Xml.Element("orth",[],[Xml.PCData orth]); | |
826 | - Xml.Element("pos",[],[Xml.PCData pos])]) -> (*print_endline orth;*) {e with form_orth=orth; form_pos=pos} | |
827 | - | xml -> (match parse_tei xml with | |
828 | - | Fs("syntactic_layer", [Fset("schemata",schemata_set)]) -> {e with schemata = List.rev (Xlist.rev_map schemata_set (load_schema id phrases))} | |
829 | - | Fs("examples_layer", [Fset("examples",examples_set)]) -> {e with examples = List.rev (Xlist.rev_map examples_set (load_example id))} | |
830 | - | Fs("semantic_layer", [Fset("frames",frame_set)]) -> {e with frames = List.rev (Xlist.rev_map frame_set (load_frame id))} | |
831 | - | Fs("meanings_layer", [Fset("meanings",meanings_set)]) -> {e with meanings = List.rev (Xlist.rev_map meanings_set (load_meaning id))} | |
832 | - | Fs("connections_layer",[Fset("alternations",alternations)]) -> {e with alternations = List.rev (Xlist.rev_map alternations (load_alternations id))} | |
833 | - | Fs("general_info",[F("status",TEIstring status)]) -> {e with status=status} | |
834 | - | xml -> failwith ("load_entry: \n" ^ tei_to_string xml))) | |
835 | - | xml -> failwith ("load_entry: \n" ^ Xml.to_string_fmt xml) | |
836 | - | |
837 | -let load_walenty filename = | |
838 | - begin | |
839 | - match Xml.parse_file filename with | |
840 | - Xml.Element("TEI", _, | |
841 | - [Xml.Element("teiHeader",_,_) ; | |
842 | - Xml.Element("text",[],[Xml.Element("body",[],entries)])]) -> | |
843 | - let phrases = ref IntMap.empty in | |
844 | - let walenty = List.rev (Xlist.rev_map entries (load_entry phrases)) in | |
845 | - walenty, !phrases | |
846 | - | _ -> failwith "load_walenty" | |
847 | - end | |
848 | - | |
849 | -let correct_expansion = function | |
850 | - [{gf=ARG; cr=[]; ce=[]; morfs=[Phrase(FixedP s)]};p] -> [LexPhrase([FIXED,Lexeme s],(Ratr,[p]))] | |
851 | - | [{gf=ARG; cr=[]; ce=[]; morfs=[LexPhrase([pos,Lexeme "własny"],(Natr,[]))]};{morfs=[a;b]} as p] -> | |
852 | - [a;b;LexPhrase([pos,Lexeme "własny"],(Atr,[p]))] | |
853 | - | _ -> failwith "correct_expansion" | |
854 | - | |
855 | -let load_expansion = function | |
856 | - Fs("expansion",[F("opinion",Symbol opinion);Fset("phrases",[p])]) -> [load_phrase (ref []) p] | |
857 | - | Fs("expansion",[F("opinion",Symbol opinion);Fset("positions",set)]) -> correct_expansion (List.rev (Xlist.rev_map set (load_position (-1) (-1) (ref IntMap.empty)))) | |
858 | - | tei -> failwith ("load_expansion: \n" ^ tei_to_string tei) | |
859 | - | |
860 | -let load_rentry = function | |
861 | - | Xml.Element("entry",["xml:id",id], [phrase;exp]) -> | |
862 | - let id = match parse_id id with | |
863 | - {hash=false; numbers=[id]; suffix="exp"} -> id | |
864 | - | _ -> failwith "process_meanings" in | |
865 | - let morf = load_phrase (ref []) (parse_tei phrase) in | |
866 | - let expansions = match parse_tei exp with | |
867 | - | Fs("phrase_type_expansions", [Fset("expansions",expansions)]) -> List.flatten (List.rev (Xlist.rev_map expansions load_expansion)) | |
868 | - | Fs("phrase_type_expansions", [F("expansions",expansion)]) -> load_expansion expansion | |
869 | - | tei -> failwith ("load_entry: \n" ^ tei_to_string tei) in | |
870 | - id,morf,expansions | |
871 | - | xml -> failwith ("load_entry: \n" ^ Xml.to_string_fmt xml) | |
872 | - | |
873 | -let expands_supplement = [ | |
874 | - (-2), PhraseAbbr(Nonch,[]), [ | |
875 | - LexPhrase([SUBST(NumberUndef,Str),Lexeme "co"],(Natr,[])); | |
876 | - LexPhrase([SUBST(NumberUndef,Str),Lexeme "coś"],(Natr,[])); | |
877 | - LexPhrase([SUBST(NumberUndef,Str),Lexeme "nic"],(Natr,[])); | |
878 | - LexPhrase([SUBST(NumberUndef,Str),Lexeme "to"],(Natr,[])); | |
879 | - ]; | |
880 | - (-3), Phrase (AdvP "pron"), [ | |
881 | - LexPhrase([ADV (Grad "pos"),Lexeme "tak"],(Natr,[])); | |
882 | - LexPhrase([ADV (Grad "pos"),Lexeme "jak"],(Natr,[])) | |
883 | - ]] | |
884 | - | |
885 | -let load_expands filename = | |
886 | - begin | |
887 | - match Xml.parse_file filename with | |
888 | - Xml.Element("TEI", _, | |
889 | - [Xml.Element("teiHeader",_,_) ; | |
890 | - Xml.Element("text",[],[Xml.Element("body",[],entries)])]) -> | |
891 | - expands_supplement @ List.rev (Xlist.rev_map entries load_rentry) | |
892 | - | _ -> failwith "load_walenty" | |
893 | - end | |
894 | - | |
895 | -let subtypes = [ | |
896 | - "int",[ | |
897 | - "co"; "czemu"; "czy"; "czyj"; "dlaczego"; "dokąd"; "gdzie"; "ile"; "jak"; | |
898 | - "jaki"; "kiedy"; "kto"; "którędy"; "który"; "odkąd"; "skąd"; "jakoby"]; | |
899 | - "rel",[ | |
900 | - "co"; "dokąd"; "gdzie"; "jak"; "jakby"; "jaki"; "jakoby"; "kiedy"; "kto"; | |
901 | - "którędy"; "który"; "odkąd"; "skąd"]] | |
902 | - | |
903 | -let equivs = ["jak",["niczym"]; "przeciw",["przeciwko"]] | |
904 | -*) | |
... | ... |