Commit bc99576298bc25ab6e1962b0e95aab6c70f494e8

Authored by Wojciech Jaworski
1 parent 4c6f5e84

rozpoczęcia dostosowywania preprocesingu w formacie conll

Showing 1 changed file with 20 additions and 20 deletions
pre/preProcessing.ml
... ... @@ -582,9 +582,7 @@ let parse query (*next_id*) =
582 582 let paths = PreLemmatization.combine_interps paths in
583 583 (* print_endline (PrePaths.to_string paths); *)*)
584 584  
585   -(* let parse_conll paths next_id =
586   - (* print_endline "a11"; *)
587   - let paths = PreMWE.process paths in
  585 +let parse_conll tokens paths next_id =
588 586 (* print_endline "a12"; *)
589 587 let paths = find_proper_names paths in
590 588 (* print_endline "a13"; *)
... ... @@ -608,7 +606,7 @@ let parse query (*next_id*) =
608 606 let paths = prepare_indexes paths in
609 607 let paths = PrePaths.sort paths in
610 608 (* print_endline "a18"; *)
611   - paths, next_id *)
  609 + paths, next_id
612 610  
613 611 let make_ids tokens paths =
614 612 Xlist.rev_map paths (fun t ->
... ... @@ -621,32 +619,34 @@ let make_chart paths last =
621 619 chart.(beg) <- (id,next) :: chart.(beg));
622 620 chart
623 621  
  622 +let split_into_sentences paths =
  623 + let paths = make_ids tokens paths in
  624 + let paths,last = PreSentences.prepare_indexes paths in
  625 + let chart = make_chart paths last in
  626 + let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in
  627 + PreSentences.find_sentence par tokens chart last;
  628 + PreSentences.find_quoted_sentences par tokens chart last;
  629 + PreSentences.find_query par tokens chart last;
  630 + PreSentences.extract_sentences tokens chart last
  631 +
624 632 let parse_text = function
625 633 RawText query ->
626 634 let tokens = ExtArray.make 100 empty_token in
627 635 let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
628 636 let paragraphs = Xlist.map (Xstring.split "\n" query) (fun paragraph ->
629 637 let paths,_ = parse paragraph in
630   - let paths = make_ids tokens paths in
631   - let paths,last = PreSentences.prepare_indexes paths in
632   - let chart = make_chart paths last in
633   - let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in
634   - PreSentences.find_sentence par tokens chart last;
635   - PreSentences.find_quoted_sentences par tokens chart last;
636   - PreSentences.find_query par tokens chart last;
637   - let sentences = PreSentences.extract_sentences tokens chart last in
  638 + let sentences = split_into_sentences tokens paths in
638 639 AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in
639 640 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)]
640   -(* | AltText[Raw,RawText query;CONLL,StructText([
641   - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(_,paths,last)]} as p]],_)] ->
642   - let (cpaths,clast), next_id = parse_conll (paths,last) first_id in
643   - let (paths,last), next_id = parse query next_id in
644   - let sentences, next_id = split_into_sentences query paths last next_id in
  641 + | AltText[Raw,RawText query;CONLL,StructText([
  642 + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] ->
  643 + parse_conll tokens paths;
  644 + let paths,_ = parse query in
  645 + let sentences = split_into_sentences tokens paths in
645 646 let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text;
646   - Mate, StructSentence("M",cpaths,clast); CONLL, StructSentence("C",cpaths,clast)]}] in
647   - (* print_endline "parse_text 6"; *)
  647 + Mate, StructSentence("M",dep_paths); CONLL, StructSentence("C",dep_paths)]}] in
648 648 AltText[Raw,RawText query; Struct, StructText([
649   - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],next_id)]*)
  649 + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)]
650 650 | _ -> failwith "parse_text: not implemented"
651 651  
652 652 let rec main_loop in_chan out_chan =
... ...