sentenceCompare.ml
7.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
open PreTypes
let initSize = 16 (* Initial size of token array *)
let fileName = "fold_morph.txt" (* Name of the files to be parsed *)
let rec getSome (x : 'a option) : 'a =
match x with
| Some a -> a
| None -> failwith "Unexpected None\n"
(* Returns a list equal to l without its n first elements *)
let rec drop (l : 'a list) (n : int) : 'a list =
if n < 0 then failwith "Cannot drop a negative number of elements"
else if n = 0 then l
else match l with
| h::t -> drop t (n - 1)
| [] -> failwith "drop exceeds the list's length"
(* Returns a list of n first elements of l *)
let take (l : 'a list) (n : int) : 'a list =
if n < 0 then failwith "Cannot take a negative number of elements"
else let rec loop n l a =
if n = 0 then List.rev a
else match l with
| h::t -> loop (n - 1) t (h::a)
| [] -> failwith "take exceeds the list's length" in
loop n l []
(* Returns a sublist of l starting at start of length len *)
let sublist (l : 'a list) (start : int) (len : int) : 'a list =
take (drop l start) len
(* Returns a sublist of a unicode string s starting at start of length len *)
let unicodeSub (s : string) (start: int) (len : int) : string =
let is = Xunicode.unicode_chars_of_utf8_string s in
let subIs = sublist is start len in
let bs = List.map Xunicode.string_of_uchar subIs in
let ss = List.map Bytes.to_string bs in
String.concat "" ss
(* Determines the indentation of a line s. One level of indentation is marked with a single tab. Tabs are separated by spaces *)
let indent (s : string) : int =
if s = "" then 0
else let rec loop (s: string) (pos: int) : int =
if s.[pos] = '\t' then loop s (pos + 2)
else pos / 2 in
loop s 0
(* Skips to the next paragraph i.e. after running skipToParagraph the next line should be the text of a paragraph *)
let rec skipToParagraph (i: in_channel) : unit =
let s = input_line i in
if indent s <> 1 then skipToParagraph i
(* Returns a string equal to s without its pre first and post last elements *)
let cutString (s : string) (pre : int) (post : int) : string =
String.sub s pre (String.length s - pre - post)
(* Reads beg from a line in the following format:
<segId>; <beg>; <len>; <form>; <interp>; ;*)
let segBeg (s : string) : int =
int_of_string (List.nth (Str.split (Str.regexp "; ") s) 1)
(* Reads len from a line in the following format:
<segId>; <beg>; <len>; <form>; <interp>; ;*)
let segLen (s : string) : int =
int_of_string (List.nth (Str.split (Str.regexp "; ") s) 2)
(* Return the raw text of a sentence *)
let rec rawSentenceString (s : sentence) : string option =
match s with
| RawSentence rs -> Some rs
| StructSentence (_, _) -> None
| DepSentence _ -> None
| QuotedSentences _ -> None
| AltSentence asnt ->
let rec loop asnt =
match asnt with
| [] -> None
| (_, snt)::t ->
let s = rawSentenceString snt in
if s = None then loop t
else s in
loop asnt
(* Returns the raw text of a paragraph *)
let rec rawParagraphString (p : paragraph) : string option =
match p with
| RawParagraph rp -> Some rp
| StructParagraph _ -> None
| AltParagraph ap ->
let rec loop ap =
match ap with
| [] -> None
| (_, p)::t ->
let s = rawParagraphString p in
if s = None then loop t
else s in
loop ap
(* Return the raw text of a text *)
let rec rawTextString (t : text) : string option =
match t with
| RawText rt -> Some rt
| StructText (_, _) -> None
| AltText at ->
let rec loop at =
match at with
| [] -> None
| (_, t)::tail ->
let s = rawTextString t in
if s = None then loop tail
else s in
loop at
(* Returns a list of all the sentences in a paragraph *)
let rec paragraphSentences (p: paragraph) : sentence list =
match p with
| RawParagraph _ -> []
| StructParagraph prs -> List.map (fun pr -> pr.psentence) prs
| AltParagraph ap -> List.fold_left (fun ss (_, prg) -> ss@(paragraphSentences prg)) [] ap
(* Returns a list of all the sentences in a text *)
let rec textSentences (t: text) : sentence list =
match t with
| RawText _ -> []
| StructText (ps, _) -> List.fold_left (fun ss p -> ss@(paragraphSentences p)) [] ps
| AltText at -> List.fold_left (fun ss (_, txt) -> ss@(textSentences txt)) [] at
(* Reads a sentence from channel i assuming text is the raw text of the sentences paragraph *)
let input_sentence (i: in_channel) (text: string) : paragraph_record option =
let rec loop i l =
let s = input_line i in
if indent s = 3 then loop i ((cutString s 5 0)::l)
else l in
let s = input_line i in
if indent s <> 2 then None
else
(
let id = cutString s 3 1 in
let revSeg = loop i [] in
let lstBeg = segBeg (List.hd revSeg) in
let lstLen = segLen (List.hd revSeg) in
let seg = List.rev revSeg in
let fstBeg = segBeg (List.hd seg) in
let len = lstBeg + lstLen - fstBeg in
let rawSen = RawSentence (unicodeSub text fstBeg len) in
Some {pid=id; pbeg=fstBeg; plen=len; pnext=(-1); psentence=AltSentence[(Raw, rawSen)]; pfile_prefix=""}
)
(* Reads a paragraph from channel i *)
let input_paragraph (i: in_channel) : paragraph =
let text = cutString (input_line i) 0 1 in
let rec loop i l =
let senOpt = input_sentence i text in
match senOpt with
| Some sen -> loop i (sen::l)
| None -> List.rev l in
AltParagraph [Raw, RawParagraph text; Struct, StructParagraph (loop i [])]
(* Reads a text from channel i *)
let input_text (i: in_channel) : text =
let rec loop i l =
match
skipToParagraph i;
input_paragraph i
with
| p -> loop i (p::l)
| exception End_of_file -> List.rev l in
let emptyArray = ExtArray.make initSize empty_token in
let l = loop i [] in
let t = List.fold_left (fun t p -> t ^ (getSome (rawParagraphString p)) ^ " ") "" l in
AltText [Raw, RawText t; Struct, StructText (l, emptyArray)]
let rec string_of_sentence (s: paragraph_record) : string =
getSome (rawSentenceString s.psentence)
(* Alternative conversion showing the sentence structure *)
(*let rec tmp ps = match ps with
| RawSentence rs -> "Raw: " ^ rs ^ " "
| StructSentence (_, _) -> "Struct "
| DepSentence _ -> "Dep "
| QuotedSentences prs -> "Quoted: (" ^ (List.fold_left (fun a pr -> a ^ string_of_sentence pr) "" prs) ^ ") "
| AltSentence asnt -> List.fold_left (fun a (_, snt) -> a ^ (tmp snt)) "" asnt in
tmp s.psentence ^ "\n"*)
let rec string_of_paragraph (p: paragraph) : string =
match p with
| RawParagraph rp -> rp ^ "\n"
| StructParagraph sp -> List.fold_left (fun s sen -> s ^ (string_of_sentence sen) ^ "\n") "" sp
| AltParagraph ap -> List.fold_left (fun s (_, p) -> s ^ (string_of_paragraph p)) "" ap
let rec string_of_text (t: text) : string =
match t with
| RawText rt -> rt ^ "\n"
| StructText (ps, _) -> List.fold_left (fun s p -> s ^ (string_of_paragraph p) ^ "\n") "" ps
| AltText ap -> List.fold_left (fun s (_, t) -> s ^ (string_of_text t)) "" ap
let equalSentences (s0: sentence) (s1: sentence) : bool =
let str0 = getSome (rawSentenceString s0) in
let str1 = getSome (rawSentenceString s1) in
str0 = str1
(* Prints an error if l1 does not contain all the sentences in l0 *)
let rec subsentences (l0 : sentence list) (l1 : sentence list) : unit =
match l0 with
| [] -> ()
| h0::t0 -> match l1 with
| [] -> print_string ("MISSING SENTENCE:\n" ^ (getSome (rawSentenceString h0)) ^ "\n")
| h1::t1 ->
if equalSentences h0 h1 then subsentences t0 t1
else subsentences (h0::t0) t1
let compareFile (s: string) : unit =
let i = open_in s in
let t0 = input_text i in
let rt = (getSome (rawTextString t0)) in
try
let t1 = PreProcessing.parse_text (RawText rt) in
let s0 = textSentences t0 in
let s1 = textSentences t1 in
subsentences s0 s1
with
| Invalid_argument _ -> ()
| Failure msg -> print_string (msg ^ "\n");
close_in i
let rec compareDir (dir: string) : unit =
print_string (dir ^ "\n");
let process s =
let absS = (dir ^ "/" ^ s) in
if s = fileName then compareFile absS
else if Sys.is_directory absS then compareDir absS in
Array.iter process (Sys.readdir dir)
let _ =
if Array.length Sys.argv < 2 then print_string ("Please provide a directory containing " ^ fileName ^ " files.\n")
else compareDir Sys.argv.(1)