ConvertTiger2CoNLL.java
2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/**
*
*/
package is2.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
/**
* @author Dr. Bernd Bohnet, 17.01.2010
*
* This class removes all information from a conll 2009 file except of
* columns 1 and 2 that contain the word id and the word form.
*/
public class ConvertTiger2CoNLL {
public static void main(String[] args) throws IOException {
OptionsSuper options = new OptionsSuper(args, null);
if (options.trainfile != null) {
System.err.println(
"included sentences " + clean(options.trainfile, options.outfile, options.start, options.count));
} else
System.err.println("Please proivde the file name -train <file-name>");
}
/**
* @param trainfile
* @throws IOException
*/
private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException {
System.err.println("writting to " + outFile);
System.err.println("start " + start + " to " + (start + numberOfSentences));
int state = 0;
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768);
BufferedWriter writer = new BufferedWriter(
new java.io.OutputStreamWriter(new java.io.FileOutputStream(outFile), "UTF-8"), 32768);
String l = null;
try {
int id = 1, snt = 0, cnt = 0;
while ((l = reader.readLine()) != null) {
if (l.startsWith("#BOS")) {
state = 1; // BOS
id = 1;
snt++;
continue;
}
if (l.startsWith("#EOS") && state == 1) {
state = 2; // BOS
cnt++;
writer.newLine();
}
if (start > snt || (start + numberOfSentences) <= snt) {
state = 3;
}
if (l.startsWith("#5") || l.startsWith("#6") || l.startsWith("#7"))
continue;
if ((start + numberOfSentences) <= snt)
break;
if (state == 3)
continue;
if (state == 1) {
l = l.replace("\t\t", "\t");
l = l.replace("\t\t", "\t");
StringTokenizer t = new StringTokenizer(l, "\t");
int count = 0;
writer.write("" + id + "\t");
while (t.hasMoreTokens()) {
if (count == 0) {
writer.write(t.nextToken() + "\t");
} else if (count == 1) {
writer.write(t.nextToken() + "\t_\t");
} else if (count == 2) {
writer.write(t.nextToken() + "\t_\t");
} else if (count == 3) {
writer.write(t.nextToken().replace(".", "|") + "\t_\t");
} else {
t.nextToken();
}
count++;
}
writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_");
writer.newLine();
}
id++;
}
writer.flush();
writer.close();
reader.close();
return cnt;
} catch (IOException e) {
e.printStackTrace();
}
return -1;
}
}