ConvertTiger2CoNLL.java
2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/**
*
*/
package is2.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
/**
* @author Dr. Bernd Bohnet, 17.01.2010
*
* This class removes all information from a conll 2009 file except of columns 1 and 2
* that contain the word id and the word form.
*/
public class ConvertTiger2CoNLL {
public static void main (String[] args) throws IOException {
OptionsSuper options = new OptionsSuper(args,null);
if (options.trainfile!= null){
System.err.println("included sentences "+clean(options.trainfile, options.outfile, options.start, options.count));
}
else System.err.println("Please proivde the file name -train <file-name>");
}
/**
* @param trainfile
* @throws IOException
*/
private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException {
System.err.println("writting to "+outFile);
System.err.println("start "+start+" to "+(start+numberOfSentences));
int state=0;
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768);
BufferedWriter writer = new BufferedWriter(new java.io.OutputStreamWriter (new java.io.FileOutputStream (outFile),"UTF-8"),32768);
String l =null;
try {
int id =1, snt=0,cnt=0;
while( (l = reader.readLine())!=null) {
if (l.startsWith("#BOS")) {
state=1; //BOS
id=1;
snt++;
continue;
}
if (l.startsWith("#EOS") && state==1) {
state=2; //BOS
cnt++;
writer.newLine();
}
if (start>snt || (start+numberOfSentences)<=snt) {
state=3;
}
if (l.startsWith("#5")||l.startsWith("#6")||l.startsWith("#7")) continue;
if ((start+numberOfSentences)<=snt) break;
if (state==3) continue;
if (state==1) {
l = l.replace("\t\t", "\t");
l = l.replace("\t\t", "\t");
StringTokenizer t = new StringTokenizer(l,"\t");
int count=0;
writer.write(""+id+"\t");
while (t.hasMoreTokens()) {
if (count==0) {
writer.write(t.nextToken()+"\t");
} else if (count==1) {
writer.write(t.nextToken()+"\t_\t");
} else if (count==2) {
writer.write(t.nextToken()+"\t_\t");
} else if (count==3) {
writer.write(t.nextToken().replace(".", "|")+"\t_\t");
}
else {
t.nextToken();
}
count++;
}
writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_");
writer.newLine();
}
id++;
}
writer.flush();
writer.close();
reader.close();
return cnt;
} catch (IOException e) {
e.printStackTrace();
}
return -1;
}
}