ConvertTiger2CoNLL.java 2.77 KB
/**
 *
 */
package is2.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;

/**
 * @author Dr. Bernd Bohnet, 17.01.2010
 *
 *         This class removes all information from a conll 2009 file except of
 *         columns 1 and 2 that contain the word id and the word form.
 */
public class ConvertTiger2CoNLL {

	public static void main(String[] args) throws IOException {

		OptionsSuper options = new OptionsSuper(args, null);

		if (options.trainfile != null) {
			System.err.println(
					"included sentences " + clean(options.trainfile, options.outfile, options.start, options.count));
		} else
			System.err.println("Please proivde the file name -train <file-name>");

	}

	/**
	 * @param trainfile
	 * @throws IOException
	 */
	private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException {

		System.err.println("writting to " + outFile);
		System.err.println("start " + start + " to " + (start + numberOfSentences));
		int state = 0;

		BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768);
		BufferedWriter writer = new BufferedWriter(
				new java.io.OutputStreamWriter(new java.io.FileOutputStream(outFile), "UTF-8"), 32768);
		String l = null;
		try {

			int id = 1, snt = 0, cnt = 0;

			while ((l = reader.readLine()) != null) {

				if (l.startsWith("#BOS")) {
					state = 1; // BOS
					id = 1;
					snt++;
					continue;
				}
				if (l.startsWith("#EOS") && state == 1) {
					state = 2; // BOS
					cnt++;

					writer.newLine();
				}

				if (start > snt || (start + numberOfSentences) <= snt) {
					state = 3;
				}

				if (l.startsWith("#5") || l.startsWith("#6") || l.startsWith("#7"))
					continue;
				if ((start + numberOfSentences) <= snt)
					break;

				if (state == 3)
					continue;

				if (state == 1) {

					l = l.replace("\t\t", "\t");
					l = l.replace("\t\t", "\t");

					StringTokenizer t = new StringTokenizer(l, "\t");
					int count = 0;

					writer.write("" + id + "\t");

					while (t.hasMoreTokens()) {
						if (count == 0) {
							writer.write(t.nextToken() + "\t");
						} else if (count == 1) {
							writer.write(t.nextToken() + "\t_\t");
						} else if (count == 2) {
							writer.write(t.nextToken() + "\t_\t");
						} else if (count == 3) {
							writer.write(t.nextToken().replace(".", "|") + "\t_\t");
						} else {
							t.nextToken();
						}
						count++;
					}
					writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_");
					writer.newLine();
				}
				id++;
			}
			writer.flush();
			writer.close();
			reader.close();

			return cnt;
		} catch (IOException e) {
			e.printStackTrace();
		}

		return -1;

	}

}