ConvertTiger2CoNLL.java 2.72 KB
/**
 * 
 */
package is2.util;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;

/**
 * @author Dr. Bernd Bohnet, 17.01.2010
 * 
 *  This class removes all information from a conll 2009 file except of columns 1 and 2 
 *  that contain the word id and the word form. 
 */
public class ConvertTiger2CoNLL {

	public static void main (String[] args) throws IOException {

	
		OptionsSuper options = new  OptionsSuper(args,null);

		if (options.trainfile!= null){
			System.err.println("included sentences "+clean(options.trainfile, options.outfile, options.start, options.count));
		}
		else 	System.err.println("Please proivde the file name -train <file-name>");
		
	}

	/**
	 * @param trainfile
	 * @throws IOException 
	 */
	private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException {

		System.err.println("writting to "+outFile);
		System.err.println("start "+start+" to "+(start+numberOfSentences));
		int state=0;

		BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768);
		BufferedWriter writer = new BufferedWriter(new java.io.OutputStreamWriter (new java.io.FileOutputStream (outFile),"UTF-8"),32768);
		String l =null;
		try {

			int id =1, snt=0,cnt=0;

			while( (l = reader.readLine())!=null) {

				  
				if (l.startsWith("#BOS")) {
					state=1; //BOS
					id=1;
					snt++;
					continue;
				}
				if (l.startsWith("#EOS") && state==1) {
					state=2; //BOS
					cnt++;
					
					writer.newLine();
				}
				
				if (start>snt || (start+numberOfSentences)<=snt) {
					state=3;
				}
				
				if (l.startsWith("#5")||l.startsWith("#6")||l.startsWith("#7")) continue;
				if ((start+numberOfSentences)<=snt) break;
				
				if (state==3) continue;
			
				
				if (state==1) {

					l = l.replace("\t\t", "\t");
					l = l.replace("\t\t", "\t");
					
					StringTokenizer t = new StringTokenizer(l,"\t");
					int count=0;
					
					writer.write(""+id+"\t");
					
					while (t.hasMoreTokens()) {
						if (count==0) {
							writer.write(t.nextToken()+"\t");
						} else if (count==1) {
							writer.write(t.nextToken()+"\t_\t");
						} else if (count==2) {
							writer.write(t.nextToken()+"\t_\t");
						} else if (count==3) {
							writer.write(t.nextToken().replace(".", "|")+"\t_\t");
						}
						else {
							t.nextToken();
						}
						count++;
					}
					writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_");
					writer.newLine();
				}
				id++;
			}
			writer.flush();
			writer.close();
			reader.close();
			
			return cnt;
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		
		return -1;

	}




}