CONLLWriter06.java 4.63 KB
package is2.io;

import is2.data.SentenceData09;
import is2.util.DB;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.StringTokenizer;


public class CONLLWriter06  {

	public static final String DASH = "_";
	
	protected BufferedWriter writer;

	public CONLLWriter06 () { }
	
	
	
	public static void main(String args[]) throws IOException {
		
		
		if (args.length==2) {
		File f = new File(args[0]);
		File f2 = new File(args[1]);
	//	BufferedReader bf = new BufferedReader(new FileInputStream(new File(args[0]),"UTF-8"),32768);
		BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f),"ISO-8859"),32768);
		BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2),"UTF-8"));;
		boolean found =false;
		boolean tab =false;
		while(true) {
			String l = ir.readLine();
			if (l==null) break;
			String x =l.trim();
			if (x.endsWith("\t")) tab=true;
			br.write(x);
			br.newLine();
			if (!l.equals(x)) found =true; 
			
		}
		ir.close();
		br.flush();
		br.close();

		if (found) DB.println("found diff. found tab? "+tab);
		} else if (args.length==3) {
			File f1 = new File(args[1]);
			File f2 = new File(args[2]);
			
			BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1),"ISO-8859"),32768);
			BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2),"UTF-8"),32768);

			int line =0, alltabs1=0,alltabs2=0;
			while(true) {
				String l1 = ir1.readLine();
				String l2 = ir2.readLine();

				if (l1==null && l2!=null) DB.println("files do not end at the same line ");
				if (l1!=null && l2==null) DB.println("files do not end at the same line ");
				if (l1==null ) break;
				StringTokenizer t1 = new StringTokenizer(l1,"\t");
				StringTokenizer t2 = new StringTokenizer(l2,"\t");
				int tabs1=0;
				while(t1.hasMoreTokens()) {

					t1.nextElement();
					tabs1++;
					alltabs1++;
				}
				
				int tabs2=0;
				while(t2.hasMoreTokens()) {

					t2.nextElement();
					tabs2++;
					alltabs2++;
				}
				line ++;
				if (tabs1!=tabs2) {
					DB.println("number of tabs different in line "+line+" file1-tabs "+tabs1+" file2-tabs "+tabs2);
					System.exit(0);
				}
				
				
			}
			DB.println("checked lines "+line+" with tabs in file 1 "+alltabs1+" in file2 "+alltabs2);
			
		} else {
			File f = new File(args[0]);
			String[] dir =f.list();
			for(String fx :dir) {
				BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+File.separatorChar+fx),"UTF-8"),32768);
				System.out.println("check file "+fx);
				while(true) {
					String l = ir.readLine();
					if (l==null) break;
					if (l.endsWith("\t")) {
						DB.println("found tab in file "+fx);
						break;
					}
				}				
				ir.close();
			}
		}
		
	}
	
	
//	public int version = CONLLReader09.TASK08;
	
	public CONLLWriter06 (String file) {
		
		try {
			writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),"UTF-8"));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	public CONLLWriter06(String outfile, int formatTask) {
		this(outfile);
	//	version = formatTask;
	}

	public void write(SentenceData09 inst) throws IOException {

		for (int i=0; i<inst.length(); i++) {
			
					
				writer.write(Integer.toString(i+1)); writer.write('\t');	// id
				writer.write(inst.forms[i]);     writer.write('\t'); 	// form
				
				if (inst.lemmas!=null && inst.lemmas[i]!=null) {
					writer.write(inst.lemmas[i]);   
				}
				else writer.write(DASH);									// lemma
				writer.write('\t');
				
//				writer.write(DASH);	// cpos
//				writer.write('\t');
				

				writer.write(inst.gpos[i]); // cpos has to be included
				writer.write('\t'); 
				
				writer.write(inst.gpos[i]); // gpos
				writer.write('\t');  
							
				
				if (inst.ofeats[i].isEmpty()||inst.ofeats[i].equals(" ")) writer.write(DASH); 
				else writer.write(inst.ofeats[i]);  
				writer.write('\t');  
				 
				
				//writer.write(DASH); writer.write('\t'); 					// pfeat
				
				writer.write(Integer.toString(inst.heads[i]));  writer.write('\t');  // head
				
				if (inst.labels[i]!=null) writer.write(inst.labels[i]); 	// rel                  
				else writer.write(DASH); 
				writer.write('\t');
				
				writer.write(DASH); 
				writer.write('\t');
				
				writer.write(DASH); 
				writer.write('\t');

						
				writer.newLine();
		}
		writer.newLine();

	}



	public void finishWriting () throws IOException {
		writer.flush();
		writer.close();
	}




}