package is2.io; import is2.data.SentenceData09; import is2.util.DB; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.StringTokenizer; public class CONLLWriter06 { public static final String DASH = "_"; protected BufferedWriter writer; public CONLLWriter06 () { } public static void main(String args[]) throws IOException { if (args.length==2) { File f = new File(args[0]); File f2 = new File(args[1]); // BufferedReader bf = new BufferedReader(new FileInputStream(new File(args[0]),"UTF-8"),32768); BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f),"ISO-8859"),32768); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2),"UTF-8"));; boolean found =false; boolean tab =false; while(true) { String l = ir.readLine(); if (l==null) break; String x =l.trim(); if (x.endsWith("\t")) tab=true; br.write(x); br.newLine(); if (!l.equals(x)) found =true; } ir.close(); br.flush(); br.close(); if (found) DB.println("found diff. found tab? "+tab); } else if (args.length==3) { File f1 = new File(args[1]); File f2 = new File(args[2]); BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1),"ISO-8859"),32768); BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2),"UTF-8"),32768); int line =0, alltabs1=0,alltabs2=0; while(true) { String l1 = ir1.readLine(); String l2 = ir2.readLine(); if (l1==null && l2!=null) DB.println("files do not end at the same line "); if (l1!=null && l2==null) DB.println("files do not end at the same line "); if (l1==null ) break; StringTokenizer t1 = new StringTokenizer(l1,"\t"); StringTokenizer t2 = new StringTokenizer(l2,"\t"); int tabs1=0; while(t1.hasMoreTokens()) { t1.nextElement(); tabs1++; alltabs1++; } int tabs2=0; while(t2.hasMoreTokens()) { t2.nextElement(); tabs2++; alltabs2++; } line ++; if (tabs1!=tabs2) { DB.println("number of tabs different in line "+line+" file1-tabs "+tabs1+" file2-tabs "+tabs2); System.exit(0); } } DB.println("checked lines "+line+" with tabs in file 1 "+alltabs1+" in file2 "+alltabs2); } else { File f = new File(args[0]); String[] dir =f.list(); for(String fx :dir) { BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+File.separatorChar+fx),"UTF-8"),32768); System.out.println("check file "+fx); while(true) { String l = ir.readLine(); if (l==null) break; if (l.endsWith("\t")) { DB.println("found tab in file "+fx); break; } } ir.close(); } } } // public int version = CONLLReader09.TASK08; public CONLLWriter06 (String file) { try { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),"UTF-8")); } catch (Exception e) { e.printStackTrace(); } } public CONLLWriter06(String outfile, int formatTask) { this(outfile); // version = formatTask; } public void write(SentenceData09 inst) throws IOException { for (int i=0; i<inst.length(); i++) { writer.write(Integer.toString(i+1)); writer.write('\t'); // id writer.write(inst.forms[i]); writer.write('\t'); // form if (inst.lemmas!=null && inst.lemmas[i]!=null) { writer.write(inst.lemmas[i]); } else writer.write(DASH); // lemma writer.write('\t'); // writer.write(DASH); // cpos // writer.write('\t'); writer.write(inst.gpos[i]); // cpos has to be included writer.write('\t'); writer.write(inst.gpos[i]); // gpos writer.write('\t'); if (inst.ofeats[i].isEmpty()||inst.ofeats[i].equals(" ")) writer.write(DASH); else writer.write(inst.ofeats[i]); writer.write('\t'); //writer.write(DASH); writer.write('\t'); // pfeat writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head if (inst.labels[i]!=null) writer.write(inst.labels[i]); // rel else writer.write(DASH); writer.write('\t'); writer.write(DASH); writer.write('\t'); writer.write(DASH); writer.write('\t'); writer.newLine(); } writer.newLine(); } public void finishWriting () throws IOException { writer.flush(); writer.close(); } }