package is2.io; import is2.data.SentenceData09; import is2.util.DB; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.StringTokenizer; public class CONLLWriter09 extends IOGenerals { int format =0; public static final String DASH = "_"; public static final boolean NO_ROOT = true, ROOT = false; protected BufferedWriter writer; public CONLLWriter09 () { try { writer = new BufferedWriter(new OutputStreamWriter(System.out,"UTF8")); } catch (Exception e) { e.printStackTrace(); } } public static void main(String args[]) throws IOException { if (args.length==2) { File f = new File(args[0]); File f2 = new File(args[1]); BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f),"UTF-8"),32768); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2),"UTF8"));; boolean found =false; boolean tab =false; while(true) { String l = ir.readLine(); if (l==null) break; String x =l.trim(); if (x.endsWith("\t")) tab=true; br.write(x); br.newLine(); if (!l.equals(x)) found =true; } ir.close(); br.flush(); br.close(); if (found) DB.println("found diff. found tab? "+tab); } else if (args.length==3) { File f1 = new File(args[1]); File f2 = new File(args[2]); BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1),"UTF-8"),32768); BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2),"UTF-8"),32768); int line =0, alltabs1=0,alltabs2=0; while(true) { String l1 = ir1.readLine(); String l2 = ir2.readLine(); if (l1==null && l2!=null) DB.println("files do not end at the same line "); if (l1!=null && l2==null) DB.println("files do not end at the same line "); if (l1==null ) break; StringTokenizer t1 = new StringTokenizer(l1,"\t"); StringTokenizer t2 = new StringTokenizer(l2,"\t"); int tabs1=0; while(t1.hasMoreTokens()) { t1.nextElement(); tabs1++; alltabs1++; } int tabs2=0; while(t2.hasMoreTokens()) { t2.nextElement(); tabs2++; alltabs2++; } line ++; if (tabs1!=tabs2) { DB.println("number of tabs different in line "+line+" file1-tabs "+tabs1+" file2-tabs "+tabs2); System.exit(0); } } DB.println("checked lines "+line+" with tabs in file 1 "+alltabs1+" in file2 "+alltabs2); } else { File f = new File(args[0]); String[] dir =f.list(); for(String fx :dir) { BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+File.separatorChar+fx),"UTF-8"),32768); System.out.println("check file "+fx); while(true) { String l = ir.readLine(); if (l==null) break; if (l.endsWith("\t")) { DB.println("found tab in file "+fx); break; } } ir.close(); } } } public CONLLWriter09 (String file) { try { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),"UTF8")); } catch (Exception e) { e.printStackTrace(); } } public CONLLWriter09 (Writer writer) { this.writer = new BufferedWriter(writer); } public CONLLWriter09(String outfile, int formatTask) { this(outfile); } public void write(SentenceData09 inst) throws IOException { write(inst, NO_ROOT); } /** * * @param inst * @param root true: remove root node * @throws IOException */ public void write(SentenceData09 inst, boolean root) throws IOException { int i, mod; if(root&&(inst.forms[0].startsWith("<root")||(inst.lemmas[0]!=null&&inst.lemmas[0].startsWith("<root")))){ i=1; mod=0; } else { i=0; mod=1; } //=()?1:0; if (format == this.F_ONE_LINE) { boolean first =true; for (; i<inst.length(); i++) { if (first ){ first=false; } else writer.write(" "); writer.write(inst.plemmas[i]); } writer.newLine(); return ; } for (; i<inst.length(); i++) { if (inst.id==null|| inst.id[i]==null) {writer.write(Integer.toString(i+mod)); writer.write('\t');} // id else { writer.write(inst.id[i]); writer.write('\t');} writer.write(inst.forms[i]); writer.write('\t'); // form if (inst.lemmas!=null && inst.lemmas[i]!=null) { writer.write(inst.lemmas[i]); } else writer.write(DASH); // lemma writer.write('\t'); if (inst.plemmas!=null && inst.plemmas[i]!=null) writer.write(inst.plemmas[i]); else writer.write(DASH); // plemma writer.write('\t'); if (inst.gpos[i]!=null) writer.write(inst.gpos[i]); // gpos else writer.write(DASH); writer.write('\t'); if (inst.ppos!=null && inst.ppos[i]!=null) writer.write(inst.ppos[i]); else writer.write(DASH); // ppos writer.write('\t'); if (inst.ofeats!=null&& inst.ofeats[i]!=null) writer.write(inst.ofeats[i]); else writer.write(DASH); writer.write('\t'); //writer.write(DASH); writer.write('\t'); // feat if (inst.pfeats!=null&&inst.pfeats[i]!=null) { //System.out.println(""+inst.pfeats[i]); writer.write(inst.pfeats[i]); } else writer.write(DASH); writer.write('\t'); writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head if (inst.pheads!=null ) writer.write(Integer.toString(inst.pheads[i])); else writer.write(DASH); writer.write('\t'); // phead if (inst.labels[i]!=null) writer.write(inst.labels[i]); // rel else writer.write(DASH); writer.write('\t'); if (inst.plabels!=null &&inst.plabels[i]!=null) writer.write(inst.plabels[i]); // rel else writer.write(DASH); writer.write('\t'); if (inst.fillp!=null && inst.fillp[i]!=null) writer.write(inst.fillp[i]); // fill p else { writer.write(DASH); } // writer.write('\t'); if (inst.sem==null) { writer.write('\t'); writer.write(DASH); } else { boolean foundPred =false; // print the predicate for (int p =0;p< inst.sem.length;p++) { if (inst.semposition[p]==i) { foundPred=true; // System.out.println("write pred "+inst.sem[p] ); writer.write('\t'); writer.write(inst.sem[p]); // if (inst.sem[p].startsWith(".")) DB.println("error "+inst.sem[p]); } } if (!foundPred ) { writer.write('\t'); writer.write(DASH); // writer.write('\t'); // writer.write(DASH); } // print the arguments for (int p =0;p< inst.sem.length;p++) { boolean found =false; if (inst.arg!=null &&inst.arg.length>p&&inst.arg[p]!=null) for(int a = 0; a<inst.arg[p].length;a++) { if (i==inst.argposition[p][a]) { writer.write('\t'); writer.write(inst.arg[p][a]); found = true; break; } } if (!found) { writer.write('\t'); writer.write(DASH); } } } writer.newLine(); } writer.newLine(); writer.flush(); } public void finishWriting () throws IOException { writer.flush(); writer.close(); } /** * Sets the output format such as CoNLL or one line for the lemmata of the sentence (see F_xxxx constants). * @param formatTask */ public void setOutputFormat(int formatTask) { format =formatTask; } }