/** * */ package is2.io; import is2.data.PSTree; import is2.util.DB; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Stack; import java.util.StringTokenizer; /** * @author Dr. Bernd Bohnet, 17.01.2011 * * Reads a sentences in Penn Tree Bank bracket style and return sentences. */ public class TigerReader implements PSReader { BufferedReader inputReader; ArrayList<File> psFiles = new ArrayList<File>(); ArrayList<PSTree> psCache = new ArrayList<PSTree>(); String filter[] = null; int startFilter =-1; int endFilter =-1; public TigerReader() {} public TigerReader(String file ) { try { inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"ISO-8859-1"),32768); } catch (Exception e) { e.printStackTrace(); } } /** * @param ps */ @Override public void startReading(String file, String[] filter) { try { this.filter =filter; startFilter =filter==null?-1:1; endFilter =filter==null?-1:1; inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"ISO-8859-1"),32768); } catch (Exception e) { e.printStackTrace(); } } public static class Line { String form; String lemma; String morph; String pos; int parent; String edge; } static int stop=0; /** * @return */ public PSTree getNext() { PSTree ps = null; String l =null; ArrayList<Line> lines = new ArrayList<Line>(); try { int state=1, terminals=0, nonterminals=0; while((l = inputReader.readLine())!=null) { if (startFilter==1 && l.startsWith("#BOS "+filter[0]) ) { System.out.println("found start "+l); startFilter=2; } if (endFilter==1 && l.startsWith("#EOS "+filter[1]) ){ System.out.println("found end "+l); endFilter=2; } if (startFilter==1||endFilter==2) continue; if (l.startsWith("#BOS")) { state=2; continue; } if (l.startsWith("#500")) state=3; if (l.startsWith("#EOS")) state=4; if (state<2) continue; if ( state==4) { ps = new PSTree(); ps.create(terminals, nonterminals); // System.out.println("terminals "+terminals); //build ps tree int cnt=0; // ps.entries[0] =CONLLReader09.ROOT; // ps.head[0]=-1; int root=-1; for(Line line : lines) { /* if (cnt==terminals) { // insert root root =cnt; cnt++; } */ ps.entries[cnt] = line.form; if (cnt<terminals) ps.pos[cnt] = line.pos; else ps.entries[cnt] =line.pos; ps.lemmas[cnt] = line.lemma; ps.head[cnt] = line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; // ps.head[cnt] = line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; ps.morph[cnt]=line.morph; cnt++; } if (root==-1) root= terminals; ps.head[cnt-1]=0; // root ps.terminalCount=terminals; lines.clear(); state=1; /* for(int k=0;k<ps.head.length;k++) { if (ps.head[k]<terminals && k!=root) { ps.head[k]=root; // DB.println("error "+k+" "+ps.head[k]); } } */ // System.out.println(""+ps.toString()); // if (stop++ == 4)System.exit(0); return ps; } StringTokenizer t = new StringTokenizer(l,"\t"); int tc=0; Line line = new Line(); lines.add(line); while(t.hasMoreTokens()) { String token = t.nextToken(); if (token.equals("\t"))continue; if (tc==0) { if (token.startsWith("#5")||token.startsWith("#6") ) { nonterminals++; } else { terminals++; //change it back to the wrong format since the conll stuff was derived from this. // if (token.equals("durchblicken")) token="durchblikken"; line.form = token; } } else if (tc==1) { line.lemma=token; } else if (tc==2) { line.pos=token; } else if (tc==3) { line.morph=token; } else if (tc==4) { line.edge=token; } else if (tc==5) { line.parent=Integer.parseInt(token); } if (token.length()>0)tc++; } // read till #EOS } } catch(Exception e) { e.printStackTrace(); } return ps; } /** * @param tree */ private void removeTraces(ArrayList<Object> tree) { Stack<ArrayList<Object>> s = new Stack<ArrayList<Object>>(); s.push(tree); ArrayList<Object> list =null; while (!s.isEmpty()) { ArrayList<Object> last =list; list = s.pop(); for(int k=0;k<list.size();k++) { Object o = list.get(k); if(o instanceof String) { String t = (String)o; if ((t.endsWith("-1")||t.endsWith("-2")||t.endsWith("-3")||t.endsWith("-4")) && list.size()>(k+1)) { t = t.substring(0, t.length()-2); list.set(k, t); } if (t.startsWith("-NONE-")) { // remove the bigger surrounding phrase, e.g. (NP (-NONE- *)) if (last.size()==2 && last.get(0) instanceof String && last.contains(list)) { ArrayList<Object> rest = remove(tree, last); if (rest!=null && rest.size()==1){ rest = remove(tree, rest); } } // remove the phrase only, e.g. (NP (AP nice small) (-NONE- *)) else { // there might a phrase with two empty elements (VP (-NONE- *) (-NONE- ...)) // System.out.println("last "+last+" list "+list ); ArrayList<Object> rest = remove(tree, list); removeTraces(rest); if (rest.size()==1) { rest = remove(tree, rest); if (rest!=null && rest.size()==1){ System.out.println("rest "+rest); System.exit(0); } } } continue; } } if (o instanceof ArrayList) { s.push((ArrayList<Object>)o); } } } } /** * Remove from tree p * @param tree phrase structure tree * @param p elment to remove */ private ArrayList<Object> remove(ArrayList<Object> tree, Object p) { Stack<ArrayList<Object>> s = new Stack<ArrayList<Object>>(); s.push(tree); while (!s.isEmpty()) { ArrayList<Object> list = s.pop(); for(int k=0;k<list.size();k++) { Object o = list.get(k); if (o == p) { list.remove(p); return list ; } if (o instanceof ArrayList) { s.push((ArrayList<Object>)o); } } } return null; } /** * Count the terminals * @param current * @return */ private int countTerminals(ArrayList<Object> current) { int count =0; boolean found =false, all =true ; for(Object o : current) { if (o instanceof String) found =true; else { all =false; if (o instanceof ArrayList) count +=countTerminals((ArrayList<Object>)o); } } if (found && all) { // System.out.println(""+current); count++; } return count; } /** * Count the terminals * @param current * @return */ private int insert(PSTree ps, ArrayList<Object> current, Integer terminal, Integer xxx, int head) { boolean found =false, all =true; String term =null; String pos =null; for(Object o : current) { if (o instanceof String) { if (found) term =(String)o; if (!found) pos =(String)o; found =true; } else { all =false; // if (o instanceof ArrayList) count +=countTerminals((ArrayList<Object>)o); } } if (found && all) { if(term.equals("-LRB-")) term="("; if(term.equals("-RRB-")) term=")"; if(term.equals("-LCB-")) term="{"; if(term.equals("-RCB-")) term="}"; if(term.contains("1\\/2-year")) term=term.replace("\\/", "/"); if(term.contains("1\\/2-foot-tall")) term=term.replace("\\/", "/"); ps.entries[ps.terminalCount] =term; ps.pos[ps.terminalCount]=pos; ps.head[ps.terminalCount]=head; // System.out.println("terminal "+term+" "+ps.terminal+" head "+head); ps.terminalCount ++; } else if (found && ! all) { if(pos.startsWith("NP-SBJ")) pos="NP-SBJ"; if(pos.startsWith("WHNP")) pos="WHNP"; ps.entries[ps.non] =pos; ps.head[ps.non]=head; // System.out.println("non terminal "+pos+" "+ps.non+" head "+ head); int non =ps.non ++; for (Object o : current) { if (o instanceof ArrayList) { insert(ps,(ArrayList<Object>)o,terminal,ps.non, non); } } } if(!all && !found)for (Object o : current) { if (o instanceof ArrayList) { insert(ps,(ArrayList<Object>)o,terminal,0, ps.non-1); } } return terminal; } /** * Count the terminals * @param current * @return */ private int countNonTerminals(ArrayList<Object> current) { int count =0; boolean found =false, all =true ; for(Object o : current) { if (o instanceof String) found =true; else { all =false; if (o instanceof ArrayList) count +=countNonTerminals((ArrayList<Object>)o); } } if (found && !all) count++; return count; } }