Main.java 9.57 KB
package pl.waw.ipipan.zil.core.md;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import pl.waw.ipipan.zil.core.md.detection.Detector;
import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
import pl.waw.ipipan.zil.core.md.entities.Text;
import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
import pl.waw.ipipan.zil.core.md.io.tei.TeiSaver;
import pl.waw.ipipan.zil.core.md.io.thrift.ThriftLoader;
import pl.waw.ipipan.zil.core.md.io.thrift.ThriftSaver;
import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;

public class Main {

    private static final Logger logger = LoggerFactory.getLogger(Main.class);

    private static final boolean GZIP_OUTPUT = true;
    private static final String DEFAULT_HEAD_MODEL = "/head_model.bin";
    private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin";
    private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
    private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
    private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";

    private static HeadDetector headModel;
    private static NominalMentionDetector nominalMentionModel;
    private static ZeroSubjectDetector zeroSubjectModel;
    
    public static enum ValenceDicts { 
    	VerbsValence,
        NounsValence
    }
    
    private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = 
    		new EnumMap(ValenceDicts.class);

    static {
    	InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL);
        headModel = new HeadDetector(headDetectionModelStream);
    	
    	InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL);
        nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream);
    	
        InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
        zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
        
        InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
        valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
        
        InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
        valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
    }
    
    
    public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
    {
	    Map<String,ArrayList<String>> map;
	    try {
	        BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
	        map = new HashMap<String,ArrayList<String>>();
	        String line;
	        boolean firstLine = true;
			while((line = br.readLine()) != null) {
				if (firstLine) {
					line = line.replace("\uFEFF", ""); // remove BOM character
					firstLine = false;
				}

				if (!line.startsWith("%")) {
				    String[] lineParts = line.split(":");
				    String lemma = lineParts[0].trim();
				    String schema = lineParts[5].trim();
				    
				    if (schema.trim().isEmpty()) {
				    	continue;
				    }
				    
				    String[] lemmaParts = lemma.split(" ");
				    if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
				    	lemma = lemma + " się";
				    }
				    
				    ArrayList<String> schemata;
				    if (!map.containsKey(lemma)) {
				    	schemata = new ArrayList<String>();
				    	schemata.add(schema);
				    	map.put(lemma, schemata);
				    } else {
				    	schemata = map.get(lemma);
				    	schemata.add(schema);
				    	map.put(lemma, schemata);
				    }
				}
			}
	        br.close();
	    } catch (IOException ex) {
	        ex.printStackTrace();
	        throw new RuntimeException(ex);
	    }
	    return map;
    }
    
    private static boolean schemaContainsSie(String schema) {
    	for (String position : schema.split("\\s\\+\\s")) {
    		position = position.trim();
    		position = position.substring(1, position.length()-1);
    		for (String phrT : position.split(";")) {
    			if (phrT.equals("refl") || phrT.equals("recip")) {
    				return true;
    			}
    		}
    	}
    	
    	return false;
    }

    private Main() {
    }

    /**
     * Main method for detecting mentions in corpus encoded in Tei format.
     *
     * @param args arguments
     */
    public static void main(String[] args) {

        if (args.length != 2 && args.length != 5) {
            logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
                    + " input_dir result_dir [head_model] [nominal_mention_model] [zero_subject_model]");
            return;
        }

        File inputDir = new File(args[0]);
        File outputDir = new File(args[1]);

        if (!inputDir.isDirectory()) {
            logger.error(inputDir + " is not a directory!");
            return;
        }
        if (!outputDir.isDirectory()) {
            logger.error(outputDir + " is not a directory!");
            return;
        }
        
        if (args.length == 5) {
            try {
            	InputStream headDetectionModelStream;
                headDetectionModelStream = new FileInputStream(new File(args[2]));
                headModel = new HeadDetector(headDetectionModelStream);
            	
            	InputStream nominalMentionsDetectionModelStream;
            	nominalMentionsDetectionModelStream = new FileInputStream(new File(args[3]));
                nominalMentionModel = new NominalMentionDetector(nominalMentionsDetectionModelStream);
            	
                InputStream zeroSubjectDetectionModelStream;
                zeroSubjectDetectionModelStream = new FileInputStream(new File(args[4]));
                zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
            } catch (IOException e) {
                logger.error("Unable to load model: " + e, e);
                return;
            }
        }
        
        
        int all = 0;
        int errors = 0;
        for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
            all++;
            try {
                File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
                TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
                annotateTeiText(teiText, teiDir);
                TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
            } catch (IOException e) {
                logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e);
                errors++;
            }
        }

        logger.info(all + " texts processed succesfully.");
        if (errors > 0)
            logger.info(errors + " texts not processed.");
        logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
        logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
    }

    /**
     * Find relative path of text directory in the corpus directory and create
     * similar directory structure in the output corpus directory.
     *
     * @param inputCorpusDir  input corpus directory
     * @param outputCorpusDir output corpus directory
     * @param textDir         input text dir
     * @return target text dir
     * @throws IOException when an error occurs
     */
    private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
        String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
        File targetDir = new File(outputCorpusDir, relativeDirPath);
        targetDir.mkdirs();
        if (!targetDir.exists() || !targetDir.isDirectory())
            throw new IOException("Failed to create output directory at: " + targetDir);
        return targetDir;
    }

    /**
     * Find mentions in Thrift text and update this Thrift text with mention
     * annotation.
     *
     * @param thriftText text to annotate with mentions
     * @throws MultiserviceException when an error occures
     */
    public static void annotateThriftText(TText thriftText) throws MultiserviceException {
        Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
        Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence);
        ThriftSaver.updateThriftText(responseText, thriftText);
    }

    /**
     * Find mentions in Tei text and update this Tei text with mention
     * annotation. This method does not save this Tei text on disk.
     *
     * @param teiText text to annotate with mentions
     * @throws TEIException when an error occurs
     */
    public static void annotateTeiText(TEICorpusText teiText, File textDir) throws TEIException {
        Text responseText = TeiLoader.loadTextFromTei(teiText, textDir);
        Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence);
        TeiSaver.updateTeiText(responseText, teiText);
    }

}