Commit e1126cdba70bd5287871ebbe89e9ae6635bb5a01

Authored by Mateusz Kopeć
0 parents

rough draft

Showing 28 changed files with 2105 additions and 0 deletions
.gitignore 0 → 100644
  1 +++ a/.gitignore
  1 +# Created by .ignore support plugin (hsz.mobi)
  2 +### Java template
  3 +*.
  4 +target/
  5 +
  6 +# Mobile Tools for Java (J2ME)
  7 +.mtj.tmp/
  8 +
  9 +# Package Files #
  10 +*.jar
  11 +*.war
  12 +*.ear
  13 +
  14 +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
  15 +hs_err_pid*
  16 +
  17 +.idea
  18 +*.iml
0 19 \ No newline at end of file
... ...
nicolas-cli/pom.xml 0 → 100644
  1 +++ a/nicolas-cli/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-cli</artifactId>
  13 +
  14 +</project>
0 15 \ No newline at end of file
... ...
nicolas-core/pom.xml 0 → 100644
  1 +++ a/nicolas-core/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas</artifactId>
  13 +
  14 + <dependencies>
  15 + <dependency>
  16 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  17 + <artifactId>nicolas-model</artifactId>
  18 + <version>${project.version}</version>
  19 + <scope>runtime</scope>
  20 + </dependency>
  21 + </dependencies>
  22 +</project>
0 23 \ No newline at end of file
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import weka.classifiers.Classifier;
  4 +import weka.classifiers.trees.RandomForest;
  5 +
  6 +
  7 +public class Constants {
  8 +
  9 + public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";
  10 + public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";
  11 + public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";
  12 + public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";
  13 +
  14 + private Constants() {
  15 + }
  16 +
  17 + public static Classifier getClassifier() {
  18 + RandomForest classifier = new RandomForest();
  19 + classifier.setNumIterations(250);
  20 + classifier.setSeed(0);
  21 + classifier.setNumExecutionSlots(8);
  22 + return classifier;
  23 + }
  24 +
  25 +
  26 + public static Classifier getSentencesClassifier() {
  27 + RandomForest classifier = new RandomForest();
  28 + classifier.setNumIterations(250);
  29 + classifier.setSeed(0);
  30 + classifier.setNumExecutionSlots(8);
  31 + return classifier;
  32 + }
  33 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  4 +
  5 +public class Nicolas {
  6 +
  7 + public String summarizeThrift(TText text, int targetTokenCount) {
  8 + return "test nicolas";
  9 + }
  10 +
  11 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import com.google.common.base.Charsets;
  4 +import com.google.common.collect.Lists;
  5 +import com.google.common.collect.Maps;
  6 +import com.google.common.collect.Sets;
  7 +import com.google.common.io.Files;
  8 +import org.slf4j.Logger;
  9 +import org.slf4j.LoggerFactory;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  11 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  12 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  13 +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  14 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  15 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
  16 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  17 +import weka.classifiers.Classifier;
  18 +import weka.core.Attribute;
  19 +import weka.core.DenseInstance;
  20 +import weka.core.Instance;
  21 +import weka.core.Instances;
  22 +
  23 +import java.io.File;
  24 +import java.io.FileInputStream;
  25 +import java.io.IOException;
  26 +import java.io.ObjectInputStream;
  27 +import java.util.*;
  28 +import java.util.function.Function;
  29 +import java.util.stream.Collectors;
  30 +
  31 +import static java.util.stream.Collectors.toList;
  32 +
  33 +public class Utils {
  34 +
  35 + private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
  36 +
  37 + private static final String DATASET_NAME = "Dataset";
  38 +
  39 + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
  40 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  41 + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
  42 +
  43 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
  44 + Map<TMention, Instance> mention2instance = Maps.newHashMap();
  45 + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
  46 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  47 + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
  48 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  49 + instance.setValue(attribute, mentionFeatures.get(attribute));
  50 + }
  51 + mention2instance.put(tMention, instance);
  52 + }
  53 + return mention2instance;
  54 + }
  55 +
  56 + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
  57 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  58 + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
  59 +
  60 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
  61 + Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
  62 + for (TSentence sentence : sentences) {
  63 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  64 + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
  65 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  66 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  67 + }
  68 + sentence2instance.put(sentence, instance);
  69 + }
  70 + return sentence2instance;
  71 + }
  72 +
  73 + public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
  74 + Instances instances = new Instances(DATASET_NAME, attributesList, 0);
  75 + instances.setClassIndex(0);
  76 + return instances;
  77 + }
  78 +
  79 + public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException {
  80 + LOG.info("Loading classifier...");
  81 + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) {
  82 + Classifier classifier = (Classifier) ois.readObject();
  83 + LOG.info("Done. " + classifier.toString());
  84 + return classifier;
  85 + }
  86 + }
  87 +
  88 + public static Map<String, TText> loadPreprocessedTexts(String path) {
  89 + Map<String, TText> id2text = Maps.newHashMap();
  90 + for (File processedFullTextFile : new File(path).listFiles()) {
  91 + TText processedFullText = loadThrifted(processedFullTextFile);
  92 + id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText);
  93 + }
  94 + LOG.info(id2text.size() + " preprocessed texts found.");
  95 + return id2text;
  96 + }
  97 +
  98 +
  99 + public static TText loadThrifted(File originalFile) {
  100 + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) {
  101 + return (TText) ois.readObject();
  102 + } catch (ClassNotFoundException | IOException e) {
  103 + LOG.error("Error reading serialized file: " + e);
  104 + return null;
  105 + }
  106 + }
  107 +
  108 + public static List<String> tokenize(String text) {
  109 + return Arrays.asList(text.split("[^\\p{L}0-9]+"));
  110 + }
  111 +
  112 + public static List<String> tokenizeOnWhitespace(String text) {
  113 + return Arrays.asList(text.split(" +"));
  114 + }
  115 +
  116 + public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) {
  117 + Map<TMention, String> mention2orth = Maps.newHashMap();
  118 + for (TSentence s : sents) {
  119 + Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth));
  120 + Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace));
  121 +
  122 + for (TMention m : s.getMentions()) {
  123 + StringBuffer mentionOrth = new StringBuffer();
  124 + for (String tokId : m.getHeadIds()) {
  125 + if (!tokId2nps.get(tokId))
  126 + mentionOrth.append(" ");
  127 + mentionOrth.append(tokId2orth.get(tokId));
  128 + }
  129 + mention2orth.put(m, mentionOrth.toString().trim());
  130 + }
  131 + }
  132 + return mention2orth;
  133 + }
  134 +
  135 + private static final Collection<String> STOPWORDS = Sets.newHashSet();
  136 +
  137 + static {
  138 + STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));
  139 + }
  140 +
  141 + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
  142 + Map<TMention, String> mention2orth = Maps.newHashMap();
  143 + for (TSentence s : sents) {
  144 + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  145 +
  146 + for (TMention m : s.getMentions()) {
  147 + StringBuffer mentionOrth = new StringBuffer();
  148 + for (String tokId : m.getChildIds()) {
  149 + TToken token = tokId2tok.get(tokId);
  150 + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
  151 + continue;
  152 + }
  153 +
  154 + if (!token.isNoPrecedingSpace())
  155 + mentionOrth.append(" ");
  156 + mentionOrth.append(token.getOrth());
  157 + }
  158 + mention2orth.put(m, mentionOrth.toString().trim());
  159 + }
  160 + }
  161 + return mention2orth;
  162 + }
  163 +
  164 + public static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
  165 + Map<TMention, String> mention2base = Maps.newHashMap();
  166 + for (TSentence s : sents) {
  167 + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase()));
  168 +
  169 + for (TMention m : s.getMentions()) {
  170 + StringBuilder mentionBase = new StringBuilder();
  171 + for (String tokId : m.getChildIds()) {
  172 + mentionBase.append(" ");
  173 + mentionBase.append(tokId2base.get(tokId));
  174 + }
  175 + mention2base.put(m, mentionBase.toString().toLowerCase().trim());
  176 + }
  177 + }
  178 + return mention2base;
  179 + }
  180 +
  181 + public static String loadSentence2Orth(TSentence sentence) {
  182 + StringBuilder sb = new StringBuilder();
  183 + for (TToken token : sentence.getTokens()) {
  184 + if (!token.isNoPrecedingSpace())
  185 + sb.append(" ");
  186 + sb.append(token.getOrth());
  187 + }
  188 + return sb.toString().trim();
  189 + }
  190 +
  191 + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
  192 + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
  193 +
  194 + MentionScorer scorer = new MentionScorer();
  195 + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
  196 +
  197 + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
  198 + return mention2score.keySet();
  199 + }
  200 +}
0 201 \ No newline at end of file
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.apply;
  2 +
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  12 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  14 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
  15 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  16 +import weka.classifiers.Classifier;
  17 +import weka.core.Instance;
  18 +import weka.core.Instances;
  19 +
  20 +import java.io.BufferedWriter;
  21 +import java.io.File;
  22 +import java.io.FileWriter;
  23 +import java.util.*;
  24 +
  25 +import static java.util.stream.Collectors.toList;
  26 +
  27 +public class ApplyModel2 {
  28 +
  29 + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class);
  30 +
  31 + private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test";
  32 + private static final String TARGET_DIR = "summaries";
  33 +
  34 + public static void main(String[] args) throws Exception {
  35 + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
  36 + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
  37 +
  38 + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
  39 + SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
  40 +
  41 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH);
  42 + int i = 1;
  43 + double avgSize = 0;
  44 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  45 + TText text = entry.getValue();
  46 +
  47 + Set<TMention> goodMentions
  48 + = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
  49 +
  50 + int targetSize = calculateTargetSize(text);
  51 + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
  52 + int size = Utils.tokenize(summary).size();
  53 + avgSize += size;
  54 + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) {
  55 + bw.append(summary);
  56 + }
  57 +
  58 + LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey());
  59 + }
  60 +
  61 + LOG.info("Avg size:" + avgSize / id2preprocessedText.size());
  62 + }
  63 +
  64 + private static int calculateTargetSize(TText text) {
  65 + List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  66 + StringBuffer body = new StringBuffer();
  67 + for (TSentence sent : sents)
  68 + body.append(Utils.loadSentence2Orth(sent) + " ");
  69 + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
  70 + return (int) (0.2 * tokenCount);
  71 + }
  72 +
  73 + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
  74 + List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
  75 +
  76 + StringBuffer sb = new StringBuffer();
  77 + for (TSentence sent : selectedSentences) {
  78 + sb.append(" " + Utils.loadSentence2Orth(sent));
  79 + }
  80 + return sb.toString().trim();
  81 + }
  82 +
  83 + private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
  84 +
  85 + List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  86 +
  87 + Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
  88 + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
  89 +
  90 + Map<TSentence, Double> sentence2score = Maps.newHashMap();
  91 + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
  92 + Instance instance = entry.getValue();
  93 + instance.setDataset(instances);
  94 + double score = sentenceClassifier.classifyInstance(instance);
  95 + sentence2score.put(entry.getKey(), score);
  96 + }
  97 +
  98 + List<TSentence> sortedSents = Lists.newArrayList(sents);
  99 + Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed());
  100 +
  101 + int size = 0;
  102 + Random r = new Random(1);
  103 + Set<TSentence> summary = Sets.newHashSet();
  104 + for (TSentence sent : sortedSents) {
  105 + size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
  106 + if (r.nextDouble() > 0.4 && size > targetSize)
  107 + break;
  108 + summary.add(sent);
  109 + if (size > targetSize)
  110 + break;
  111 + }
  112 + List<TSentence> selectedSentences = Lists.newArrayList();
  113 + for (TSentence sent : sents) {
  114 + if (summary.contains(sent))
  115 + selectedSentences.add(sent);
  116 + }
  117 + return selectedSentences;
  118 + }
  119 +
  120 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
  2 +
  3 +import com.google.common.collect.*;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import weka.core.Attribute;
  7 +
  8 +import java.util.*;
  9 +
  10 +public class FeatureExtractor {
  11 +
  12 + protected static final Logger LOG = LoggerFactory.getLogger(FeatureExtractor.class);
  13 +
  14 + private final List<Attribute> sortedAttributes = Lists.newArrayList();
  15 +
  16 + private final BiMap<String, Attribute> name2attribute = HashBiMap.create();
  17 +
  18 + private final Set<String> normalizedAttributes = Sets.newHashSet();
  19 +
  20 + public ArrayList<Attribute> getAttributesList() {
  21 + return Lists.newArrayList(sortedAttributes);
  22 + }
  23 +
  24 + protected Attribute getAttributeByName(String name) {
  25 + return name2attribute.get(name);
  26 + }
  27 +
  28 + protected void addNumericAttribute(String attributeName) {
  29 + name2attribute.put(attributeName, new Attribute(attributeName));
  30 + }
  31 +
  32 + protected void addBinaryAttribute(String attributeName) {
  33 + name2attribute.put(attributeName, new Attribute(attributeName, Lists.newArrayList("f", "t")));
  34 + }
  35 +
  36 + protected void addNominalAttribute(String attributeName, List<String> values) {
  37 + name2attribute.put(attributeName, new Attribute(attributeName, values));
  38 + }
  39 +
  40 + protected void addNumericAttributeNormalized(String attributeName) {
  41 + addNumericAttribute(attributeName);
  42 + addNumericAttribute(attributeName + "_normalized");
  43 + normalizedAttributes.add(attributeName);
  44 + }
  45 +
  46 + protected void fillSortedAttributes(String scoreAttName) {
  47 + sortedAttributes.addAll(name2attribute.values());
  48 + sortedAttributes.remove(getAttributeByName(scoreAttName));
  49 + Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2)));
  50 + sortedAttributes.add(0, getAttributeByName(scoreAttName));
  51 + }
  52 +
  53 + protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) {
  54 + Map<Attribute, Double> attribute2max = Maps.newHashMap();
  55 + Map<Attribute, Double> attribute2min = Maps.newHashMap();
  56 + for (T entity : entity2attributes.keySet()) {
  57 + Map<Attribute, Double> entityAttributes = entity2attributes.get(entity);
  58 + for (String attributeName : normalizedAttributes) {
  59 + Attribute attribute = getAttributeByName(attributeName);
  60 + Double value = entityAttributes.get(attribute);
  61 +
  62 + attribute2max.putIfAbsent(attribute, Double.MIN_VALUE);
  63 + attribute2max.compute(attribute, (k, v) -> Math.max(v, value));
  64 +
  65 + attribute2min.putIfAbsent(attribute, Double.MAX_VALUE);
  66 + attribute2min.compute(attribute, (k, v) -> Math.min(v, value));
  67 + }
  68 + }
  69 + for (T mention : entity2attributes.keySet()) {
  70 + Map<Attribute, Double> entityAttributes = entity2attributes.get(mention);
  71 + for (Attribute attribute : attribute2max.keySet()) {
  72 + Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
  73 + entityAttributes.put(normalizedAttribute,
  74 + (entityAttributes.get(attribute) - attribute2min.get(attribute))
  75 + / (attribute2max.get(attribute) - attribute2min.get(attribute)));
  76 + }
  77 + }
  78 + }
  79 +
  80 + protected double toBinary(boolean bool) {
  81 + return bool ? 1.0 : 0.0;
  82 + }
  83 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import com.google.common.collect.Sets;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  7 +
  8 +import java.util.List;
  9 +import java.util.Map;
  10 +import java.util.Set;
  11 +import java.util.function.Function;
  12 +import java.util.stream.Collectors;
  13 +
  14 +import static java.util.stream.Collectors.toList;
  15 +import static java.util.stream.Collectors.toMap;
  16 +
  17 +/**
  18 + * Created by me2 on 04.04.16.
  19 + */
  20 +public class FeatureHelper {
  21 +
  22 + private final List<TMention> mentions;
  23 + private final Map<String, TMention> mentionId2mention;
  24 + private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap();
  25 + private final Map<TMention, TCoreference> mention2coref = Maps.newHashMap();
  26 + private final Map<TMention, TSentence> mention2sent = Maps.newHashMap();
  27 + private final Map<TMention, TParagraph> mention2par = Maps.newHashMap();
  28 + private final Map<TMention, String> mention2Orth = Maps.newHashMap();
  29 + private final Map<TMention, String> mention2Base = Maps.newHashMap();
  30 + private final Map<TMention, TToken> mention2head = Maps.newHashMap();
  31 + private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet();
  32 +
  33 + private final Map<TMention, Integer> mention2Index = Maps.newHashMap();
  34 + private final Map<TSentence, Integer> sent2Index = Maps.newHashMap();
  35 + private final Map<TParagraph, Integer> par2Index = Maps.newHashMap();
  36 + private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap();
  37 + private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap();
  38 + private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap();
  39 +
  40 +
  41 + public FeatureHelper(TText preprocessedText) {
  42 + mentions = preprocessedText.getParagraphs().stream()
  43 + .flatMap(p -> p.getSentences().stream())
  44 + .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList());
  45 +
  46 + mentionId2mention = mentions.stream().collect(Collectors.toMap(TMention::getId, Function.identity()));
  47 +
  48 + for (TCoreference coref : preprocessedText.getCoreferences()) {
  49 + List<TMention> ments = coref.getMentionIds().stream().map(mentionId2mention::get).collect(toList());
  50 + for (TMention m : ments) {
  51 + mention2coref.put(m, coref);
  52 + }
  53 + coref2mentions.put(coref, ments);
  54 + }
  55 +
  56 + int parIdx = 0;
  57 + int sentIdx = 0;
  58 + int mentionIdx = 0;
  59 + for (TParagraph par : preprocessedText.getParagraphs()) {
  60 + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences());
  61 + mention2Orth.putAll(m2o);
  62 + Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences());
  63 + mention2Base.putAll(m2b);
  64 +
  65 + int sentIdxInPar = 0;
  66 + int mentionIdxInPar = 0;
  67 + for (TSentence sent : par.getSentences()) {
  68 +
  69 + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity()));
  70 +
  71 + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap();
  72 + for (TNamedEntity namedEntity : sent.getNames()) {
  73 + for (String childId : namedEntity.getChildIds()) {
  74 + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet());
  75 + tokenId2namedEntities.get(childId).add(namedEntity);
  76 + }
  77 + }
  78 +
  79 + int mentionIdxInSent = 0;
  80 + for (TMention mention : sent.getMentions()) {
  81 + mention2sent.put(mention, sent);
  82 + mention2par.put(mention, par);
  83 + mention2Index.put(mention, mentionIdx++);
  84 + mention2indexInSent.put(mention, mentionIdxInSent++);
  85 + mention2indexInPar.put(mention, mentionIdxInPar++);
  86 +
  87 + String firstHeadTokenId = mention.getHeadIds().iterator().next();
  88 + mention2head.put(mention, tokenId2token.get(firstHeadTokenId));
  89 + if (tokenId2namedEntities.containsKey(firstHeadTokenId))
  90 + mentionsInNamedEntities.add(mention);
  91 + }
  92 + sent2Index.put(sent, sentIdx++);
  93 + sent2IndexInPar.put(sent, sentIdxInPar++);
  94 + }
  95 +
  96 + par2Index.put(par, parIdx++);
  97 + }
  98 + }
  99 +
  100 + public List<TMention> getMentions() {
  101 + return mentions;
  102 + }
  103 +
  104 + public int getMentionIndexInChain(TMention mention) {
  105 + return coref2mentions.get(mention2coref.get(mention)).indexOf(mention);
  106 + }
  107 +
  108 + public int getChainLength(TMention mention) {
  109 + return coref2mentions.get(mention2coref.get(mention)).size();
  110 + }
  111 +
  112 + public String getSentenceLastTokenOrth(TSentence sent) {
  113 + return sent.getTokens().get(sent.getTokensSize() - 1).getOrth();
  114 + }
  115 +
  116 + public String getMentionOrth(TMention mention) {
  117 + return mention2Orth.get(mention);
  118 + }
  119 +
  120 + public String getMentionBase(TMention mention) {
  121 + return mention2Base.get(mention);
  122 + }
  123 +
  124 + public int getMentionIndex(TMention mention) {
  125 + return mention2Index.get(mention);
  126 + }
  127 +
  128 + public int getMentionIndexInSent(TMention mention) {
  129 + return mention2indexInSent.get(mention);
  130 + }
  131 +
  132 + public int getMentionIndexInPar(TMention mention) {
  133 + return mention2indexInPar.get(mention);
  134 + }
  135 +
  136 + public int getParIndex(TParagraph paragraph) {
  137 + return par2Index.get(paragraph);
  138 + }
  139 +
  140 + public int getSentIndex(TSentence sent) {
  141 + return sent2Index.get(sent);
  142 + }
  143 +
  144 + public int getSentIndexInPar(TSentence sent) {
  145 + return sent2IndexInPar.get(sent);
  146 + }
  147 +
  148 + public TParagraph getMentionParagraph(TMention mention) {
  149 + return mention2par.get(mention);
  150 + }
  151 +
  152 + public TSentence getMentionSentence(TMention mention) {
  153 + return mention2sent.get(mention);
  154 + }
  155 +
  156 + public TMention getFirstChainMention(TMention mention) {
  157 + return mentionId2mention.get(mention2coref.get(mention).getMentionIdsIterator().next());
  158 + }
  159 +
  160 + public TToken getMentionHeadToken(TMention mention) {
  161 + return mention2head.get(mention);
  162 + }
  163 +
  164 + public boolean isMentionNamedEntity(TMention mention) {
  165 + return mentionsInNamedEntities.contains(mention);
  166 + }
  167 +
  168 + public boolean isNested(TMention mention) {
  169 + return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds()));
  170 + }
  171 +
  172 + public boolean isNesting(TMention mention) {
  173 + return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds()));
  174 + }
  175 +
  176 + public Set<TCoreference> getClusters() {
  177 + return coref2mentions.keySet();
  178 + }
  179 +
  180 + public Set<TMention> getCoreferentMentions(TMention tMention) {
  181 + return getMentionCluster(tMention).getMentionIds().stream().map(this.mentionId2mention::get).collect(Collectors.toSet());
  182 + }
  183 +
  184 + public TCoreference getMentionCluster(TMention tMention) {
  185 + return this.mention2coref.get(tMention);
  186 + }
  187 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
  2 +
  3 +import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
  4 +
  5 +
  6 +public class Interpretation {
  7 + private String ctag = "null";
  8 + private String casee = "null";
  9 + private String gender = "null";
  10 + private String number = "null";
  11 + private String person = "null";
  12 +
  13 + public Interpretation(TInterpretation chosenInterpretation) {
  14 + ctag = chosenInterpretation.getCtag();
  15 + String[] split = chosenInterpretation.getMsd().split(":");
  16 + switch (ctag) {
  17 + case "ger":
  18 + case "subst":
  19 + case "pact":
  20 + case "ppas":
  21 + case "num":
  22 + case "numcol":
  23 + case "adj":
  24 + number = split[0];
  25 + casee = split[1];
  26 + gender = split[2];
  27 + break;
  28 + case "ppron12":
  29 + case "ppron3":
  30 + number = split[0];
  31 + casee = split[1];
  32 + gender = split[2];
  33 + person = split[3];
  34 + break;
  35 + case "siebie":
  36 + casee = split[0];
  37 + break;
  38 + case "fin":
  39 + case "bedzie":
  40 + case "aglt":
  41 + case "impt":
  42 + number = split[0];
  43 + person = split[1];
  44 + break;
  45 + case "praet":
  46 + case "winien":
  47 + number = split[0];
  48 + gender = split[1];
  49 + break;
  50 + case "prep":
  51 + casee = split[0];
  52 + break;
  53 + default:
  54 + break;
  55 + }
  56 + }
  57 +
  58 + public String getCase() {
  59 + return casee;
  60 + }
  61 +
  62 + public String getGender() {
  63 + return gender;
  64 + }
  65 +
  66 + public String getNumber() {
  67 + return number;
  68 + }
  69 +
  70 + public String getPerson() {
  71 + return person;
  72 + }
  73 +
  74 + public String getCtag() {
  75 + return ctag;
  76 + }
  77 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention;
  2 +
  3 +import com.google.common.collect.*;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  5 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  7 +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
  8 +import weka.core.Attribute;
  9 +
  10 +import java.io.File;
  11 +import java.io.IOException;
  12 +import java.nio.file.Files;
  13 +import java.util.*;
  14 +import java.util.stream.Collectors;
  15 +import java.util.stream.Stream;
  16 +
  17 +
  18 +public class MentionFeatureExtractor extends FeatureExtractor {
  19 +
  20 + private final List<String> frequentBases = Lists.newArrayList();
  21 +
  22 + public MentionFeatureExtractor() {
  23 +
  24 + //coref
  25 + addNumericAttributeNormalized("chain_length");
  26 +
  27 + // text characteristics
  28 + addNumericAttribute("text_token_count");
  29 + addNumericAttribute("text_sent_count");
  30 + addNumericAttribute("text_par_count");
  31 + addNumericAttribute("text_mention_count");
  32 + addNumericAttribute("text_cluster_count");
  33 +
  34 + //mention characteristics
  35 + for (String prefix : Lists.newArrayList("mention", "chain_first_mention")) {
  36 + // mention characteristics
  37 + addNumericAttributeNormalized(prefix + "_index");
  38 + addNumericAttributeNormalized(prefix + "_index_in_sent");
  39 + addNumericAttributeNormalized(prefix + "_index_in_par");
  40 + addNumericAttributeNormalized(prefix + "_index_in_chain");
  41 + addBinaryAttribute(prefix + "_capitalized");
  42 + addBinaryAttribute(prefix + "_all_caps");
  43 + addNumericAttributeNormalized(prefix + "_char_count");
  44 + addNumericAttributeNormalized(prefix + "_token_count");
  45 + addBinaryAttribute(prefix + "_is_zero");
  46 + addBinaryAttribute(prefix + "_is_named");
  47 + addBinaryAttribute(prefix + "_is_pronoun");
  48 + addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"));
  49 + addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter"));
  50 + addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc"));
  51 + addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl"));
  52 + addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n"));
  53 +
  54 + // relation to other
  55 + addBinaryAttribute(prefix + "_is_nested");
  56 + addBinaryAttribute(prefix + "_is_nesting");
  57 +
  58 + // par characteristics
  59 + addNumericAttributeNormalized(prefix + "_par_idx");
  60 + addNumericAttributeNormalized(prefix + "_par_token_count");
  61 + addNumericAttributeNormalized(prefix + "_par_sent_count");
  62 +
  63 + // sent characteristics
  64 + addNumericAttributeNormalized(prefix + "_sent_token_count");
  65 + addNumericAttributeNormalized(prefix + "_sent_mention_count");
  66 + addNumericAttributeNormalized(prefix + "_sent_idx");
  67 + addNumericAttributeNormalized(prefix + "_sent_idx_in_par");
  68 + addBinaryAttribute(prefix + "_sent_ends_with_dot");
  69 + addBinaryAttribute(prefix + "_sent_ends_with_questionmark");
  70 +
  71 + // frequent bases
  72 + loadFrequentBases();
  73 + for (String base : frequentBases) {
  74 + addBinaryAttribute(prefix + "_" + encodeBase(base));
  75 + }
  76 + }
  77 +
  78 + addNominalAttribute("score", Lists.newArrayList("bad", "good"));
  79 + fillSortedAttributes("score");
  80 + }
  81 +
  82 + private String encodeBase(String base) {
  83 + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
  84 + }
  85 +
  86 + private void loadFrequentBases() {
  87 + try {
  88 + Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath());
  89 + this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList()));
  90 + } catch (IOException e) {
  91 + e.printStackTrace();
  92 + }
  93 + }
  94 +
  95 + public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) {
  96 + Map<TMention, Map<Attribute, Double>> result = Maps.newHashMap();
  97 +
  98 + FeatureHelper helper = new FeatureHelper(preprocessedText);
  99 +
  100 + addScoreFeature(result, helper.getMentions());
  101 +
  102 + for (TMention mention : helper.getMentions()) {
  103 + Map<Attribute, Double> attribute2value = result.get(mention);
  104 +
  105 + //mention
  106 + addMentionAttributes(helper, mention, attribute2value, "mention");
  107 +
  108 + //first chain mention
  109 + TMention firstChainMention = helper.getFirstChainMention(mention);
  110 + addMentionAttributes(helper, firstChainMention, attribute2value, "chain_first_mention");
  111 +
  112 + //coref
  113 + attribute2value.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention));
  114 +
  115 + //text
  116 + List<TParagraph> pars = preprocessedText.getParagraphs();
  117 + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
  118 + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
  119 + attribute2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
  120 + attribute2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
  121 + attribute2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
  122 + attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size());
  123 + attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
  124 + attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
  125 +
  126 + assert (attribute2value.size() == getAttributesList().size());
  127 + }
  128 + addNormalizedAttributeValues(result);
  129 +
  130 + return result;
  131 + }
  132 +
  133 + private void addMentionAttributes(FeatureHelper helper, TMention mention, Map<Attribute, Double> attribute2value, String attributePrefix) {
  134 + // mention characteristics
  135 + attribute2value.put(getAttributeByName(attributePrefix + "_index"), (double) helper.getMentionIndex(mention));
  136 + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));
  137 + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_par"), (double) helper.getMentionIndexInPar(mention));
  138 + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_chain"), (double) helper.getMentionIndexInChain(mention));
  139 + attribute2value.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());
  140 + attribute2value.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject()));
  141 + attribute2value.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*")));
  142 + attribute2value.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));
  143 +
  144 + Interpretation interp = new Interpretation(helper.getMentionHeadToken(mention).getChosenInterpretation());
  145 + addNominalAttributeValue(interp.getCtag(), attribute2value, attributePrefix + "_ctag");
  146 + addNominalAttributeValue(interp.getPerson(), attribute2value, attributePrefix + "_person");
  147 + addNominalAttributeValue(interp.getNumber(), attribute2value, attributePrefix + "_number");
  148 + addNominalAttributeValue(interp.getGender(), attribute2value, attributePrefix + "_gender");
  149 + addNominalAttributeValue(interp.getCase(), attribute2value, attributePrefix + "_case");
  150 +
  151 + // relation to other mentions
  152 + attribute2value.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention)));
  153 + attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention)));
  154 +
  155 + String orth = helper.getMentionOrth(mention);
  156 + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1))));
  157 + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth)));
  158 + attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length());
  159 +
  160 + // par characteristics
  161 + TParagraph mentionParagraph = helper.getMentionParagraph(mention);
  162 + attribute2value.put(getAttributeByName(attributePrefix + "_par_idx"), (double) helper.getParIndex(mentionParagraph));
  163 + attribute2value.put(getAttributeByName(attributePrefix + "_par_token_count"), mentionParagraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
  164 + attribute2value.put(getAttributeByName(attributePrefix + "_par_sent_count"), (double) mentionParagraph.getSentences().size());
  165 +
  166 + // sent characteristics
  167 + TSentence mentionSentence = helper.getMentionSentence(mention);
  168 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_token_count"), (double) mentionSentence.getTokensSize());
  169 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size());
  170 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence));
  171 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence));
  172 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals(".")));
  173 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?")));
  174 +
  175 + // frequent bases
  176 + String mentionBase = helper.getMentionBase(mention);
  177 + for (String base : frequentBases) {
  178 + attribute2value.put(getAttributeByName(attributePrefix + "_" + encodeBase(base)), toBinary(mentionBase.equals(base)));
  179 + }
  180 + }
  181 +
  182 + private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
  183 + Attribute att = getAttributeByName(attributeName);
  184 + int index = att.indexOfValue(value);
  185 + if (index == -1)
  186 + LOG.warn(value + " not found for attribute " + attributeName);
  187 + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
  188 + }
  189 +
  190 +
  191 + private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) {
  192 + for (TMention m : mentions) {
  193 + Map<Attribute, Double> map = Maps.newHashMap();
  194 + map.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  195 + result.put(m, map);
  196 + }
  197 + }
  198 +
  199 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention;
  2 +
  3 +import com.google.common.collect.Sets;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  9 +import weka.classifiers.Classifier;
  10 +import weka.core.Instance;
  11 +import weka.core.Instances;
  12 +
  13 +import java.util.Map;
  14 +import java.util.Set;
  15 +
  16 +public class MentionModel {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(MentionModel.class);
  19 +
  20 + public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception {
  21 + Set<TMention> goodMentions = Sets.newHashSet();
  22 +
  23 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  24 + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor);
  25 + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
  26 + Instance instance = entry.getValue();
  27 + instance.setDataset(instances);
  28 + instance.setClassMissing();
  29 + boolean good = classifier.classifyInstance(instance) > 0.5;
  30 + if (good)
  31 + goodMentions.add(entry.getKey());
  32 + }
  33 + LOG.info("\t" + goodMentions.size() + "\t" + mention2instance.size());
  34 + return goodMentions;
  35 + }
  36 +
  37 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention;
  2 +
  3 +import com.google.common.collect.HashMultiset;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Multiset;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  9 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  10 +
  11 +import java.util.Collection;
  12 +import java.util.List;
  13 +import java.util.Map;
  14 +import java.util.stream.Collectors;
  15 +
  16 +public class MentionScorer {
  17 +
  18 +
  19 + public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
  20 + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
  21 +
  22 + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
  23 + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences);
  24 +
  25 + return booleanTokenIntersection(mention2Orth, tokenCounts);
  26 + }
  27 +
  28 + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
  29 + Map<TMention, Double> mention2score = Maps.newHashMap();
  30 + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
  31 + TMention mention = entry.getKey();
  32 + String mentionOrth = mention2Orth.get(mention);
  33 + for (String token : Utils.tokenize(mentionOrth)) {
  34 + if (tokenCounts.contains(token.toLowerCase())) {
  35 + mention2score.put(mention, 1.0);
  36 + break;
  37 + }
  38 + }
  39 + mention2score.putIfAbsent(mention, 0.0);
  40 + }
  41 + return mention2score;
  42 + }
  43 +
  44 + private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
  45 + Map<TMention, Double> mention2score = Maps.newHashMap();
  46 + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
  47 + TMention mention = entry.getKey();
  48 + String mentionOrth = mention2Orth.get(mention);
  49 + int present = 0;
  50 + for (String token : Utils.tokenize(mentionOrth)) {
  51 + if (tokenCounts.contains(token.toLowerCase())) {
  52 + present++;
  53 + }
  54 + }
  55 + mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0);
  56 + }
  57 + return mention2score;
  58 + }
  59 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention;
  2 +
  3 +import com.google.common.base.Charsets;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.io.Files;
  6 +import org.apache.logging.log4j.LogManager;
  7 +import org.apache.logging.log4j.Logger;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  10 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  11 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  12 +import weka.core.Instance;
  13 +import weka.core.Instances;
  14 +import weka.core.converters.ArffSaver;
  15 +
  16 +import java.io.File;
  17 +import java.io.IOException;
  18 +import java.util.Map;
  19 +
  20 +
  21 +public class PrepareTrainingData {
  22 +
  23 + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
  24 +
  25 + public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
  26 + public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  27 +
  28 + public static void main(String[] args) throws IOException {
  29 +
  30 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
  31 + Map<String, String> id2optimalSummary = loadOptimalSummaries();
  32 +
  33 + MentionScorer mentionScorer = new MentionScorer();
  34 + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
  35 +
  36 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  37 +
  38 + int i = 1;
  39 + for (String textId : id2preprocessedText.keySet()) {
  40 + LOG.info(i++ + "/" + id2preprocessedText.size());
  41 +
  42 + TText preprocessedText = id2preprocessedText.get(textId);
  43 + String optimalSummary = id2optimalSummary.get(textId);
  44 + if (optimalSummary == null)
  45 + continue;
  46 + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
  47 +
  48 + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor);
  49 + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
  50 + TMention mention = entry.getKey();
  51 + Instance instance = entry.getValue();
  52 + instance.setDataset(instances);
  53 + instance.setClassValue(mention2score.get(mention));
  54 + instances.add(instance);
  55 + }
  56 + }
  57 + saveInstancesToFile(instances);
  58 + }
  59 +
  60 + private static void saveInstancesToFile(Instances instances) throws IOException {
  61 + ArffSaver saver = new ArffSaver();
  62 + saver.setInstances(instances);
  63 + saver.setFile(new File(Constants.MENTIONS_DATASET_PATH));
  64 + saver.writeBatch();
  65 + }
  66 +
  67 + private static Map<String, String> loadOptimalSummaries() throws IOException {
  68 + Map<String, String> id2optimalSummary = Maps.newHashMap();
  69 + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
  70 + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
  71 + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
  72 + }
  73 + LOG.info(id2optimalSummary.size() + " optimal summaries found.");
  74 + return id2optimalSummary;
  75 + }
  76 +
  77 +
  78 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.core.Instances;
  9 +import weka.core.converters.ArffLoader;
  10 +
  11 +import java.io.File;
  12 +import java.io.FileOutputStream;
  13 +import java.io.ObjectOutputStream;
  14 +
  15 +
  16 +public class TrainModel {
  17 + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
  18 +
  19 + public static void main(String[] args) throws Exception {
  20 +
  21 + ArffLoader loader = new ArffLoader();
  22 + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
  23 + Instances instances = loader.getDataSet();
  24 + instances.setClassIndex(0);
  25 + LOG.info(instances.size() + " instances loaded.");
  26 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  27 +
  28 + StopWatch watch = new StopWatch();
  29 + watch.start();
  30 +
  31 + Classifier classifier = Constants.getClassifier();
  32 +
  33 + LOG.info("Building classifier...");
  34 + classifier.buildClassifier(instances);
  35 + LOG.info("...done.");
  36 +
  37 + try (ObjectOutputStream oos = new ObjectOutputStream(
  38 + new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) {
  39 + oos.writeObject(classifier);
  40 + }
  41 +
  42 + watch.stop();
  43 + LOG.info("Elapsed time: " + watch);
  44 +
  45 + LOG.info(classifier.toString());
  46 + }
  47 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention.test;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.classifiers.evaluation.Evaluation;
  9 +import weka.core.Instances;
  10 +import weka.core.converters.ArffLoader;
  11 +
  12 +import java.io.File;
  13 +import java.util.Random;
  14 +
  15 +
  16 +public class Crossvalidate {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
  19 +
  20 + public static void main(String[] args) throws Exception {
  21 +
  22 + ArffLoader loader = new ArffLoader();
  23 + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
  24 + Instances instances = loader.getDataSet();
  25 + instances.setClassIndex(0);
  26 + LOG.info(instances.size() + " instances loaded.");
  27 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  28 +
  29 +// while (instances.size() > 10000)
  30 +// instances.remove(instances.size() - 1);
  31 +
  32 + StopWatch watch = new StopWatch();
  33 + watch.start();
  34 +
  35 + Classifier tree = Constants.getClassifier();
  36 +
  37 + Evaluation eval = new Evaluation(instances);
  38 + eval.crossValidateModel(tree, instances, 10, new Random(1));
  39 + LOG.info(eval.toSummaryString());
  40 +
  41 + watch.stop();
  42 + LOG.info("Elapsed time: " + watch);
  43 + }
  44 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.mention.test;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.classifiers.evaluation.Evaluation;
  9 +import weka.core.Instances;
  10 +import weka.core.converters.ArffLoader;
  11 +
  12 +import java.io.File;
  13 +import java.io.FileInputStream;
  14 +import java.io.IOException;
  15 +import java.io.ObjectInputStream;
  16 +
  17 +/**
  18 + * Created by me2 on 05.04.16.
  19 + */
  20 +public class Validate {
  21 + private static final Logger LOG = LoggerFactory.getLogger(Validate.class);
  22 +
  23 + public static void main(String[] args) throws Exception {
  24 +
  25 + ArffLoader loader = new ArffLoader();
  26 + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
  27 + Instances instances = loader.getDataSet();
  28 + instances.setClassIndex(0);
  29 + LOG.info(instances.size() + " instances loaded.");
  30 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  31 +
  32 + Classifier classifier = loadClassifier();
  33 +
  34 + StopWatch watch = new StopWatch();
  35 + watch.start();
  36 +
  37 + Evaluation eval = new Evaluation(instances);
  38 + eval.evaluateModel(classifier, instances);
  39 +
  40 + LOG.info(eval.toSummaryString());
  41 +
  42 + watch.stop();
  43 + LOG.info("Elapsed time: " + watch);
  44 + }
  45 +
  46 + private static Classifier loadClassifier() throws IOException, ClassNotFoundException {
  47 + LOG.info("Loading classifier...");
  48 + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) {
  49 + Classifier classifier = (Classifier) ois.readObject();
  50 + LOG.info("Done. " + classifier.toString());
  51 + return classifier;
  52 + }
  53 + }
  54 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.sentence;
  2 +
  3 +import com.google.common.base.Charsets;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.io.Files;
  6 +import org.apache.logging.log4j.LogManager;
  7 +import org.apache.logging.log4j.Logger;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  12 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  14 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
  15 +import weka.classifiers.Classifier;
  16 +import weka.core.Instance;
  17 +import weka.core.Instances;
  18 +import weka.core.converters.ArffSaver;
  19 +
  20 +import java.io.File;
  21 +import java.io.IOException;
  22 +import java.util.Map;
  23 +import java.util.Set;
  24 +
  25 +
  26 +public class PrepareTrainingData {
  27 +
  28 + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
  29 +
  30 + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
  31 + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  32 +
  33 + public static void main(String[] args) throws Exception {
  34 +
  35 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
  36 + Map<String, String> id2optimalSummary = loadOptimalSummaries();
  37 +
  38 + SentenceScorer sentenceScorer = new SentenceScorer();
  39 + SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
  40 +
  41 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  42 +
  43 + Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
  44 + MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
  45 +
  46 + int i = 1;
  47 + for (String textId : id2preprocessedText.keySet()) {
  48 + LOG.info(i++ + "/" + id2preprocessedText.size());
  49 +
  50 + TText preprocessedText = id2preprocessedText.get(textId);
  51 + String optimalSummary = id2optimalSummary.get(textId);
  52 + if (optimalSummary == null)
  53 + continue;
  54 + Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText);
  55 +
  56 + Set<TMention> goodMentions
  57 + = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText);
  58 +// Set<TMention> goodMentions
  59 +// = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
  60 +
  61 + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
  62 + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
  63 + TSentence sentence = entry.getKey();
  64 + Instance instance = entry.getValue();
  65 + instance.setDataset(instances);
  66 + instance.setClassValue(sentence2score.get(sentence));
  67 + instances.add(instance);
  68 + }
  69 + }
  70 + saveInstancesToFile(instances);
  71 + }
  72 +
  73 + private static void saveInstancesToFile(Instances instances) throws IOException {
  74 + ArffSaver saver = new ArffSaver();
  75 + saver.setInstances(instances);
  76 + saver.setFile(new File(Constants.SENTENCES_DATASET_PATH));
  77 + saver.writeBatch();
  78 + }
  79 +
  80 + private static Map<String, String> loadOptimalSummaries() throws IOException {
  81 + Map<String, String> id2optimalSummary = Maps.newHashMap();
  82 + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
  83 + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
  84 + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
  85 + }
  86 + LOG.info(id2optimalSummary.size() + " optimal summaries found.");
  87 + return id2optimalSummary;
  88 + }
  89 +
  90 +
  91 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.sentence;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  5 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  7 +import weka.core.Attribute;
  8 +
  9 +import java.util.List;
  10 +import java.util.Map;
  11 +import java.util.Set;
  12 +import java.util.stream.Collectors;
  13 +
  14 +public class SentenceFeatureExtractor extends FeatureExtractor {
  15 +
  16 + public SentenceFeatureExtractor() {
  17 +
  18 + addNumericAttributeNormalized("sent_mention_cluster_count");
  19 + addNumericAttributeNormalized("sent_good_mention_cluster_count");
  20 + addNumericAttributeNormalized("sent_good_mention_cluster_good_count");
  21 + addNumericAttributeNormalized("sent_cluster_count");
  22 + addNumericAttributeNormalized("sent_good_cluster_count");
  23 + addNumericAttributeNormalized("sent_mention_count");
  24 + addNumericAttributeNormalized("sent_good_mention_count");
  25 +
  26 + addNumericAttributeNormalized("sent_token_length");
  27 + addNumericAttributeNormalized("sent_idx");
  28 + addNumericAttributeNormalized("sent_idx_in_par");
  29 + addBinaryAttribute("sent_ends_with_dot");
  30 + addBinaryAttribute("sent_ends_with_questionmark");
  31 +
  32 + addNumericAttributeNormalized("par_idx");
  33 + addNumericAttributeNormalized("par_token_count");
  34 + addNumericAttributeNormalized("par_sent_count");
  35 +
  36 + addNumericAttribute("text_token_count");
  37 + addNumericAttribute("text_sent_count");
  38 + addNumericAttribute("text_par_count");
  39 + addNumericAttribute("text_mention_count");
  40 + addNumericAttribute("text_cluster_count");
  41 +
  42 + addNumericAttribute("score");
  43 + fillSortedAttributes("score");
  44 + }
  45 +
  46 + public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) {
  47 +
  48 + int sentenceIdx = 0;
  49 + int parIdx = 0;
  50 +
  51 + FeatureHelper helper = new FeatureHelper(preprocessedText);
  52 + List<TParagraph> pars = preprocessedText.getParagraphs();
  53 + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
  54 + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
  55 +
  56 + Map<TSentence, Map<Attribute, Double>> sentence2features = Maps.newLinkedHashMap();
  57 + for (TParagraph paragraph : preprocessedText.getParagraphs()) {
  58 + int sentenceIdxInPar = 0;
  59 + for (TSentence sentence : paragraph.getSentences()) {
  60 + Map<Attribute, Double> feature2value = Maps.newHashMap();
  61 +
  62 + feature2value.put(getAttributeByName("sent_mention_cluster_count"), sentence.getMentions().stream().mapToDouble(helper::getChainLength).sum());
  63 + feature2value.put(getAttributeByName("sent_good_mention_cluster_count"), sentence.getMentions().stream().filter(goodMentions::contains).mapToDouble(helper::getChainLength).sum());
  64 + feature2value.put(getAttributeByName("sent_good_mention_cluster_good_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).flatMap(m -> helper.getCoreferentMentions(m).stream()).filter(goodMentions::contains).count());
  65 + feature2value.put(getAttributeByName("sent_cluster_count"), (double) sentence.getMentions().stream().map(helper::getMentionCluster).collect(Collectors.toSet()).size());
  66 + feature2value.put(getAttributeByName("sent_good_cluster_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).map(helper::getMentionCluster).collect(Collectors.toSet()).size());
  67 + feature2value.put(getAttributeByName("sent_mention_count"), (double) sentence.getMentions().size());
  68 + feature2value.put(getAttributeByName("sent_good_mention_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).count());
  69 +
  70 + feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size());
  71 + feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar);
  72 + feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx);
  73 + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals(".")));
  74 + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?")));
  75 +
  76 + feature2value.put(getAttributeByName("par_idx"), (double) parIdx);
  77 + feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
  78 + feature2value.put(getAttributeByName("par_sent_count"), (double) paragraph.getSentences().size());
  79 +
  80 + feature2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
  81 + feature2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
  82 + feature2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
  83 + feature2value.put(getAttributeByName("text_par_count"), (double) pars.size());
  84 + feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
  85 + feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
  86 +
  87 + feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  88 +
  89 + feature2value.remove(null);
  90 + assert (feature2value.size() == getAttributesList().size());
  91 +
  92 + sentence2features.put(sentence, feature2value);
  93 +
  94 + sentenceIdx++;
  95 + sentenceIdxInPar++;
  96 + }
  97 + parIdx++;
  98 + }
  99 + addNormalizedAttributeValues(sentence2features);
  100 +
  101 + return sentence2features;
  102 + }
  103 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.sentence;
  2 +
  3 +import com.google.common.collect.HashMultiset;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Multiset;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  9 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  10 +
  11 +import java.util.List;
  12 +import java.util.Map;
  13 +
  14 +public class SentenceScorer {
  15 + public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) {
  16 + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
  17 +
  18 + Map<TSentence, Double> sentence2score = Maps.newHashMap();
  19 + for (TParagraph paragraph : preprocessedText.getParagraphs())
  20 + for (TSentence sentence : paragraph.getSentences()) {
  21 + double score = 0.0;
  22 +
  23 + String orth = Utils.loadSentence2Orth(sentence);
  24 + List<String> tokens = Utils.tokenize(orth);
  25 + for (String token : tokens) {
  26 + score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0;
  27 + }
  28 + sentence2score.put(sentence, score / tokens.size());
  29 + }
  30 + return sentence2score;
  31 + }
  32 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.sentence;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.core.Instances;
  9 +import weka.core.converters.ArffLoader;
  10 +
  11 +import java.io.File;
  12 +import java.io.FileOutputStream;
  13 +import java.io.ObjectOutputStream;
  14 +
  15 +
  16 +public class TrainModel {
  17 + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
  18 +
  19 + public static void main(String[] args) throws Exception {
  20 +
  21 + ArffLoader loader = new ArffLoader();
  22 + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
  23 + Instances instances = loader.getDataSet();
  24 + instances.setClassIndex(0);
  25 + LOG.info(instances.size() + " instances loaded.");
  26 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  27 +
  28 + StopWatch watch = new StopWatch();
  29 + watch.start();
  30 +
  31 + Classifier classifier = Constants.getSentencesClassifier();
  32 +
  33 + LOG.info("Building classifier...");
  34 + classifier.buildClassifier(instances);
  35 + LOG.info("...done.");
  36 +
  37 + try (ObjectOutputStream oos = new ObjectOutputStream(
  38 + new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) {
  39 + oos.writeObject(classifier);
  40 + }
  41 +
  42 + watch.stop();
  43 + LOG.info("Elapsed time: " + watch);
  44 +
  45 + LOG.info(classifier.toString());
  46 + }
  47 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.sentence.test;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.classifiers.evaluation.Evaluation;
  9 +import weka.core.Instances;
  10 +import weka.core.converters.ArffLoader;
  11 +
  12 +import java.io.File;
  13 +import java.util.Random;
  14 +
  15 +
  16 +public class Crossvalidate {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
  19 +
  20 + public static void main(String[] args) throws Exception {
  21 +
  22 + ArffLoader loader = new ArffLoader();
  23 + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
  24 + Instances instances = loader.getDataSet();
  25 + instances.setClassIndex(0);
  26 + LOG.info(instances.size() + " instances loaded.");
  27 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  28 +
  29 + StopWatch watch = new StopWatch();
  30 + watch.start();
  31 +
  32 + Classifier tree = Constants.getSentencesClassifier();
  33 +
  34 + Evaluation eval = new Evaluation(instances);
  35 + eval.crossValidateModel(tree, instances, 10, new Random(1));
  36 + LOG.info(eval.toSummaryString());
  37 +
  38 + watch.stop();
  39 + LOG.info("Elapsed time: " + watch);
  40 + }
  41 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java 0 → 100644
  1 +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
  6 +import org.apache.commons.csv.CSVFormat;
  7 +import org.apache.commons.csv.CSVPrinter;
  8 +import org.apache.commons.csv.QuoteMode;
  9 +import org.apache.commons.io.IOUtils;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  11 +import pl.waw.ipipan.zil.summ.nicolas.Utils;
  12 +
  13 +import java.io.File;
  14 +import java.io.FileReader;
  15 +import java.io.FileWriter;
  16 +import java.io.IOException;
  17 +import java.util.Arrays;
  18 +import java.util.List;
  19 +import java.util.Map;
  20 +import java.util.Set;
  21 +
  22 +/**
  23 + * Created by me2 on 26.07.16.
  24 + */
  25 +public class Zero {
  26 +
  27 + private static final String IDS_PATH = "summaries_dev";
  28 + private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/";
  29 +
  30 + public static void main(String[] args) throws IOException {
  31 +
  32 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
  33 + Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH);
  34 +
  35 + int mentionCount = 0;
  36 + int mentionInNom = 0;
  37 + int mentionInNomSequential = 0;
  38 +
  39 + List<List<Object>> rows = Lists.newArrayList();
  40 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  41 + String textId = entry.getKey();
  42 +// System.out.println(id);
  43 +
  44 + TText text = entry.getValue();
  45 + List<String> sentenceIds = id2sentIds.get(textId);
  46 +// System.out.println(sentenceIds);
  47 +
  48 + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
  49 + for (TCoreference coreference : text.getCoreferences()) {
  50 + for (String mentionId : coreference.getMentionIds()) {
  51 + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
  52 + }
  53 + }
  54 +
  55 + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
  56 + TSentence prevSentence = null;
  57 + for (TParagraph p : text.getParagraphs()) {
  58 + Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences());
  59 +
  60 + for (TSentence sentence : p.getSentences()) {
  61 + if (!sentenceIds.contains(sentence.getId()))
  62 + continue;
  63 + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
  64 +
  65 + Map<String, TToken> tokenId2Token = Maps.newHashMap();
  66 + for (TToken t : sentence.getTokens())
  67 + tokenId2Token.put(t.getId(), t);
  68 +
  69 + for (TMention mention : sentence.getMentions()) {
  70 + mentionCount++;
  71 +
  72 + for (String tokenId : mention.getHeadIds()) {
  73 + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
  74 + if (isInNominative(interp)) {
  75 + mentionInNom++;
  76 +
  77 + currentSentenceNominativeMentionIds.add(mention.getId());
  78 + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
  79 + mentionInNomSequential++;
  80 + System.out.println(tMentionStringMap.get(mention)
  81 + + "\n\t" + Utils.loadSentence2Orth(prevSentence)
  82 + + "\n\t" + Utils.loadSentence2Orth(sentence));
  83 +
  84 + List<Object> row = Lists.newArrayList();
  85 + row.add("C");
  86 + row.add(textId);
  87 + row.add(tMentionStringMap.get(mention));
  88 + row.add(Utils.loadSentence2Orth(prevSentence));
  89 + row.add(Utils.loadSentence2Orth(sentence));
  90 + rows.add(row);
  91 + }
  92 + break;
  93 + }
  94 + }
  95 + }
  96 +
  97 + prevSentence = sentence;
  98 + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
  99 + }
  100 + }
  101 + }
  102 +
  103 + System.out.println(mentionCount + " mentions");
  104 + System.out.println(mentionInNom + " mention in nom");
  105 + System.out.println(mentionInNomSequential + " mention in nom with previous in nom");
  106 +
  107 + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
  108 + for (List<Object> row : rows) {
  109 + csvPrinter.printRecord(row);
  110 + }
  111 + }
  112 +
  113 + }
  114 +
  115 + private static boolean isInNominative(TInterpretation interp) {
  116 + return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
  117 + }
  118 +
  119 + private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException {
  120 + Map<String, List<String>> result = Maps.newHashMap();
  121 + for (File f : new File(idsPath).listFiles()) {
  122 + String id = f.getName().split("_")[0];
  123 + List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
  124 + result.put(id, sentenceIds);
  125 + }
  126 + return result;
  127 + }
  128 +}
... ...
nicolas-model/pom.xml 0 → 100644
  1 +++ a/nicolas-model/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-model</artifactId>
  13 +
  14 +</project>
0 15 \ No newline at end of file
... ...
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt 0 → 100644
  1 +++ a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt
  1 +on
  2 +to
  3 +co
  4 +rok
  5 +być
  6 +wszystko
  7 +polska
  8 +człowiek
  9 +sobie
  10 +raz
  11 +my
  12 +mieć
  13 +czas
  14 +państwo
  15 +praca
  16 +osoba
  17 +sprawa
  18 +ja
  19 +kraj
  20 +pieniądz
  21 +nikt
  22 +kto
  23 +przykład
  24 +nic
  25 +koniec
  26 +rząd
  27 +prawo
  28 +życie
  29 +miejsce
  30 +móc
  31 +fot
  32 +problem
  33 +władza
  34 +miesiąc
  35 +rzecz
  36 +stan
  37 +świat
  38 +wszyscy
  39 +mówić
  40 +rozmowa
  41 +coś
  42 +sytuacja
  43 +powód
  44 +początek
  45 +wiedzieć
  46 +dzień
  47 +uwaga
  48 +strona
  49 +udział
  50 +in
  51 +musieć
  52 +polityk
  53 +ktoś
  54 +ogół
  55 +polityka
  56 +chcieć
  57 +walka
  58 +zmiana
  59 +decyzja
  60 +ciąg
  61 +m .
  62 +pan
  63 +szansa
  64 +polak
  65 +przypadek
  66 +większość
  67 +pytanie
  68 +wzgląd
  69 +warszawa
  70 +proca
  71 +pomoc
  72 +prezydent
  73 +społeczeństwo
  74 +wynik
  75 +dziecko
  76 +prawda
  77 +związek
  78 +gospodarka
  79 +część
  80 +wojna
  81 +tydzień
  82 +granica
  83 +głos
  84 +przyszłość
  85 +autor
  86 +wybory
  87 +rynek
  88 +cel
  89 +ustawa
  90 +uważać
  91 +ten rok
  92 +droga
  93 +dom
  94 +rys
  95 +myśleć
  96 +firma
  97 +zasada
  98 +fakt
  99 +kolej
  100 +nadzieja
  101 +dolar
  102 +wraz
  103 +miasto
  104 +rozwój
  105 +ten sposób
  106 +europa
  107 +temat
  108 +siła
  109 +rodzina
  110 +minister
  111 +historia
  112 +wpływ
  113 +współpraca
  114 +środek
  115 +informacja
  116 +procent
  117 +wniosek
  118 +unia europejski
  119 +niemcy
  120 +podstawa
  121 +reforma
  122 +partia
  123 +interes
  124 +ten sprawa
  125 +kandydat
  126 +sukces
  127 +sposób
  128 +wątpliwość
  129 +złoty
  130 +sld
  131 +pracownik
  132 +stanowisko
  133 +dyskusja
  134 +telewizja
  135 +pewność
  136 +odpowiedź
  137 +rzeczywistość
  138 +program
  139 +cena
  140 +działanie
  141 +system
  142 +unia
  143 +ręka
  144 +odpowiedzialność
  145 +środowisko
  146 +solidarność
  147 +demokracja
  148 +maić
  149 +ramy
  150 +badanie
  151 +media
  152 +wartość
  153 +wybór
  154 +głowa
  155 +zostać
  156 +usa
  157 +pracować
  158 +porozumienie
  159 +widzieć
  160 +zdanie
  161 +akcja
  162 +wolność
  163 +spotkanie
  164 +przeszłość
  165 +stosunek
  166 +okazja
  167 +prowadzić
  168 +zachód
  169 +kobieta
  170 +obywatel
  171 +sąd
  172 +ubiegły rok
  173 +dziennikarz
  174 +kultura
  175 +grupa
  176 +opinia publiczny
  177 +obrona
  178 +bezpieczeństwo
  179 +opinia
  180 +rzeczpospolita
  181 +dokument
  182 +racja
  183 +szkoła
  184 +góra
  185 +warunek
  186 +organizacja
  187 +oko
  188 +godzina
  189 +tysiąc
  190 +ten czas
  191 +możliwość
  192 +błąd
  193 +ziemia
  194 +parlament
  195 +ten pora
  196 +chwila
  197 +naród
  198 +konflikt
  199 +działalność
  200 +sejm
  201 +powrót
  202 +premier
  203 +działać
  204 +rada
  205 +zdrowie
  206 +wiek
  207 +dodatek
  208 +poziom
  209 +widzenie
  210 +żyć
  211 +powiedzieć
  212 +inwestycja
  213 +rosja
  214 +niemiec
  215 +samochód
  216 +skutek
  217 +punkt
  218 +rola
  219 +mieszkaniec
  220 +wyborca
  221 +koszt
  222 +budżet
  223 +szef
  224 +styczeń
  225 +instytucja
  226 +pełnia
  227 +ulica
  228 +aws
  229 +ochrona
  230 +dostęp
  231 +zagrożenie
  232 +zgoda
  233 +ue
  234 +" rzeczpospolita "
  235 +liczba
  236 +wieś
  237 +połowa
0 238 \ No newline at end of file
... ...
nicolas-train/pom.xml 0 → 100644
  1 +++ a/nicolas-train/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-train</artifactId>
  13 +
  14 +</project>
0 15 \ No newline at end of file
... ...
nicolas-zero/pom.xml 0 → 100644
  1 +++ a/nicolas-zero/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-zero</artifactId>
  13 +
  14 +</project>
0 15 \ No newline at end of file
... ...
pom.xml 0 → 100644
  1 +++ a/pom.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 +
  7 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  8 + <artifactId>nicolas-container</artifactId>
  9 + <packaging>pom</packaging>
  10 + <version>1.0-SNAPSHOT</version>
  11 +
  12 + <modules>
  13 + <module>nicolas-core</module>
  14 + <module>nicolas-cli</module>
  15 + <module>nicolas-model</module>
  16 + <module>nicolas-train</module>
  17 + <module>nicolas-zero</module>
  18 + </modules>
  19 +
  20 + <properties>
  21 + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  22 + <java.version.build>1.8</java.version.build>
  23 + </properties>
  24 +
  25 + <prerequisites>
  26 + <maven>3.0.5</maven>
  27 + </prerequisites>
  28 +
  29 + <developers>
  30 + <developer>
  31 + <name>Mateusz Kopeć</name>
  32 + <organization>ICS PAS</organization>
  33 + <email>m.kopec@ipipan.waw.pl</email>
  34 + </developer>
  35 + </developers>
  36 +
  37 + <dependencies>
  38 + <dependency>
  39 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  40 + <artifactId>pscapi</artifactId>
  41 + <version>1.0-SNAPSHOT</version>
  42 + </dependency>
  43 + <dependency>
  44 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  45 + <artifactId>utils</artifactId>
  46 + <version>1.0-SNAPSHOT</version>
  47 + </dependency>
  48 +
  49 + <dependency>
  50 + <groupId>org.apache.commons</groupId>
  51 + <artifactId>commons-csv</artifactId>
  52 + <version>1.3</version>
  53 + </dependency>
  54 + <dependency>
  55 + <groupId>com.google.guava</groupId>
  56 + <artifactId>guava</artifactId>
  57 + <version>19.0</version>
  58 + </dependency>
  59 + <dependency>
  60 + <groupId>nz.ac.waikato.cms.weka</groupId>
  61 + <artifactId>weka-dev</artifactId>
  62 + <version>3.9.0</version>
  63 + </dependency>
  64 + <dependency>
  65 + <groupId>org.apache.commons</groupId>
  66 + <artifactId>commons-lang3</artifactId>
  67 + <version>3.4</version>
  68 + </dependency>
  69 + <dependency>
  70 + <groupId>commons-io</groupId>
  71 + <artifactId>commons-io</artifactId>
  72 + <version>2.5</version>
  73 + </dependency>
  74 + </dependencies>
  75 +
  76 +
  77 + <build>
  78 + <plugins>
  79 + <plugin>
  80 + <groupId>org.apache.maven.plugins</groupId>
  81 + <artifactId>maven-compiler-plugin</artifactId>
  82 + <version>3.1</version>
  83 + <configuration>
  84 + <source>${java.version.build}</source>
  85 + <target>${java.version.build}</target>
  86 + </configuration>
  87 + </plugin>
  88 + </plugins>
  89 + </build>
  90 +
  91 + <distributionManagement>
  92 + <repository>
  93 + <id>deployment</id>
  94 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
  95 + </repository>
  96 + <snapshotRepository>
  97 + <id>deployment</id>
  98 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
  99 + </snapshotRepository>
  100 + </distributionManagement>
  101 +</project>
0 102 \ No newline at end of file
... ...