From c9b1021d986bf2636bfd978885170dedd7d45546 Mon Sep 17 00:00:00 2001 From: Mateusz Kopeć <m.kopec@ipipan.waw.pl> Date: Wed, 12 Apr 2017 23:20:45 +0200 Subject: [PATCH] fix sonar issues --- nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java | 5 ++++- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java | 7 +++++-- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java | 31 +++++++++++++++++++------------ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java | 12 +++++++----- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java | 3 ++- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java | 16 ++++++---------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java | 2 +- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java | 12 ++++++++---- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java | 13 +++++-------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java | 30 +++++++++++++++++------------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java | 36 ++++++++++++++++++++---------------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java | 3 +-- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java | 16 +++++++++------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java | 8 ++------ 15 files changed, 177 insertions(+), 142 deletions(-) diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java index d3257ea..2eb5527 100644 --- a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java @@ -92,7 +92,10 @@ class Cli { public void validate(String name, String value) { File file = new File(value); try { - file.createNewFile(); + boolean newFile = file.createNewFile(); + if (!newFile) { + LOG.warn("Output file exists and will be overridden."); + } } catch (IOException ex) { throw new ParameterException("Parameter " + name + " should be a valid file path (found " + value + ")", ex); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java index 1243c73..4a49879 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java @@ -67,11 +67,14 @@ public class FeatureExtractor { } } for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { - for (Attribute attribute : attribute2max.keySet()) { + for (Map.Entry<Attribute, Double> entry : attribute2max.entrySet()) { + Attribute attribute = entry.getKey(); + Double max = entry.getValue(); + Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); entityAttributes.put(normalizedAttribute, (entityAttributes.get(attribute) - attribute2min.get(attribute)) - / (attribute2max.get(attribute) - attribute2min.get(attribute))); + / (max - attribute2min.get(attribute))); } } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java index 0bd02ff..2750afd 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java @@ -54,49 +54,56 @@ public class FeatureHelper { coref2mentions.put(coref, ments); } - int parIdx = 0; - int sentIdx = 0; - int mentionIdx = 0; + Counters counters = new Counters(); for (TParagraph par : preprocessedText.getParagraphs()) { - Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); - mention2Orth.putAll(m2o); - Map<TMention, String> m2b = loadMention2Base(par.getSentences()); - mention2Base.putAll(m2b); - - int sentIdxInPar = 0; - int mentionIdxInPar = 0; - for (TSentence sent : par.getSentences()) { - - Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); - - Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); - for (TNamedEntity namedEntity : sent.getNames()) { - for (String childId : namedEntity.getChildIds()) { - tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); - tokenId2namedEntities.get(childId).add(namedEntity); - } - } + processParagraph(counters, par); + } + } - int mentionIdxInSent = 0; - for (TMention mention : sent.getMentions()) { - mention2sent.put(mention, sent); - mention2par.put(mention, par); - mention2index.put(mention, mentionIdx++); - mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); - mention2indexInSent.put(mention, mentionIdxInSent++); - mention2indexInPar.put(mention, mentionIdxInPar++); - - String firstHeadTokenId = mention.getHeadIds().iterator().next(); - mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); - if (tokenId2namedEntities.containsKey(firstHeadTokenId)) - mentionsInNamedEntities.add(mention); - } - sent2Index.put(sent, sentIdx++); - sent2IndexInPar.put(sent, sentIdxInPar++); + private void processParagraph(Counters counters, TParagraph par) { + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); + mention2Orth.putAll(m2o); + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); + mention2Base.putAll(m2b); + + int sentIdxInPar = 0; + int mentionIdxInPar = 0; + for (TSentence sent : par.getSentences()) { + + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); + + Map<String, Set<TNamedEntity>> tokenId2namedEntities = getTokenId2NamedEntities(sent); + + int mentionIdxInSent = 0; + for (TMention mention : sent.getMentions()) { + mention2sent.put(mention, sent); + mention2par.put(mention, par); + mention2index.put(mention, counters.mentionIdx++); + mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); + mention2indexInSent.put(mention, mentionIdxInSent++); + mention2indexInPar.put(mention, mentionIdxInPar++); + + String firstHeadTokenId = mention.getHeadIds().iterator().next(); + mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); + if (tokenId2namedEntities.containsKey(firstHeadTokenId)) + mentionsInNamedEntities.add(mention); } + sent2Index.put(sent, counters.sentIdx++); + sent2IndexInPar.put(sent, sentIdxInPar++); + } - par2Index.put(par, parIdx++); + par2Index.put(par, counters.parIdx++); + } + + private Map<String, Set<TNamedEntity>> getTokenId2NamedEntities(TSentence sent) { + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); + for (TNamedEntity namedEntity : sent.getNames()) { + for (String childId : namedEntity.getChildIds()) { + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); + tokenId2namedEntities.get(childId).add(namedEntity); + } } + return tokenId2namedEntities; } public List<TMention> getMentions() { @@ -220,31 +227,35 @@ public class FeatureHelper { return mention2sent.get(mention).getTokens().get(idx - 1); } - private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { + private static Map<TMention, String> loadMention2Orth(List<TSentence> sentences) { Map<TMention, String> mention2orth = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); + for (TSentence sentence : sentences) { + Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); - for (TMention m : s.getMentions()) { - StringBuilder mentionOrth = new StringBuilder(); - for (String tokId : m.getChildIds()) { - TToken token = tokId2tok.get(tokId); - if (!token.isNoPrecedingSpace()) - mentionOrth.append(" "); - mentionOrth.append(token.getOrth()); - } - mention2orth.put(m, mentionOrth.toString().trim()); + for (TMention mention : sentence.getMentions()) { + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); } } return mention2orth; } - private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { + private static String getMentionOrth(Map<String, TToken> tokId2tok, TMention m) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : m.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + return mentionOrth.toString().trim(); + } + + private static Map<TMention, String> loadMention2Base(List<TSentence> sentences) { Map<TMention, String> mention2base = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); + for (TSentence sentence : sentences) { + Map<String, String> tokId2base = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); - for (TMention m : s.getMentions()) { + for (TMention m : sentence.getMentions()) { StringBuilder mentionBase = new StringBuilder(); for (String tokId : m.getChildIds()) { mentionBase.append(" "); @@ -255,4 +266,10 @@ public class FeatureHelper { } return mention2base; } + + private class Counters { + int parIdx = 0; + int sentIdx = 0; + int mentionIdx = 0; + } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java index 9b2b8b5..7cb379a 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java @@ -18,6 +18,10 @@ import java.util.stream.Collectors; public class MentionFeatureExtractor extends FeatureExtractor { + private static final String SCORE_ATTRIBUTE_NAME = "score"; + private static final String OTHER_VALUE = "other"; + private static final String NULL_VALUE = "null"; + private final List<String> frequentBases; public MentionFeatureExtractor() throws IOException { @@ -48,10 +52,10 @@ public class MentionFeatureExtractor extends FeatureExtractor { addBinaryAttribute(prefix + "_is_named"); addBinaryAttribute(prefix + "_is_pronoun"); addNominalAttribute(prefix + "_ctag", Constants.POS_TAGS); - addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); - addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); - addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); - addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n")); + addNominalAttribute(prefix + "_person", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "pri", "sec", "ter")); + addNominalAttribute(prefix + "_case", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "nom", "acc", "dat", "gen", "loc", "inst", "voc")); + addNominalAttribute(prefix + "_number", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "sg", "pl")); + addNominalAttribute(prefix + "_gender", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "f", "m1", "m2", "m3", "n")); // relation to other addBinaryAttribute(prefix + "_is_nested"); @@ -76,8 +80,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { } } - addNominalAttribute("score", Lists.newArrayList("bad", "good")); - fillSortedAttributes("score"); + addNominalAttribute(SCORE_ATTRIBUTE_NAME, Lists.newArrayList("bad", "good")); + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); } private String encodeBase(String base) { @@ -143,8 +147,11 @@ public class MentionFeatureExtractor extends FeatureExtractor { attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); String orth = helper.getMentionOrth(mention); - attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1)))); - attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth))); + String firstLetter = orth.substring(0, 1); + String firstLetterUpperCased = firstLetter.toUpperCase(); + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && firstLetterUpperCased.equals(firstLetter))); + String upperCased = orth.toUpperCase(); + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(upperCased.equals(orth))); attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length()); // par characteristics @@ -159,8 +166,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size()); attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence)); attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence)); - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("."))); - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?"))); + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); // frequent bases String mentionBase = helper.getMentionBase(mention); @@ -174,14 +181,14 @@ public class MentionFeatureExtractor extends FeatureExtractor { int index = att.indexOfValue(value); if (index == -1) LOG.warn("{} not found for attribute {}", value, attributeName); - attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue(OTHER_VALUE) : index)); } private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) { for (TMention m : mentions) { Map<Attribute, Double> map = Maps.newHashMap(); - map.put(getAttributeByName("score"), weka.core.Utils.missingValue()); + map.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); result.put(m, map); } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java index 1f429a5..0ba0f7b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java @@ -13,6 +13,8 @@ import java.util.stream.Collectors; public class SentenceFeatureExtractor extends FeatureExtractor { + private static final String SCORE_ATTRIBUTE_NAME = "score"; + public SentenceFeatureExtractor() { addNumericAttributeNormalized("sent_mention_cluster_count"); @@ -39,8 +41,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { addNumericAttribute("text_mention_count"); addNumericAttribute("text_cluster_count"); - addNumericAttribute("score"); - fillSortedAttributes("score"); + addNumericAttribute(SCORE_ATTRIBUTE_NAME); + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); } public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) { @@ -70,8 +72,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size()); feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar); feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx); - feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("."))); - feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?"))); + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(sentence)))); + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(sentence)))); feature2value.put(getAttributeByName("par_idx"), (double) parIdx); feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); @@ -84,7 +86,7 @@ public class SentenceFeatureExtractor extends FeatureExtractor { feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); - feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); + feature2value.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); feature2value.remove(null); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java index 5476e17..105494b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java @@ -42,12 +42,13 @@ public class ResourceUtils { } private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { + Predicate<String> stringIsNonempty = (String s) -> !s.isEmpty(); try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { return IOUtils.readLines(stream, Constants.ENCODING) .stream() .map(String::trim) .map(String::toLowerCase) - .filter(((Predicate<String>) String::isEmpty).negate()) + .filter(stringIsNonempty) .sorted() .distinct() .collect(Collectors.toList()); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java index d63fe0b..87a52e1 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java @@ -37,15 +37,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count"; private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length"; private static final String IS_PAN_OR_PANI = "_is_pan_or_pani"; - - // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet( -// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ","); private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy"); - private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(); -// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet( -// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ","); - private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_"; private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_"; @@ -105,7 +98,10 @@ public class ZeroFeatureExtractor extends FeatureExtractor { candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue()); TMention mention = candidate.getZeroCandidateMention(); - TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().orElse(null); + if (antecedent == null) { + throw new IllegalArgumentException("Mention pair without first element!"); + } addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX); addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX); @@ -165,14 +161,14 @@ public class ZeroFeatureExtractor extends FeatureExtractor { TSentence mentionSentence = helper.getMentionSentence(mention); candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size()); candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size()); - candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?"))); + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary("?".equals(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth()))); } private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { Attribute att = getAttributeByName(attributeName); int index = att.indexOfValue(value); if (index == -1) - LOG.warn(value + "not found for attribute " + attributeName); + LOG.warn("{} not found for attribute {}", value, attributeName); attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java index e938576..4688869 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java @@ -32,7 +32,7 @@ public class CorpusHelper { } public static List<Summary> getAbstractSummaries(Text text) { - return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); } public static Set<String> loadTrainTextIds() throws IOException { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java index f7c0e1d..d5f76d9 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java @@ -67,15 +67,19 @@ public class PathConstants { } } - public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException { + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException { downloadFile(url, targetZipFile); extractZipFile(targetZipFile, targetDir); } - private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { + private static void extractZipFile(File targetZipFile, File targetDir) throws IOException { createFolder(targetDir); - ZipFile zipFile = new ZipFile(targetZipFile); - zipFile.extractAll(targetDir.getPath()); + try { + ZipFile zipFile = new ZipFile(targetZipFile); + zipFile.extractAll(targetDir.getPath()); + } catch (ZipException e) { + throw new IOException(e); + } LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java index 27bc63a..5fe0270 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java @@ -106,6 +106,10 @@ class Crossvalidate { return Pair.of(acc, name); }).max(Comparator.comparingDouble(Pair::getLeft)); + printBestResult(watch, max); + } + + private static void printBestResult(StopWatch watch, Optional<Pair<Double, String>> max) { LOG.info("#########"); if (max.isPresent()) { LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); @@ -142,13 +146,6 @@ class Crossvalidate { return Pair.of(acc, name); }).max(Comparator.comparingDouble(Pair::getLeft)); - LOG.info("#########"); - if (max.isPresent()) { - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); - } else { - LOG.info("Empty algorithms list"); - } - watch.stop(); - LOG.info("Elapsed time: {}", watch); + printBestResult(watch, max); } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java index aa341fc..922443d 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java @@ -22,33 +22,37 @@ public class MentionScorer { Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); - Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStoptags(sentences); return booleanTokenIntersection(mention2Orth, tokenCounts); } - private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sentences) { + private Map<TMention, String> loadMention2OrthExcludingStoptags(List<TSentence> sentences) { Map<TMention, String> mention2orth = Maps.newHashMap(); for (TSentence sentence : sentences) { Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); for (TMention mention : sentence.getMentions()) { - StringBuilder mentionOrth = new StringBuilder(); - for (String tokId : mention.getChildIds()) { - TToken token = tokId2tok.get(tokId); - if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) - continue; - - if (!token.isNoPrecedingSpace()) - mentionOrth.append(" "); - mentionOrth.append(token.getOrth()); - } - mention2orth.put(mention, mentionOrth.toString().trim()); + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); } } return mention2orth; } + private String getMentionOrth(Map<String, TToken> tokId2tok, TMention mention) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : mention.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) + continue; + + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + return mentionOrth.toString().trim(); + } + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { Map<TMention, Double> mention2score = Maps.newHashMap(); for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java index 2b311f0..6985061 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java @@ -17,7 +17,6 @@ import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class ExtractGoldSummaries { - private ExtractGoldSummaries() { } @@ -28,25 +27,30 @@ public class ExtractGoldSummaries { File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles(); if (files != null) { for (File file : files) { - Text text = PSC_IO.readText(file); + extractGoldSummariesFromFile(file); + } + } + } - List<Summary> goldSummaries; + private static void extractGoldSummariesFromFile(File file) throws IOException, JAXBException { + Text text = PSC_IO.readText(file); - boolean isTest = CorpusHelper.isTest(text); - if (isTest) { - goldSummaries = CorpusHelper.getAbstractSummaries(text); - } else { - goldSummaries = CorpusHelper.getExtractSummaries(text); - } + List<Summary> goldSummaries; + File targetDir; - for (Summary summary : goldSummaries) { - File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; - File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); + boolean isTest = CorpusHelper.isTest(text); + if (isTest) { + goldSummaries = CorpusHelper.getAbstractSummaries(text); + targetDir = GOLD_TEST_SUMMARIES_DIR; + } else { + goldSummaries = CorpusHelper.getExtractSummaries(text); + targetDir = GOLD_TRAIN_SUMMARIES_DIR; + } - try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { - writer.append(summary.getBody()); - } - } + for (Summary summary : goldSummaries) { + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { + writer.append(summary.getBody()); } } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java index 44d8bb7..4b24879 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java @@ -9,7 +9,6 @@ import pl.waw.ipipan.zil.summ.nicolas.PathConstants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; -import javax.xml.bind.JAXBException; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; @@ -26,7 +25,7 @@ public class ExtractMostFrequentMentions { private ExtractMostFrequentMentions() { } - public static void main(String[] args) throws IOException, JAXBException { + public static void main(String[] args) throws IOException { List<String> mostFrequentMentionBases = getMostFrequentMentionBases(); try (BufferedWriter bw = new BufferedWriter(new FileWriter(PathConstants.TARGET_MODEL_DIR + Constants.FREQUENT_BASES_RESOURCE_PATH))) { for (String base : mostFrequentMentionBases) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java index 9c4f012..6215502 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java @@ -89,7 +89,7 @@ public class PrepareTrainingData { } } - private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { + private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { SentenceScorer sentenceScorer = new SentenceScorer(); SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); @@ -97,10 +97,12 @@ public class PrepareTrainingData { Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; - for (String textId : id2preprocessedText.keySet()) { + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { logProgress(id2preprocessedText, i++); - TText preprocessedText = id2preprocessedText.get(textId); + String textId = entry.getKey(); + TText preprocessedText = entry.getValue(); + String optimalSummary = id2optimalSummary.get(textId); if (optimalSummary == null) continue; @@ -110,9 +112,9 @@ public class PrepareTrainingData { = loadGoldGoodMentions(textId, preprocessedText, id2optimalSummary); Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { - TSentence sentence = entry.getKey(); - Instance instance = entry.getValue(); + for (Map.Entry<TSentence, Instance> sentenceInstance : sentence2instance.entrySet()) { + TSentence sentence = sentenceInstance.getKey(); + Instance instance = sentenceInstance.getValue(); instance.setDataset(instances); instance.setClassValue(sentence2score.get(sentence)); instances.add(instance); @@ -121,7 +123,7 @@ public class PrepareTrainingData { saveInstancesToFile(instances, SENTENCE_ARFF); } - private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) throws IOException { + private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) { String optimalSummary = id2optimalSummary.get(id); MentionScorer scorer = new MentionScorer(); diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java index e6d9588..f8b9d7c 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java @@ -7,11 +7,10 @@ import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; import weka.classifiers.Classifier; import weka.core.Instances; +import weka.core.SerializationHelper; import weka.core.converters.ArffLoader; import java.io.File; -import java.io.FileOutputStream; -import java.io.ObjectOutputStream; import java.util.logging.LogManager; import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; @@ -48,10 +47,7 @@ public class TrainAllModels { String target = TARGET_MODEL_DIR + targetPath; LOG.info("Saving classifier at: {}", target); - try (ObjectOutputStream oos = new ObjectOutputStream( - new FileOutputStream(target))) { - oos.writeObject(classifier); - } + SerializationHelper.write(target, classifier); watch.stop(); LOG.info("Elapsed time: {}", watch); -- libgit2 0.22.2