Commit c9b1021d986bf2636bfd978885170dedd7d45546
1 parent
d3b1a80b
fix sonar issues
Showing
15 changed files
with
177 additions
and
142 deletions
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java
| ... | ... | @@ -92,7 +92,10 @@ class Cli { |
| 92 | 92 | public void validate(String name, String value) { |
| 93 | 93 | File file = new File(value); |
| 94 | 94 | try { |
| 95 | - file.createNewFile(); | |
| 95 | + boolean newFile = file.createNewFile(); | |
| 96 | + if (!newFile) { | |
| 97 | + LOG.warn("Output file exists and will be overridden."); | |
| 98 | + } | |
| 96 | 99 | } catch (IOException ex) { |
| 97 | 100 | throw new ParameterException("Parameter " + name |
| 98 | 101 | + " should be a valid file path (found " + value + ")", ex); |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
| ... | ... | @@ -67,11 +67,14 @@ public class FeatureExtractor { |
| 67 | 67 | } |
| 68 | 68 | } |
| 69 | 69 | for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { |
| 70 | - for (Attribute attribute : attribute2max.keySet()) { | |
| 70 | + for (Map.Entry<Attribute, Double> entry : attribute2max.entrySet()) { | |
| 71 | + Attribute attribute = entry.getKey(); | |
| 72 | + Double max = entry.getValue(); | |
| 73 | + | |
| 71 | 74 | Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); |
| 72 | 75 | entityAttributes.put(normalizedAttribute, |
| 73 | 76 | (entityAttributes.get(attribute) - attribute2min.get(attribute)) |
| 74 | - / (attribute2max.get(attribute) - attribute2min.get(attribute))); | |
| 77 | + / (max - attribute2min.get(attribute))); | |
| 75 | 78 | } |
| 76 | 79 | } |
| 77 | 80 | } |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
| ... | ... | @@ -54,49 +54,56 @@ public class FeatureHelper { |
| 54 | 54 | coref2mentions.put(coref, ments); |
| 55 | 55 | } |
| 56 | 56 | |
| 57 | - int parIdx = 0; | |
| 58 | - int sentIdx = 0; | |
| 59 | - int mentionIdx = 0; | |
| 57 | + Counters counters = new Counters(); | |
| 60 | 58 | for (TParagraph par : preprocessedText.getParagraphs()) { |
| 61 | - Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); | |
| 62 | - mention2Orth.putAll(m2o); | |
| 63 | - Map<TMention, String> m2b = loadMention2Base(par.getSentences()); | |
| 64 | - mention2Base.putAll(m2b); | |
| 65 | - | |
| 66 | - int sentIdxInPar = 0; | |
| 67 | - int mentionIdxInPar = 0; | |
| 68 | - for (TSentence sent : par.getSentences()) { | |
| 69 | - | |
| 70 | - Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | |
| 71 | - | |
| 72 | - Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | |
| 73 | - for (TNamedEntity namedEntity : sent.getNames()) { | |
| 74 | - for (String childId : namedEntity.getChildIds()) { | |
| 75 | - tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | |
| 76 | - tokenId2namedEntities.get(childId).add(namedEntity); | |
| 77 | - } | |
| 78 | - } | |
| 59 | + processParagraph(counters, par); | |
| 60 | + } | |
| 61 | + } | |
| 79 | 62 | |
| 80 | - int mentionIdxInSent = 0; | |
| 81 | - for (TMention mention : sent.getMentions()) { | |
| 82 | - mention2sent.put(mention, sent); | |
| 83 | - mention2par.put(mention, par); | |
| 84 | - mention2index.put(mention, mentionIdx++); | |
| 85 | - mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); | |
| 86 | - mention2indexInSent.put(mention, mentionIdxInSent++); | |
| 87 | - mention2indexInPar.put(mention, mentionIdxInPar++); | |
| 88 | - | |
| 89 | - String firstHeadTokenId = mention.getHeadIds().iterator().next(); | |
| 90 | - mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | |
| 91 | - if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | |
| 92 | - mentionsInNamedEntities.add(mention); | |
| 93 | - } | |
| 94 | - sent2Index.put(sent, sentIdx++); | |
| 95 | - sent2IndexInPar.put(sent, sentIdxInPar++); | |
| 63 | + private void processParagraph(Counters counters, TParagraph par) { | |
| 64 | + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); | |
| 65 | + mention2Orth.putAll(m2o); | |
| 66 | + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); | |
| 67 | + mention2Base.putAll(m2b); | |
| 68 | + | |
| 69 | + int sentIdxInPar = 0; | |
| 70 | + int mentionIdxInPar = 0; | |
| 71 | + for (TSentence sent : par.getSentences()) { | |
| 72 | + | |
| 73 | + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | |
| 74 | + | |
| 75 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = getTokenId2NamedEntities(sent); | |
| 76 | + | |
| 77 | + int mentionIdxInSent = 0; | |
| 78 | + for (TMention mention : sent.getMentions()) { | |
| 79 | + mention2sent.put(mention, sent); | |
| 80 | + mention2par.put(mention, par); | |
| 81 | + mention2index.put(mention, counters.mentionIdx++); | |
| 82 | + mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); | |
| 83 | + mention2indexInSent.put(mention, mentionIdxInSent++); | |
| 84 | + mention2indexInPar.put(mention, mentionIdxInPar++); | |
| 85 | + | |
| 86 | + String firstHeadTokenId = mention.getHeadIds().iterator().next(); | |
| 87 | + mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | |
| 88 | + if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | |
| 89 | + mentionsInNamedEntities.add(mention); | |
| 96 | 90 | } |
| 91 | + sent2Index.put(sent, counters.sentIdx++); | |
| 92 | + sent2IndexInPar.put(sent, sentIdxInPar++); | |
| 93 | + } | |
| 97 | 94 | |
| 98 | - par2Index.put(par, parIdx++); | |
| 95 | + par2Index.put(par, counters.parIdx++); | |
| 96 | + } | |
| 97 | + | |
| 98 | + private Map<String, Set<TNamedEntity>> getTokenId2NamedEntities(TSentence sent) { | |
| 99 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | |
| 100 | + for (TNamedEntity namedEntity : sent.getNames()) { | |
| 101 | + for (String childId : namedEntity.getChildIds()) { | |
| 102 | + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | |
| 103 | + tokenId2namedEntities.get(childId).add(namedEntity); | |
| 104 | + } | |
| 99 | 105 | } |
| 106 | + return tokenId2namedEntities; | |
| 100 | 107 | } |
| 101 | 108 | |
| 102 | 109 | public List<TMention> getMentions() { |
| ... | ... | @@ -220,31 +227,35 @@ public class FeatureHelper { |
| 220 | 227 | return mention2sent.get(mention).getTokens().get(idx - 1); |
| 221 | 228 | } |
| 222 | 229 | |
| 223 | - private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | |
| 230 | + private static Map<TMention, String> loadMention2Orth(List<TSentence> sentences) { | |
| 224 | 231 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
| 225 | - for (TSentence s : sents) { | |
| 226 | - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
| 232 | + for (TSentence sentence : sentences) { | |
| 233 | + Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
| 227 | 234 | |
| 228 | - for (TMention m : s.getMentions()) { | |
| 229 | - StringBuilder mentionOrth = new StringBuilder(); | |
| 230 | - for (String tokId : m.getChildIds()) { | |
| 231 | - TToken token = tokId2tok.get(tokId); | |
| 232 | - if (!token.isNoPrecedingSpace()) | |
| 233 | - mentionOrth.append(" "); | |
| 234 | - mentionOrth.append(token.getOrth()); | |
| 235 | - } | |
| 236 | - mention2orth.put(m, mentionOrth.toString().trim()); | |
| 235 | + for (TMention mention : sentence.getMentions()) { | |
| 236 | + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); | |
| 237 | 237 | } |
| 238 | 238 | } |
| 239 | 239 | return mention2orth; |
| 240 | 240 | } |
| 241 | 241 | |
| 242 | - private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | |
| 242 | + private static String getMentionOrth(Map<String, TToken> tokId2tok, TMention m) { | |
| 243 | + StringBuilder mentionOrth = new StringBuilder(); | |
| 244 | + for (String tokId : m.getChildIds()) { | |
| 245 | + TToken token = tokId2tok.get(tokId); | |
| 246 | + if (!token.isNoPrecedingSpace()) | |
| 247 | + mentionOrth.append(" "); | |
| 248 | + mentionOrth.append(token.getOrth()); | |
| 249 | + } | |
| 250 | + return mentionOrth.toString().trim(); | |
| 251 | + } | |
| 252 | + | |
| 253 | + private static Map<TMention, String> loadMention2Base(List<TSentence> sentences) { | |
| 243 | 254 | Map<TMention, String> mention2base = Maps.newHashMap(); |
| 244 | - for (TSentence s : sents) { | |
| 245 | - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | |
| 255 | + for (TSentence sentence : sentences) { | |
| 256 | + Map<String, String> tokId2base = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | |
| 246 | 257 | |
| 247 | - for (TMention m : s.getMentions()) { | |
| 258 | + for (TMention m : sentence.getMentions()) { | |
| 248 | 259 | StringBuilder mentionBase = new StringBuilder(); |
| 249 | 260 | for (String tokId : m.getChildIds()) { |
| 250 | 261 | mentionBase.append(" "); |
| ... | ... | @@ -255,4 +266,10 @@ public class FeatureHelper { |
| 255 | 266 | } |
| 256 | 267 | return mention2base; |
| 257 | 268 | } |
| 269 | + | |
| 270 | + private class Counters { | |
| 271 | + int parIdx = 0; | |
| 272 | + int sentIdx = 0; | |
| 273 | + int mentionIdx = 0; | |
| 274 | + } | |
| 258 | 275 | } |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
| ... | ... | @@ -18,6 +18,10 @@ import java.util.stream.Collectors; |
| 18 | 18 | |
| 19 | 19 | public class MentionFeatureExtractor extends FeatureExtractor { |
| 20 | 20 | |
| 21 | + private static final String SCORE_ATTRIBUTE_NAME = "score"; | |
| 22 | + private static final String OTHER_VALUE = "other"; | |
| 23 | + private static final String NULL_VALUE = "null"; | |
| 24 | + | |
| 21 | 25 | private final List<String> frequentBases; |
| 22 | 26 | |
| 23 | 27 | public MentionFeatureExtractor() throws IOException { |
| ... | ... | @@ -48,10 +52,10 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
| 48 | 52 | addBinaryAttribute(prefix + "_is_named"); |
| 49 | 53 | addBinaryAttribute(prefix + "_is_pronoun"); |
| 50 | 54 | addNominalAttribute(prefix + "_ctag", Constants.POS_TAGS); |
| 51 | - addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); | |
| 52 | - addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | |
| 53 | - addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); | |
| 54 | - addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n")); | |
| 55 | + addNominalAttribute(prefix + "_person", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "pri", "sec", "ter")); | |
| 56 | + addNominalAttribute(prefix + "_case", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | |
| 57 | + addNominalAttribute(prefix + "_number", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "sg", "pl")); | |
| 58 | + addNominalAttribute(prefix + "_gender", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "f", "m1", "m2", "m3", "n")); | |
| 55 | 59 | |
| 56 | 60 | // relation to other |
| 57 | 61 | addBinaryAttribute(prefix + "_is_nested"); |
| ... | ... | @@ -76,8 +80,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
| 76 | 80 | } |
| 77 | 81 | } |
| 78 | 82 | |
| 79 | - addNominalAttribute("score", Lists.newArrayList("bad", "good")); | |
| 80 | - fillSortedAttributes("score"); | |
| 83 | + addNominalAttribute(SCORE_ATTRIBUTE_NAME, Lists.newArrayList("bad", "good")); | |
| 84 | + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); | |
| 81 | 85 | } |
| 82 | 86 | |
| 83 | 87 | private String encodeBase(String base) { |
| ... | ... | @@ -143,8 +147,11 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
| 143 | 147 | attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); |
| 144 | 148 | |
| 145 | 149 | String orth = helper.getMentionOrth(mention); |
| 146 | - attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1)))); | |
| 147 | - attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth))); | |
| 150 | + String firstLetter = orth.substring(0, 1); | |
| 151 | + String firstLetterUpperCased = firstLetter.toUpperCase(); | |
| 152 | + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && firstLetterUpperCased.equals(firstLetter))); | |
| 153 | + String upperCased = orth.toUpperCase(); | |
| 154 | + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(upperCased.equals(orth))); | |
| 148 | 155 | attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length()); |
| 149 | 156 | |
| 150 | 157 | // par characteristics |
| ... | ... | @@ -159,8 +166,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
| 159 | 166 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size()); |
| 160 | 167 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence)); |
| 161 | 168 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence)); |
| 162 | - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("."))); | |
| 163 | - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?"))); | |
| 169 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); | |
| 170 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); | |
| 164 | 171 | |
| 165 | 172 | // frequent bases |
| 166 | 173 | String mentionBase = helper.getMentionBase(mention); |
| ... | ... | @@ -174,14 +181,14 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
| 174 | 181 | int index = att.indexOfValue(value); |
| 175 | 182 | if (index == -1) |
| 176 | 183 | LOG.warn("{} not found for attribute {}", value, attributeName); |
| 177 | - attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | |
| 184 | + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue(OTHER_VALUE) : index)); | |
| 178 | 185 | } |
| 179 | 186 | |
| 180 | 187 | |
| 181 | 188 | private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) { |
| 182 | 189 | for (TMention m : mentions) { |
| 183 | 190 | Map<Attribute, Double> map = Maps.newHashMap(); |
| 184 | - map.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
| 191 | + map.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); | |
| 185 | 192 | result.put(m, map); |
| 186 | 193 | } |
| 187 | 194 | } |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
| ... | ... | @@ -13,6 +13,8 @@ import java.util.stream.Collectors; |
| 13 | 13 | |
| 14 | 14 | public class SentenceFeatureExtractor extends FeatureExtractor { |
| 15 | 15 | |
| 16 | + private static final String SCORE_ATTRIBUTE_NAME = "score"; | |
| 17 | + | |
| 16 | 18 | public SentenceFeatureExtractor() { |
| 17 | 19 | |
| 18 | 20 | addNumericAttributeNormalized("sent_mention_cluster_count"); |
| ... | ... | @@ -39,8 +41,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
| 39 | 41 | addNumericAttribute("text_mention_count"); |
| 40 | 42 | addNumericAttribute("text_cluster_count"); |
| 41 | 43 | |
| 42 | - addNumericAttribute("score"); | |
| 43 | - fillSortedAttributes("score"); | |
| 44 | + addNumericAttribute(SCORE_ATTRIBUTE_NAME); | |
| 45 | + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); | |
| 44 | 46 | } |
| 45 | 47 | |
| 46 | 48 | public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) { |
| ... | ... | @@ -70,8 +72,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
| 70 | 72 | feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size()); |
| 71 | 73 | feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar); |
| 72 | 74 | feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx); |
| 73 | - feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("."))); | |
| 74 | - feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?"))); | |
| 75 | + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(sentence)))); | |
| 76 | + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(sentence)))); | |
| 75 | 77 | |
| 76 | 78 | feature2value.put(getAttributeByName("par_idx"), (double) parIdx); |
| 77 | 79 | feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); |
| ... | ... | @@ -84,7 +86,7 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
| 84 | 86 | feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); |
| 85 | 87 | feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); |
| 86 | 88 | |
| 87 | - feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
| 89 | + feature2value.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); | |
| 88 | 90 | |
| 89 | 91 | feature2value.remove(null); |
| 90 | 92 | |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java
| ... | ... | @@ -42,12 +42,13 @@ public class ResourceUtils { |
| 42 | 42 | } |
| 43 | 43 | |
| 44 | 44 | private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { |
| 45 | + Predicate<String> stringIsNonempty = (String s) -> !s.isEmpty(); | |
| 45 | 46 | try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { |
| 46 | 47 | return IOUtils.readLines(stream, Constants.ENCODING) |
| 47 | 48 | .stream() |
| 48 | 49 | .map(String::trim) |
| 49 | 50 | .map(String::toLowerCase) |
| 50 | - .filter(((Predicate<String>) String::isEmpty).negate()) | |
| 51 | + .filter(stringIsNonempty) | |
| 51 | 52 | .sorted() |
| 52 | 53 | .distinct() |
| 53 | 54 | .collect(Collectors.toList()); |
| ... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
| ... | ... | @@ -37,15 +37,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
| 37 | 37 | private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count"; |
| 38 | 38 | private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length"; |
| 39 | 39 | private static final String IS_PAN_OR_PANI = "_is_pan_or_pani"; |
| 40 | - | |
| 41 | - // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet( | |
| 42 | -// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ","); | |
| 43 | 40 | private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy"); |
| 44 | - | |
| 45 | 41 | private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(); |
| 46 | -// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet( | |
| 47 | -// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ","); | |
| 48 | - | |
| 49 | 42 | private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_"; |
| 50 | 43 | private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_"; |
| 51 | 44 | |
| ... | ... | @@ -105,7 +98,10 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
| 105 | 98 | candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue()); |
| 106 | 99 | |
| 107 | 100 | TMention mention = candidate.getZeroCandidateMention(); |
| 108 | - TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); | |
| 101 | + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().orElse(null); | |
| 102 | + if (antecedent == null) { | |
| 103 | + throw new IllegalArgumentException("Mention pair without first element!"); | |
| 104 | + } | |
| 109 | 105 | |
| 110 | 106 | addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX); |
| 111 | 107 | addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX); |
| ... | ... | @@ -165,14 +161,14 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
| 165 | 161 | TSentence mentionSentence = helper.getMentionSentence(mention); |
| 166 | 162 | candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size()); |
| 167 | 163 | candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size()); |
| 168 | - candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?"))); | |
| 164 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary("?".equals(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth()))); | |
| 169 | 165 | } |
| 170 | 166 | |
| 171 | 167 | private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { |
| 172 | 168 | Attribute att = getAttributeByName(attributeName); |
| 173 | 169 | int index = att.indexOfValue(value); |
| 174 | 170 | if (index == -1) |
| 175 | - LOG.warn(value + "not found for attribute " + attributeName); | |
| 171 | + LOG.warn("{} not found for attribute {}", value, attributeName); | |
| 176 | 172 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
| 177 | 173 | } |
| 178 | 174 | } |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
| ... | ... | @@ -32,7 +32,7 @@ public class CorpusHelper { |
| 32 | 32 | } |
| 33 | 33 | |
| 34 | 34 | public static List<Summary> getAbstractSummaries(Text text) { |
| 35 | - return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
| 35 | + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
| 36 | 36 | } |
| 37 | 37 | |
| 38 | 38 | public static Set<String> loadTrainTextIds() throws IOException { |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
| ... | ... | @@ -67,15 +67,19 @@ public class PathConstants { |
| 67 | 67 | } |
| 68 | 68 | } |
| 69 | 69 | |
| 70 | - public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException { | |
| 70 | + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException { | |
| 71 | 71 | downloadFile(url, targetZipFile); |
| 72 | 72 | extractZipFile(targetZipFile, targetDir); |
| 73 | 73 | } |
| 74 | 74 | |
| 75 | - private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { | |
| 75 | + private static void extractZipFile(File targetZipFile, File targetDir) throws IOException { | |
| 76 | 76 | createFolder(targetDir); |
| 77 | - ZipFile zipFile = new ZipFile(targetZipFile); | |
| 78 | - zipFile.extractAll(targetDir.getPath()); | |
| 77 | + try { | |
| 78 | + ZipFile zipFile = new ZipFile(targetZipFile); | |
| 79 | + zipFile.extractAll(targetDir.getPath()); | |
| 80 | + } catch (ZipException e) { | |
| 81 | + throw new IOException(e); | |
| 82 | + } | |
| 79 | 83 | LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); |
| 80 | 84 | } |
| 81 | 85 | } |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
| ... | ... | @@ -106,6 +106,10 @@ class Crossvalidate { |
| 106 | 106 | return Pair.of(acc, name); |
| 107 | 107 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
| 108 | 108 | |
| 109 | + printBestResult(watch, max); | |
| 110 | + } | |
| 111 | + | |
| 112 | + private static void printBestResult(StopWatch watch, Optional<Pair<Double, String>> max) { | |
| 109 | 113 | LOG.info("#########"); |
| 110 | 114 | if (max.isPresent()) { |
| 111 | 115 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
| ... | ... | @@ -142,13 +146,6 @@ class Crossvalidate { |
| 142 | 146 | return Pair.of(acc, name); |
| 143 | 147 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
| 144 | 148 | |
| 145 | - LOG.info("#########"); | |
| 146 | - if (max.isPresent()) { | |
| 147 | - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | |
| 148 | - } else { | |
| 149 | - LOG.info("Empty algorithms list"); | |
| 150 | - } | |
| 151 | - watch.stop(); | |
| 152 | - LOG.info("Elapsed time: {}", watch); | |
| 149 | + printBestResult(watch, max); | |
| 153 | 150 | } |
| 154 | 151 | } |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
| ... | ... | @@ -22,33 +22,37 @@ public class MentionScorer { |
| 22 | 22 | Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); |
| 23 | 23 | |
| 24 | 24 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
| 25 | - Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); | |
| 25 | + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStoptags(sentences); | |
| 26 | 26 | |
| 27 | 27 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
| 28 | 28 | } |
| 29 | 29 | |
| 30 | - private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sentences) { | |
| 30 | + private Map<TMention, String> loadMention2OrthExcludingStoptags(List<TSentence> sentences) { | |
| 31 | 31 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
| 32 | 32 | for (TSentence sentence : sentences) { |
| 33 | 33 | Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); |
| 34 | 34 | |
| 35 | 35 | for (TMention mention : sentence.getMentions()) { |
| 36 | - StringBuilder mentionOrth = new StringBuilder(); | |
| 37 | - for (String tokId : mention.getChildIds()) { | |
| 38 | - TToken token = tokId2tok.get(tokId); | |
| 39 | - if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) | |
| 40 | - continue; | |
| 41 | - | |
| 42 | - if (!token.isNoPrecedingSpace()) | |
| 43 | - mentionOrth.append(" "); | |
| 44 | - mentionOrth.append(token.getOrth()); | |
| 45 | - } | |
| 46 | - mention2orth.put(mention, mentionOrth.toString().trim()); | |
| 36 | + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); | |
| 47 | 37 | } |
| 48 | 38 | } |
| 49 | 39 | return mention2orth; |
| 50 | 40 | } |
| 51 | 41 | |
| 42 | + private String getMentionOrth(Map<String, TToken> tokId2tok, TMention mention) { | |
| 43 | + StringBuilder mentionOrth = new StringBuilder(); | |
| 44 | + for (String tokId : mention.getChildIds()) { | |
| 45 | + TToken token = tokId2tok.get(tokId); | |
| 46 | + if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) | |
| 47 | + continue; | |
| 48 | + | |
| 49 | + if (!token.isNoPrecedingSpace()) | |
| 50 | + mentionOrth.append(" "); | |
| 51 | + mentionOrth.append(token.getOrth()); | |
| 52 | + } | |
| 53 | + return mentionOrth.toString().trim(); | |
| 54 | + } | |
| 55 | + | |
| 52 | 56 | private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { |
| 53 | 57 | Map<TMention, Double> mention2score = Maps.newHashMap(); |
| 54 | 58 | for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
| ... | ... | @@ -17,7 +17,6 @@ import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
| 17 | 17 | |
| 18 | 18 | public class ExtractGoldSummaries { |
| 19 | 19 | |
| 20 | - | |
| 21 | 20 | private ExtractGoldSummaries() { |
| 22 | 21 | } |
| 23 | 22 | |
| ... | ... | @@ -28,25 +27,30 @@ public class ExtractGoldSummaries { |
| 28 | 27 | File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles(); |
| 29 | 28 | if (files != null) { |
| 30 | 29 | for (File file : files) { |
| 31 | - Text text = PSC_IO.readText(file); | |
| 30 | + extractGoldSummariesFromFile(file); | |
| 31 | + } | |
| 32 | + } | |
| 33 | + } | |
| 32 | 34 | |
| 33 | - List<Summary> goldSummaries; | |
| 35 | + private static void extractGoldSummariesFromFile(File file) throws IOException, JAXBException { | |
| 36 | + Text text = PSC_IO.readText(file); | |
| 34 | 37 | |
| 35 | - boolean isTest = CorpusHelper.isTest(text); | |
| 36 | - if (isTest) { | |
| 37 | - goldSummaries = CorpusHelper.getAbstractSummaries(text); | |
| 38 | - } else { | |
| 39 | - goldSummaries = CorpusHelper.getExtractSummaries(text); | |
| 40 | - } | |
| 38 | + List<Summary> goldSummaries; | |
| 39 | + File targetDir; | |
| 41 | 40 | |
| 42 | - for (Summary summary : goldSummaries) { | |
| 43 | - File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; | |
| 44 | - File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | |
| 41 | + boolean isTest = CorpusHelper.isTest(text); | |
| 42 | + if (isTest) { | |
| 43 | + goldSummaries = CorpusHelper.getAbstractSummaries(text); | |
| 44 | + targetDir = GOLD_TEST_SUMMARIES_DIR; | |
| 45 | + } else { | |
| 46 | + goldSummaries = CorpusHelper.getExtractSummaries(text); | |
| 47 | + targetDir = GOLD_TRAIN_SUMMARIES_DIR; | |
| 48 | + } | |
| 45 | 49 | |
| 46 | - try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | |
| 47 | - writer.append(summary.getBody()); | |
| 48 | - } | |
| 49 | - } | |
| 50 | + for (Summary summary : goldSummaries) { | |
| 51 | + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | |
| 52 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | |
| 53 | + writer.append(summary.getBody()); | |
| 50 | 54 | } |
| 51 | 55 | } |
| 52 | 56 | } |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java
| ... | ... | @@ -9,7 +9,6 @@ import pl.waw.ipipan.zil.summ.nicolas.PathConstants; |
| 9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
| 10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
| 11 | 11 | |
| 12 | -import javax.xml.bind.JAXBException; | |
| 13 | 12 | import java.io.BufferedWriter; |
| 14 | 13 | import java.io.FileWriter; |
| 15 | 14 | import java.io.IOException; |
| ... | ... | @@ -26,7 +25,7 @@ public class ExtractMostFrequentMentions { |
| 26 | 25 | private ExtractMostFrequentMentions() { |
| 27 | 26 | } |
| 28 | 27 | |
| 29 | - public static void main(String[] args) throws IOException, JAXBException { | |
| 28 | + public static void main(String[] args) throws IOException { | |
| 30 | 29 | List<String> mostFrequentMentionBases = getMostFrequentMentionBases(); |
| 31 | 30 | try (BufferedWriter bw = new BufferedWriter(new FileWriter(PathConstants.TARGET_MODEL_DIR + Constants.FREQUENT_BASES_RESOURCE_PATH))) { |
| 32 | 31 | for (String base : mostFrequentMentionBases) { |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
| ... | ... | @@ -89,7 +89,7 @@ public class PrepareTrainingData { |
| 89 | 89 | } |
| 90 | 90 | } |
| 91 | 91 | |
| 92 | - private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { | |
| 92 | + private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { | |
| 93 | 93 | |
| 94 | 94 | SentenceScorer sentenceScorer = new SentenceScorer(); |
| 95 | 95 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
| ... | ... | @@ -97,10 +97,12 @@ public class PrepareTrainingData { |
| 97 | 97 | Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
| 98 | 98 | |
| 99 | 99 | int i = 1; |
| 100 | - for (String textId : id2preprocessedText.keySet()) { | |
| 100 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
| 101 | 101 | logProgress(id2preprocessedText, i++); |
| 102 | 102 | |
| 103 | - TText preprocessedText = id2preprocessedText.get(textId); | |
| 103 | + String textId = entry.getKey(); | |
| 104 | + TText preprocessedText = entry.getValue(); | |
| 105 | + | |
| 104 | 106 | String optimalSummary = id2optimalSummary.get(textId); |
| 105 | 107 | if (optimalSummary == null) |
| 106 | 108 | continue; |
| ... | ... | @@ -110,9 +112,9 @@ public class PrepareTrainingData { |
| 110 | 112 | = loadGoldGoodMentions(textId, preprocessedText, id2optimalSummary); |
| 111 | 113 | |
| 112 | 114 | Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); |
| 113 | - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | |
| 114 | - TSentence sentence = entry.getKey(); | |
| 115 | - Instance instance = entry.getValue(); | |
| 115 | + for (Map.Entry<TSentence, Instance> sentenceInstance : sentence2instance.entrySet()) { | |
| 116 | + TSentence sentence = sentenceInstance.getKey(); | |
| 117 | + Instance instance = sentenceInstance.getValue(); | |
| 116 | 118 | instance.setDataset(instances); |
| 117 | 119 | instance.setClassValue(sentence2score.get(sentence)); |
| 118 | 120 | instances.add(instance); |
| ... | ... | @@ -121,7 +123,7 @@ public class PrepareTrainingData { |
| 121 | 123 | saveInstancesToFile(instances, SENTENCE_ARFF); |
| 122 | 124 | } |
| 123 | 125 | |
| 124 | - private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) throws IOException { | |
| 126 | + private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) { | |
| 125 | 127 | String optimalSummary = id2optimalSummary.get(id); |
| 126 | 128 | |
| 127 | 129 | MentionScorer scorer = new MentionScorer(); |
| ... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
| ... | ... | @@ -7,11 +7,10 @@ import pl.waw.ipipan.zil.summ.nicolas.Constants; |
| 7 | 7 | import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; |
| 8 | 8 | import weka.classifiers.Classifier; |
| 9 | 9 | import weka.core.Instances; |
| 10 | +import weka.core.SerializationHelper; | |
| 10 | 11 | import weka.core.converters.ArffLoader; |
| 11 | 12 | |
| 12 | 13 | import java.io.File; |
| 13 | -import java.io.FileOutputStream; | |
| 14 | -import java.io.ObjectOutputStream; | |
| 15 | 14 | import java.util.logging.LogManager; |
| 16 | 15 | |
| 17 | 16 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
| ... | ... | @@ -48,10 +47,7 @@ public class TrainAllModels { |
| 48 | 47 | |
| 49 | 48 | String target = TARGET_MODEL_DIR + targetPath; |
| 50 | 49 | LOG.info("Saving classifier at: {}", target); |
| 51 | - try (ObjectOutputStream oos = new ObjectOutputStream( | |
| 52 | - new FileOutputStream(target))) { | |
| 53 | - oos.writeObject(classifier); | |
| 54 | - } | |
| 50 | + SerializationHelper.write(target, classifier); | |
| 55 | 51 | |
| 56 | 52 | watch.stop(); |
| 57 | 53 | LOG.info("Elapsed time: {}", watch); |
| ... | ... |