Commit e1126cdba70bd5287871ebbe89e9ae6635bb5a01
0 parents
rough draft
Showing
28 changed files
with
2105 additions
and
0 deletions
.gitignore
0 → 100644
1 | +++ a/.gitignore | ||
1 | +# Created by .ignore support plugin (hsz.mobi) | ||
2 | +### Java template | ||
3 | +*. | ||
4 | +target/ | ||
5 | + | ||
6 | +# Mobile Tools for Java (J2ME) | ||
7 | +.mtj.tmp/ | ||
8 | + | ||
9 | +# Package Files # | ||
10 | +*.jar | ||
11 | +*.war | ||
12 | +*.ear | ||
13 | + | ||
14 | +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml | ||
15 | +hs_err_pid* | ||
16 | + | ||
17 | +.idea | ||
18 | +*.iml | ||
0 | \ No newline at end of file | 19 | \ No newline at end of file |
nicolas-cli/pom.xml
0 → 100644
1 | +++ a/nicolas-cli/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas-cli</artifactId> | ||
13 | + | ||
14 | +</project> | ||
0 | \ No newline at end of file | 15 | \ No newline at end of file |
nicolas-core/pom.xml
0 → 100644
1 | +++ a/nicolas-core/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas</artifactId> | ||
13 | + | ||
14 | + <dependencies> | ||
15 | + <dependency> | ||
16 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
17 | + <artifactId>nicolas-model</artifactId> | ||
18 | + <version>${project.version}</version> | ||
19 | + <scope>runtime</scope> | ||
20 | + </dependency> | ||
21 | + </dependencies> | ||
22 | +</project> | ||
0 | \ No newline at end of file | 23 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import weka.classifiers.Classifier; | ||
4 | +import weka.classifiers.trees.RandomForest; | ||
5 | + | ||
6 | + | ||
7 | +public class Constants { | ||
8 | + | ||
9 | + public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; | ||
10 | + public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; | ||
11 | + public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; | ||
12 | + public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; | ||
13 | + | ||
14 | + private Constants() { | ||
15 | + } | ||
16 | + | ||
17 | + public static Classifier getClassifier() { | ||
18 | + RandomForest classifier = new RandomForest(); | ||
19 | + classifier.setNumIterations(250); | ||
20 | + classifier.setSeed(0); | ||
21 | + classifier.setNumExecutionSlots(8); | ||
22 | + return classifier; | ||
23 | + } | ||
24 | + | ||
25 | + | ||
26 | + public static Classifier getSentencesClassifier() { | ||
27 | + RandomForest classifier = new RandomForest(); | ||
28 | + classifier.setNumIterations(250); | ||
29 | + classifier.setSeed(0); | ||
30 | + classifier.setNumExecutionSlots(8); | ||
31 | + return classifier; | ||
32 | + } | ||
33 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
4 | + | ||
5 | +public class Nicolas { | ||
6 | + | ||
7 | + public String summarizeThrift(TText text, int targetTokenCount) { | ||
8 | + return "test nicolas"; | ||
9 | + } | ||
10 | + | ||
11 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import com.google.common.base.Charsets; | ||
4 | +import com.google.common.collect.Lists; | ||
5 | +import com.google.common.collect.Maps; | ||
6 | +import com.google.common.collect.Sets; | ||
7 | +import com.google.common.io.Files; | ||
8 | +import org.slf4j.Logger; | ||
9 | +import org.slf4j.LoggerFactory; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
11 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
12 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
13 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
15 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | ||
16 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
17 | +import weka.classifiers.Classifier; | ||
18 | +import weka.core.Attribute; | ||
19 | +import weka.core.DenseInstance; | ||
20 | +import weka.core.Instance; | ||
21 | +import weka.core.Instances; | ||
22 | + | ||
23 | +import java.io.File; | ||
24 | +import java.io.FileInputStream; | ||
25 | +import java.io.IOException; | ||
26 | +import java.io.ObjectInputStream; | ||
27 | +import java.util.*; | ||
28 | +import java.util.function.Function; | ||
29 | +import java.util.stream.Collectors; | ||
30 | + | ||
31 | +import static java.util.stream.Collectors.toList; | ||
32 | + | ||
33 | +public class Utils { | ||
34 | + | ||
35 | + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); | ||
36 | + | ||
37 | + private static final String DATASET_NAME = "Dataset"; | ||
38 | + | ||
39 | + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | ||
40 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
41 | + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | ||
42 | + | ||
43 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | ||
44 | + Map<TMention, Instance> mention2instance = Maps.newHashMap(); | ||
45 | + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | ||
46 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
47 | + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | ||
48 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
49 | + instance.setValue(attribute, mentionFeatures.get(attribute)); | ||
50 | + } | ||
51 | + mention2instance.put(tMention, instance); | ||
52 | + } | ||
53 | + return mention2instance; | ||
54 | + } | ||
55 | + | ||
56 | + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | ||
57 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
58 | + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | ||
59 | + | ||
60 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | ||
61 | + Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | ||
62 | + for (TSentence sentence : sentences) { | ||
63 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
64 | + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | ||
65 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
66 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
67 | + } | ||
68 | + sentence2instance.put(sentence, instance); | ||
69 | + } | ||
70 | + return sentence2instance; | ||
71 | + } | ||
72 | + | ||
73 | + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | ||
74 | + Instances instances = new Instances(DATASET_NAME, attributesList, 0); | ||
75 | + instances.setClassIndex(0); | ||
76 | + return instances; | ||
77 | + } | ||
78 | + | ||
79 | + public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException { | ||
80 | + LOG.info("Loading classifier..."); | ||
81 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) { | ||
82 | + Classifier classifier = (Classifier) ois.readObject(); | ||
83 | + LOG.info("Done. " + classifier.toString()); | ||
84 | + return classifier; | ||
85 | + } | ||
86 | + } | ||
87 | + | ||
88 | + public static Map<String, TText> loadPreprocessedTexts(String path) { | ||
89 | + Map<String, TText> id2text = Maps.newHashMap(); | ||
90 | + for (File processedFullTextFile : new File(path).listFiles()) { | ||
91 | + TText processedFullText = loadThrifted(processedFullTextFile); | ||
92 | + id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText); | ||
93 | + } | ||
94 | + LOG.info(id2text.size() + " preprocessed texts found."); | ||
95 | + return id2text; | ||
96 | + } | ||
97 | + | ||
98 | + | ||
99 | + public static TText loadThrifted(File originalFile) { | ||
100 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) { | ||
101 | + return (TText) ois.readObject(); | ||
102 | + } catch (ClassNotFoundException | IOException e) { | ||
103 | + LOG.error("Error reading serialized file: " + e); | ||
104 | + return null; | ||
105 | + } | ||
106 | + } | ||
107 | + | ||
108 | + public static List<String> tokenize(String text) { | ||
109 | + return Arrays.asList(text.split("[^\\p{L}0-9]+")); | ||
110 | + } | ||
111 | + | ||
112 | + public static List<String> tokenizeOnWhitespace(String text) { | ||
113 | + return Arrays.asList(text.split(" +")); | ||
114 | + } | ||
115 | + | ||
116 | + public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { | ||
117 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
118 | + for (TSentence s : sents) { | ||
119 | + Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); | ||
120 | + Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); | ||
121 | + | ||
122 | + for (TMention m : s.getMentions()) { | ||
123 | + StringBuffer mentionOrth = new StringBuffer(); | ||
124 | + for (String tokId : m.getHeadIds()) { | ||
125 | + if (!tokId2nps.get(tokId)) | ||
126 | + mentionOrth.append(" "); | ||
127 | + mentionOrth.append(tokId2orth.get(tokId)); | ||
128 | + } | ||
129 | + mention2orth.put(m, mentionOrth.toString().trim()); | ||
130 | + } | ||
131 | + } | ||
132 | + return mention2orth; | ||
133 | + } | ||
134 | + | ||
135 | + private static final Collection<String> STOPWORDS = Sets.newHashSet(); | ||
136 | + | ||
137 | + static { | ||
138 | + STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); | ||
139 | + } | ||
140 | + | ||
141 | + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | ||
142 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
143 | + for (TSentence s : sents) { | ||
144 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | ||
145 | + | ||
146 | + for (TMention m : s.getMentions()) { | ||
147 | + StringBuffer mentionOrth = new StringBuffer(); | ||
148 | + for (String tokId : m.getChildIds()) { | ||
149 | + TToken token = tokId2tok.get(tokId); | ||
150 | + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | ||
151 | + continue; | ||
152 | + } | ||
153 | + | ||
154 | + if (!token.isNoPrecedingSpace()) | ||
155 | + mentionOrth.append(" "); | ||
156 | + mentionOrth.append(token.getOrth()); | ||
157 | + } | ||
158 | + mention2orth.put(m, mentionOrth.toString().trim()); | ||
159 | + } | ||
160 | + } | ||
161 | + return mention2orth; | ||
162 | + } | ||
163 | + | ||
164 | + public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | ||
165 | + Map<TMention, String> mention2base = Maps.newHashMap(); | ||
166 | + for (TSentence s : sents) { | ||
167 | + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); | ||
168 | + | ||
169 | + for (TMention m : s.getMentions()) { | ||
170 | + StringBuilder mentionBase = new StringBuilder(); | ||
171 | + for (String tokId : m.getChildIds()) { | ||
172 | + mentionBase.append(" "); | ||
173 | + mentionBase.append(tokId2base.get(tokId)); | ||
174 | + } | ||
175 | + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | ||
176 | + } | ||
177 | + } | ||
178 | + return mention2base; | ||
179 | + } | ||
180 | + | ||
181 | + public static String loadSentence2Orth(TSentence sentence) { | ||
182 | + StringBuilder sb = new StringBuilder(); | ||
183 | + for (TToken token : sentence.getTokens()) { | ||
184 | + if (!token.isNoPrecedingSpace()) | ||
185 | + sb.append(" "); | ||
186 | + sb.append(token.getOrth()); | ||
187 | + } | ||
188 | + return sb.toString().trim(); | ||
189 | + } | ||
190 | + | ||
191 | + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | ||
192 | + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | ||
193 | + | ||
194 | + MentionScorer scorer = new MentionScorer(); | ||
195 | + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | ||
196 | + | ||
197 | + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | ||
198 | + return mention2score.keySet(); | ||
199 | + } | ||
200 | +} | ||
0 | \ No newline at end of file | 201 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.apply; | ||
2 | + | ||
3 | +import com.google.common.collect.Lists; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.collect.Sets; | ||
6 | +import org.slf4j.Logger; | ||
7 | +import org.slf4j.LoggerFactory; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | ||
15 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
16 | +import weka.classifiers.Classifier; | ||
17 | +import weka.core.Instance; | ||
18 | +import weka.core.Instances; | ||
19 | + | ||
20 | +import java.io.BufferedWriter; | ||
21 | +import java.io.File; | ||
22 | +import java.io.FileWriter; | ||
23 | +import java.util.*; | ||
24 | + | ||
25 | +import static java.util.stream.Collectors.toList; | ||
26 | + | ||
27 | +public class ApplyModel2 { | ||
28 | + | ||
29 | + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); | ||
30 | + | ||
31 | + private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test"; | ||
32 | + private static final String TARGET_DIR = "summaries"; | ||
33 | + | ||
34 | + public static void main(String[] args) throws Exception { | ||
35 | + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | ||
36 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | ||
37 | + | ||
38 | + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | ||
39 | + SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | ||
40 | + | ||
41 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); | ||
42 | + int i = 1; | ||
43 | + double avgSize = 0; | ||
44 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
45 | + TText text = entry.getValue(); | ||
46 | + | ||
47 | + Set<TMention> goodMentions | ||
48 | + = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | ||
49 | + | ||
50 | + int targetSize = calculateTargetSize(text); | ||
51 | + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | ||
52 | + int size = Utils.tokenize(summary).size(); | ||
53 | + avgSize += size; | ||
54 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) { | ||
55 | + bw.append(summary); | ||
56 | + } | ||
57 | + | ||
58 | + LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); | ||
59 | + } | ||
60 | + | ||
61 | + LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); | ||
62 | + } | ||
63 | + | ||
64 | + private static int calculateTargetSize(TText text) { | ||
65 | + List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
66 | + StringBuffer body = new StringBuffer(); | ||
67 | + for (TSentence sent : sents) | ||
68 | + body.append(Utils.loadSentence2Orth(sent) + " "); | ||
69 | + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | ||
70 | + return (int) (0.2 * tokenCount); | ||
71 | + } | ||
72 | + | ||
73 | + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | ||
74 | + List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | ||
75 | + | ||
76 | + StringBuffer sb = new StringBuffer(); | ||
77 | + for (TSentence sent : selectedSentences) { | ||
78 | + sb.append(" " + Utils.loadSentence2Orth(sent)); | ||
79 | + } | ||
80 | + return sb.toString().trim(); | ||
81 | + } | ||
82 | + | ||
83 | + private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | ||
84 | + | ||
85 | + List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
86 | + | ||
87 | + Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | ||
88 | + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | ||
89 | + | ||
90 | + Map<TSentence, Double> sentence2score = Maps.newHashMap(); | ||
91 | + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | ||
92 | + Instance instance = entry.getValue(); | ||
93 | + instance.setDataset(instances); | ||
94 | + double score = sentenceClassifier.classifyInstance(instance); | ||
95 | + sentence2score.put(entry.getKey(), score); | ||
96 | + } | ||
97 | + | ||
98 | + List<TSentence> sortedSents = Lists.newArrayList(sents); | ||
99 | + Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | ||
100 | + | ||
101 | + int size = 0; | ||
102 | + Random r = new Random(1); | ||
103 | + Set<TSentence> summary = Sets.newHashSet(); | ||
104 | + for (TSentence sent : sortedSents) { | ||
105 | + size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | ||
106 | + if (r.nextDouble() > 0.4 && size > targetSize) | ||
107 | + break; | ||
108 | + summary.add(sent); | ||
109 | + if (size > targetSize) | ||
110 | + break; | ||
111 | + } | ||
112 | + List<TSentence> selectedSentences = Lists.newArrayList(); | ||
113 | + for (TSentence sent : sents) { | ||
114 | + if (summary.contains(sent)) | ||
115 | + selectedSentences.add(sent); | ||
116 | + } | ||
117 | + return selectedSentences; | ||
118 | + } | ||
119 | + | ||
120 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | ||
2 | + | ||
3 | +import com.google.common.collect.*; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import weka.core.Attribute; | ||
7 | + | ||
8 | +import java.util.*; | ||
9 | + | ||
10 | +public class FeatureExtractor { | ||
11 | + | ||
12 | + protected static final Logger LOG = LoggerFactory.getLogger(FeatureExtractor.class); | ||
13 | + | ||
14 | + private final List<Attribute> sortedAttributes = Lists.newArrayList(); | ||
15 | + | ||
16 | + private final BiMap<String, Attribute> name2attribute = HashBiMap.create(); | ||
17 | + | ||
18 | + private final Set<String> normalizedAttributes = Sets.newHashSet(); | ||
19 | + | ||
20 | + public ArrayList<Attribute> getAttributesList() { | ||
21 | + return Lists.newArrayList(sortedAttributes); | ||
22 | + } | ||
23 | + | ||
24 | + protected Attribute getAttributeByName(String name) { | ||
25 | + return name2attribute.get(name); | ||
26 | + } | ||
27 | + | ||
28 | + protected void addNumericAttribute(String attributeName) { | ||
29 | + name2attribute.put(attributeName, new Attribute(attributeName)); | ||
30 | + } | ||
31 | + | ||
32 | + protected void addBinaryAttribute(String attributeName) { | ||
33 | + name2attribute.put(attributeName, new Attribute(attributeName, Lists.newArrayList("f", "t"))); | ||
34 | + } | ||
35 | + | ||
36 | + protected void addNominalAttribute(String attributeName, List<String> values) { | ||
37 | + name2attribute.put(attributeName, new Attribute(attributeName, values)); | ||
38 | + } | ||
39 | + | ||
40 | + protected void addNumericAttributeNormalized(String attributeName) { | ||
41 | + addNumericAttribute(attributeName); | ||
42 | + addNumericAttribute(attributeName + "_normalized"); | ||
43 | + normalizedAttributes.add(attributeName); | ||
44 | + } | ||
45 | + | ||
46 | + protected void fillSortedAttributes(String scoreAttName) { | ||
47 | + sortedAttributes.addAll(name2attribute.values()); | ||
48 | + sortedAttributes.remove(getAttributeByName(scoreAttName)); | ||
49 | + Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2))); | ||
50 | + sortedAttributes.add(0, getAttributeByName(scoreAttName)); | ||
51 | + } | ||
52 | + | ||
53 | + protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { | ||
54 | + Map<Attribute, Double> attribute2max = Maps.newHashMap(); | ||
55 | + Map<Attribute, Double> attribute2min = Maps.newHashMap(); | ||
56 | + for (T entity : entity2attributes.keySet()) { | ||
57 | + Map<Attribute, Double> entityAttributes = entity2attributes.get(entity); | ||
58 | + for (String attributeName : normalizedAttributes) { | ||
59 | + Attribute attribute = getAttributeByName(attributeName); | ||
60 | + Double value = entityAttributes.get(attribute); | ||
61 | + | ||
62 | + attribute2max.putIfAbsent(attribute, Double.MIN_VALUE); | ||
63 | + attribute2max.compute(attribute, (k, v) -> Math.max(v, value)); | ||
64 | + | ||
65 | + attribute2min.putIfAbsent(attribute, Double.MAX_VALUE); | ||
66 | + attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); | ||
67 | + } | ||
68 | + } | ||
69 | + for (T mention : entity2attributes.keySet()) { | ||
70 | + Map<Attribute, Double> entityAttributes = entity2attributes.get(mention); | ||
71 | + for (Attribute attribute : attribute2max.keySet()) { | ||
72 | + Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); | ||
73 | + entityAttributes.put(normalizedAttribute, | ||
74 | + (entityAttributes.get(attribute) - attribute2min.get(attribute)) | ||
75 | + / (attribute2max.get(attribute) - attribute2min.get(attribute))); | ||
76 | + } | ||
77 | + } | ||
78 | + } | ||
79 | + | ||
80 | + protected double toBinary(boolean bool) { | ||
81 | + return bool ? 1.0 : 0.0; | ||
82 | + } | ||
83 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | ||
2 | + | ||
3 | +import com.google.common.collect.Maps; | ||
4 | +import com.google.common.collect.Sets; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
7 | + | ||
8 | +import java.util.List; | ||
9 | +import java.util.Map; | ||
10 | +import java.util.Set; | ||
11 | +import java.util.function.Function; | ||
12 | +import java.util.stream.Collectors; | ||
13 | + | ||
14 | +import static java.util.stream.Collectors.toList; | ||
15 | +import static java.util.stream.Collectors.toMap; | ||
16 | + | ||
17 | +/** | ||
18 | + * Created by me2 on 04.04.16. | ||
19 | + */ | ||
20 | +public class FeatureHelper { | ||
21 | + | ||
22 | + private final List<TMention> mentions; | ||
23 | + private final Map<String, TMention> mentionId2mention; | ||
24 | + private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); | ||
25 | + private final Map<TMention, TCoreference> mention2coref = Maps.newHashMap(); | ||
26 | + private final Map<TMention, TSentence> mention2sent = Maps.newHashMap(); | ||
27 | + private final Map<TMention, TParagraph> mention2par = Maps.newHashMap(); | ||
28 | + private final Map<TMention, String> mention2Orth = Maps.newHashMap(); | ||
29 | + private final Map<TMention, String> mention2Base = Maps.newHashMap(); | ||
30 | + private final Map<TMention, TToken> mention2head = Maps.newHashMap(); | ||
31 | + private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet(); | ||
32 | + | ||
33 | + private final Map<TMention, Integer> mention2Index = Maps.newHashMap(); | ||
34 | + private final Map<TSentence, Integer> sent2Index = Maps.newHashMap(); | ||
35 | + private final Map<TParagraph, Integer> par2Index = Maps.newHashMap(); | ||
36 | + private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap(); | ||
37 | + private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap(); | ||
38 | + private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); | ||
39 | + | ||
40 | + | ||
41 | + public FeatureHelper(TText preprocessedText) { | ||
42 | + mentions = preprocessedText.getParagraphs().stream() | ||
43 | + .flatMap(p -> p.getSentences().stream()) | ||
44 | + .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); | ||
45 | + | ||
46 | + mentionId2mention = mentions.stream().collect(Collectors.toMap(TMention::getId, Function.identity())); | ||
47 | + | ||
48 | + for (TCoreference coref : preprocessedText.getCoreferences()) { | ||
49 | + List<TMention> ments = coref.getMentionIds().stream().map(mentionId2mention::get).collect(toList()); | ||
50 | + for (TMention m : ments) { | ||
51 | + mention2coref.put(m, coref); | ||
52 | + } | ||
53 | + coref2mentions.put(coref, ments); | ||
54 | + } | ||
55 | + | ||
56 | + int parIdx = 0; | ||
57 | + int sentIdx = 0; | ||
58 | + int mentionIdx = 0; | ||
59 | + for (TParagraph par : preprocessedText.getParagraphs()) { | ||
60 | + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences()); | ||
61 | + mention2Orth.putAll(m2o); | ||
62 | + Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); | ||
63 | + mention2Base.putAll(m2b); | ||
64 | + | ||
65 | + int sentIdxInPar = 0; | ||
66 | + int mentionIdxInPar = 0; | ||
67 | + for (TSentence sent : par.getSentences()) { | ||
68 | + | ||
69 | + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | ||
70 | + | ||
71 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | ||
72 | + for (TNamedEntity namedEntity : sent.getNames()) { | ||
73 | + for (String childId : namedEntity.getChildIds()) { | ||
74 | + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | ||
75 | + tokenId2namedEntities.get(childId).add(namedEntity); | ||
76 | + } | ||
77 | + } | ||
78 | + | ||
79 | + int mentionIdxInSent = 0; | ||
80 | + for (TMention mention : sent.getMentions()) { | ||
81 | + mention2sent.put(mention, sent); | ||
82 | + mention2par.put(mention, par); | ||
83 | + mention2Index.put(mention, mentionIdx++); | ||
84 | + mention2indexInSent.put(mention, mentionIdxInSent++); | ||
85 | + mention2indexInPar.put(mention, mentionIdxInPar++); | ||
86 | + | ||
87 | + String firstHeadTokenId = mention.getHeadIds().iterator().next(); | ||
88 | + mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | ||
89 | + if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | ||
90 | + mentionsInNamedEntities.add(mention); | ||
91 | + } | ||
92 | + sent2Index.put(sent, sentIdx++); | ||
93 | + sent2IndexInPar.put(sent, sentIdxInPar++); | ||
94 | + } | ||
95 | + | ||
96 | + par2Index.put(par, parIdx++); | ||
97 | + } | ||
98 | + } | ||
99 | + | ||
100 | + public List<TMention> getMentions() { | ||
101 | + return mentions; | ||
102 | + } | ||
103 | + | ||
104 | + public int getMentionIndexInChain(TMention mention) { | ||
105 | + return coref2mentions.get(mention2coref.get(mention)).indexOf(mention); | ||
106 | + } | ||
107 | + | ||
108 | + public int getChainLength(TMention mention) { | ||
109 | + return coref2mentions.get(mention2coref.get(mention)).size(); | ||
110 | + } | ||
111 | + | ||
112 | + public String getSentenceLastTokenOrth(TSentence sent) { | ||
113 | + return sent.getTokens().get(sent.getTokensSize() - 1).getOrth(); | ||
114 | + } | ||
115 | + | ||
116 | + public String getMentionOrth(TMention mention) { | ||
117 | + return mention2Orth.get(mention); | ||
118 | + } | ||
119 | + | ||
120 | + public String getMentionBase(TMention mention) { | ||
121 | + return mention2Base.get(mention); | ||
122 | + } | ||
123 | + | ||
124 | + public int getMentionIndex(TMention mention) { | ||
125 | + return mention2Index.get(mention); | ||
126 | + } | ||
127 | + | ||
128 | + public int getMentionIndexInSent(TMention mention) { | ||
129 | + return mention2indexInSent.get(mention); | ||
130 | + } | ||
131 | + | ||
132 | + public int getMentionIndexInPar(TMention mention) { | ||
133 | + return mention2indexInPar.get(mention); | ||
134 | + } | ||
135 | + | ||
136 | + public int getParIndex(TParagraph paragraph) { | ||
137 | + return par2Index.get(paragraph); | ||
138 | + } | ||
139 | + | ||
140 | + public int getSentIndex(TSentence sent) { | ||
141 | + return sent2Index.get(sent); | ||
142 | + } | ||
143 | + | ||
144 | + public int getSentIndexInPar(TSentence sent) { | ||
145 | + return sent2IndexInPar.get(sent); | ||
146 | + } | ||
147 | + | ||
148 | + public TParagraph getMentionParagraph(TMention mention) { | ||
149 | + return mention2par.get(mention); | ||
150 | + } | ||
151 | + | ||
152 | + public TSentence getMentionSentence(TMention mention) { | ||
153 | + return mention2sent.get(mention); | ||
154 | + } | ||
155 | + | ||
156 | + public TMention getFirstChainMention(TMention mention) { | ||
157 | + return mentionId2mention.get(mention2coref.get(mention).getMentionIdsIterator().next()); | ||
158 | + } | ||
159 | + | ||
160 | + public TToken getMentionHeadToken(TMention mention) { | ||
161 | + return mention2head.get(mention); | ||
162 | + } | ||
163 | + | ||
164 | + public boolean isMentionNamedEntity(TMention mention) { | ||
165 | + return mentionsInNamedEntities.contains(mention); | ||
166 | + } | ||
167 | + | ||
168 | + public boolean isNested(TMention mention) { | ||
169 | + return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds())); | ||
170 | + } | ||
171 | + | ||
172 | + public boolean isNesting(TMention mention) { | ||
173 | + return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds())); | ||
174 | + } | ||
175 | + | ||
176 | + public Set<TCoreference> getClusters() { | ||
177 | + return coref2mentions.keySet(); | ||
178 | + } | ||
179 | + | ||
180 | + public Set<TMention> getCoreferentMentions(TMention tMention) { | ||
181 | + return getMentionCluster(tMention).getMentionIds().stream().map(this.mentionId2mention::get).collect(Collectors.toSet()); | ||
182 | + } | ||
183 | + | ||
184 | + public TCoreference getMentionCluster(TMention tMention) { | ||
185 | + return this.mention2coref.get(tMention); | ||
186 | + } | ||
187 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; | ||
4 | + | ||
5 | + | ||
6 | +public class Interpretation { | ||
7 | + private String ctag = "null"; | ||
8 | + private String casee = "null"; | ||
9 | + private String gender = "null"; | ||
10 | + private String number = "null"; | ||
11 | + private String person = "null"; | ||
12 | + | ||
13 | + public Interpretation(TInterpretation chosenInterpretation) { | ||
14 | + ctag = chosenInterpretation.getCtag(); | ||
15 | + String[] split = chosenInterpretation.getMsd().split(":"); | ||
16 | + switch (ctag) { | ||
17 | + case "ger": | ||
18 | + case "subst": | ||
19 | + case "pact": | ||
20 | + case "ppas": | ||
21 | + case "num": | ||
22 | + case "numcol": | ||
23 | + case "adj": | ||
24 | + number = split[0]; | ||
25 | + casee = split[1]; | ||
26 | + gender = split[2]; | ||
27 | + break; | ||
28 | + case "ppron12": | ||
29 | + case "ppron3": | ||
30 | + number = split[0]; | ||
31 | + casee = split[1]; | ||
32 | + gender = split[2]; | ||
33 | + person = split[3]; | ||
34 | + break; | ||
35 | + case "siebie": | ||
36 | + casee = split[0]; | ||
37 | + break; | ||
38 | + case "fin": | ||
39 | + case "bedzie": | ||
40 | + case "aglt": | ||
41 | + case "impt": | ||
42 | + number = split[0]; | ||
43 | + person = split[1]; | ||
44 | + break; | ||
45 | + case "praet": | ||
46 | + case "winien": | ||
47 | + number = split[0]; | ||
48 | + gender = split[1]; | ||
49 | + break; | ||
50 | + case "prep": | ||
51 | + casee = split[0]; | ||
52 | + break; | ||
53 | + default: | ||
54 | + break; | ||
55 | + } | ||
56 | + } | ||
57 | + | ||
58 | + public String getCase() { | ||
59 | + return casee; | ||
60 | + } | ||
61 | + | ||
62 | + public String getGender() { | ||
63 | + return gender; | ||
64 | + } | ||
65 | + | ||
66 | + public String getNumber() { | ||
67 | + return number; | ||
68 | + } | ||
69 | + | ||
70 | + public String getPerson() { | ||
71 | + return person; | ||
72 | + } | ||
73 | + | ||
74 | + public String getCtag() { | ||
75 | + return ctag; | ||
76 | + } | ||
77 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | + | ||
3 | +import com.google.common.collect.*; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | ||
8 | +import weka.core.Attribute; | ||
9 | + | ||
10 | +import java.io.File; | ||
11 | +import java.io.IOException; | ||
12 | +import java.nio.file.Files; | ||
13 | +import java.util.*; | ||
14 | +import java.util.stream.Collectors; | ||
15 | +import java.util.stream.Stream; | ||
16 | + | ||
17 | + | ||
18 | +public class MentionFeatureExtractor extends FeatureExtractor { | ||
19 | + | ||
20 | + private final List<String> frequentBases = Lists.newArrayList(); | ||
21 | + | ||
22 | + public MentionFeatureExtractor() { | ||
23 | + | ||
24 | + //coref | ||
25 | + addNumericAttributeNormalized("chain_length"); | ||
26 | + | ||
27 | + // text characteristics | ||
28 | + addNumericAttribute("text_token_count"); | ||
29 | + addNumericAttribute("text_sent_count"); | ||
30 | + addNumericAttribute("text_par_count"); | ||
31 | + addNumericAttribute("text_mention_count"); | ||
32 | + addNumericAttribute("text_cluster_count"); | ||
33 | + | ||
34 | + //mention characteristics | ||
35 | + for (String prefix : Lists.newArrayList("mention", "chain_first_mention")) { | ||
36 | + // mention characteristics | ||
37 | + addNumericAttributeNormalized(prefix + "_index"); | ||
38 | + addNumericAttributeNormalized(prefix + "_index_in_sent"); | ||
39 | + addNumericAttributeNormalized(prefix + "_index_in_par"); | ||
40 | + addNumericAttributeNormalized(prefix + "_index_in_chain"); | ||
41 | + addBinaryAttribute(prefix + "_capitalized"); | ||
42 | + addBinaryAttribute(prefix + "_all_caps"); | ||
43 | + addNumericAttributeNormalized(prefix + "_char_count"); | ||
44 | + addNumericAttributeNormalized(prefix + "_token_count"); | ||
45 | + addBinaryAttribute(prefix + "_is_zero"); | ||
46 | + addBinaryAttribute(prefix + "_is_named"); | ||
47 | + addBinaryAttribute(prefix + "_is_pronoun"); | ||
48 | + addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact")); | ||
49 | + addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); | ||
50 | + addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | ||
51 | + addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); | ||
52 | + addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n")); | ||
53 | + | ||
54 | + // relation to other | ||
55 | + addBinaryAttribute(prefix + "_is_nested"); | ||
56 | + addBinaryAttribute(prefix + "_is_nesting"); | ||
57 | + | ||
58 | + // par characteristics | ||
59 | + addNumericAttributeNormalized(prefix + "_par_idx"); | ||
60 | + addNumericAttributeNormalized(prefix + "_par_token_count"); | ||
61 | + addNumericAttributeNormalized(prefix + "_par_sent_count"); | ||
62 | + | ||
63 | + // sent characteristics | ||
64 | + addNumericAttributeNormalized(prefix + "_sent_token_count"); | ||
65 | + addNumericAttributeNormalized(prefix + "_sent_mention_count"); | ||
66 | + addNumericAttributeNormalized(prefix + "_sent_idx"); | ||
67 | + addNumericAttributeNormalized(prefix + "_sent_idx_in_par"); | ||
68 | + addBinaryAttribute(prefix + "_sent_ends_with_dot"); | ||
69 | + addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); | ||
70 | + | ||
71 | + // frequent bases | ||
72 | + loadFrequentBases(); | ||
73 | + for (String base : frequentBases) { | ||
74 | + addBinaryAttribute(prefix + "_" + encodeBase(base)); | ||
75 | + } | ||
76 | + } | ||
77 | + | ||
78 | + addNominalAttribute("score", Lists.newArrayList("bad", "good")); | ||
79 | + fillSortedAttributes("score"); | ||
80 | + } | ||
81 | + | ||
82 | + private String encodeBase(String base) { | ||
83 | + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | ||
84 | + } | ||
85 | + | ||
86 | + private void loadFrequentBases() { | ||
87 | + try { | ||
88 | + Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath()); | ||
89 | + this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList())); | ||
90 | + } catch (IOException e) { | ||
91 | + e.printStackTrace(); | ||
92 | + } | ||
93 | + } | ||
94 | + | ||
95 | + public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { | ||
96 | + Map<TMention, Map<Attribute, Double>> result = Maps.newHashMap(); | ||
97 | + | ||
98 | + FeatureHelper helper = new FeatureHelper(preprocessedText); | ||
99 | + | ||
100 | + addScoreFeature(result, helper.getMentions()); | ||
101 | + | ||
102 | + for (TMention mention : helper.getMentions()) { | ||
103 | + Map<Attribute, Double> attribute2value = result.get(mention); | ||
104 | + | ||
105 | + //mention | ||
106 | + addMentionAttributes(helper, mention, attribute2value, "mention"); | ||
107 | + | ||
108 | + //first chain mention | ||
109 | + TMention firstChainMention = helper.getFirstChainMention(mention); | ||
110 | + addMentionAttributes(helper, firstChainMention, attribute2value, "chain_first_mention"); | ||
111 | + | ||
112 | + //coref | ||
113 | + attribute2value.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention)); | ||
114 | + | ||
115 | + //text | ||
116 | + List<TParagraph> pars = preprocessedText.getParagraphs(); | ||
117 | + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | ||
118 | + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList()); | ||
119 | + attribute2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum()); | ||
120 | + attribute2value.put(getAttributeByName("text_token_count"), (double) tokens.size()); | ||
121 | + attribute2value.put(getAttributeByName("text_sent_count"), (double) sents.size()); | ||
122 | + attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); | ||
123 | + attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); | ||
124 | + attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); | ||
125 | + | ||
126 | + assert (attribute2value.size() == getAttributesList().size()); | ||
127 | + } | ||
128 | + addNormalizedAttributeValues(result); | ||
129 | + | ||
130 | + return result; | ||
131 | + } | ||
132 | + | ||
133 | + private void addMentionAttributes(FeatureHelper helper, TMention mention, Map<Attribute, Double> attribute2value, String attributePrefix) { | ||
134 | + // mention characteristics | ||
135 | + attribute2value.put(getAttributeByName(attributePrefix + "_index"), (double) helper.getMentionIndex(mention)); | ||
136 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | ||
137 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_par"), (double) helper.getMentionIndexInPar(mention)); | ||
138 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_chain"), (double) helper.getMentionIndexInChain(mention)); | ||
139 | + attribute2value.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | ||
140 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject())); | ||
141 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*"))); | ||
142 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | ||
143 | + | ||
144 | + Interpretation interp = new Interpretation(helper.getMentionHeadToken(mention).getChosenInterpretation()); | ||
145 | + addNominalAttributeValue(interp.getCtag(), attribute2value, attributePrefix + "_ctag"); | ||
146 | + addNominalAttributeValue(interp.getPerson(), attribute2value, attributePrefix + "_person"); | ||
147 | + addNominalAttributeValue(interp.getNumber(), attribute2value, attributePrefix + "_number"); | ||
148 | + addNominalAttributeValue(interp.getGender(), attribute2value, attributePrefix + "_gender"); | ||
149 | + addNominalAttributeValue(interp.getCase(), attribute2value, attributePrefix + "_case"); | ||
150 | + | ||
151 | + // relation to other mentions | ||
152 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention))); | ||
153 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); | ||
154 | + | ||
155 | + String orth = helper.getMentionOrth(mention); | ||
156 | + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1)))); | ||
157 | + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth))); | ||
158 | + attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length()); | ||
159 | + | ||
160 | + // par characteristics | ||
161 | + TParagraph mentionParagraph = helper.getMentionParagraph(mention); | ||
162 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_idx"), (double) helper.getParIndex(mentionParagraph)); | ||
163 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_token_count"), mentionParagraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); | ||
164 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_sent_count"), (double) mentionParagraph.getSentences().size()); | ||
165 | + | ||
166 | + // sent characteristics | ||
167 | + TSentence mentionSentence = helper.getMentionSentence(mention); | ||
168 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_token_count"), (double) mentionSentence.getTokensSize()); | ||
169 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size()); | ||
170 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence)); | ||
171 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence)); | ||
172 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("."))); | ||
173 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?"))); | ||
174 | + | ||
175 | + // frequent bases | ||
176 | + String mentionBase = helper.getMentionBase(mention); | ||
177 | + for (String base : frequentBases) { | ||
178 | + attribute2value.put(getAttributeByName(attributePrefix + "_" + encodeBase(base)), toBinary(mentionBase.equals(base))); | ||
179 | + } | ||
180 | + } | ||
181 | + | ||
182 | + private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { | ||
183 | + Attribute att = getAttributeByName(attributeName); | ||
184 | + int index = att.indexOfValue(value); | ||
185 | + if (index == -1) | ||
186 | + LOG.warn(value + " not found for attribute " + attributeName); | ||
187 | + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | ||
188 | + } | ||
189 | + | ||
190 | + | ||
191 | + private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) { | ||
192 | + for (TMention m : mentions) { | ||
193 | + Map<Attribute, Double> map = Maps.newHashMap(); | ||
194 | + map.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | ||
195 | + result.put(m, map); | ||
196 | + } | ||
197 | + } | ||
198 | + | ||
199 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | + | ||
3 | +import com.google.common.collect.Sets; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
9 | +import weka.classifiers.Classifier; | ||
10 | +import weka.core.Instance; | ||
11 | +import weka.core.Instances; | ||
12 | + | ||
13 | +import java.util.Map; | ||
14 | +import java.util.Set; | ||
15 | + | ||
16 | +public class MentionModel { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(MentionModel.class); | ||
19 | + | ||
20 | + public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { | ||
21 | + Set<TMention> goodMentions = Sets.newHashSet(); | ||
22 | + | ||
23 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
24 | + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor); | ||
25 | + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | ||
26 | + Instance instance = entry.getValue(); | ||
27 | + instance.setDataset(instances); | ||
28 | + instance.setClassMissing(); | ||
29 | + boolean good = classifier.classifyInstance(instance) > 0.5; | ||
30 | + if (good) | ||
31 | + goodMentions.add(entry.getKey()); | ||
32 | + } | ||
33 | + LOG.info("\t" + goodMentions.size() + "\t" + mention2instance.size()); | ||
34 | + return goodMentions; | ||
35 | + } | ||
36 | + | ||
37 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | + | ||
3 | +import com.google.common.collect.HashMultiset; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.collect.Multiset; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
10 | + | ||
11 | +import java.util.Collection; | ||
12 | +import java.util.List; | ||
13 | +import java.util.Map; | ||
14 | +import java.util.stream.Collectors; | ||
15 | + | ||
16 | +public class MentionScorer { | ||
17 | + | ||
18 | + | ||
19 | + public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { | ||
20 | + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | ||
21 | + | ||
22 | + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | ||
23 | + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences); | ||
24 | + | ||
25 | + return booleanTokenIntersection(mention2Orth, tokenCounts); | ||
26 | + } | ||
27 | + | ||
28 | + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | ||
29 | + Map<TMention, Double> mention2score = Maps.newHashMap(); | ||
30 | + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | ||
31 | + TMention mention = entry.getKey(); | ||
32 | + String mentionOrth = mention2Orth.get(mention); | ||
33 | + for (String token : Utils.tokenize(mentionOrth)) { | ||
34 | + if (tokenCounts.contains(token.toLowerCase())) { | ||
35 | + mention2score.put(mention, 1.0); | ||
36 | + break; | ||
37 | + } | ||
38 | + } | ||
39 | + mention2score.putIfAbsent(mention, 0.0); | ||
40 | + } | ||
41 | + return mention2score; | ||
42 | + } | ||
43 | + | ||
44 | + private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | ||
45 | + Map<TMention, Double> mention2score = Maps.newHashMap(); | ||
46 | + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | ||
47 | + TMention mention = entry.getKey(); | ||
48 | + String mentionOrth = mention2Orth.get(mention); | ||
49 | + int present = 0; | ||
50 | + for (String token : Utils.tokenize(mentionOrth)) { | ||
51 | + if (tokenCounts.contains(token.toLowerCase())) { | ||
52 | + present++; | ||
53 | + } | ||
54 | + } | ||
55 | + mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0); | ||
56 | + } | ||
57 | + return mention2score; | ||
58 | + } | ||
59 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | + | ||
3 | +import com.google.common.base.Charsets; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.io.Files; | ||
6 | +import org.apache.logging.log4j.LogManager; | ||
7 | +import org.apache.logging.log4j.Logger; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
12 | +import weka.core.Instance; | ||
13 | +import weka.core.Instances; | ||
14 | +import weka.core.converters.ArffSaver; | ||
15 | + | ||
16 | +import java.io.File; | ||
17 | +import java.io.IOException; | ||
18 | +import java.util.Map; | ||
19 | + | ||
20 | + | ||
21 | +public class PrepareTrainingData { | ||
22 | + | ||
23 | + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class); | ||
24 | + | ||
25 | + public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | ||
26 | + public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | ||
27 | + | ||
28 | + public static void main(String[] args) throws IOException { | ||
29 | + | ||
30 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | ||
31 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(); | ||
32 | + | ||
33 | + MentionScorer mentionScorer = new MentionScorer(); | ||
34 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | ||
35 | + | ||
36 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
37 | + | ||
38 | + int i = 1; | ||
39 | + for (String textId : id2preprocessedText.keySet()) { | ||
40 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
41 | + | ||
42 | + TText preprocessedText = id2preprocessedText.get(textId); | ||
43 | + String optimalSummary = id2optimalSummary.get(textId); | ||
44 | + if (optimalSummary == null) | ||
45 | + continue; | ||
46 | + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | ||
47 | + | ||
48 | + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor); | ||
49 | + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | ||
50 | + TMention mention = entry.getKey(); | ||
51 | + Instance instance = entry.getValue(); | ||
52 | + instance.setDataset(instances); | ||
53 | + instance.setClassValue(mention2score.get(mention)); | ||
54 | + instances.add(instance); | ||
55 | + } | ||
56 | + } | ||
57 | + saveInstancesToFile(instances); | ||
58 | + } | ||
59 | + | ||
60 | + private static void saveInstancesToFile(Instances instances) throws IOException { | ||
61 | + ArffSaver saver = new ArffSaver(); | ||
62 | + saver.setInstances(instances); | ||
63 | + saver.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
64 | + saver.writeBatch(); | ||
65 | + } | ||
66 | + | ||
67 | + private static Map<String, String> loadOptimalSummaries() throws IOException { | ||
68 | + Map<String, String> id2optimalSummary = Maps.newHashMap(); | ||
69 | + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | ||
70 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | ||
71 | + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | ||
72 | + } | ||
73 | + LOG.info(id2optimalSummary.size() + " optimal summaries found."); | ||
74 | + return id2optimalSummary; | ||
75 | + } | ||
76 | + | ||
77 | + | ||
78 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.core.Instances; | ||
9 | +import weka.core.converters.ArffLoader; | ||
10 | + | ||
11 | +import java.io.File; | ||
12 | +import java.io.FileOutputStream; | ||
13 | +import java.io.ObjectOutputStream; | ||
14 | + | ||
15 | + | ||
16 | +public class TrainModel { | ||
17 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | ||
18 | + | ||
19 | + public static void main(String[] args) throws Exception { | ||
20 | + | ||
21 | + ArffLoader loader = new ArffLoader(); | ||
22 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
23 | + Instances instances = loader.getDataSet(); | ||
24 | + instances.setClassIndex(0); | ||
25 | + LOG.info(instances.size() + " instances loaded."); | ||
26 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
27 | + | ||
28 | + StopWatch watch = new StopWatch(); | ||
29 | + watch.start(); | ||
30 | + | ||
31 | + Classifier classifier = Constants.getClassifier(); | ||
32 | + | ||
33 | + LOG.info("Building classifier..."); | ||
34 | + classifier.buildClassifier(instances); | ||
35 | + LOG.info("...done."); | ||
36 | + | ||
37 | + try (ObjectOutputStream oos = new ObjectOutputStream( | ||
38 | + new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) { | ||
39 | + oos.writeObject(classifier); | ||
40 | + } | ||
41 | + | ||
42 | + watch.stop(); | ||
43 | + LOG.info("Elapsed time: " + watch); | ||
44 | + | ||
45 | + LOG.info(classifier.toString()); | ||
46 | + } | ||
47 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.classifiers.evaluation.Evaluation; | ||
9 | +import weka.core.Instances; | ||
10 | +import weka.core.converters.ArffLoader; | ||
11 | + | ||
12 | +import java.io.File; | ||
13 | +import java.util.Random; | ||
14 | + | ||
15 | + | ||
16 | +public class Crossvalidate { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
19 | + | ||
20 | + public static void main(String[] args) throws Exception { | ||
21 | + | ||
22 | + ArffLoader loader = new ArffLoader(); | ||
23 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
24 | + Instances instances = loader.getDataSet(); | ||
25 | + instances.setClassIndex(0); | ||
26 | + LOG.info(instances.size() + " instances loaded."); | ||
27 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
28 | + | ||
29 | +// while (instances.size() > 10000) | ||
30 | +// instances.remove(instances.size() - 1); | ||
31 | + | ||
32 | + StopWatch watch = new StopWatch(); | ||
33 | + watch.start(); | ||
34 | + | ||
35 | + Classifier tree = Constants.getClassifier(); | ||
36 | + | ||
37 | + Evaluation eval = new Evaluation(instances); | ||
38 | + eval.crossValidateModel(tree, instances, 10, new Random(1)); | ||
39 | + LOG.info(eval.toSummaryString()); | ||
40 | + | ||
41 | + watch.stop(); | ||
42 | + LOG.info("Elapsed time: " + watch); | ||
43 | + } | ||
44 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.classifiers.evaluation.Evaluation; | ||
9 | +import weka.core.Instances; | ||
10 | +import weka.core.converters.ArffLoader; | ||
11 | + | ||
12 | +import java.io.File; | ||
13 | +import java.io.FileInputStream; | ||
14 | +import java.io.IOException; | ||
15 | +import java.io.ObjectInputStream; | ||
16 | + | ||
17 | +/** | ||
18 | + * Created by me2 on 05.04.16. | ||
19 | + */ | ||
20 | +public class Validate { | ||
21 | + private static final Logger LOG = LoggerFactory.getLogger(Validate.class); | ||
22 | + | ||
23 | + public static void main(String[] args) throws Exception { | ||
24 | + | ||
25 | + ArffLoader loader = new ArffLoader(); | ||
26 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
27 | + Instances instances = loader.getDataSet(); | ||
28 | + instances.setClassIndex(0); | ||
29 | + LOG.info(instances.size() + " instances loaded."); | ||
30 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
31 | + | ||
32 | + Classifier classifier = loadClassifier(); | ||
33 | + | ||
34 | + StopWatch watch = new StopWatch(); | ||
35 | + watch.start(); | ||
36 | + | ||
37 | + Evaluation eval = new Evaluation(instances); | ||
38 | + eval.evaluateModel(classifier, instances); | ||
39 | + | ||
40 | + LOG.info(eval.toSummaryString()); | ||
41 | + | ||
42 | + watch.stop(); | ||
43 | + LOG.info("Elapsed time: " + watch); | ||
44 | + } | ||
45 | + | ||
46 | + private static Classifier loadClassifier() throws IOException, ClassNotFoundException { | ||
47 | + LOG.info("Loading classifier..."); | ||
48 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) { | ||
49 | + Classifier classifier = (Classifier) ois.readObject(); | ||
50 | + LOG.info("Done. " + classifier.toString()); | ||
51 | + return classifier; | ||
52 | + } | ||
53 | + } | ||
54 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | + | ||
3 | +import com.google.common.base.Charsets; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.io.Files; | ||
6 | +import org.apache.logging.log4j.LogManager; | ||
7 | +import org.apache.logging.log4j.Logger; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | ||
15 | +import weka.classifiers.Classifier; | ||
16 | +import weka.core.Instance; | ||
17 | +import weka.core.Instances; | ||
18 | +import weka.core.converters.ArffSaver; | ||
19 | + | ||
20 | +import java.io.File; | ||
21 | +import java.io.IOException; | ||
22 | +import java.util.Map; | ||
23 | +import java.util.Set; | ||
24 | + | ||
25 | + | ||
26 | +public class PrepareTrainingData { | ||
27 | + | ||
28 | + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class); | ||
29 | + | ||
30 | + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | ||
31 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | ||
32 | + | ||
33 | + public static void main(String[] args) throws Exception { | ||
34 | + | ||
35 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | ||
36 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(); | ||
37 | + | ||
38 | + SentenceScorer sentenceScorer = new SentenceScorer(); | ||
39 | + SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); | ||
40 | + | ||
41 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
42 | + | ||
43 | + Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | ||
44 | + MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); | ||
45 | + | ||
46 | + int i = 1; | ||
47 | + for (String textId : id2preprocessedText.keySet()) { | ||
48 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
49 | + | ||
50 | + TText preprocessedText = id2preprocessedText.get(textId); | ||
51 | + String optimalSummary = id2optimalSummary.get(textId); | ||
52 | + if (optimalSummary == null) | ||
53 | + continue; | ||
54 | + Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText); | ||
55 | + | ||
56 | + Set<TMention> goodMentions | ||
57 | + = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText); | ||
58 | +// Set<TMention> goodMentions | ||
59 | +// = Utils.loadGoldGoodMentions(textId, preprocessedText, true); | ||
60 | + | ||
61 | + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | ||
62 | + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | ||
63 | + TSentence sentence = entry.getKey(); | ||
64 | + Instance instance = entry.getValue(); | ||
65 | + instance.setDataset(instances); | ||
66 | + instance.setClassValue(sentence2score.get(sentence)); | ||
67 | + instances.add(instance); | ||
68 | + } | ||
69 | + } | ||
70 | + saveInstancesToFile(instances); | ||
71 | + } | ||
72 | + | ||
73 | + private static void saveInstancesToFile(Instances instances) throws IOException { | ||
74 | + ArffSaver saver = new ArffSaver(); | ||
75 | + saver.setInstances(instances); | ||
76 | + saver.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | ||
77 | + saver.writeBatch(); | ||
78 | + } | ||
79 | + | ||
80 | + private static Map<String, String> loadOptimalSummaries() throws IOException { | ||
81 | + Map<String, String> id2optimalSummary = Maps.newHashMap(); | ||
82 | + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | ||
83 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | ||
84 | + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | ||
85 | + } | ||
86 | + LOG.info(id2optimalSummary.size() + " optimal summaries found."); | ||
87 | + return id2optimalSummary; | ||
88 | + } | ||
89 | + | ||
90 | + | ||
91 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | + | ||
3 | +import com.google.common.collect.Maps; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
7 | +import weka.core.Attribute; | ||
8 | + | ||
9 | +import java.util.List; | ||
10 | +import java.util.Map; | ||
11 | +import java.util.Set; | ||
12 | +import java.util.stream.Collectors; | ||
13 | + | ||
14 | +public class SentenceFeatureExtractor extends FeatureExtractor { | ||
15 | + | ||
16 | + public SentenceFeatureExtractor() { | ||
17 | + | ||
18 | + addNumericAttributeNormalized("sent_mention_cluster_count"); | ||
19 | + addNumericAttributeNormalized("sent_good_mention_cluster_count"); | ||
20 | + addNumericAttributeNormalized("sent_good_mention_cluster_good_count"); | ||
21 | + addNumericAttributeNormalized("sent_cluster_count"); | ||
22 | + addNumericAttributeNormalized("sent_good_cluster_count"); | ||
23 | + addNumericAttributeNormalized("sent_mention_count"); | ||
24 | + addNumericAttributeNormalized("sent_good_mention_count"); | ||
25 | + | ||
26 | + addNumericAttributeNormalized("sent_token_length"); | ||
27 | + addNumericAttributeNormalized("sent_idx"); | ||
28 | + addNumericAttributeNormalized("sent_idx_in_par"); | ||
29 | + addBinaryAttribute("sent_ends_with_dot"); | ||
30 | + addBinaryAttribute("sent_ends_with_questionmark"); | ||
31 | + | ||
32 | + addNumericAttributeNormalized("par_idx"); | ||
33 | + addNumericAttributeNormalized("par_token_count"); | ||
34 | + addNumericAttributeNormalized("par_sent_count"); | ||
35 | + | ||
36 | + addNumericAttribute("text_token_count"); | ||
37 | + addNumericAttribute("text_sent_count"); | ||
38 | + addNumericAttribute("text_par_count"); | ||
39 | + addNumericAttribute("text_mention_count"); | ||
40 | + addNumericAttribute("text_cluster_count"); | ||
41 | + | ||
42 | + addNumericAttribute("score"); | ||
43 | + fillSortedAttributes("score"); | ||
44 | + } | ||
45 | + | ||
46 | + public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) { | ||
47 | + | ||
48 | + int sentenceIdx = 0; | ||
49 | + int parIdx = 0; | ||
50 | + | ||
51 | + FeatureHelper helper = new FeatureHelper(preprocessedText); | ||
52 | + List<TParagraph> pars = preprocessedText.getParagraphs(); | ||
53 | + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | ||
54 | + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList()); | ||
55 | + | ||
56 | + Map<TSentence, Map<Attribute, Double>> sentence2features = Maps.newLinkedHashMap(); | ||
57 | + for (TParagraph paragraph : preprocessedText.getParagraphs()) { | ||
58 | + int sentenceIdxInPar = 0; | ||
59 | + for (TSentence sentence : paragraph.getSentences()) { | ||
60 | + Map<Attribute, Double> feature2value = Maps.newHashMap(); | ||
61 | + | ||
62 | + feature2value.put(getAttributeByName("sent_mention_cluster_count"), sentence.getMentions().stream().mapToDouble(helper::getChainLength).sum()); | ||
63 | + feature2value.put(getAttributeByName("sent_good_mention_cluster_count"), sentence.getMentions().stream().filter(goodMentions::contains).mapToDouble(helper::getChainLength).sum()); | ||
64 | + feature2value.put(getAttributeByName("sent_good_mention_cluster_good_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).flatMap(m -> helper.getCoreferentMentions(m).stream()).filter(goodMentions::contains).count()); | ||
65 | + feature2value.put(getAttributeByName("sent_cluster_count"), (double) sentence.getMentions().stream().map(helper::getMentionCluster).collect(Collectors.toSet()).size()); | ||
66 | + feature2value.put(getAttributeByName("sent_good_cluster_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).map(helper::getMentionCluster).collect(Collectors.toSet()).size()); | ||
67 | + feature2value.put(getAttributeByName("sent_mention_count"), (double) sentence.getMentions().size()); | ||
68 | + feature2value.put(getAttributeByName("sent_good_mention_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).count()); | ||
69 | + | ||
70 | + feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size()); | ||
71 | + feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar); | ||
72 | + feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx); | ||
73 | + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("."))); | ||
74 | + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?"))); | ||
75 | + | ||
76 | + feature2value.put(getAttributeByName("par_idx"), (double) parIdx); | ||
77 | + feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); | ||
78 | + feature2value.put(getAttributeByName("par_sent_count"), (double) paragraph.getSentences().size()); | ||
79 | + | ||
80 | + feature2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum()); | ||
81 | + feature2value.put(getAttributeByName("text_token_count"), (double) tokens.size()); | ||
82 | + feature2value.put(getAttributeByName("text_sent_count"), (double) sents.size()); | ||
83 | + feature2value.put(getAttributeByName("text_par_count"), (double) pars.size()); | ||
84 | + feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); | ||
85 | + feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); | ||
86 | + | ||
87 | + feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | ||
88 | + | ||
89 | + feature2value.remove(null); | ||
90 | + assert (feature2value.size() == getAttributesList().size()); | ||
91 | + | ||
92 | + sentence2features.put(sentence, feature2value); | ||
93 | + | ||
94 | + sentenceIdx++; | ||
95 | + sentenceIdxInPar++; | ||
96 | + } | ||
97 | + parIdx++; | ||
98 | + } | ||
99 | + addNormalizedAttributeValues(sentence2features); | ||
100 | + | ||
101 | + return sentence2features; | ||
102 | + } | ||
103 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | + | ||
3 | +import com.google.common.collect.HashMultiset; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.collect.Multiset; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
10 | + | ||
11 | +import java.util.List; | ||
12 | +import java.util.Map; | ||
13 | + | ||
14 | +public class SentenceScorer { | ||
15 | + public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { | ||
16 | + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | ||
17 | + | ||
18 | + Map<TSentence, Double> sentence2score = Maps.newHashMap(); | ||
19 | + for (TParagraph paragraph : preprocessedText.getParagraphs()) | ||
20 | + for (TSentence sentence : paragraph.getSentences()) { | ||
21 | + double score = 0.0; | ||
22 | + | ||
23 | + String orth = Utils.loadSentence2Orth(sentence); | ||
24 | + List<String> tokens = Utils.tokenize(orth); | ||
25 | + for (String token : tokens) { | ||
26 | + score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; | ||
27 | + } | ||
28 | + sentence2score.put(sentence, score / tokens.size()); | ||
29 | + } | ||
30 | + return sentence2score; | ||
31 | + } | ||
32 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.core.Instances; | ||
9 | +import weka.core.converters.ArffLoader; | ||
10 | + | ||
11 | +import java.io.File; | ||
12 | +import java.io.FileOutputStream; | ||
13 | +import java.io.ObjectOutputStream; | ||
14 | + | ||
15 | + | ||
16 | +public class TrainModel { | ||
17 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | ||
18 | + | ||
19 | + public static void main(String[] args) throws Exception { | ||
20 | + | ||
21 | + ArffLoader loader = new ArffLoader(); | ||
22 | + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | ||
23 | + Instances instances = loader.getDataSet(); | ||
24 | + instances.setClassIndex(0); | ||
25 | + LOG.info(instances.size() + " instances loaded."); | ||
26 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
27 | + | ||
28 | + StopWatch watch = new StopWatch(); | ||
29 | + watch.start(); | ||
30 | + | ||
31 | + Classifier classifier = Constants.getSentencesClassifier(); | ||
32 | + | ||
33 | + LOG.info("Building classifier..."); | ||
34 | + classifier.buildClassifier(instances); | ||
35 | + LOG.info("...done."); | ||
36 | + | ||
37 | + try (ObjectOutputStream oos = new ObjectOutputStream( | ||
38 | + new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) { | ||
39 | + oos.writeObject(classifier); | ||
40 | + } | ||
41 | + | ||
42 | + watch.stop(); | ||
43 | + LOG.info("Elapsed time: " + watch); | ||
44 | + | ||
45 | + LOG.info(classifier.toString()); | ||
46 | + } | ||
47 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.classifiers.evaluation.Evaluation; | ||
9 | +import weka.core.Instances; | ||
10 | +import weka.core.converters.ArffLoader; | ||
11 | + | ||
12 | +import java.io.File; | ||
13 | +import java.util.Random; | ||
14 | + | ||
15 | + | ||
16 | +public class Crossvalidate { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
19 | + | ||
20 | + public static void main(String[] args) throws Exception { | ||
21 | + | ||
22 | + ArffLoader loader = new ArffLoader(); | ||
23 | + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | ||
24 | + Instances instances = loader.getDataSet(); | ||
25 | + instances.setClassIndex(0); | ||
26 | + LOG.info(instances.size() + " instances loaded."); | ||
27 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
28 | + | ||
29 | + StopWatch watch = new StopWatch(); | ||
30 | + watch.start(); | ||
31 | + | ||
32 | + Classifier tree = Constants.getSentencesClassifier(); | ||
33 | + | ||
34 | + Evaluation eval = new Evaluation(instances); | ||
35 | + eval.crossValidateModel(tree, instances, 10, new Random(1)); | ||
36 | + LOG.info(eval.toSummaryString()); | ||
37 | + | ||
38 | + watch.stop(); | ||
39 | + LOG.info("Elapsed time: " + watch); | ||
40 | + } | ||
41 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java | ||
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import com.google.common.collect.Lists; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.collect.Sets; | ||
6 | +import org.apache.commons.csv.CSVFormat; | ||
7 | +import org.apache.commons.csv.CSVPrinter; | ||
8 | +import org.apache.commons.csv.QuoteMode; | ||
9 | +import org.apache.commons.io.IOUtils; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
12 | + | ||
13 | +import java.io.File; | ||
14 | +import java.io.FileReader; | ||
15 | +import java.io.FileWriter; | ||
16 | +import java.io.IOException; | ||
17 | +import java.util.Arrays; | ||
18 | +import java.util.List; | ||
19 | +import java.util.Map; | ||
20 | +import java.util.Set; | ||
21 | + | ||
22 | +/** | ||
23 | + * Created by me2 on 26.07.16. | ||
24 | + */ | ||
25 | +public class Zero { | ||
26 | + | ||
27 | + private static final String IDS_PATH = "summaries_dev"; | ||
28 | + private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/"; | ||
29 | + | ||
30 | + public static void main(String[] args) throws IOException { | ||
31 | + | ||
32 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | ||
33 | + Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH); | ||
34 | + | ||
35 | + int mentionCount = 0; | ||
36 | + int mentionInNom = 0; | ||
37 | + int mentionInNomSequential = 0; | ||
38 | + | ||
39 | + List<List<Object>> rows = Lists.newArrayList(); | ||
40 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
41 | + String textId = entry.getKey(); | ||
42 | +// System.out.println(id); | ||
43 | + | ||
44 | + TText text = entry.getValue(); | ||
45 | + List<String> sentenceIds = id2sentIds.get(textId); | ||
46 | +// System.out.println(sentenceIds); | ||
47 | + | ||
48 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | ||
49 | + for (TCoreference coreference : text.getCoreferences()) { | ||
50 | + for (String mentionId : coreference.getMentionIds()) { | ||
51 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | ||
52 | + } | ||
53 | + } | ||
54 | + | ||
55 | + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | ||
56 | + TSentence prevSentence = null; | ||
57 | + for (TParagraph p : text.getParagraphs()) { | ||
58 | + Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences()); | ||
59 | + | ||
60 | + for (TSentence sentence : p.getSentences()) { | ||
61 | + if (!sentenceIds.contains(sentence.getId())) | ||
62 | + continue; | ||
63 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | ||
64 | + | ||
65 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); | ||
66 | + for (TToken t : sentence.getTokens()) | ||
67 | + tokenId2Token.put(t.getId(), t); | ||
68 | + | ||
69 | + for (TMention mention : sentence.getMentions()) { | ||
70 | + mentionCount++; | ||
71 | + | ||
72 | + for (String tokenId : mention.getHeadIds()) { | ||
73 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | ||
74 | + if (isInNominative(interp)) { | ||
75 | + mentionInNom++; | ||
76 | + | ||
77 | + currentSentenceNominativeMentionIds.add(mention.getId()); | ||
78 | + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | ||
79 | + mentionInNomSequential++; | ||
80 | + System.out.println(tMentionStringMap.get(mention) | ||
81 | + + "\n\t" + Utils.loadSentence2Orth(prevSentence) | ||
82 | + + "\n\t" + Utils.loadSentence2Orth(sentence)); | ||
83 | + | ||
84 | + List<Object> row = Lists.newArrayList(); | ||
85 | + row.add("C"); | ||
86 | + row.add(textId); | ||
87 | + row.add(tMentionStringMap.get(mention)); | ||
88 | + row.add(Utils.loadSentence2Orth(prevSentence)); | ||
89 | + row.add(Utils.loadSentence2Orth(sentence)); | ||
90 | + rows.add(row); | ||
91 | + } | ||
92 | + break; | ||
93 | + } | ||
94 | + } | ||
95 | + } | ||
96 | + | ||
97 | + prevSentence = sentence; | ||
98 | + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | ||
99 | + } | ||
100 | + } | ||
101 | + } | ||
102 | + | ||
103 | + System.out.println(mentionCount + " mentions"); | ||
104 | + System.out.println(mentionInNom + " mention in nom"); | ||
105 | + System.out.println(mentionInNomSequential + " mention in nom with previous in nom"); | ||
106 | + | ||
107 | + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | ||
108 | + for (List<Object> row : rows) { | ||
109 | + csvPrinter.printRecord(row); | ||
110 | + } | ||
111 | + } | ||
112 | + | ||
113 | + } | ||
114 | + | ||
115 | + private static boolean isInNominative(TInterpretation interp) { | ||
116 | + return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | ||
117 | + } | ||
118 | + | ||
119 | + private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException { | ||
120 | + Map<String, List<String>> result = Maps.newHashMap(); | ||
121 | + for (File f : new File(idsPath).listFiles()) { | ||
122 | + String id = f.getName().split("_")[0]; | ||
123 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | ||
124 | + result.put(id, sentenceIds); | ||
125 | + } | ||
126 | + return result; | ||
127 | + } | ||
128 | +} |
nicolas-model/pom.xml
0 → 100644
1 | +++ a/nicolas-model/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas-model</artifactId> | ||
13 | + | ||
14 | +</project> | ||
0 | \ No newline at end of file | 15 | \ No newline at end of file |
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt
0 → 100644
1 | +++ a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt | ||
1 | +on | ||
2 | +to | ||
3 | +co | ||
4 | +rok | ||
5 | +być | ||
6 | +wszystko | ||
7 | +polska | ||
8 | +człowiek | ||
9 | +sobie | ||
10 | +raz | ||
11 | +my | ||
12 | +mieć | ||
13 | +czas | ||
14 | +państwo | ||
15 | +praca | ||
16 | +osoba | ||
17 | +sprawa | ||
18 | +ja | ||
19 | +kraj | ||
20 | +pieniądz | ||
21 | +nikt | ||
22 | +kto | ||
23 | +przykład | ||
24 | +nic | ||
25 | +koniec | ||
26 | +rząd | ||
27 | +prawo | ||
28 | +życie | ||
29 | +miejsce | ||
30 | +móc | ||
31 | +fot | ||
32 | +problem | ||
33 | +władza | ||
34 | +miesiąc | ||
35 | +rzecz | ||
36 | +stan | ||
37 | +świat | ||
38 | +wszyscy | ||
39 | +mówić | ||
40 | +rozmowa | ||
41 | +coś | ||
42 | +sytuacja | ||
43 | +powód | ||
44 | +początek | ||
45 | +wiedzieć | ||
46 | +dzień | ||
47 | +uwaga | ||
48 | +strona | ||
49 | +udział | ||
50 | +in | ||
51 | +musieć | ||
52 | +polityk | ||
53 | +ktoś | ||
54 | +ogół | ||
55 | +polityka | ||
56 | +chcieć | ||
57 | +walka | ||
58 | +zmiana | ||
59 | +decyzja | ||
60 | +ciąg | ||
61 | +m . | ||
62 | +pan | ||
63 | +szansa | ||
64 | +polak | ||
65 | +przypadek | ||
66 | +większość | ||
67 | +pytanie | ||
68 | +wzgląd | ||
69 | +warszawa | ||
70 | +proca | ||
71 | +pomoc | ||
72 | +prezydent | ||
73 | +społeczeństwo | ||
74 | +wynik | ||
75 | +dziecko | ||
76 | +prawda | ||
77 | +związek | ||
78 | +gospodarka | ||
79 | +część | ||
80 | +wojna | ||
81 | +tydzień | ||
82 | +granica | ||
83 | +głos | ||
84 | +przyszłość | ||
85 | +autor | ||
86 | +wybory | ||
87 | +rynek | ||
88 | +cel | ||
89 | +ustawa | ||
90 | +uważać | ||
91 | +ten rok | ||
92 | +droga | ||
93 | +dom | ||
94 | +rys | ||
95 | +myśleć | ||
96 | +firma | ||
97 | +zasada | ||
98 | +fakt | ||
99 | +kolej | ||
100 | +nadzieja | ||
101 | +dolar | ||
102 | +wraz | ||
103 | +miasto | ||
104 | +rozwój | ||
105 | +ten sposób | ||
106 | +europa | ||
107 | +temat | ||
108 | +siła | ||
109 | +rodzina | ||
110 | +minister | ||
111 | +historia | ||
112 | +wpływ | ||
113 | +współpraca | ||
114 | +środek | ||
115 | +informacja | ||
116 | +procent | ||
117 | +wniosek | ||
118 | +unia europejski | ||
119 | +niemcy | ||
120 | +podstawa | ||
121 | +reforma | ||
122 | +partia | ||
123 | +interes | ||
124 | +ten sprawa | ||
125 | +kandydat | ||
126 | +sukces | ||
127 | +sposób | ||
128 | +wątpliwość | ||
129 | +złoty | ||
130 | +sld | ||
131 | +pracownik | ||
132 | +stanowisko | ||
133 | +dyskusja | ||
134 | +telewizja | ||
135 | +pewność | ||
136 | +odpowiedź | ||
137 | +rzeczywistość | ||
138 | +program | ||
139 | +cena | ||
140 | +działanie | ||
141 | +system | ||
142 | +unia | ||
143 | +ręka | ||
144 | +odpowiedzialność | ||
145 | +środowisko | ||
146 | +solidarność | ||
147 | +demokracja | ||
148 | +maić | ||
149 | +ramy | ||
150 | +badanie | ||
151 | +media | ||
152 | +wartość | ||
153 | +wybór | ||
154 | +głowa | ||
155 | +zostać | ||
156 | +usa | ||
157 | +pracować | ||
158 | +porozumienie | ||
159 | +widzieć | ||
160 | +zdanie | ||
161 | +akcja | ||
162 | +wolność | ||
163 | +spotkanie | ||
164 | +przeszłość | ||
165 | +stosunek | ||
166 | +okazja | ||
167 | +prowadzić | ||
168 | +zachód | ||
169 | +kobieta | ||
170 | +obywatel | ||
171 | +sąd | ||
172 | +ubiegły rok | ||
173 | +dziennikarz | ||
174 | +kultura | ||
175 | +grupa | ||
176 | +opinia publiczny | ||
177 | +obrona | ||
178 | +bezpieczeństwo | ||
179 | +opinia | ||
180 | +rzeczpospolita | ||
181 | +dokument | ||
182 | +racja | ||
183 | +szkoła | ||
184 | +góra | ||
185 | +warunek | ||
186 | +organizacja | ||
187 | +oko | ||
188 | +godzina | ||
189 | +tysiąc | ||
190 | +ten czas | ||
191 | +możliwość | ||
192 | +błąd | ||
193 | +ziemia | ||
194 | +parlament | ||
195 | +ten pora | ||
196 | +chwila | ||
197 | +naród | ||
198 | +konflikt | ||
199 | +działalność | ||
200 | +sejm | ||
201 | +powrót | ||
202 | +premier | ||
203 | +działać | ||
204 | +rada | ||
205 | +zdrowie | ||
206 | +wiek | ||
207 | +dodatek | ||
208 | +poziom | ||
209 | +widzenie | ||
210 | +żyć | ||
211 | +powiedzieć | ||
212 | +inwestycja | ||
213 | +rosja | ||
214 | +niemiec | ||
215 | +samochód | ||
216 | +skutek | ||
217 | +punkt | ||
218 | +rola | ||
219 | +mieszkaniec | ||
220 | +wyborca | ||
221 | +koszt | ||
222 | +budżet | ||
223 | +szef | ||
224 | +styczeń | ||
225 | +instytucja | ||
226 | +pełnia | ||
227 | +ulica | ||
228 | +aws | ||
229 | +ochrona | ||
230 | +dostęp | ||
231 | +zagrożenie | ||
232 | +zgoda | ||
233 | +ue | ||
234 | +" rzeczpospolita " | ||
235 | +liczba | ||
236 | +wieś | ||
237 | +połowa | ||
0 | \ No newline at end of file | 238 | \ No newline at end of file |
nicolas-train/pom.xml
0 → 100644
1 | +++ a/nicolas-train/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas-train</artifactId> | ||
13 | + | ||
14 | +</project> | ||
0 | \ No newline at end of file | 15 | \ No newline at end of file |
nicolas-zero/pom.xml
0 → 100644
1 | +++ a/nicolas-zero/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas-zero</artifactId> | ||
13 | + | ||
14 | +</project> | ||
0 | \ No newline at end of file | 15 | \ No newline at end of file |
pom.xml
0 → 100644
1 | +++ a/pom.xml | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + | ||
7 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
8 | + <artifactId>nicolas-container</artifactId> | ||
9 | + <packaging>pom</packaging> | ||
10 | + <version>1.0-SNAPSHOT</version> | ||
11 | + | ||
12 | + <modules> | ||
13 | + <module>nicolas-core</module> | ||
14 | + <module>nicolas-cli</module> | ||
15 | + <module>nicolas-model</module> | ||
16 | + <module>nicolas-train</module> | ||
17 | + <module>nicolas-zero</module> | ||
18 | + </modules> | ||
19 | + | ||
20 | + <properties> | ||
21 | + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
22 | + <java.version.build>1.8</java.version.build> | ||
23 | + </properties> | ||
24 | + | ||
25 | + <prerequisites> | ||
26 | + <maven>3.0.5</maven> | ||
27 | + </prerequisites> | ||
28 | + | ||
29 | + <developers> | ||
30 | + <developer> | ||
31 | + <name>Mateusz Kopeć</name> | ||
32 | + <organization>ICS PAS</organization> | ||
33 | + <email>m.kopec@ipipan.waw.pl</email> | ||
34 | + </developer> | ||
35 | + </developers> | ||
36 | + | ||
37 | + <dependencies> | ||
38 | + <dependency> | ||
39 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
40 | + <artifactId>pscapi</artifactId> | ||
41 | + <version>1.0-SNAPSHOT</version> | ||
42 | + </dependency> | ||
43 | + <dependency> | ||
44 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
45 | + <artifactId>utils</artifactId> | ||
46 | + <version>1.0-SNAPSHOT</version> | ||
47 | + </dependency> | ||
48 | + | ||
49 | + <dependency> | ||
50 | + <groupId>org.apache.commons</groupId> | ||
51 | + <artifactId>commons-csv</artifactId> | ||
52 | + <version>1.3</version> | ||
53 | + </dependency> | ||
54 | + <dependency> | ||
55 | + <groupId>com.google.guava</groupId> | ||
56 | + <artifactId>guava</artifactId> | ||
57 | + <version>19.0</version> | ||
58 | + </dependency> | ||
59 | + <dependency> | ||
60 | + <groupId>nz.ac.waikato.cms.weka</groupId> | ||
61 | + <artifactId>weka-dev</artifactId> | ||
62 | + <version>3.9.0</version> | ||
63 | + </dependency> | ||
64 | + <dependency> | ||
65 | + <groupId>org.apache.commons</groupId> | ||
66 | + <artifactId>commons-lang3</artifactId> | ||
67 | + <version>3.4</version> | ||
68 | + </dependency> | ||
69 | + <dependency> | ||
70 | + <groupId>commons-io</groupId> | ||
71 | + <artifactId>commons-io</artifactId> | ||
72 | + <version>2.5</version> | ||
73 | + </dependency> | ||
74 | + </dependencies> | ||
75 | + | ||
76 | + | ||
77 | + <build> | ||
78 | + <plugins> | ||
79 | + <plugin> | ||
80 | + <groupId>org.apache.maven.plugins</groupId> | ||
81 | + <artifactId>maven-compiler-plugin</artifactId> | ||
82 | + <version>3.1</version> | ||
83 | + <configuration> | ||
84 | + <source>${java.version.build}</source> | ||
85 | + <target>${java.version.build}</target> | ||
86 | + </configuration> | ||
87 | + </plugin> | ||
88 | + </plugins> | ||
89 | + </build> | ||
90 | + | ||
91 | + <distributionManagement> | ||
92 | + <repository> | ||
93 | + <id>deployment</id> | ||
94 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | ||
95 | + </repository> | ||
96 | + <snapshotRepository> | ||
97 | + <id>deployment</id> | ||
98 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | ||
99 | + </snapshotRepository> | ||
100 | + </distributionManagement> | ||
101 | +</project> | ||
0 | \ No newline at end of file | 102 | \ No newline at end of file |