Commit e1126cdba70bd5287871ebbe89e9ae6635bb5a01
0 parents
rough draft
Showing
28 changed files
with
2105 additions
and
0 deletions
.gitignore
0 → 100644
1 | +++ a/.gitignore | |
1 | +# Created by .ignore support plugin (hsz.mobi) | |
2 | +### Java template | |
3 | +*. | |
4 | +target/ | |
5 | + | |
6 | +# Mobile Tools for Java (J2ME) | |
7 | +.mtj.tmp/ | |
8 | + | |
9 | +# Package Files # | |
10 | +*.jar | |
11 | +*.war | |
12 | +*.ear | |
13 | + | |
14 | +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml | |
15 | +hs_err_pid* | |
16 | + | |
17 | +.idea | |
18 | +*.iml | |
0 | 19 | \ No newline at end of file |
... | ... |
nicolas-cli/pom.xml
0 → 100644
1 | +++ a/nicolas-cli/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas-cli</artifactId> | |
13 | + | |
14 | +</project> | |
0 | 15 | \ No newline at end of file |
... | ... |
nicolas-core/pom.xml
0 → 100644
1 | +++ a/nicolas-core/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas</artifactId> | |
13 | + | |
14 | + <dependencies> | |
15 | + <dependency> | |
16 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
17 | + <artifactId>nicolas-model</artifactId> | |
18 | + <version>${project.version}</version> | |
19 | + <scope>runtime</scope> | |
20 | + </dependency> | |
21 | + </dependencies> | |
22 | +</project> | |
0 | 23 | \ No newline at end of file |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import weka.classifiers.Classifier; | |
4 | +import weka.classifiers.trees.RandomForest; | |
5 | + | |
6 | + | |
7 | +public class Constants { | |
8 | + | |
9 | + public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; | |
10 | + public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; | |
11 | + public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; | |
12 | + public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; | |
13 | + | |
14 | + private Constants() { | |
15 | + } | |
16 | + | |
17 | + public static Classifier getClassifier() { | |
18 | + RandomForest classifier = new RandomForest(); | |
19 | + classifier.setNumIterations(250); | |
20 | + classifier.setSeed(0); | |
21 | + classifier.setNumExecutionSlots(8); | |
22 | + return classifier; | |
23 | + } | |
24 | + | |
25 | + | |
26 | + public static Classifier getSentencesClassifier() { | |
27 | + RandomForest classifier = new RandomForest(); | |
28 | + classifier.setNumIterations(250); | |
29 | + classifier.setSeed(0); | |
30 | + classifier.setNumExecutionSlots(8); | |
31 | + return classifier; | |
32 | + } | |
33 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
4 | + | |
5 | +public class Nicolas { | |
6 | + | |
7 | + public String summarizeThrift(TText text, int targetTokenCount) { | |
8 | + return "test nicolas"; | |
9 | + } | |
10 | + | |
11 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import com.google.common.base.Charsets; | |
4 | +import com.google.common.collect.Lists; | |
5 | +import com.google.common.collect.Maps; | |
6 | +import com.google.common.collect.Sets; | |
7 | +import com.google.common.io.Files; | |
8 | +import org.slf4j.Logger; | |
9 | +import org.slf4j.LoggerFactory; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
11 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
12 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
13 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
15 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | |
16 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
17 | +import weka.classifiers.Classifier; | |
18 | +import weka.core.Attribute; | |
19 | +import weka.core.DenseInstance; | |
20 | +import weka.core.Instance; | |
21 | +import weka.core.Instances; | |
22 | + | |
23 | +import java.io.File; | |
24 | +import java.io.FileInputStream; | |
25 | +import java.io.IOException; | |
26 | +import java.io.ObjectInputStream; | |
27 | +import java.util.*; | |
28 | +import java.util.function.Function; | |
29 | +import java.util.stream.Collectors; | |
30 | + | |
31 | +import static java.util.stream.Collectors.toList; | |
32 | + | |
33 | +public class Utils { | |
34 | + | |
35 | + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); | |
36 | + | |
37 | + private static final String DATASET_NAME = "Dataset"; | |
38 | + | |
39 | + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | |
40 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
41 | + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | |
42 | + | |
43 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | |
44 | + Map<TMention, Instance> mention2instance = Maps.newHashMap(); | |
45 | + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | |
46 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
47 | + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | |
48 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
49 | + instance.setValue(attribute, mentionFeatures.get(attribute)); | |
50 | + } | |
51 | + mention2instance.put(tMention, instance); | |
52 | + } | |
53 | + return mention2instance; | |
54 | + } | |
55 | + | |
56 | + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | |
57 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
58 | + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | |
59 | + | |
60 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | |
61 | + Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | |
62 | + for (TSentence sentence : sentences) { | |
63 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
64 | + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | |
65 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
66 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
67 | + } | |
68 | + sentence2instance.put(sentence, instance); | |
69 | + } | |
70 | + return sentence2instance; | |
71 | + } | |
72 | + | |
73 | + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | |
74 | + Instances instances = new Instances(DATASET_NAME, attributesList, 0); | |
75 | + instances.setClassIndex(0); | |
76 | + return instances; | |
77 | + } | |
78 | + | |
79 | + public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException { | |
80 | + LOG.info("Loading classifier..."); | |
81 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) { | |
82 | + Classifier classifier = (Classifier) ois.readObject(); | |
83 | + LOG.info("Done. " + classifier.toString()); | |
84 | + return classifier; | |
85 | + } | |
86 | + } | |
87 | + | |
88 | + public static Map<String, TText> loadPreprocessedTexts(String path) { | |
89 | + Map<String, TText> id2text = Maps.newHashMap(); | |
90 | + for (File processedFullTextFile : new File(path).listFiles()) { | |
91 | + TText processedFullText = loadThrifted(processedFullTextFile); | |
92 | + id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText); | |
93 | + } | |
94 | + LOG.info(id2text.size() + " preprocessed texts found."); | |
95 | + return id2text; | |
96 | + } | |
97 | + | |
98 | + | |
99 | + public static TText loadThrifted(File originalFile) { | |
100 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) { | |
101 | + return (TText) ois.readObject(); | |
102 | + } catch (ClassNotFoundException | IOException e) { | |
103 | + LOG.error("Error reading serialized file: " + e); | |
104 | + return null; | |
105 | + } | |
106 | + } | |
107 | + | |
108 | + public static List<String> tokenize(String text) { | |
109 | + return Arrays.asList(text.split("[^\\p{L}0-9]+")); | |
110 | + } | |
111 | + | |
112 | + public static List<String> tokenizeOnWhitespace(String text) { | |
113 | + return Arrays.asList(text.split(" +")); | |
114 | + } | |
115 | + | |
116 | + public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { | |
117 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | |
118 | + for (TSentence s : sents) { | |
119 | + Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); | |
120 | + Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); | |
121 | + | |
122 | + for (TMention m : s.getMentions()) { | |
123 | + StringBuffer mentionOrth = new StringBuffer(); | |
124 | + for (String tokId : m.getHeadIds()) { | |
125 | + if (!tokId2nps.get(tokId)) | |
126 | + mentionOrth.append(" "); | |
127 | + mentionOrth.append(tokId2orth.get(tokId)); | |
128 | + } | |
129 | + mention2orth.put(m, mentionOrth.toString().trim()); | |
130 | + } | |
131 | + } | |
132 | + return mention2orth; | |
133 | + } | |
134 | + | |
135 | + private static final Collection<String> STOPWORDS = Sets.newHashSet(); | |
136 | + | |
137 | + static { | |
138 | + STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); | |
139 | + } | |
140 | + | |
141 | + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | |
142 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | |
143 | + for (TSentence s : sents) { | |
144 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
145 | + | |
146 | + for (TMention m : s.getMentions()) { | |
147 | + StringBuffer mentionOrth = new StringBuffer(); | |
148 | + for (String tokId : m.getChildIds()) { | |
149 | + TToken token = tokId2tok.get(tokId); | |
150 | + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | |
151 | + continue; | |
152 | + } | |
153 | + | |
154 | + if (!token.isNoPrecedingSpace()) | |
155 | + mentionOrth.append(" "); | |
156 | + mentionOrth.append(token.getOrth()); | |
157 | + } | |
158 | + mention2orth.put(m, mentionOrth.toString().trim()); | |
159 | + } | |
160 | + } | |
161 | + return mention2orth; | |
162 | + } | |
163 | + | |
164 | + public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | |
165 | + Map<TMention, String> mention2base = Maps.newHashMap(); | |
166 | + for (TSentence s : sents) { | |
167 | + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); | |
168 | + | |
169 | + for (TMention m : s.getMentions()) { | |
170 | + StringBuilder mentionBase = new StringBuilder(); | |
171 | + for (String tokId : m.getChildIds()) { | |
172 | + mentionBase.append(" "); | |
173 | + mentionBase.append(tokId2base.get(tokId)); | |
174 | + } | |
175 | + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | |
176 | + } | |
177 | + } | |
178 | + return mention2base; | |
179 | + } | |
180 | + | |
181 | + public static String loadSentence2Orth(TSentence sentence) { | |
182 | + StringBuilder sb = new StringBuilder(); | |
183 | + for (TToken token : sentence.getTokens()) { | |
184 | + if (!token.isNoPrecedingSpace()) | |
185 | + sb.append(" "); | |
186 | + sb.append(token.getOrth()); | |
187 | + } | |
188 | + return sb.toString().trim(); | |
189 | + } | |
190 | + | |
191 | + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | |
192 | + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | |
193 | + | |
194 | + MentionScorer scorer = new MentionScorer(); | |
195 | + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | |
196 | + | |
197 | + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | |
198 | + return mention2score.keySet(); | |
199 | + } | |
200 | +} | |
0 | 201 | \ No newline at end of file |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.apply; | |
2 | + | |
3 | +import com.google.common.collect.Lists; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.collect.Sets; | |
6 | +import org.slf4j.Logger; | |
7 | +import org.slf4j.LoggerFactory; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | |
15 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
16 | +import weka.classifiers.Classifier; | |
17 | +import weka.core.Instance; | |
18 | +import weka.core.Instances; | |
19 | + | |
20 | +import java.io.BufferedWriter; | |
21 | +import java.io.File; | |
22 | +import java.io.FileWriter; | |
23 | +import java.util.*; | |
24 | + | |
25 | +import static java.util.stream.Collectors.toList; | |
26 | + | |
27 | +public class ApplyModel2 { | |
28 | + | |
29 | + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); | |
30 | + | |
31 | + private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test"; | |
32 | + private static final String TARGET_DIR = "summaries"; | |
33 | + | |
34 | + public static void main(String[] args) throws Exception { | |
35 | + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | |
36 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | |
37 | + | |
38 | + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | |
39 | + SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | |
40 | + | |
41 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); | |
42 | + int i = 1; | |
43 | + double avgSize = 0; | |
44 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
45 | + TText text = entry.getValue(); | |
46 | + | |
47 | + Set<TMention> goodMentions | |
48 | + = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | |
49 | + | |
50 | + int targetSize = calculateTargetSize(text); | |
51 | + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | |
52 | + int size = Utils.tokenize(summary).size(); | |
53 | + avgSize += size; | |
54 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) { | |
55 | + bw.append(summary); | |
56 | + } | |
57 | + | |
58 | + LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); | |
59 | + } | |
60 | + | |
61 | + LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); | |
62 | + } | |
63 | + | |
64 | + private static int calculateTargetSize(TText text) { | |
65 | + List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
66 | + StringBuffer body = new StringBuffer(); | |
67 | + for (TSentence sent : sents) | |
68 | + body.append(Utils.loadSentence2Orth(sent) + " "); | |
69 | + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | |
70 | + return (int) (0.2 * tokenCount); | |
71 | + } | |
72 | + | |
73 | + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | |
74 | + List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | |
75 | + | |
76 | + StringBuffer sb = new StringBuffer(); | |
77 | + for (TSentence sent : selectedSentences) { | |
78 | + sb.append(" " + Utils.loadSentence2Orth(sent)); | |
79 | + } | |
80 | + return sb.toString().trim(); | |
81 | + } | |
82 | + | |
83 | + private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | |
84 | + | |
85 | + List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
86 | + | |
87 | + Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | |
88 | + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
89 | + | |
90 | + Map<TSentence, Double> sentence2score = Maps.newHashMap(); | |
91 | + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | |
92 | + Instance instance = entry.getValue(); | |
93 | + instance.setDataset(instances); | |
94 | + double score = sentenceClassifier.classifyInstance(instance); | |
95 | + sentence2score.put(entry.getKey(), score); | |
96 | + } | |
97 | + | |
98 | + List<TSentence> sortedSents = Lists.newArrayList(sents); | |
99 | + Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | |
100 | + | |
101 | + int size = 0; | |
102 | + Random r = new Random(1); | |
103 | + Set<TSentence> summary = Sets.newHashSet(); | |
104 | + for (TSentence sent : sortedSents) { | |
105 | + size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | |
106 | + if (r.nextDouble() > 0.4 && size > targetSize) | |
107 | + break; | |
108 | + summary.add(sent); | |
109 | + if (size > targetSize) | |
110 | + break; | |
111 | + } | |
112 | + List<TSentence> selectedSentences = Lists.newArrayList(); | |
113 | + for (TSentence sent : sents) { | |
114 | + if (summary.contains(sent)) | |
115 | + selectedSentences.add(sent); | |
116 | + } | |
117 | + return selectedSentences; | |
118 | + } | |
119 | + | |
120 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | |
2 | + | |
3 | +import com.google.common.collect.*; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import weka.core.Attribute; | |
7 | + | |
8 | +import java.util.*; | |
9 | + | |
10 | +public class FeatureExtractor { | |
11 | + | |
12 | + protected static final Logger LOG = LoggerFactory.getLogger(FeatureExtractor.class); | |
13 | + | |
14 | + private final List<Attribute> sortedAttributes = Lists.newArrayList(); | |
15 | + | |
16 | + private final BiMap<String, Attribute> name2attribute = HashBiMap.create(); | |
17 | + | |
18 | + private final Set<String> normalizedAttributes = Sets.newHashSet(); | |
19 | + | |
20 | + public ArrayList<Attribute> getAttributesList() { | |
21 | + return Lists.newArrayList(sortedAttributes); | |
22 | + } | |
23 | + | |
24 | + protected Attribute getAttributeByName(String name) { | |
25 | + return name2attribute.get(name); | |
26 | + } | |
27 | + | |
28 | + protected void addNumericAttribute(String attributeName) { | |
29 | + name2attribute.put(attributeName, new Attribute(attributeName)); | |
30 | + } | |
31 | + | |
32 | + protected void addBinaryAttribute(String attributeName) { | |
33 | + name2attribute.put(attributeName, new Attribute(attributeName, Lists.newArrayList("f", "t"))); | |
34 | + } | |
35 | + | |
36 | + protected void addNominalAttribute(String attributeName, List<String> values) { | |
37 | + name2attribute.put(attributeName, new Attribute(attributeName, values)); | |
38 | + } | |
39 | + | |
40 | + protected void addNumericAttributeNormalized(String attributeName) { | |
41 | + addNumericAttribute(attributeName); | |
42 | + addNumericAttribute(attributeName + "_normalized"); | |
43 | + normalizedAttributes.add(attributeName); | |
44 | + } | |
45 | + | |
46 | + protected void fillSortedAttributes(String scoreAttName) { | |
47 | + sortedAttributes.addAll(name2attribute.values()); | |
48 | + sortedAttributes.remove(getAttributeByName(scoreAttName)); | |
49 | + Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2))); | |
50 | + sortedAttributes.add(0, getAttributeByName(scoreAttName)); | |
51 | + } | |
52 | + | |
53 | + protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { | |
54 | + Map<Attribute, Double> attribute2max = Maps.newHashMap(); | |
55 | + Map<Attribute, Double> attribute2min = Maps.newHashMap(); | |
56 | + for (T entity : entity2attributes.keySet()) { | |
57 | + Map<Attribute, Double> entityAttributes = entity2attributes.get(entity); | |
58 | + for (String attributeName : normalizedAttributes) { | |
59 | + Attribute attribute = getAttributeByName(attributeName); | |
60 | + Double value = entityAttributes.get(attribute); | |
61 | + | |
62 | + attribute2max.putIfAbsent(attribute, Double.MIN_VALUE); | |
63 | + attribute2max.compute(attribute, (k, v) -> Math.max(v, value)); | |
64 | + | |
65 | + attribute2min.putIfAbsent(attribute, Double.MAX_VALUE); | |
66 | + attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); | |
67 | + } | |
68 | + } | |
69 | + for (T mention : entity2attributes.keySet()) { | |
70 | + Map<Attribute, Double> entityAttributes = entity2attributes.get(mention); | |
71 | + for (Attribute attribute : attribute2max.keySet()) { | |
72 | + Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); | |
73 | + entityAttributes.put(normalizedAttribute, | |
74 | + (entityAttributes.get(attribute) - attribute2min.get(attribute)) | |
75 | + / (attribute2max.get(attribute) - attribute2min.get(attribute))); | |
76 | + } | |
77 | + } | |
78 | + } | |
79 | + | |
80 | + protected double toBinary(boolean bool) { | |
81 | + return bool ? 1.0 : 0.0; | |
82 | + } | |
83 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | |
2 | + | |
3 | +import com.google.common.collect.Maps; | |
4 | +import com.google.common.collect.Sets; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
7 | + | |
8 | +import java.util.List; | |
9 | +import java.util.Map; | |
10 | +import java.util.Set; | |
11 | +import java.util.function.Function; | |
12 | +import java.util.stream.Collectors; | |
13 | + | |
14 | +import static java.util.stream.Collectors.toList; | |
15 | +import static java.util.stream.Collectors.toMap; | |
16 | + | |
17 | +/** | |
18 | + * Created by me2 on 04.04.16. | |
19 | + */ | |
20 | +public class FeatureHelper { | |
21 | + | |
22 | + private final List<TMention> mentions; | |
23 | + private final Map<String, TMention> mentionId2mention; | |
24 | + private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); | |
25 | + private final Map<TMention, TCoreference> mention2coref = Maps.newHashMap(); | |
26 | + private final Map<TMention, TSentence> mention2sent = Maps.newHashMap(); | |
27 | + private final Map<TMention, TParagraph> mention2par = Maps.newHashMap(); | |
28 | + private final Map<TMention, String> mention2Orth = Maps.newHashMap(); | |
29 | + private final Map<TMention, String> mention2Base = Maps.newHashMap(); | |
30 | + private final Map<TMention, TToken> mention2head = Maps.newHashMap(); | |
31 | + private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet(); | |
32 | + | |
33 | + private final Map<TMention, Integer> mention2Index = Maps.newHashMap(); | |
34 | + private final Map<TSentence, Integer> sent2Index = Maps.newHashMap(); | |
35 | + private final Map<TParagraph, Integer> par2Index = Maps.newHashMap(); | |
36 | + private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap(); | |
37 | + private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap(); | |
38 | + private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); | |
39 | + | |
40 | + | |
41 | + public FeatureHelper(TText preprocessedText) { | |
42 | + mentions = preprocessedText.getParagraphs().stream() | |
43 | + .flatMap(p -> p.getSentences().stream()) | |
44 | + .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); | |
45 | + | |
46 | + mentionId2mention = mentions.stream().collect(Collectors.toMap(TMention::getId, Function.identity())); | |
47 | + | |
48 | + for (TCoreference coref : preprocessedText.getCoreferences()) { | |
49 | + List<TMention> ments = coref.getMentionIds().stream().map(mentionId2mention::get).collect(toList()); | |
50 | + for (TMention m : ments) { | |
51 | + mention2coref.put(m, coref); | |
52 | + } | |
53 | + coref2mentions.put(coref, ments); | |
54 | + } | |
55 | + | |
56 | + int parIdx = 0; | |
57 | + int sentIdx = 0; | |
58 | + int mentionIdx = 0; | |
59 | + for (TParagraph par : preprocessedText.getParagraphs()) { | |
60 | + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences()); | |
61 | + mention2Orth.putAll(m2o); | |
62 | + Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); | |
63 | + mention2Base.putAll(m2b); | |
64 | + | |
65 | + int sentIdxInPar = 0; | |
66 | + int mentionIdxInPar = 0; | |
67 | + for (TSentence sent : par.getSentences()) { | |
68 | + | |
69 | + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | |
70 | + | |
71 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | |
72 | + for (TNamedEntity namedEntity : sent.getNames()) { | |
73 | + for (String childId : namedEntity.getChildIds()) { | |
74 | + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | |
75 | + tokenId2namedEntities.get(childId).add(namedEntity); | |
76 | + } | |
77 | + } | |
78 | + | |
79 | + int mentionIdxInSent = 0; | |
80 | + for (TMention mention : sent.getMentions()) { | |
81 | + mention2sent.put(mention, sent); | |
82 | + mention2par.put(mention, par); | |
83 | + mention2Index.put(mention, mentionIdx++); | |
84 | + mention2indexInSent.put(mention, mentionIdxInSent++); | |
85 | + mention2indexInPar.put(mention, mentionIdxInPar++); | |
86 | + | |
87 | + String firstHeadTokenId = mention.getHeadIds().iterator().next(); | |
88 | + mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | |
89 | + if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | |
90 | + mentionsInNamedEntities.add(mention); | |
91 | + } | |
92 | + sent2Index.put(sent, sentIdx++); | |
93 | + sent2IndexInPar.put(sent, sentIdxInPar++); | |
94 | + } | |
95 | + | |
96 | + par2Index.put(par, parIdx++); | |
97 | + } | |
98 | + } | |
99 | + | |
100 | + public List<TMention> getMentions() { | |
101 | + return mentions; | |
102 | + } | |
103 | + | |
104 | + public int getMentionIndexInChain(TMention mention) { | |
105 | + return coref2mentions.get(mention2coref.get(mention)).indexOf(mention); | |
106 | + } | |
107 | + | |
108 | + public int getChainLength(TMention mention) { | |
109 | + return coref2mentions.get(mention2coref.get(mention)).size(); | |
110 | + } | |
111 | + | |
112 | + public String getSentenceLastTokenOrth(TSentence sent) { | |
113 | + return sent.getTokens().get(sent.getTokensSize() - 1).getOrth(); | |
114 | + } | |
115 | + | |
116 | + public String getMentionOrth(TMention mention) { | |
117 | + return mention2Orth.get(mention); | |
118 | + } | |
119 | + | |
120 | + public String getMentionBase(TMention mention) { | |
121 | + return mention2Base.get(mention); | |
122 | + } | |
123 | + | |
124 | + public int getMentionIndex(TMention mention) { | |
125 | + return mention2Index.get(mention); | |
126 | + } | |
127 | + | |
128 | + public int getMentionIndexInSent(TMention mention) { | |
129 | + return mention2indexInSent.get(mention); | |
130 | + } | |
131 | + | |
132 | + public int getMentionIndexInPar(TMention mention) { | |
133 | + return mention2indexInPar.get(mention); | |
134 | + } | |
135 | + | |
136 | + public int getParIndex(TParagraph paragraph) { | |
137 | + return par2Index.get(paragraph); | |
138 | + } | |
139 | + | |
140 | + public int getSentIndex(TSentence sent) { | |
141 | + return sent2Index.get(sent); | |
142 | + } | |
143 | + | |
144 | + public int getSentIndexInPar(TSentence sent) { | |
145 | + return sent2IndexInPar.get(sent); | |
146 | + } | |
147 | + | |
148 | + public TParagraph getMentionParagraph(TMention mention) { | |
149 | + return mention2par.get(mention); | |
150 | + } | |
151 | + | |
152 | + public TSentence getMentionSentence(TMention mention) { | |
153 | + return mention2sent.get(mention); | |
154 | + } | |
155 | + | |
156 | + public TMention getFirstChainMention(TMention mention) { | |
157 | + return mentionId2mention.get(mention2coref.get(mention).getMentionIdsIterator().next()); | |
158 | + } | |
159 | + | |
160 | + public TToken getMentionHeadToken(TMention mention) { | |
161 | + return mention2head.get(mention); | |
162 | + } | |
163 | + | |
164 | + public boolean isMentionNamedEntity(TMention mention) { | |
165 | + return mentionsInNamedEntities.contains(mention); | |
166 | + } | |
167 | + | |
168 | + public boolean isNested(TMention mention) { | |
169 | + return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds())); | |
170 | + } | |
171 | + | |
172 | + public boolean isNesting(TMention mention) { | |
173 | + return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds())); | |
174 | + } | |
175 | + | |
176 | + public Set<TCoreference> getClusters() { | |
177 | + return coref2mentions.keySet(); | |
178 | + } | |
179 | + | |
180 | + public Set<TMention> getCoreferentMentions(TMention tMention) { | |
181 | + return getMentionCluster(tMention).getMentionIds().stream().map(this.mentionId2mention::get).collect(Collectors.toSet()); | |
182 | + } | |
183 | + | |
184 | + public TCoreference getMentionCluster(TMention tMention) { | |
185 | + return this.mention2coref.get(tMention); | |
186 | + } | |
187 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.features; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; | |
4 | + | |
5 | + | |
6 | +public class Interpretation { | |
7 | + private String ctag = "null"; | |
8 | + private String casee = "null"; | |
9 | + private String gender = "null"; | |
10 | + private String number = "null"; | |
11 | + private String person = "null"; | |
12 | + | |
13 | + public Interpretation(TInterpretation chosenInterpretation) { | |
14 | + ctag = chosenInterpretation.getCtag(); | |
15 | + String[] split = chosenInterpretation.getMsd().split(":"); | |
16 | + switch (ctag) { | |
17 | + case "ger": | |
18 | + case "subst": | |
19 | + case "pact": | |
20 | + case "ppas": | |
21 | + case "num": | |
22 | + case "numcol": | |
23 | + case "adj": | |
24 | + number = split[0]; | |
25 | + casee = split[1]; | |
26 | + gender = split[2]; | |
27 | + break; | |
28 | + case "ppron12": | |
29 | + case "ppron3": | |
30 | + number = split[0]; | |
31 | + casee = split[1]; | |
32 | + gender = split[2]; | |
33 | + person = split[3]; | |
34 | + break; | |
35 | + case "siebie": | |
36 | + casee = split[0]; | |
37 | + break; | |
38 | + case "fin": | |
39 | + case "bedzie": | |
40 | + case "aglt": | |
41 | + case "impt": | |
42 | + number = split[0]; | |
43 | + person = split[1]; | |
44 | + break; | |
45 | + case "praet": | |
46 | + case "winien": | |
47 | + number = split[0]; | |
48 | + gender = split[1]; | |
49 | + break; | |
50 | + case "prep": | |
51 | + casee = split[0]; | |
52 | + break; | |
53 | + default: | |
54 | + break; | |
55 | + } | |
56 | + } | |
57 | + | |
58 | + public String getCase() { | |
59 | + return casee; | |
60 | + } | |
61 | + | |
62 | + public String getGender() { | |
63 | + return gender; | |
64 | + } | |
65 | + | |
66 | + public String getNumber() { | |
67 | + return number; | |
68 | + } | |
69 | + | |
70 | + public String getPerson() { | |
71 | + return person; | |
72 | + } | |
73 | + | |
74 | + public String getCtag() { | |
75 | + return ctag; | |
76 | + } | |
77 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | + | |
3 | +import com.google.common.collect.*; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | |
8 | +import weka.core.Attribute; | |
9 | + | |
10 | +import java.io.File; | |
11 | +import java.io.IOException; | |
12 | +import java.nio.file.Files; | |
13 | +import java.util.*; | |
14 | +import java.util.stream.Collectors; | |
15 | +import java.util.stream.Stream; | |
16 | + | |
17 | + | |
18 | +public class MentionFeatureExtractor extends FeatureExtractor { | |
19 | + | |
20 | + private final List<String> frequentBases = Lists.newArrayList(); | |
21 | + | |
22 | + public MentionFeatureExtractor() { | |
23 | + | |
24 | + //coref | |
25 | + addNumericAttributeNormalized("chain_length"); | |
26 | + | |
27 | + // text characteristics | |
28 | + addNumericAttribute("text_token_count"); | |
29 | + addNumericAttribute("text_sent_count"); | |
30 | + addNumericAttribute("text_par_count"); | |
31 | + addNumericAttribute("text_mention_count"); | |
32 | + addNumericAttribute("text_cluster_count"); | |
33 | + | |
34 | + //mention characteristics | |
35 | + for (String prefix : Lists.newArrayList("mention", "chain_first_mention")) { | |
36 | + // mention characteristics | |
37 | + addNumericAttributeNormalized(prefix + "_index"); | |
38 | + addNumericAttributeNormalized(prefix + "_index_in_sent"); | |
39 | + addNumericAttributeNormalized(prefix + "_index_in_par"); | |
40 | + addNumericAttributeNormalized(prefix + "_index_in_chain"); | |
41 | + addBinaryAttribute(prefix + "_capitalized"); | |
42 | + addBinaryAttribute(prefix + "_all_caps"); | |
43 | + addNumericAttributeNormalized(prefix + "_char_count"); | |
44 | + addNumericAttributeNormalized(prefix + "_token_count"); | |
45 | + addBinaryAttribute(prefix + "_is_zero"); | |
46 | + addBinaryAttribute(prefix + "_is_named"); | |
47 | + addBinaryAttribute(prefix + "_is_pronoun"); | |
48 | + addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact")); | |
49 | + addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); | |
50 | + addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | |
51 | + addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); | |
52 | + addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n")); | |
53 | + | |
54 | + // relation to other | |
55 | + addBinaryAttribute(prefix + "_is_nested"); | |
56 | + addBinaryAttribute(prefix + "_is_nesting"); | |
57 | + | |
58 | + // par characteristics | |
59 | + addNumericAttributeNormalized(prefix + "_par_idx"); | |
60 | + addNumericAttributeNormalized(prefix + "_par_token_count"); | |
61 | + addNumericAttributeNormalized(prefix + "_par_sent_count"); | |
62 | + | |
63 | + // sent characteristics | |
64 | + addNumericAttributeNormalized(prefix + "_sent_token_count"); | |
65 | + addNumericAttributeNormalized(prefix + "_sent_mention_count"); | |
66 | + addNumericAttributeNormalized(prefix + "_sent_idx"); | |
67 | + addNumericAttributeNormalized(prefix + "_sent_idx_in_par"); | |
68 | + addBinaryAttribute(prefix + "_sent_ends_with_dot"); | |
69 | + addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); | |
70 | + | |
71 | + // frequent bases | |
72 | + loadFrequentBases(); | |
73 | + for (String base : frequentBases) { | |
74 | + addBinaryAttribute(prefix + "_" + encodeBase(base)); | |
75 | + } | |
76 | + } | |
77 | + | |
78 | + addNominalAttribute("score", Lists.newArrayList("bad", "good")); | |
79 | + fillSortedAttributes("score"); | |
80 | + } | |
81 | + | |
82 | + private String encodeBase(String base) { | |
83 | + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | |
84 | + } | |
85 | + | |
86 | + private void loadFrequentBases() { | |
87 | + try { | |
88 | + Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath()); | |
89 | + this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList())); | |
90 | + } catch (IOException e) { | |
91 | + e.printStackTrace(); | |
92 | + } | |
93 | + } | |
94 | + | |
95 | + public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { | |
96 | + Map<TMention, Map<Attribute, Double>> result = Maps.newHashMap(); | |
97 | + | |
98 | + FeatureHelper helper = new FeatureHelper(preprocessedText); | |
99 | + | |
100 | + addScoreFeature(result, helper.getMentions()); | |
101 | + | |
102 | + for (TMention mention : helper.getMentions()) { | |
103 | + Map<Attribute, Double> attribute2value = result.get(mention); | |
104 | + | |
105 | + //mention | |
106 | + addMentionAttributes(helper, mention, attribute2value, "mention"); | |
107 | + | |
108 | + //first chain mention | |
109 | + TMention firstChainMention = helper.getFirstChainMention(mention); | |
110 | + addMentionAttributes(helper, firstChainMention, attribute2value, "chain_first_mention"); | |
111 | + | |
112 | + //coref | |
113 | + attribute2value.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention)); | |
114 | + | |
115 | + //text | |
116 | + List<TParagraph> pars = preprocessedText.getParagraphs(); | |
117 | + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | |
118 | + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList()); | |
119 | + attribute2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum()); | |
120 | + attribute2value.put(getAttributeByName("text_token_count"), (double) tokens.size()); | |
121 | + attribute2value.put(getAttributeByName("text_sent_count"), (double) sents.size()); | |
122 | + attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); | |
123 | + attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); | |
124 | + attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); | |
125 | + | |
126 | + assert (attribute2value.size() == getAttributesList().size()); | |
127 | + } | |
128 | + addNormalizedAttributeValues(result); | |
129 | + | |
130 | + return result; | |
131 | + } | |
132 | + | |
133 | + private void addMentionAttributes(FeatureHelper helper, TMention mention, Map<Attribute, Double> attribute2value, String attributePrefix) { | |
134 | + // mention characteristics | |
135 | + attribute2value.put(getAttributeByName(attributePrefix + "_index"), (double) helper.getMentionIndex(mention)); | |
136 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | |
137 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_par"), (double) helper.getMentionIndexInPar(mention)); | |
138 | + attribute2value.put(getAttributeByName(attributePrefix + "_index_in_chain"), (double) helper.getMentionIndexInChain(mention)); | |
139 | + attribute2value.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | |
140 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject())); | |
141 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*"))); | |
142 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | |
143 | + | |
144 | + Interpretation interp = new Interpretation(helper.getMentionHeadToken(mention).getChosenInterpretation()); | |
145 | + addNominalAttributeValue(interp.getCtag(), attribute2value, attributePrefix + "_ctag"); | |
146 | + addNominalAttributeValue(interp.getPerson(), attribute2value, attributePrefix + "_person"); | |
147 | + addNominalAttributeValue(interp.getNumber(), attribute2value, attributePrefix + "_number"); | |
148 | + addNominalAttributeValue(interp.getGender(), attribute2value, attributePrefix + "_gender"); | |
149 | + addNominalAttributeValue(interp.getCase(), attribute2value, attributePrefix + "_case"); | |
150 | + | |
151 | + // relation to other mentions | |
152 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention))); | |
153 | + attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); | |
154 | + | |
155 | + String orth = helper.getMentionOrth(mention); | |
156 | + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1)))); | |
157 | + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth))); | |
158 | + attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length()); | |
159 | + | |
160 | + // par characteristics | |
161 | + TParagraph mentionParagraph = helper.getMentionParagraph(mention); | |
162 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_idx"), (double) helper.getParIndex(mentionParagraph)); | |
163 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_token_count"), mentionParagraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); | |
164 | + attribute2value.put(getAttributeByName(attributePrefix + "_par_sent_count"), (double) mentionParagraph.getSentences().size()); | |
165 | + | |
166 | + // sent characteristics | |
167 | + TSentence mentionSentence = helper.getMentionSentence(mention); | |
168 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_token_count"), (double) mentionSentence.getTokensSize()); | |
169 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size()); | |
170 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence)); | |
171 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence)); | |
172 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("."))); | |
173 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?"))); | |
174 | + | |
175 | + // frequent bases | |
176 | + String mentionBase = helper.getMentionBase(mention); | |
177 | + for (String base : frequentBases) { | |
178 | + attribute2value.put(getAttributeByName(attributePrefix + "_" + encodeBase(base)), toBinary(mentionBase.equals(base))); | |
179 | + } | |
180 | + } | |
181 | + | |
182 | + private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { | |
183 | + Attribute att = getAttributeByName(attributeName); | |
184 | + int index = att.indexOfValue(value); | |
185 | + if (index == -1) | |
186 | + LOG.warn(value + " not found for attribute " + attributeName); | |
187 | + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | |
188 | + } | |
189 | + | |
190 | + | |
191 | + private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) { | |
192 | + for (TMention m : mentions) { | |
193 | + Map<Attribute, Double> map = Maps.newHashMap(); | |
194 | + map.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
195 | + result.put(m, map); | |
196 | + } | |
197 | + } | |
198 | + | |
199 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | + | |
3 | +import com.google.common.collect.Sets; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
9 | +import weka.classifiers.Classifier; | |
10 | +import weka.core.Instance; | |
11 | +import weka.core.Instances; | |
12 | + | |
13 | +import java.util.Map; | |
14 | +import java.util.Set; | |
15 | + | |
16 | +public class MentionModel { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(MentionModel.class); | |
19 | + | |
20 | + public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { | |
21 | + Set<TMention> goodMentions = Sets.newHashSet(); | |
22 | + | |
23 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
24 | + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor); | |
25 | + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | |
26 | + Instance instance = entry.getValue(); | |
27 | + instance.setDataset(instances); | |
28 | + instance.setClassMissing(); | |
29 | + boolean good = classifier.classifyInstance(instance) > 0.5; | |
30 | + if (good) | |
31 | + goodMentions.add(entry.getKey()); | |
32 | + } | |
33 | + LOG.info("\t" + goodMentions.size() + "\t" + mention2instance.size()); | |
34 | + return goodMentions; | |
35 | + } | |
36 | + | |
37 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | + | |
3 | +import com.google.common.collect.HashMultiset; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.collect.Multiset; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
10 | + | |
11 | +import java.util.Collection; | |
12 | +import java.util.List; | |
13 | +import java.util.Map; | |
14 | +import java.util.stream.Collectors; | |
15 | + | |
16 | +public class MentionScorer { | |
17 | + | |
18 | + | |
19 | + public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { | |
20 | + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | |
21 | + | |
22 | + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | |
23 | + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences); | |
24 | + | |
25 | + return booleanTokenIntersection(mention2Orth, tokenCounts); | |
26 | + } | |
27 | + | |
28 | + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | |
29 | + Map<TMention, Double> mention2score = Maps.newHashMap(); | |
30 | + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | |
31 | + TMention mention = entry.getKey(); | |
32 | + String mentionOrth = mention2Orth.get(mention); | |
33 | + for (String token : Utils.tokenize(mentionOrth)) { | |
34 | + if (tokenCounts.contains(token.toLowerCase())) { | |
35 | + mention2score.put(mention, 1.0); | |
36 | + break; | |
37 | + } | |
38 | + } | |
39 | + mention2score.putIfAbsent(mention, 0.0); | |
40 | + } | |
41 | + return mention2score; | |
42 | + } | |
43 | + | |
44 | + private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | |
45 | + Map<TMention, Double> mention2score = Maps.newHashMap(); | |
46 | + for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | |
47 | + TMention mention = entry.getKey(); | |
48 | + String mentionOrth = mention2Orth.get(mention); | |
49 | + int present = 0; | |
50 | + for (String token : Utils.tokenize(mentionOrth)) { | |
51 | + if (tokenCounts.contains(token.toLowerCase())) { | |
52 | + present++; | |
53 | + } | |
54 | + } | |
55 | + mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0); | |
56 | + } | |
57 | + return mention2score; | |
58 | + } | |
59 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | + | |
3 | +import com.google.common.base.Charsets; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.io.Files; | |
6 | +import org.apache.logging.log4j.LogManager; | |
7 | +import org.apache.logging.log4j.Logger; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
12 | +import weka.core.Instance; | |
13 | +import weka.core.Instances; | |
14 | +import weka.core.converters.ArffSaver; | |
15 | + | |
16 | +import java.io.File; | |
17 | +import java.io.IOException; | |
18 | +import java.util.Map; | |
19 | + | |
20 | + | |
21 | +public class PrepareTrainingData { | |
22 | + | |
23 | + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class); | |
24 | + | |
25 | + public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
26 | + public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
27 | + | |
28 | + public static void main(String[] args) throws IOException { | |
29 | + | |
30 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | |
31 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(); | |
32 | + | |
33 | + MentionScorer mentionScorer = new MentionScorer(); | |
34 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | |
35 | + | |
36 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
37 | + | |
38 | + int i = 1; | |
39 | + for (String textId : id2preprocessedText.keySet()) { | |
40 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | |
41 | + | |
42 | + TText preprocessedText = id2preprocessedText.get(textId); | |
43 | + String optimalSummary = id2optimalSummary.get(textId); | |
44 | + if (optimalSummary == null) | |
45 | + continue; | |
46 | + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | |
47 | + | |
48 | + Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor); | |
49 | + for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | |
50 | + TMention mention = entry.getKey(); | |
51 | + Instance instance = entry.getValue(); | |
52 | + instance.setDataset(instances); | |
53 | + instance.setClassValue(mention2score.get(mention)); | |
54 | + instances.add(instance); | |
55 | + } | |
56 | + } | |
57 | + saveInstancesToFile(instances); | |
58 | + } | |
59 | + | |
60 | + private static void saveInstancesToFile(Instances instances) throws IOException { | |
61 | + ArffSaver saver = new ArffSaver(); | |
62 | + saver.setInstances(instances); | |
63 | + saver.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
64 | + saver.writeBatch(); | |
65 | + } | |
66 | + | |
67 | + private static Map<String, String> loadOptimalSummaries() throws IOException { | |
68 | + Map<String, String> id2optimalSummary = Maps.newHashMap(); | |
69 | + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | |
70 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | |
71 | + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | |
72 | + } | |
73 | + LOG.info(id2optimalSummary.size() + " optimal summaries found."); | |
74 | + return id2optimalSummary; | |
75 | + } | |
76 | + | |
77 | + | |
78 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.core.Instances; | |
9 | +import weka.core.converters.ArffLoader; | |
10 | + | |
11 | +import java.io.File; | |
12 | +import java.io.FileOutputStream; | |
13 | +import java.io.ObjectOutputStream; | |
14 | + | |
15 | + | |
16 | +public class TrainModel { | |
17 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
18 | + | |
19 | + public static void main(String[] args) throws Exception { | |
20 | + | |
21 | + ArffLoader loader = new ArffLoader(); | |
22 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
23 | + Instances instances = loader.getDataSet(); | |
24 | + instances.setClassIndex(0); | |
25 | + LOG.info(instances.size() + " instances loaded."); | |
26 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
27 | + | |
28 | + StopWatch watch = new StopWatch(); | |
29 | + watch.start(); | |
30 | + | |
31 | + Classifier classifier = Constants.getClassifier(); | |
32 | + | |
33 | + LOG.info("Building classifier..."); | |
34 | + classifier.buildClassifier(instances); | |
35 | + LOG.info("...done."); | |
36 | + | |
37 | + try (ObjectOutputStream oos = new ObjectOutputStream( | |
38 | + new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) { | |
39 | + oos.writeObject(classifier); | |
40 | + } | |
41 | + | |
42 | + watch.stop(); | |
43 | + LOG.info("Elapsed time: " + watch); | |
44 | + | |
45 | + LOG.info(classifier.toString()); | |
46 | + } | |
47 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention.test; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.classifiers.evaluation.Evaluation; | |
9 | +import weka.core.Instances; | |
10 | +import weka.core.converters.ArffLoader; | |
11 | + | |
12 | +import java.io.File; | |
13 | +import java.util.Random; | |
14 | + | |
15 | + | |
16 | +public class Crossvalidate { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
19 | + | |
20 | + public static void main(String[] args) throws Exception { | |
21 | + | |
22 | + ArffLoader loader = new ArffLoader(); | |
23 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
24 | + Instances instances = loader.getDataSet(); | |
25 | + instances.setClassIndex(0); | |
26 | + LOG.info(instances.size() + " instances loaded."); | |
27 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
28 | + | |
29 | +// while (instances.size() > 10000) | |
30 | +// instances.remove(instances.size() - 1); | |
31 | + | |
32 | + StopWatch watch = new StopWatch(); | |
33 | + watch.start(); | |
34 | + | |
35 | + Classifier tree = Constants.getClassifier(); | |
36 | + | |
37 | + Evaluation eval = new Evaluation(instances); | |
38 | + eval.crossValidateModel(tree, instances, 10, new Random(1)); | |
39 | + LOG.info(eval.toSummaryString()); | |
40 | + | |
41 | + watch.stop(); | |
42 | + LOG.info("Elapsed time: " + watch); | |
43 | + } | |
44 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.mention.test; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.classifiers.evaluation.Evaluation; | |
9 | +import weka.core.Instances; | |
10 | +import weka.core.converters.ArffLoader; | |
11 | + | |
12 | +import java.io.File; | |
13 | +import java.io.FileInputStream; | |
14 | +import java.io.IOException; | |
15 | +import java.io.ObjectInputStream; | |
16 | + | |
17 | +/** | |
18 | + * Created by me2 on 05.04.16. | |
19 | + */ | |
20 | +public class Validate { | |
21 | + private static final Logger LOG = LoggerFactory.getLogger(Validate.class); | |
22 | + | |
23 | + public static void main(String[] args) throws Exception { | |
24 | + | |
25 | + ArffLoader loader = new ArffLoader(); | |
26 | + loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
27 | + Instances instances = loader.getDataSet(); | |
28 | + instances.setClassIndex(0); | |
29 | + LOG.info(instances.size() + " instances loaded."); | |
30 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
31 | + | |
32 | + Classifier classifier = loadClassifier(); | |
33 | + | |
34 | + StopWatch watch = new StopWatch(); | |
35 | + watch.start(); | |
36 | + | |
37 | + Evaluation eval = new Evaluation(instances); | |
38 | + eval.evaluateModel(classifier, instances); | |
39 | + | |
40 | + LOG.info(eval.toSummaryString()); | |
41 | + | |
42 | + watch.stop(); | |
43 | + LOG.info("Elapsed time: " + watch); | |
44 | + } | |
45 | + | |
46 | + private static Classifier loadClassifier() throws IOException, ClassNotFoundException { | |
47 | + LOG.info("Loading classifier..."); | |
48 | + try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) { | |
49 | + Classifier classifier = (Classifier) ois.readObject(); | |
50 | + LOG.info("Done. " + classifier.toString()); | |
51 | + return classifier; | |
52 | + } | |
53 | + } | |
54 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
2 | + | |
3 | +import com.google.common.base.Charsets; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.io.Files; | |
6 | +import org.apache.logging.log4j.LogManager; | |
7 | +import org.apache.logging.log4j.Logger; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
14 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | |
15 | +import weka.classifiers.Classifier; | |
16 | +import weka.core.Instance; | |
17 | +import weka.core.Instances; | |
18 | +import weka.core.converters.ArffSaver; | |
19 | + | |
20 | +import java.io.File; | |
21 | +import java.io.IOException; | |
22 | +import java.util.Map; | |
23 | +import java.util.Set; | |
24 | + | |
25 | + | |
26 | +public class PrepareTrainingData { | |
27 | + | |
28 | + private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class); | |
29 | + | |
30 | + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
31 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
32 | + | |
33 | + public static void main(String[] args) throws Exception { | |
34 | + | |
35 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | |
36 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(); | |
37 | + | |
38 | + SentenceScorer sentenceScorer = new SentenceScorer(); | |
39 | + SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); | |
40 | + | |
41 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
42 | + | |
43 | + Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | |
44 | + MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); | |
45 | + | |
46 | + int i = 1; | |
47 | + for (String textId : id2preprocessedText.keySet()) { | |
48 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | |
49 | + | |
50 | + TText preprocessedText = id2preprocessedText.get(textId); | |
51 | + String optimalSummary = id2optimalSummary.get(textId); | |
52 | + if (optimalSummary == null) | |
53 | + continue; | |
54 | + Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText); | |
55 | + | |
56 | + Set<TMention> goodMentions | |
57 | + = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText); | |
58 | +// Set<TMention> goodMentions | |
59 | +// = Utils.loadGoldGoodMentions(textId, preprocessedText, true); | |
60 | + | |
61 | + Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | |
62 | + for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | |
63 | + TSentence sentence = entry.getKey(); | |
64 | + Instance instance = entry.getValue(); | |
65 | + instance.setDataset(instances); | |
66 | + instance.setClassValue(sentence2score.get(sentence)); | |
67 | + instances.add(instance); | |
68 | + } | |
69 | + } | |
70 | + saveInstancesToFile(instances); | |
71 | + } | |
72 | + | |
73 | + private static void saveInstancesToFile(Instances instances) throws IOException { | |
74 | + ArffSaver saver = new ArffSaver(); | |
75 | + saver.setInstances(instances); | |
76 | + saver.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
77 | + saver.writeBatch(); | |
78 | + } | |
79 | + | |
80 | + private static Map<String, String> loadOptimalSummaries() throws IOException { | |
81 | + Map<String, String> id2optimalSummary = Maps.newHashMap(); | |
82 | + for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | |
83 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | |
84 | + id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | |
85 | + } | |
86 | + LOG.info(id2optimalSummary.size() + " optimal summaries found."); | |
87 | + return id2optimalSummary; | |
88 | + } | |
89 | + | |
90 | + | |
91 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
2 | + | |
3 | +import com.google.common.collect.Maps; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
7 | +import weka.core.Attribute; | |
8 | + | |
9 | +import java.util.List; | |
10 | +import java.util.Map; | |
11 | +import java.util.Set; | |
12 | +import java.util.stream.Collectors; | |
13 | + | |
14 | +public class SentenceFeatureExtractor extends FeatureExtractor { | |
15 | + | |
16 | + public SentenceFeatureExtractor() { | |
17 | + | |
18 | + addNumericAttributeNormalized("sent_mention_cluster_count"); | |
19 | + addNumericAttributeNormalized("sent_good_mention_cluster_count"); | |
20 | + addNumericAttributeNormalized("sent_good_mention_cluster_good_count"); | |
21 | + addNumericAttributeNormalized("sent_cluster_count"); | |
22 | + addNumericAttributeNormalized("sent_good_cluster_count"); | |
23 | + addNumericAttributeNormalized("sent_mention_count"); | |
24 | + addNumericAttributeNormalized("sent_good_mention_count"); | |
25 | + | |
26 | + addNumericAttributeNormalized("sent_token_length"); | |
27 | + addNumericAttributeNormalized("sent_idx"); | |
28 | + addNumericAttributeNormalized("sent_idx_in_par"); | |
29 | + addBinaryAttribute("sent_ends_with_dot"); | |
30 | + addBinaryAttribute("sent_ends_with_questionmark"); | |
31 | + | |
32 | + addNumericAttributeNormalized("par_idx"); | |
33 | + addNumericAttributeNormalized("par_token_count"); | |
34 | + addNumericAttributeNormalized("par_sent_count"); | |
35 | + | |
36 | + addNumericAttribute("text_token_count"); | |
37 | + addNumericAttribute("text_sent_count"); | |
38 | + addNumericAttribute("text_par_count"); | |
39 | + addNumericAttribute("text_mention_count"); | |
40 | + addNumericAttribute("text_cluster_count"); | |
41 | + | |
42 | + addNumericAttribute("score"); | |
43 | + fillSortedAttributes("score"); | |
44 | + } | |
45 | + | |
46 | + public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) { | |
47 | + | |
48 | + int sentenceIdx = 0; | |
49 | + int parIdx = 0; | |
50 | + | |
51 | + FeatureHelper helper = new FeatureHelper(preprocessedText); | |
52 | + List<TParagraph> pars = preprocessedText.getParagraphs(); | |
53 | + List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | |
54 | + List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList()); | |
55 | + | |
56 | + Map<TSentence, Map<Attribute, Double>> sentence2features = Maps.newLinkedHashMap(); | |
57 | + for (TParagraph paragraph : preprocessedText.getParagraphs()) { | |
58 | + int sentenceIdxInPar = 0; | |
59 | + for (TSentence sentence : paragraph.getSentences()) { | |
60 | + Map<Attribute, Double> feature2value = Maps.newHashMap(); | |
61 | + | |
62 | + feature2value.put(getAttributeByName("sent_mention_cluster_count"), sentence.getMentions().stream().mapToDouble(helper::getChainLength).sum()); | |
63 | + feature2value.put(getAttributeByName("sent_good_mention_cluster_count"), sentence.getMentions().stream().filter(goodMentions::contains).mapToDouble(helper::getChainLength).sum()); | |
64 | + feature2value.put(getAttributeByName("sent_good_mention_cluster_good_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).flatMap(m -> helper.getCoreferentMentions(m).stream()).filter(goodMentions::contains).count()); | |
65 | + feature2value.put(getAttributeByName("sent_cluster_count"), (double) sentence.getMentions().stream().map(helper::getMentionCluster).collect(Collectors.toSet()).size()); | |
66 | + feature2value.put(getAttributeByName("sent_good_cluster_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).map(helper::getMentionCluster).collect(Collectors.toSet()).size()); | |
67 | + feature2value.put(getAttributeByName("sent_mention_count"), (double) sentence.getMentions().size()); | |
68 | + feature2value.put(getAttributeByName("sent_good_mention_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).count()); | |
69 | + | |
70 | + feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size()); | |
71 | + feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar); | |
72 | + feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx); | |
73 | + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("."))); | |
74 | + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?"))); | |
75 | + | |
76 | + feature2value.put(getAttributeByName("par_idx"), (double) parIdx); | |
77 | + feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); | |
78 | + feature2value.put(getAttributeByName("par_sent_count"), (double) paragraph.getSentences().size()); | |
79 | + | |
80 | + feature2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum()); | |
81 | + feature2value.put(getAttributeByName("text_token_count"), (double) tokens.size()); | |
82 | + feature2value.put(getAttributeByName("text_sent_count"), (double) sents.size()); | |
83 | + feature2value.put(getAttributeByName("text_par_count"), (double) pars.size()); | |
84 | + feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); | |
85 | + feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); | |
86 | + | |
87 | + feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
88 | + | |
89 | + feature2value.remove(null); | |
90 | + assert (feature2value.size() == getAttributesList().size()); | |
91 | + | |
92 | + sentence2features.put(sentence, feature2value); | |
93 | + | |
94 | + sentenceIdx++; | |
95 | + sentenceIdxInPar++; | |
96 | + } | |
97 | + parIdx++; | |
98 | + } | |
99 | + addNormalizedAttributeValues(sentence2features); | |
100 | + | |
101 | + return sentence2features; | |
102 | + } | |
103 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
2 | + | |
3 | +import com.google.common.collect.HashMultiset; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.collect.Multiset; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
10 | + | |
11 | +import java.util.List; | |
12 | +import java.util.Map; | |
13 | + | |
14 | +public class SentenceScorer { | |
15 | + public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { | |
16 | + Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | |
17 | + | |
18 | + Map<TSentence, Double> sentence2score = Maps.newHashMap(); | |
19 | + for (TParagraph paragraph : preprocessedText.getParagraphs()) | |
20 | + for (TSentence sentence : paragraph.getSentences()) { | |
21 | + double score = 0.0; | |
22 | + | |
23 | + String orth = Utils.loadSentence2Orth(sentence); | |
24 | + List<String> tokens = Utils.tokenize(orth); | |
25 | + for (String token : tokens) { | |
26 | + score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; | |
27 | + } | |
28 | + sentence2score.put(sentence, score / tokens.size()); | |
29 | + } | |
30 | + return sentence2score; | |
31 | + } | |
32 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.core.Instances; | |
9 | +import weka.core.converters.ArffLoader; | |
10 | + | |
11 | +import java.io.File; | |
12 | +import java.io.FileOutputStream; | |
13 | +import java.io.ObjectOutputStream; | |
14 | + | |
15 | + | |
16 | +public class TrainModel { | |
17 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
18 | + | |
19 | + public static void main(String[] args) throws Exception { | |
20 | + | |
21 | + ArffLoader loader = new ArffLoader(); | |
22 | + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
23 | + Instances instances = loader.getDataSet(); | |
24 | + instances.setClassIndex(0); | |
25 | + LOG.info(instances.size() + " instances loaded."); | |
26 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
27 | + | |
28 | + StopWatch watch = new StopWatch(); | |
29 | + watch.start(); | |
30 | + | |
31 | + Classifier classifier = Constants.getSentencesClassifier(); | |
32 | + | |
33 | + LOG.info("Building classifier..."); | |
34 | + classifier.buildClassifier(instances); | |
35 | + LOG.info("...done."); | |
36 | + | |
37 | + try (ObjectOutputStream oos = new ObjectOutputStream( | |
38 | + new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) { | |
39 | + oos.writeObject(classifier); | |
40 | + } | |
41 | + | |
42 | + watch.stop(); | |
43 | + LOG.info("Elapsed time: " + watch); | |
44 | + | |
45 | + LOG.info(classifier.toString()); | |
46 | + } | |
47 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.classifiers.evaluation.Evaluation; | |
9 | +import weka.core.Instances; | |
10 | +import weka.core.converters.ArffLoader; | |
11 | + | |
12 | +import java.io.File; | |
13 | +import java.util.Random; | |
14 | + | |
15 | + | |
16 | +public class Crossvalidate { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
19 | + | |
20 | + public static void main(String[] args) throws Exception { | |
21 | + | |
22 | + ArffLoader loader = new ArffLoader(); | |
23 | + loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
24 | + Instances instances = loader.getDataSet(); | |
25 | + instances.setClassIndex(0); | |
26 | + LOG.info(instances.size() + " instances loaded."); | |
27 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
28 | + | |
29 | + StopWatch watch = new StopWatch(); | |
30 | + watch.start(); | |
31 | + | |
32 | + Classifier tree = Constants.getSentencesClassifier(); | |
33 | + | |
34 | + Evaluation eval = new Evaluation(instances); | |
35 | + eval.crossValidateModel(tree, instances, 10, new Random(1)); | |
36 | + LOG.info(eval.toSummaryString()); | |
37 | + | |
38 | + watch.stop(); | |
39 | + LOG.info("Elapsed time: " + watch); | |
40 | + } | |
41 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
0 → 100644
1 | +++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import com.google.common.collect.Lists; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.collect.Sets; | |
6 | +import org.apache.commons.csv.CSVFormat; | |
7 | +import org.apache.commons.csv.CSVPrinter; | |
8 | +import org.apache.commons.csv.QuoteMode; | |
9 | +import org.apache.commons.io.IOUtils; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
12 | + | |
13 | +import java.io.File; | |
14 | +import java.io.FileReader; | |
15 | +import java.io.FileWriter; | |
16 | +import java.io.IOException; | |
17 | +import java.util.Arrays; | |
18 | +import java.util.List; | |
19 | +import java.util.Map; | |
20 | +import java.util.Set; | |
21 | + | |
22 | +/** | |
23 | + * Created by me2 on 26.07.16. | |
24 | + */ | |
25 | +public class Zero { | |
26 | + | |
27 | + private static final String IDS_PATH = "summaries_dev"; | |
28 | + private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/"; | |
29 | + | |
30 | + public static void main(String[] args) throws IOException { | |
31 | + | |
32 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | |
33 | + Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH); | |
34 | + | |
35 | + int mentionCount = 0; | |
36 | + int mentionInNom = 0; | |
37 | + int mentionInNomSequential = 0; | |
38 | + | |
39 | + List<List<Object>> rows = Lists.newArrayList(); | |
40 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
41 | + String textId = entry.getKey(); | |
42 | +// System.out.println(id); | |
43 | + | |
44 | + TText text = entry.getValue(); | |
45 | + List<String> sentenceIds = id2sentIds.get(textId); | |
46 | +// System.out.println(sentenceIds); | |
47 | + | |
48 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | |
49 | + for (TCoreference coreference : text.getCoreferences()) { | |
50 | + for (String mentionId : coreference.getMentionIds()) { | |
51 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | |
52 | + } | |
53 | + } | |
54 | + | |
55 | + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | |
56 | + TSentence prevSentence = null; | |
57 | + for (TParagraph p : text.getParagraphs()) { | |
58 | + Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences()); | |
59 | + | |
60 | + for (TSentence sentence : p.getSentences()) { | |
61 | + if (!sentenceIds.contains(sentence.getId())) | |
62 | + continue; | |
63 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | |
64 | + | |
65 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); | |
66 | + for (TToken t : sentence.getTokens()) | |
67 | + tokenId2Token.put(t.getId(), t); | |
68 | + | |
69 | + for (TMention mention : sentence.getMentions()) { | |
70 | + mentionCount++; | |
71 | + | |
72 | + for (String tokenId : mention.getHeadIds()) { | |
73 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | |
74 | + if (isInNominative(interp)) { | |
75 | + mentionInNom++; | |
76 | + | |
77 | + currentSentenceNominativeMentionIds.add(mention.getId()); | |
78 | + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | |
79 | + mentionInNomSequential++; | |
80 | + System.out.println(tMentionStringMap.get(mention) | |
81 | + + "\n\t" + Utils.loadSentence2Orth(prevSentence) | |
82 | + + "\n\t" + Utils.loadSentence2Orth(sentence)); | |
83 | + | |
84 | + List<Object> row = Lists.newArrayList(); | |
85 | + row.add("C"); | |
86 | + row.add(textId); | |
87 | + row.add(tMentionStringMap.get(mention)); | |
88 | + row.add(Utils.loadSentence2Orth(prevSentence)); | |
89 | + row.add(Utils.loadSentence2Orth(sentence)); | |
90 | + rows.add(row); | |
91 | + } | |
92 | + break; | |
93 | + } | |
94 | + } | |
95 | + } | |
96 | + | |
97 | + prevSentence = sentence; | |
98 | + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | |
99 | + } | |
100 | + } | |
101 | + } | |
102 | + | |
103 | + System.out.println(mentionCount + " mentions"); | |
104 | + System.out.println(mentionInNom + " mention in nom"); | |
105 | + System.out.println(mentionInNomSequential + " mention in nom with previous in nom"); | |
106 | + | |
107 | + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | |
108 | + for (List<Object> row : rows) { | |
109 | + csvPrinter.printRecord(row); | |
110 | + } | |
111 | + } | |
112 | + | |
113 | + } | |
114 | + | |
115 | + private static boolean isInNominative(TInterpretation interp) { | |
116 | + return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | |
117 | + } | |
118 | + | |
119 | + private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException { | |
120 | + Map<String, List<String>> result = Maps.newHashMap(); | |
121 | + for (File f : new File(idsPath).listFiles()) { | |
122 | + String id = f.getName().split("_")[0]; | |
123 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | |
124 | + result.put(id, sentenceIds); | |
125 | + } | |
126 | + return result; | |
127 | + } | |
128 | +} | |
... | ... |
nicolas-model/pom.xml
0 → 100644
1 | +++ a/nicolas-model/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas-model</artifactId> | |
13 | + | |
14 | +</project> | |
0 | 15 | \ No newline at end of file |
... | ... |
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt
0 → 100644
1 | +++ a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt | |
1 | +on | |
2 | +to | |
3 | +co | |
4 | +rok | |
5 | +być | |
6 | +wszystko | |
7 | +polska | |
8 | +człowiek | |
9 | +sobie | |
10 | +raz | |
11 | +my | |
12 | +mieć | |
13 | +czas | |
14 | +państwo | |
15 | +praca | |
16 | +osoba | |
17 | +sprawa | |
18 | +ja | |
19 | +kraj | |
20 | +pieniądz | |
21 | +nikt | |
22 | +kto | |
23 | +przykład | |
24 | +nic | |
25 | +koniec | |
26 | +rząd | |
27 | +prawo | |
28 | +życie | |
29 | +miejsce | |
30 | +móc | |
31 | +fot | |
32 | +problem | |
33 | +władza | |
34 | +miesiąc | |
35 | +rzecz | |
36 | +stan | |
37 | +świat | |
38 | +wszyscy | |
39 | +mówić | |
40 | +rozmowa | |
41 | +coś | |
42 | +sytuacja | |
43 | +powód | |
44 | +początek | |
45 | +wiedzieć | |
46 | +dzień | |
47 | +uwaga | |
48 | +strona | |
49 | +udział | |
50 | +in | |
51 | +musieć | |
52 | +polityk | |
53 | +ktoś | |
54 | +ogół | |
55 | +polityka | |
56 | +chcieć | |
57 | +walka | |
58 | +zmiana | |
59 | +decyzja | |
60 | +ciąg | |
61 | +m . | |
62 | +pan | |
63 | +szansa | |
64 | +polak | |
65 | +przypadek | |
66 | +większość | |
67 | +pytanie | |
68 | +wzgląd | |
69 | +warszawa | |
70 | +proca | |
71 | +pomoc | |
72 | +prezydent | |
73 | +społeczeństwo | |
74 | +wynik | |
75 | +dziecko | |
76 | +prawda | |
77 | +związek | |
78 | +gospodarka | |
79 | +część | |
80 | +wojna | |
81 | +tydzień | |
82 | +granica | |
83 | +głos | |
84 | +przyszłość | |
85 | +autor | |
86 | +wybory | |
87 | +rynek | |
88 | +cel | |
89 | +ustawa | |
90 | +uważać | |
91 | +ten rok | |
92 | +droga | |
93 | +dom | |
94 | +rys | |
95 | +myśleć | |
96 | +firma | |
97 | +zasada | |
98 | +fakt | |
99 | +kolej | |
100 | +nadzieja | |
101 | +dolar | |
102 | +wraz | |
103 | +miasto | |
104 | +rozwój | |
105 | +ten sposób | |
106 | +europa | |
107 | +temat | |
108 | +siła | |
109 | +rodzina | |
110 | +minister | |
111 | +historia | |
112 | +wpływ | |
113 | +współpraca | |
114 | +środek | |
115 | +informacja | |
116 | +procent | |
117 | +wniosek | |
118 | +unia europejski | |
119 | +niemcy | |
120 | +podstawa | |
121 | +reforma | |
122 | +partia | |
123 | +interes | |
124 | +ten sprawa | |
125 | +kandydat | |
126 | +sukces | |
127 | +sposób | |
128 | +wątpliwość | |
129 | +złoty | |
130 | +sld | |
131 | +pracownik | |
132 | +stanowisko | |
133 | +dyskusja | |
134 | +telewizja | |
135 | +pewność | |
136 | +odpowiedź | |
137 | +rzeczywistość | |
138 | +program | |
139 | +cena | |
140 | +działanie | |
141 | +system | |
142 | +unia | |
143 | +ręka | |
144 | +odpowiedzialność | |
145 | +środowisko | |
146 | +solidarność | |
147 | +demokracja | |
148 | +maić | |
149 | +ramy | |
150 | +badanie | |
151 | +media | |
152 | +wartość | |
153 | +wybór | |
154 | +głowa | |
155 | +zostać | |
156 | +usa | |
157 | +pracować | |
158 | +porozumienie | |
159 | +widzieć | |
160 | +zdanie | |
161 | +akcja | |
162 | +wolność | |
163 | +spotkanie | |
164 | +przeszłość | |
165 | +stosunek | |
166 | +okazja | |
167 | +prowadzić | |
168 | +zachód | |
169 | +kobieta | |
170 | +obywatel | |
171 | +sąd | |
172 | +ubiegły rok | |
173 | +dziennikarz | |
174 | +kultura | |
175 | +grupa | |
176 | +opinia publiczny | |
177 | +obrona | |
178 | +bezpieczeństwo | |
179 | +opinia | |
180 | +rzeczpospolita | |
181 | +dokument | |
182 | +racja | |
183 | +szkoła | |
184 | +góra | |
185 | +warunek | |
186 | +organizacja | |
187 | +oko | |
188 | +godzina | |
189 | +tysiąc | |
190 | +ten czas | |
191 | +możliwość | |
192 | +błąd | |
193 | +ziemia | |
194 | +parlament | |
195 | +ten pora | |
196 | +chwila | |
197 | +naród | |
198 | +konflikt | |
199 | +działalność | |
200 | +sejm | |
201 | +powrót | |
202 | +premier | |
203 | +działać | |
204 | +rada | |
205 | +zdrowie | |
206 | +wiek | |
207 | +dodatek | |
208 | +poziom | |
209 | +widzenie | |
210 | +żyć | |
211 | +powiedzieć | |
212 | +inwestycja | |
213 | +rosja | |
214 | +niemiec | |
215 | +samochód | |
216 | +skutek | |
217 | +punkt | |
218 | +rola | |
219 | +mieszkaniec | |
220 | +wyborca | |
221 | +koszt | |
222 | +budżet | |
223 | +szef | |
224 | +styczeń | |
225 | +instytucja | |
226 | +pełnia | |
227 | +ulica | |
228 | +aws | |
229 | +ochrona | |
230 | +dostęp | |
231 | +zagrożenie | |
232 | +zgoda | |
233 | +ue | |
234 | +" rzeczpospolita " | |
235 | +liczba | |
236 | +wieś | |
237 | +połowa | |
0 | 238 | \ No newline at end of file |
... | ... |
nicolas-train/pom.xml
0 → 100644
1 | +++ a/nicolas-train/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas-train</artifactId> | |
13 | + | |
14 | +</project> | |
0 | 15 | \ No newline at end of file |
... | ... |
nicolas-zero/pom.xml
0 → 100644
1 | +++ a/nicolas-zero/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas-zero</artifactId> | |
13 | + | |
14 | +</project> | |
0 | 15 | \ No newline at end of file |
... | ... |
pom.xml
0 → 100644
1 | +++ a/pom.xml | |
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + | |
7 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
8 | + <artifactId>nicolas-container</artifactId> | |
9 | + <packaging>pom</packaging> | |
10 | + <version>1.0-SNAPSHOT</version> | |
11 | + | |
12 | + <modules> | |
13 | + <module>nicolas-core</module> | |
14 | + <module>nicolas-cli</module> | |
15 | + <module>nicolas-model</module> | |
16 | + <module>nicolas-train</module> | |
17 | + <module>nicolas-zero</module> | |
18 | + </modules> | |
19 | + | |
20 | + <properties> | |
21 | + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
22 | + <java.version.build>1.8</java.version.build> | |
23 | + </properties> | |
24 | + | |
25 | + <prerequisites> | |
26 | + <maven>3.0.5</maven> | |
27 | + </prerequisites> | |
28 | + | |
29 | + <developers> | |
30 | + <developer> | |
31 | + <name>Mateusz Kopeć</name> | |
32 | + <organization>ICS PAS</organization> | |
33 | + <email>m.kopec@ipipan.waw.pl</email> | |
34 | + </developer> | |
35 | + </developers> | |
36 | + | |
37 | + <dependencies> | |
38 | + <dependency> | |
39 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
40 | + <artifactId>pscapi</artifactId> | |
41 | + <version>1.0-SNAPSHOT</version> | |
42 | + </dependency> | |
43 | + <dependency> | |
44 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
45 | + <artifactId>utils</artifactId> | |
46 | + <version>1.0-SNAPSHOT</version> | |
47 | + </dependency> | |
48 | + | |
49 | + <dependency> | |
50 | + <groupId>org.apache.commons</groupId> | |
51 | + <artifactId>commons-csv</artifactId> | |
52 | + <version>1.3</version> | |
53 | + </dependency> | |
54 | + <dependency> | |
55 | + <groupId>com.google.guava</groupId> | |
56 | + <artifactId>guava</artifactId> | |
57 | + <version>19.0</version> | |
58 | + </dependency> | |
59 | + <dependency> | |
60 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
61 | + <artifactId>weka-dev</artifactId> | |
62 | + <version>3.9.0</version> | |
63 | + </dependency> | |
64 | + <dependency> | |
65 | + <groupId>org.apache.commons</groupId> | |
66 | + <artifactId>commons-lang3</artifactId> | |
67 | + <version>3.4</version> | |
68 | + </dependency> | |
69 | + <dependency> | |
70 | + <groupId>commons-io</groupId> | |
71 | + <artifactId>commons-io</artifactId> | |
72 | + <version>2.5</version> | |
73 | + </dependency> | |
74 | + </dependencies> | |
75 | + | |
76 | + | |
77 | + <build> | |
78 | + <plugins> | |
79 | + <plugin> | |
80 | + <groupId>org.apache.maven.plugins</groupId> | |
81 | + <artifactId>maven-compiler-plugin</artifactId> | |
82 | + <version>3.1</version> | |
83 | + <configuration> | |
84 | + <source>${java.version.build}</source> | |
85 | + <target>${java.version.build}</target> | |
86 | + </configuration> | |
87 | + </plugin> | |
88 | + </plugins> | |
89 | + </build> | |
90 | + | |
91 | + <distributionManagement> | |
92 | + <repository> | |
93 | + <id>deployment</id> | |
94 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | |
95 | + </repository> | |
96 | + <snapshotRepository> | |
97 | + <id>deployment</id> | |
98 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | |
99 | + </snapshotRepository> | |
100 | + </distributionManagement> | |
101 | +</project> | |
0 | 102 | \ No newline at end of file |
... | ... |