Initial commit (dfbfe3fd) | Commits | core / md

Browse Code »

Commit dfbfe3fdf1b83e7b7fe76d1421ab9d9488227f62

Authored by Mateusz Kopeć 10 years ago

0 parents

master ...

Initial commit

Inline Side-by-side

Showing 82 changed files with 3236 additions and 0 deletions

.gitignore 0 → 100644

View file @dfbfe3f

	1	+++ a/.gitignore
	1	+/target/
	2	+.classpath
	3	+.project
	4	+.settings
...	...

pom.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/pom.xml
	1	+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	2	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	3	+ <modelVersion>4.0.0</modelVersion>
	4	+ <groupId>pl.waw.ipipan.zil.core</groupId>
	5	+ <artifactId>md</artifactId>
	6	+ <version>1.2-SNAPSHOT</version>
	7	+ <build>
	8	+ <plugins>
	9	+ <plugin>
	10	+ <artifactId>maven-compiler-plugin</artifactId>
	11	+ <version>2.3.2</version>
	12	+ <configuration>
	13	+ <source>1.7</source>
	14	+ <target>1.7</target>
	15	+ </configuration>
	16	+ </plugin>
	17	+ <plugin>
	18	+ <groupId>org.dstovall</groupId>
	19	+ <artifactId>onejar-maven-plugin</artifactId>
	20	+ <version>1.4.4</version>
	21	+ <executions>
	22	+ <execution>
	23	+ <configuration>
	24	+ <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass>
	25	+ </configuration>
	26	+ <goals>
	27	+ <goal>one-jar</goal>
	28	+ </goals>
	29	+ </execution>
	30	+ </executions>
	31	+ </plugin>
	32	+ </plugins>
	33	+ </build>
	34	+ <dependencies>
	35	+ <dependency>
	36	+ <groupId>log4j</groupId>
	37	+ <artifactId>log4j</artifactId>
	38	+ <version>1.2.17</version>
	39	+ </dependency>
	40	+ <dependency>
	41	+ <groupId>ipipan.multiservice</groupId>
	42	+ <artifactId>MultiserviceUtils</artifactId>
	43	+ <version>1.0-SNAPSHOT</version>
	44	+ </dependency>
	45	+ <dependency>
	46	+ <groupId>ipipan</groupId>
	47	+ <artifactId>teiapi</artifactId>
	48	+ <version>1.0-SNAPSHOT</version>
	49	+ </dependency>
	50	+ <dependency>
	51	+ <groupId>junit</groupId>
	52	+ <artifactId>junit</artifactId>
	53	+ <version>4.11</version>
	54	+ </dependency>
	55	+ <dependency>
	56	+ <groupId>nz.ac.waikato.cms.weka</groupId>
	57	+ <artifactId>weka-stable</artifactId>
	58	+ <version>3.6.10</version>
	59	+ </dependency>
	60	+ </dependencies>
	61	+ <repositories>
	62	+ <repository>
	63	+ <id>zil-maven-repo</id>
	64	+ <name>ZIL maven repository</name>
	65	+ <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url>
	66	+ </repository>
	67	+ </repositories>
	68	+ <pluginRepositories>
	69	+ <pluginRepository>
	70	+ <id>onejar-maven-plugin.googlecode.com</id>
	71	+ <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url>
	72	+ </pluginRepository>
	73	+ </pluginRepositories>
	74	+</project>
...	...

src/main/java/pl/waw/ipipan/zil/core/md/Main.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java
	1	+package pl.waw.ipipan.zil.core.md;
	2	+
	3	+import ipipan.clarin.tei.api.entities.TEICorpusText;
	4	+import ipipan.clarin.tei.api.exceptions.TEIException;
	5	+import ipipan.clarin.tei.api.io.IOUtils;
	6	+
	7	+import java.io.File;
	8	+import java.io.FileInputStream;
	9	+import java.io.IOException;
	10	+import java.io.InputStream;
	11	+
	12	+import org.apache.log4j.Logger;
	13	+
	14	+import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
	15	+import pl.waw.ipipan.multiservice.thrift.types.TText;
	16	+import pl.waw.ipipan.zil.core.md.detection.Detector;
	17	+import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
	18	+import pl.waw.ipipan.zil.core.md.entities.Text;
	19	+import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
	20	+import pl.waw.ipipan.zil.core.md.io.tei.TeiSaver;
	21	+import pl.waw.ipipan.zil.core.md.io.thrift.ThriftLoader;
	22	+import pl.waw.ipipan.zil.core.md.io.thrift.ThriftSaver;
	23	+
	24	+/**
	25	+ * @author Mateusz Kopeć
	26	+ *
	27	+ */
	28	+public class Main {
	29	+
	30	+ private final static Logger logger = Logger.getLogger(Main.class);
	31	+ private final static boolean GZIP_OUTPUT = true;
	32	+
	33	+ private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
	34	+
	35	+ private static ZeroSubjectDetector zeroSubjectModel;
	36	+ static {
	37	+ InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
	38	+ zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
	39	+ }
	40	+
	41	+ /**
	42	+ * Main method for detecting mentions in corpus encoded in Tei format.
	43	+ *
	44	+ * @param args
	45	+ * @throws TEIException
	46	+ */
	47	+ public static void main(String[] args) {
	48	+
	49	+ if (args.length != 2 && args.length != 3) {
	50	+ logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
	51	+ + " input_dir result_dir [zero_subject_model]");
	52	+ return;
	53	+ }
	54	+
	55	+ File inputDir = new File(args[0]);
	56	+ File outputDir = new File(args[1]);
	57	+
	58	+ if (!inputDir.isDirectory()) {
	59	+ logger.error(inputDir + " is not a directory!");
	60	+ return;
	61	+ }
	62	+ if (!outputDir.isDirectory()) {
	63	+ logger.error(outputDir + " is not a directory!");
	64	+ return;
	65	+ }
	66	+ if (args.length == 3) {
	67	+ try {
	68	+ InputStream zeroSubjectDetectionModelStream;
	69	+ zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2]));
	70	+ zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
	71	+ if (zeroSubjectModel == null)
	72	+ throw new IOException();
	73	+ } catch (IOException e) {
	74	+ logger.error("Unable to load model from file: " + args[2] + ": " + e);
	75	+ return;
	76	+ }
	77	+ }
	78	+
	79	+ int all = 0;
	80	+ int errors = 0;
	81	+ for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
	82	+ all++;
	83	+ try {
	84	+ File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
	85	+ TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
	86	+ annotateTeiText(teiText);
	87	+ TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
	88	+ } catch (IOException e) {
	89	+ logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage());
	90	+ errors++;
	91	+ }
	92	+ }
	93	+
	94	+ logger.info(all + " texts processed succesfully.");
	95	+ if (errors > 0)
	96	+ logger.info(errors + " texts not processed.");
	97	+ logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
	98	+ logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
	99	+ }
	100	+
	101	+ /**
	102	+ * Find relative path of text directory in the corpus directory and create
	103	+ * similar directory structure in the output corpus directory.
	104	+ *
	105	+ * @param inputCorpusDir
	106	+ * @param outputCorpusDir
	107	+ * @param textDir
	108	+ * @return
	109	+ * @throws IOException
	110	+ */
	111	+ private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
	112	+ String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
	113	+ File targetDir = new File(outputCorpusDir, relativeDirPath);
	114	+ targetDir.mkdirs();
	115	+ if (!targetDir.exists() \|\| !targetDir.isDirectory())
	116	+ throw new IOException("Failed to create output directory at: " + targetDir);
	117	+ return targetDir;
	118	+ }
	119	+
	120	+ /**
	121	+ * Find mentions in Thrift text and update this Thrift text with mention
	122	+ * annotation.
	123	+ *
	124	+ * @param thriftText
	125	+ * @throws MultiserviceException
	126	+ */
	127	+ public static void annotateThriftText(TText thriftText) throws MultiserviceException {
	128	+ Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
	129	+ Detector.findMentionsInText(responseText, zeroSubjectModel);
	130	+ ThriftSaver.updateThriftText(responseText, thriftText);
	131	+ }
	132	+
	133	+ /**
	134	+ * Find mentions in Tei text and update this Tei text with mention
	135	+ * annotation. This method does not save this Tei text on disk.
	136	+ *
	137	+ * @param teiText
	138	+ * @param zeroSubjectModel
	139	+ * @throws TEIException
	140	+ */
	141	+ public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
	142	+ Text responseText = TeiLoader.loadTextFromTei(teiText);
	143	+ Detector.findMentionsInText(responseText, zeroSubjectModel);
	144	+ TeiSaver.updateTeiText(responseText, teiText);
	145	+ }
	146	+
	147	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
	1	+package pl.waw.ipipan.zil.core.md.detection;
	2	+
	3	+import java.util.Collection;
	4	+import java.util.HashSet;
	5	+import java.util.List;
	6	+import java.util.Set;
	7	+
	8	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	9	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	10	+import pl.waw.ipipan.zil.core.md.entities.Token;
	11	+
	12	+public class Cleaner {
	13	+ public static void cleanUnnecessarySentenceMentions(Sentence sentence) {
	14	+ List<Mention> mentions = sentence.getMentions();
	15	+ Collection<Mention> unnecessaryMentions = new HashSet<Mention>();
	16	+
	17	+ for (int i = 0; i < mentions.size(); i++) {
	18	+ Mention m1 = mentions.get(i);
	19	+ for (int j = i + 1; j < mentions.size(); j++) {
	20	+ Mention m2 = mentions.get(j);
	21	+
	22	+ Mention lessImportantMention = getLessImportantMention(m1, m2);
	23	+ Mention moreImportantMention = m1 == lessImportantMention ? m2
	24	+ : m1;
	25	+
	26	+ // same mention borders
	27	+ if (m1.getSegments().equals(m2.getSegments())) {
	28	+ unnecessaryMentions.add(lessImportantMention);
	29	+ // System.out.println("Same borders: "+ m1 +", "+
	30	+ // m2+": "+getLessImportantMention(m1, m2)+" removed");
	31	+ continue;
	32	+ }
	33	+ // same mention heads
	34	+ if (!m1.getHeadSegments().isEmpty()
	35	+ && !m2.getHeadSegments().isEmpty()) {
	36	+ if (m1.getHeadSegments().equals(m2.getHeadSegments())) {
	37	+
	38	+ List<Token> segments = moreImportantMention
	39	+ .getSegments();
	40	+
	41	+ boolean isConj = false;
	42	+ for (Token seg : segments) {
	43	+ if (seg.getChosenInterpretation().getCtag()
	44	+ .equals("conj")) {
	45	+ isConj = true;
	46	+ break;
	47	+ }
	48	+ }
	49	+
	50	+ if (!isConj) {
	51	+ unnecessaryMentions.add(lessImportantMention);
	52	+ // System.out.println("Same heads: " + m1 + ", " +
	53	+ // m2 + ": " + lessImportantMention
	54	+ // + " removed");
	55	+
	56	+ continue;
	57	+ }
	58	+ }
	59	+ }
	60	+
	61	+ // mention head equals whole other mention
	62	+ if (m1.getHeadSegments().isEmpty()
	63	+ && !m2.getHeadSegments().isEmpty()) {
	64	+ if (m2.getHeadSegments().equals(m1.getSegments())) {
	65	+ unnecessaryMentions.add(lessImportantMention);
	66	+ continue;
	67	+ // System.out.println("head is other mention: " + m1 +
	68	+ // ", " + m2 + ": "
	69	+ // + getLessImportantMention(m1, m2) + " removed");
	70	+ }
	71	+ }
	72	+
	73	+ // the same, but other way round
	74	+ if (m2.getHeadSegments().isEmpty()
	75	+ && !m1.getHeadSegments().isEmpty()) {
	76	+
	77	+ if (m1.getHeadSegments().equals(m2.getSegments())) {
	78	+ unnecessaryMentions.add(lessImportantMention);
	79	+ continue;
	80	+ // System.out.println("head is other mention: " + m1 +
	81	+ // ", " + m2 + ": "
	82	+ // + getLessImportantMention(m1, m2) + " removed");
	83	+ }
	84	+ }
	85	+
	86	+ // nie zawieraja sie w sobie, lecz maja czesc wspolna
	87	+ boolean intersect = false;
	88	+
	89	+ Set<Token> notInM1 = new HashSet<Token>(m2.getSegments());
	90	+ notInM1.removeAll(m1.getSegments());
	91	+ if (notInM1.size() < m2.getSegments().size())
	92	+ intersect = true;
	93	+
	94	+ Set<Token> notInM2 = new HashSet<Token>(m1.getSegments());
	95	+ notInM2.removeAll(m2.getSegments());
	96	+ if (notInM2.size() < m1.getSegments().size())
	97	+ intersect = true;
	98	+
	99	+ // if (intersect)
	100	+ // System.out.println(m1+","+m2);
	101	+
	102	+ if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) {
	103	+ unnecessaryMentions.add(lessImportantMention);
	104	+ continue;
	105	+ // System.out.println("intersection!" + m1 + ", " + m2 +
	106	+ // ": "
	107	+ // + getLessImportantMention(m1, m2) + " removed");
	108	+ }
	109	+
	110	+ }
	111	+ }
	112	+
	113	+ for (Mention m : unnecessaryMentions)
	114	+ sentence.removeMention(m);
	115	+
	116	+ // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]]
	117	+ unnecessaryMentions.clear();
	118	+
	119	+ OUTER: for (Mention m : sentence.getMentions()) {
	120	+ for (Token seg : m.getSegments())
	121	+ if (seg.getOrth().toLowerCase().equals(seg.getOrth()))
	122	+ continue OUTER;
	123	+
	124	+ //only for children of fully capitalized mentions
	125	+ Set<Mention> allMentions = new HashSet<Mention>();
	126	+ for (Token seg : m.getSegments())
	127	+ for (Mention m2 : seg.getMentions())
	128	+ if (m.getSegments().containsAll(m2.getSegments()))
	129	+ allMentions.add(m2);
	130	+
	131	+ allMentions.remove(m);
	132	+
	133	+ unnecessaryMentions.addAll(allMentions);
	134	+ }
	135	+ for (Mention m : unnecessaryMentions)
	136	+ sentence.removeMention(m);
	137	+ }
	138	+
	139	+ private static Mention getLessImportantMention(Mention m1, Mention m2) {
	140	+ if (m1.getSegments().size() > m2.getSegments().size())
	141	+ return m2;
	142	+ else
	143	+ return m1;
	144	+ }
	145	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
	1	+package pl.waw.ipipan.zil.core.md.detection;
	2	+
	3	+public class Constants {
	4	+ public static final String MORPHO_NOUN_CTAGS = "subst\|depr\|ger";
	5	+ public static final String MORPHO_VERB_CTAGS = "fin\|bedzie\|aglt\|impt";
	6	+ public static final String MORPHO_PRONOUN_CTAGS = "ppron3\|ppron12";
	7	+ public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "\|"
	8	+ + MORPHO_PRONOUN_CTAGS;
	9	+ public static final String WORDS_CTAGS = "Noun\|Ppron.*";
	10	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
	1	+package pl.waw.ipipan.zil.core.md.detection;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.HashSet;
	5	+import java.util.List;
	6	+import java.util.Set;
	7	+
	8	+import org.apache.log4j.Logger;
	9	+
	10	+import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
	11	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	12	+import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
	13	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	14	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	15	+import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
	16	+import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
	17	+import pl.waw.ipipan.zil.core.md.entities.Text;
	18	+import pl.waw.ipipan.zil.core.md.entities.Token;
	19	+
	20	+public class Detector {
	21	+ private static Logger logger = Logger.getLogger(Detector.class);
	22	+
	23	+ public static void findMentionsInText(Text text,
	24	+ ZeroSubjectDetector zeroSubjectModel) {
	25	+ text.clearMentions();
	26	+ logger.debug("Detecting mentions in text " + text.getId());
	27	+ for (Paragraph p : text)
	28	+ for (Sentence s : p)
	29	+ detectMentionsInSentence(s, zeroSubjectModel);
	30	+ }
	31	+
	32	+ private static void detectMentionsInSentence(Sentence sentence,
	33	+ ZeroSubjectDetector zeroSubjectModel) {
	34	+ // adding mentions
	35	+ addMentionsByTokenCtag(sentence);
	36	+ addMentionsBySyntacticWordsCtag(sentence);
	37	+ addMentionsByNamedEntities(sentence);
	38	+ addMentionsByGroups(sentence);
	39	+ addSpeakerMentionsInSpoken(sentence);
	40	+
	41	+ // zero subject detection
	42	+ zeroSubjectModel.addZeroSubjectMentions(sentence);
	43	+
	44	+ // removing mentions
	45	+ removeTo(sentence);
	46	+ Cleaner.cleanUnnecessarySentenceMentions(sentence);
	47	+
	48	+ // updating mention heads
	49	+ updateMentionHeads(sentence);
	50	+ }
	51	+
	52	+ /**
	53	+ * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
	54	+ *
	55	+ * @param sentence
	56	+ */
	57	+ private static void updateMentionHeads(Sentence sentence) {
	58	+ for (Mention m : sentence.getMentions())
	59	+ if (m.getHeadSegments().isEmpty())
	60	+ m.addHeadSegment(m.getFirstSegment());
	61	+ }
	62	+
	63	+ /**
	64	+ * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
	65	+ *
	66	+ * @param sentence
	67	+ */
	68	+ private static void removeTo(Sentence sentence) {
	69	+ Set<String> orths = new HashSet<String>();
	70	+ for (Token morph : sentence)
	71	+ orths.add(morph.getOrth());
	72	+
	73	+ if (orths.contains("jeśli") \|\| orths.contains("jeżeli")
	74	+ \|\| orths.contains("skoro")) {
	75	+ for (Mention mention : sentence.getMentions()) {
	76	+ List<Token> mentSegs = mention.getSegments();
	77	+ if (mentSegs.size() == 1
	78	+ && mentSegs.get(0).getBase().equals("to")) {
	79	+ sentence.removeMention(mention);
	80	+ }
	81	+ }
	82	+ }
	83	+ }
	84	+
	85	+ private static void addSpeakerMentionsInSpoken(Sentence sentence) {
	86	+ // heurystyka dla sp1:, sp2:, MarszałekJAkistam:
	87	+ if (sentence.size() > 2) {
	88	+ Token first = sentence.get(0);
	89	+ Token second = sentence.get(1);
	90	+ if (second.getOrth().equals(":")) {
	91	+ sentence.addMention(new Mention(first));
	92	+ }
	93	+ }
	94	+ }
	95	+
	96	+ /**
	97	+ * Wyszukuję i oznaczam wszystkie NG*
	98	+ *
	99	+ * @param sentence
	100	+ */
	101	+ private static void addMentionsByGroups(Sentence sentence) {
	102	+ for (SyntacticGroup group : sentence.getGroups()) {
	103	+ if (group.getType().startsWith("NG")) {
	104	+ List<Token> segments = group.getTokens();
	105	+ List<Token> heads = group.getSemanticHeadTokens();
	106	+
	107	+ sentence.addMention(new Mention(segments, heads));
	108	+ }
	109	+ }
	110	+ }
	111	+
	112	+ /**
	113	+ * Wyszukuję i oznaczam wszystkie NER
	114	+ *
	115	+ * @param sentence
	116	+ */
	117	+ private static void addMentionsByNamedEntities(Sentence sentence) {
	118	+ for (NamedEntity ne : sentence.getNamedEntities()) {
	119	+
	120	+ List<Token> headTokens = new ArrayList<Token>();
	121	+ List<Token> tokens = ne.getTokens();
	122	+
	123	+ boolean containsNoun = false;
	124	+ for (Token seg : tokens) {
	125	+ if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
	126	+ containsNoun = true;
	127	+ break;
	128	+ }
	129	+ }
	130	+ if (!containsNoun)
	131	+ continue;
	132	+
	133	+ sentence.addMention(new Mention(tokens, headTokens));
	134	+ }
	135	+ }
	136	+
	137	+ /**
	138	+ * @param sentence
	139	+ */
	140	+ private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
	141	+ for (SyntacticWord w : sentence.getSyntacticWords())
	142	+ if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
	143	+ List<Token> tokens = w.getTokens();
	144	+ if (tokens.size() == 1) {
	145	+ sentence.addMention(new Mention(tokens.get(0)));
	146	+ } else {
	147	+ List<Token> heads = new ArrayList<Token>();
	148	+ sentence.addMention(new Mention(tokens, heads));
	149	+ }
	150	+ }
	151	+ }
	152	+
	153	+ /**
	154	+ * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
	155	+ * skladniowych, to korzystam z niego zamiast morfoskladni
	156	+ *
	157	+ * @param sentence
	158	+ */
	159	+ private static void addMentionsByTokenCtag(Sentence sentence) {
	160	+ for (Token token : sentence)
	161	+ if (token.getCtag().matches(Constants.MORPHO_CTAGS))
	162	+ sentence.addMention(new Mention(token));
	163	+ }
	164	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.util.Arrays;
	4	+import java.util.HashSet;
	5	+import java.util.Set;
	6	+
	7	+public class Constants {
	8	+ final public static Set<String> VERB_TAGS = new HashSet<>(
	9	+ Arrays.asList(new String[] { "fin", "bedzie", "aglt", "praet",
	10	+ "winien" }));
	11	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import ipipan.clarin.tei.api.entities.TEIMention;
	4	+import ipipan.clarin.tei.api.entities.TEIMorph;
	5	+
	6	+import java.util.ArrayList;
	7	+import java.util.Arrays;
	8	+import java.util.HashMap;
	9	+import java.util.HashSet;
	10	+import java.util.Iterator;
	11	+import java.util.LinkedList;
	12	+import java.util.List;
	13	+import java.util.Map;
	14	+import java.util.Set;
	15	+
	16	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	17	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	18	+import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
	19	+import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
	20	+import pl.waw.ipipan.zil.core.md.entities.Token;
	21	+
	22	+public class FeatureGeneration {
	23	+ final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
	24	+ "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
	25	+
	26	+ final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
	27	+ "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
	28	+ "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
	29	+
	30	+ final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
	31	+ Arrays.asList(new String[] { "?", "!" }));
	32	+
	33	+ final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
	34	+ static {
	35	+ CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
	36	+ CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
	37	+ CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
	38	+ }
	39	+
	40	+ final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
	41	+ "ppron3", "ger", "num", "numcol" }));
	42	+
	43	+ final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
	44	+
	45	+ final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
	46	+ "praet", "winien" }));
	47	+
	48	+ final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
	49	+ "który" }));
	50	+
	51	+ public static void generateFeatures(Map<String, Object> features, Token m, Sentence s, Set<String> quasiVerbs) {
	52	+
	53	+ features.put("verbCtag", m.getChosenInterpretation().getCtag());
	54	+ features.put("verbNumber", m.getChosenInterpretation().getNumber());
	55	+ features.put("verbGender", m.getChosenInterpretation().getGender());
	56	+ features.put("verbPerson", m.getChosenInterpretation().getPerson());
	57	+
	58	+ features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase()));
	59	+
	60	+ features.put("nextCtag", getNeighbouringTag(s, m, 1));
	61	+ features.put("prevCtag", getNeighbouringTag(s, m, -1));
	62	+
	63	+ features.put("isPrevPraet", isPrevPraet(m, s));
	64	+ features.put("isPrevComma", isPrevComma(m, s));
	65	+ features.put("isPrev2Pred", isPrev2Pred(m, s));
	66	+ features.put("isNextInf", isNextInf(m, s));
	67	+
	68	+ List<Token> clause = getClause(s, m);
	69	+ features.put("sentLength", s.size());
	70	+ features.put("clauseLength", clause.size());
	71	+
	72	+ addFeatures(features, clause, "clause", m);
	73	+ addFeatures(features, s, "sent", m);
	74	+ for (int i = 1; i < 6; i++)
	75	+ addFeatures(features, getWindow(s, m, i, 0), "window_" + i + "_" + 0, m);
	76	+ for (int i = 1; i < 6; i++)
	77	+ addFeatures(features, getWindow(s, m, 0, i), "window_" + 0 + "_" + i, m);
	78	+ for (int i = 1; i < 6; i++)
	79	+ addFeatures(features, getWindow(s, m, i, i), "window_" + i + "_" + i, m);
	80	+ }
	81	+
	82	+ private static boolean isNextInf(Token m, Sentence s) {
	83	+ boolean now = false;
	84	+ for (Token morph : s) {
	85	+ if (now)
	86	+ return morph.getChosenInterpretation().getCtag().equals("inf");
	87	+ if (m.equals(morph))
	88	+ now = true;
	89	+ }
	90	+ return false;
	91	+ }
	92	+
	93	+ private static boolean isPrev2Pred(Token m, Sentence s) {
	94	+ Token prev = null;
	95	+ Token prev2 = null;
	96	+ for (Token morph : s) {
	97	+ if (m.equals(morph))
	98	+ break;
	99	+ prev2 = prev;
	100	+ prev = morph;
	101	+ }
	102	+ return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
	103	+ \|\| (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
	104	+ }
	105	+
	106	+ private static Object isPrevComma(Token m, Sentence s) {
	107	+ Token prev = null;
	108	+ for (Token morph : s) {
	109	+ if (m.equals(morph))
	110	+ break;
	111	+ prev = morph;
	112	+ }
	113	+ return prev != null && prev.getChosenInterpretation().getBase().equals(",");
	114	+ }
	115	+
	116	+ private static String getNeighbouringTag(Sentence s, Token m, int i) {
	117	+ int idx = s.indexOf(m) + i;
	118	+ if (idx >= s.size() \|\| idx < 0)
	119	+ return "None";
	120	+ return s.get(idx).getChosenInterpretation().getCtag();
	121	+ }
	122	+
	123	+ private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
	124	+
	125	+ boolean hasNom = false; // 1
	126	+ boolean hasNum = false; // 2
	127	+ boolean hasPOG = false; // 3
	128	+
	129	+ boolean hasNomNum = false;
	130	+ boolean hasNumPOG = false;
	131	+ boolean hasNomPOG = false;
	132	+ boolean hasNomNumPOG = false;
	133	+
	134	+ boolean has2Nom = false;
	135	+ boolean has2NomPOG = false;
	136	+ boolean has2POG = false;
	137	+
	138	+ Token prev = null;
	139	+ for (Token candidate : clause) {
	140	+
	141	+ if (!isNoun(candidate) \|\| isJakJako(prev)) {
	142	+ prev = candidate;
	143	+ continue;
	144	+ }
	145	+
	146	+ // nom, nom2
	147	+ if (isNom(candidate)) {
	148	+ if (hasNom)
	149	+ has2Nom = true;
	150	+ hasNom = true;
	151	+ }
	152	+ // num
	153	+ if (agreedNum(candidate, m)) {
	154	+ hasNum = true;
	155	+ }
	156	+ // pog, pog2
	157	+ if (agreedGenderOrPerson(candidate, m)) {
	158	+ if (hasPOG)
	159	+ has2POG = true;
	160	+ hasPOG = true;
	161	+ }
	162	+
	163	+ // nom num, nom num pog
	164	+ if (isNom(candidate) && agreedNum(candidate, m)) {
	165	+ if (agreedGenderOrPerson(candidate, m))
	166	+ hasNomNumPOG = true;
	167	+ hasNomNum = true;
	168	+ }
	169	+
	170	+ // nom pog, num pog
	171	+ if (agreedGenderOrPerson(candidate, m))
	172	+ if (isNom(candidate)) {
	173	+ if (hasNomPOG)
	174	+ has2NomPOG = true;
	175	+ hasNomPOG = true;
	176	+ } else if (agreedNum(candidate, m))
	177	+ hasNumPOG = true;
	178	+
	179	+ prev = candidate;
	180	+ }
	181	+
	182	+ // features.put("conj_" + prefix, hasConj);
	183	+ features.put("cand_2_nom_" + prefix, has2Nom);
	184	+ features.put("cand_2_POG_" + prefix, has2POG);
	185	+ features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
	186	+
	187	+ features.put("cand_nom_" + prefix, hasNom);
	188	+ features.put("cand_num_" + prefix, hasNum);
	189	+ features.put("cand_POG_" + prefix, hasPOG);
	190	+
	191	+ features.put("cand_nom+num_" + prefix, hasNomNum);
	192	+ features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
	193	+ features.put("cand_nom+POG_" + prefix, hasNomPOG);
	194	+ features.put("cand_num+POG_" + prefix, hasNumPOG);
	195	+ }
	196	+
	197	+ private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
	198	+
	199	+ int idx = s.indexOf(m);
	200	+ int from = Math.max(0, idx - pre);
	201	+ int to = Math.min(s.size(), idx + post + 1);
	202	+
	203	+ return new ArrayList<>(s.subList(from, to));
	204	+ }
	205	+
	206	+ private static boolean isPrevPraet(Token m, Sentence s) {
	207	+ Token prev = null;
	208	+ for (Token morph : s) {
	209	+ if (m.equals(morph))
	210	+ break;
	211	+ prev = morph;
	212	+ }
	213	+ return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
	214	+ }
	215	+
	216	+ /**
	217	+ * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
	218	+ * lub (jak przy streszczeniach: w środku musi być czasownik w formie
	219	+ * osobowej),
	220	+ */
	221	+ public static List<Token> getClause(Sentence s, Token m2) {
	222	+
	223	+ List<List<Token>> sublists = getClauses(s);
	224	+
	225	+ for (List<Token> sub : sublists)
	226	+ for (Token m : sub)
	227	+ if (m.equals(m2))
	228	+ return sub;
	229	+
	230	+ return null;
	231	+ }
	232	+
	233	+ public static List<List<Token>> getClauses(Sentence s) {
	234	+
	235	+ Set<Token> noSplitMorphs = new HashSet<>();
	236	+ for (SyntacticGroup g : s.getGroups()) {
	237	+ for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
	238	+ noSplitMorphs.add(m);
	239	+ }
	240	+ }
	241	+ for (SyntacticWord g : s.getSyntacticWords()) {
	242	+ for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
	243	+ noSplitMorphs.add(m);
	244	+ }
	245	+ }
	246	+
	247	+ LinkedList<List<Token>> sublists = new LinkedList<>();
	248	+ List<Token> currentSublist = new ArrayList<>();
	249	+ boolean clauseHasVerb = false;
	250	+ for (Token m : s) {
	251	+ String base = m.getChosenInterpretation().getBase();
	252	+ if (!noSplitMorphs.contains(m)
	253	+ && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) \|\| ((CLAUSE_SPLIT_LEMMAS.contains(base) \|\| CLAUSE_SPLIT_LEMMAS2
	254	+ .contains(base)) && clauseHasVerb))) {
	255	+ sublists.add(currentSublist);
	256	+ currentSublist = new ArrayList<>();
	257	+ clauseHasVerb = false;
	258	+ } else {
	259	+ if (isVerb(m))
	260	+ clauseHasVerb = true;
	261	+ }
	262	+ currentSublist.add(m);
	263	+ }
	264	+ if (currentSublist.size() > 0) {
	265	+ if (clauseHasVerb)
	266	+ sublists.add(currentSublist);
	267	+ else
	268	+ sublists.getLast().addAll(currentSublist);
	269	+ }
	270	+
	271	+ // merge clause beginning with zaimek wzgl. etc to previous clause
	272	+ List<Token> prev = null;
	273	+ Iterator<List<Token>> it = sublists.iterator();
	274	+ while (it.hasNext()) {
	275	+ List<Token> sublist = it.next();
	276	+ boolean containsRelPron = false;
	277	+ int i = 1;
	278	+ for (Token m : sublist) {
	279	+ if (i > 2)
	280	+ break;
	281	+ if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
	282	+ containsRelPron = true;
	283	+ break;
	284	+ }
	285	+ i++;
	286	+ }
	287	+ if (prev != null && containsRelPron) {
	288	+ prev.addAll(sublist);
	289	+ it.remove();
	290	+ } else
	291	+ prev = sublist;
	292	+ }
	293	+
	294	+ return sublists;
	295	+ }
	296	+
	297	+ private static boolean agreedNum(Token candidate, Token keyword) {
	298	+ String keywordNum = keyword.getNumber();
	299	+ String wordNum = candidate.getNumber();
	300	+ return keywordNum.equals(wordNum);
	301	+ }
	302	+
	303	+ private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
	304	+ if (isPraet(keyword)) {
	305	+ // praet has number:gender
	306	+ String keywordGender = keyword.getGender();
	307	+ String wordGender = candidate.getGender();
	308	+ return keywordGender.equals(wordGender);
	309	+ } else {
	310	+ // other verbs have number:person
	311	+ String keywordPerson = keyword.getPerson();
	312	+ String wordPerson = "ter"; // default
	313	+ if (PRONOUN_TAGS.contains(candidate))
	314	+ wordPerson = candidate.getPerson();
	315	+ return wordPerson.equals(keywordPerson);
	316	+ }
	317	+ }
	318	+
	319	+ private static boolean isJakJako(Token prev) {
	320	+ String base = prev == null ? null : prev.getBase();
	321	+ return prev != null && (base.equals("jak") \|\| base.equals("jako"));
	322	+ }
	323	+
	324	+ private static boolean isPraet(Token keyword) {
	325	+ return keyword.getCtag().equals("praet");
	326	+ }
	327	+
	328	+ private static boolean isNom(Token candidate) {
	329	+ return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
	330	+ // tylko!
	331	+ }
	332	+
	333	+ private static boolean isNoun(Token m) {
	334	+ return NOUN_TAGS.contains(m.getCtag());
	335	+ }
	336	+
	337	+ public static boolean isVerb(Token morph) {
	338	+ return VERB_TAGS.contains(morph.getCtag());
	339	+ }
	340	+
	341	+ public static boolean isVerb(Mention m) {
	342	+ boolean hasOnlyVerbs = true;
	343	+ for (Token morph : m.getSegments())
	344	+ if (!isVerb(morph)) {
	345	+ hasOnlyVerbs = false;
	346	+ break;
	347	+ }
	348	+ return hasOnlyVerbs;
	349	+ }
	350	+
	351	+ public static boolean isVerb(TEIMention m) {
	352	+ boolean hasOnlyVerbs = true;
	353	+ for (TEIMorph morph : m.getMorphs())
	354	+ if (!isVerb(morph)) {
	355	+ hasOnlyVerbs = false;
	356	+ break;
	357	+ }
	358	+ return hasOnlyVerbs;
	359	+ }
	360	+
	361	+ private static boolean isVerb(TEIMorph morph) {
	362	+ return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
	363	+ }
	364	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import ipipan.clarin.tei.api.entities.TEICorpusText;
	4	+import ipipan.clarin.tei.api.io.IOUtils;
	5	+import ipipan.clarin.tei.api.io.TEI_IO;
	6	+
	7	+import java.io.File;
	8	+import java.util.ArrayList;
	9	+import java.util.HashSet;
	10	+import java.util.List;
	11	+import java.util.Map.Entry;
	12	+import java.util.Set;
	13	+import java.util.TreeMap;
	14	+import java.util.TreeSet;
	15	+
	16	+import org.apache.log4j.Logger;
	17	+
	18	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	19	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	20	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	21	+import pl.waw.ipipan.zil.core.md.entities.Text;
	22	+import pl.waw.ipipan.zil.core.md.entities.Token;
	23	+import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
	24	+import weka.core.Attribute;
	25	+import weka.core.FastVector;
	26	+import weka.core.Instance;
	27	+import weka.core.Instances;
	28	+
	29	+public class InstanceCreator {
	30	+
	31	+ final private static Logger logger = Logger.getLogger(InstanceCreator.class);
	32	+ final private static TEI_IO teiIO = TEI_IO.getInstance();
	33	+
	34	+ public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
	35	+ int allTexts = 0;
	36	+ int exceptions = 0;
	37	+ int allSentences = 0;
	38	+
	39	+ List<TreeMap<String, Object>> examples = new ArrayList<>();
	40	+ for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
	41	+ try {
	42	+ allTexts++;
	43	+ logger.info("Processing text " + textDir);
	44	+ TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
	45	+ Text text = TeiLoader.loadTextFromTei(ct);
	46	+
	47	+ for (Paragraph p : text)
	48	+ for (Sentence s : p) {
	49	+ allSentences++;
	50	+ loadExamplesFromSentence(quasiVerbs, examples, s);
	51	+ }
	52	+
	53	+ } catch (Exception e) {
	54	+ logger.error(e.getLocalizedMessage());
	55	+ exceptions++;
	56	+ }
	57	+ }
	58	+
	59	+ logger.info(allTexts + " texts found.");
	60	+ if (exceptions != 0)
	61	+ logger.error(exceptions + " texts with exceptions.");
	62	+ logger.info(allSentences + " sentences found.");
	63	+
	64	+ return examples;
	65	+ }
	66	+
	67	+ public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
	68	+ Sentence s) {
	69	+
	70	+ // collect positive examples
	71	+ Set<Token> positive = new HashSet<>();
	72	+ for (Mention m : s.getMentions()) {
	73	+ if (FeatureGeneration.isVerb(m)) {
	74	+ positive.addAll(m.getSegments());
	75	+ }
	76	+ }
	77	+
	78	+ for (Token m : s) {
	79	+ if (!FeatureGeneration.isVerb(m))
	80	+ continue;
	81	+
	82	+ TreeMap<String, Object> features = new TreeMap<>();
	83	+ if (positive.contains(m)) {
	84	+ features.put("class", Boolean.valueOf(true));
	85	+ } else {
	86	+ features.put("class", Boolean.valueOf(false));
	87	+ }
	88	+
	89	+ FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
	90	+ examples.add(features);
	91	+ }
	92	+ }
	93	+
	94	+ public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
	95	+
	96	+ TreeSet<String> booleanAttsOccurred = new TreeSet<>();
	97	+ TreeSet<String> doubleAttsOccurred = new TreeSet<>();
	98	+ TreeMap<String, Set<String>> att2values = new TreeMap<>();
	99	+ for (TreeMap<String, Object> example : examples) {
	100	+ for (Entry<String, Object> e : example.entrySet()) {
	101	+ String key = e.getKey();
	102	+ Object val = e.getValue();
	103	+ if (val instanceof Integer \|\| val instanceof Double) {
	104	+ doubleAttsOccurred.add(key);
	105	+ continue;
	106	+ }
	107	+ if (val instanceof Boolean) {
	108	+ booleanAttsOccurred.add(key);
	109	+ continue;
	110	+ }
	111	+ if (!att2values.containsKey(key))
	112	+ att2values.put(key, new HashSet<String>());
	113	+ att2values.get(key).add(val.toString());
	114	+ }
	115	+ }
	116	+
	117	+ List<Attribute> atts = new ArrayList<>();
	118	+
	119	+ // double attributes
	120	+ for (String attName : doubleAttsOccurred) {
	121	+ Attribute att = new Attribute(attName);
	122	+ atts.add(att);
	123	+ }
	124	+
	125	+ // boolean attributes (treated as nominal)
	126	+ FastVector values = new FastVector(2);
	127	+ values.addElement("false");
	128	+ values.addElement("true");
	129	+ for (String attName : booleanAttsOccurred) {
	130	+ Attribute att = new Attribute(attName, values);
	131	+ atts.add(att);
	132	+ }
	133	+
	134	+ // nominal attributes
	135	+ for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
	136	+ FastVector vals = new FastVector(attVals.getValue().size());
	137	+ for (String val : attVals.getValue())
	138	+ vals.addElement(val);
	139	+ Attribute att = new Attribute(attVals.getKey(), vals);
	140	+ atts.add(att);
	141	+ }
	142	+
	143	+ FastVector fvWekaAttributes = new FastVector(atts.size());
	144	+ for (Attribute attr : atts) {
	145	+ fvWekaAttributes.addElement(attr);
	146	+ }
	147	+
	148	+ Instances data = new Instances("Zero", fvWekaAttributes, 10);
	149	+ data.setClass(data.attribute(classFeatureName));
	150	+ return data;
	151	+ }
	152	+
	153	+ public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
	154	+ for (TreeMap<String, Object> example : examples) {
	155	+ Instance instance = new Instance(instances.numAttributes());
	156	+
	157	+ for (Entry<String, Object> e : example.entrySet()) {
	158	+ Object val = e.getValue();
	159	+ String name = e.getKey();
	160	+ if (val instanceof Integer) {
	161	+ instance.setValue(instances.attribute(name), (int) val);
	162	+ } else if (val instanceof Boolean) {
	163	+ instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
	164	+ } else {
	165	+ int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
	166	+ if (indexOfValue == -1) {
	167	+ logger.debug("Unkown value: " + val.toString() + " of feature: " + name
	168	+ + ". Marking as missing value.");
	169	+ instance.setMissing(instances.attribute(name));
	170	+ } else
	171	+ instance.setValue(instances.attribute(name), indexOfValue);
	172	+ }
	173	+ }
	174	+
	175	+ instance.setDataset(instances);
	176	+ instances.add(instance);
	177	+ }
	178	+ }
	179	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.io.Serializable;
	4	+import java.util.List;
	5	+import java.util.Set;
	6	+import java.util.TreeMap;
	7	+
	8	+import org.apache.log4j.Logger;
	9	+
	10	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	11	+import weka.classifiers.Classifier;
	12	+import weka.core.Instance;
	13	+import weka.core.Instances;
	14	+
	15	+public class Model implements Serializable {
	16	+
	17	+ private static final long serialVersionUID = 3351727361273283076L;
	18	+ private static final Logger logger = Logger.getLogger(Model.class);
	19	+
	20	+ private Classifier classifier;
	21	+ private Set<String> quasiVerbs;
	22	+ private Instances instances;
	23	+
	24	+ public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
	25	+ this.classifier = classifier;
	26	+ this.instances = instances;
	27	+ this.quasiVerbs = quasiVerbs;
	28	+ }
	29	+
	30	+ public boolean isZeroSubject(Instance instance, Sentence sentence) {
	31	+ try {
	32	+ double response = this.classifier.classifyInstance(instance);
	33	+ return response > 0;
	34	+ } catch (Exception e) {
	35	+ logger.error("Error classyfing verb in sentence: " + sentence);
	36	+ return false;
	37	+ }
	38	+ }
	39	+
	40	+ public Instances getInstances(List<TreeMap<String, Object>> examples) {
	41	+ Instances instances = new Instances(this.instances);
	42	+ InstanceCreator.fillInstances(examples, instances);
	43	+ return instances;
	44	+ }
	45	+
	46	+ public Set<String> getQuasiVerbs() {
	47	+ return quasiVerbs;
	48	+ }
	49	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.io.InputStream;
	4	+
	5	+import weka.core.SerializationHelper;
	6	+
	7	+public class Serializer {
	8	+
	9	+ public static void saveModel(Model m, String targetModelFilePath) throws Exception {
	10	+ SerializationHelper.write(targetModelFilePath, m);
	11	+ }
	12	+
	13	+ public static Model loadModel(String path) throws Exception {
	14	+ Model m = (Model) SerializationHelper.read(path);
	15	+ return m;
	16	+ }
	17	+
	18	+ public static Model loadModelFromStream(InputStream stream) throws Exception {
	19	+ Model m = (Model) SerializationHelper.read(stream);
	20	+ return m;
	21	+ }
	22	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.io.BufferedReader;
	4	+import java.io.File;
	5	+import java.io.IOException;
	6	+import java.io.InputStream;
	7	+import java.io.InputStreamReader;
	8	+import java.util.HashSet;
	9	+import java.util.List;
	10	+import java.util.Random;
	11	+import java.util.Set;
	12	+import java.util.TreeMap;
	13	+
	14	+import org.apache.log4j.Logger;
	15	+
	16	+import weka.classifiers.Evaluation;
	17	+import weka.classifiers.rules.JRip;
	18	+import weka.classifiers.rules.JRip.RipperRule;
	19	+import weka.core.Attribute;
	20	+import weka.core.Instance;
	21	+import weka.core.Instances;
	22	+
	23	+public class Trainer {
	24	+
	25	+ final private static Logger logger = Logger.getLogger(Trainer.class);
	26	+
	27	+ private static final boolean DO_CV = false;
	28	+ private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
	29	+
	30	+ public static void main(String[] args) {
	31	+
	32	+ if (args.length != 2) {
	33	+ logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
	34	+ + " trainDir targetModelFile");
	35	+ return;
	36	+ }
	37	+
	38	+ File dataDir = new File(args[0]);
	39	+ String targetModelFilePath = args[1];
	40	+
	41	+ if (!dataDir.isDirectory()) {
	42	+ logger.error(dataDir + " is not a directory!");
	43	+ return;
	44	+ }
	45	+
	46	+ Set<String> quasiVerbs = loadQuasiVerbs();
	47	+
	48	+ List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
	49	+ Instances instances = InstanceCreator.createInstances(examples, "class");
	50	+ InstanceCreator.fillInstances(examples, instances);
	51	+
	52	+ printStats(instances);
	53	+
	54	+ try {
	55	+ JRip model = new JRip();
	56	+
	57	+ if (DO_CV) {
	58	+ logger.info("Crossvalidation...");
	59	+ Evaluation eval = new Evaluation(instances);
	60	+ eval.crossValidateModel(model, instances, 10, new Random(1));
	61	+ logger.info(eval.toSummaryString());
	62	+ logger.info(eval.toMatrixString());
	63	+ logger.info(eval.toClassDetailsString());
	64	+ }
	65	+
	66	+ logger.info("Building final classifier...");
	67	+ model = new JRip();
	68	+ model.buildClassifier(instances);
	69	+ logger.info(model.getRuleset().size() + " rules generated.");
	70	+ for (int i = 0; i < model.getRuleset().size(); i++) {
	71	+ RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
	72	+ logger.info("\t" + v.toString(instances.classAttribute()));
	73	+ }
	74	+
	75	+ instances.delete();
	76	+ logger.info("Features stats:");
	77	+ for (int i = 0; i < instances.numAttributes(); i++) {
	78	+ Attribute att = instances.attribute(i);
	79	+ logger.info(i + ".\t" + att.toString());
	80	+ }
	81	+
	82	+ logger.info("Saving classifier...");
	83	+ Model m = new Model(model, instances, quasiVerbs);
	84	+ Serializer.saveModel(m, targetModelFilePath);
	85	+ logger.info("Done.");
	86	+
	87	+ } catch (Exception e) {
	88	+ logger.error("Error: " + e);
	89	+ }
	90	+ }
	91	+
	92	+ private static Set<String> loadQuasiVerbs() {
	93	+ Set<String> quasiVerbs = new HashSet<>();
	94	+ InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
	95	+ try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
	96	+ String line = null;
	97	+ while ((line = br.readLine()) != null) {
	98	+ quasiVerbs.add(line.trim());
	99	+ }
	100	+ } catch (IOException e) {
	101	+ logger.error(e.getLocalizedMessage());
	102	+ }
	103	+ return quasiVerbs;
	104	+ }
	105	+
	106	+ private static void printStats(Instances instances) {
	107	+ int positive = 0;
	108	+ int negative = 0;
	109	+ for (int i = 0; i < instances.numInstances(); i++) {
	110	+ Instance inst = instances.instance(i);
	111	+ if (inst.classValue() > 0)
	112	+ negative++;
	113	+ else
	114	+ positive++;
	115	+ }
	116	+ logger.info(positive + " positive examples");
	117	+ logger.info(negative + " negative examples");
	118	+ logger.info((positive + negative) + " examples total");
	119	+ logger.info((instances.numAttributes() - 1) + " attributes");
	120	+ logger.info(instances.toSummaryString());
	121	+ }
	122	+
	123	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.io.File;
	4	+import java.io.InputStream;
	5	+import java.util.ArrayList;
	6	+import java.util.HashSet;
	7	+import java.util.List;
	8	+import java.util.Set;
	9	+import java.util.TreeMap;
	10	+
	11	+import org.apache.log4j.Logger;
	12	+
	13	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	14	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	15	+import pl.waw.ipipan.zil.core.md.entities.Token;
	16	+import weka.core.Instances;
	17	+
	18	+public class ZeroSubjectDetector {
	19	+ final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class);
	20	+
	21	+ private Model model;
	22	+ private Set<String> quasiVerbs = new HashSet<>();
	23	+
	24	+ public static int verbsWithoutSubject = 0;
	25	+ public static int verbsWithSubject = 0;
	26	+
	27	+ public void addZeroSubjectMentions(Sentence sentence) {
	28	+ List<TreeMap<String, Object>> examples = new ArrayList<>();
	29	+ InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
	30	+ if (examples.isEmpty())
	31	+ return;
	32	+
	33	+ Instances instances = model.getInstances(examples);
	34	+
	35	+ // label instances
	36	+ List<Boolean> areZeros = new ArrayList<>();
	37	+ for (int i = 0; i < instances.numInstances(); i++) {
	38	+ boolean isZero = model.isZeroSubject(instances.instance(i), sentence);
	39	+ areZeros.add(isZero);
	40	+ if (isZero)
	41	+ verbsWithoutSubject++;
	42	+ else
	43	+ verbsWithSubject++;
	44	+ }
	45	+
	46	+ int i = 0;
	47	+ for (Token m : sentence) {
	48	+ if (!FeatureGeneration.isVerb(m))
	49	+ continue;
	50	+ if (areZeros.get(i))
	51	+ sentence.addMention(new Mention(m, true));
	52	+ i++;
	53	+ }
	54	+ }
	55	+
	56	+ public ZeroSubjectDetector(File zeroSubjectDetectionModel) {
	57	+ try {
	58	+ this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
	59	+ this.quasiVerbs = this.model.getQuasiVerbs();
	60	+ } catch (Exception e) {
	61	+ logger.error("Error loading model:" + e);
	62	+ }
	63	+ }
	64	+
	65	+ public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) {
	66	+ try {
	67	+ this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
	68	+ this.quasiVerbs = this.model.getQuasiVerbs();
	69	+ } catch (Exception e) {
	70	+ logger.error("Error loading model:" + e);
	71	+ }
	72	+ }
	73	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import pl.waw.ipipan.zil.core.md.detection.zero.Constants;
	4	+
	5	+public class Interpretation {
	6	+ private String ctag = "null";
	7	+ private String base = "null";
	8	+
	9	+ private String number = "null";
	10	+ private String casee = "null";
	11	+ private String gender = "null";
	12	+ private String person = "null";
	13	+
	14	+ public Interpretation(String ctag2, String morph, String base) {
	15	+ this.ctag = ctag2;
	16	+ this.base = base;
	17	+
	18	+ String[] spl = morph.split(":");
	19	+ if (ctag.equalsIgnoreCase("subst") \|\| ctag.equalsIgnoreCase("depr") \|\| ctag.equalsIgnoreCase("ger")) {
	20	+ this.number = spl[0];
	21	+ this.casee = spl[1];
	22	+ this.gender = spl[2];
	23	+ } else if (ctag.equalsIgnoreCase("ppron12") \|\| ctag.equalsIgnoreCase("ppron3")) {
	24	+ this.number = spl[0];
	25	+ this.casee = spl[1];
	26	+ this.gender = spl[2];
	27	+ this.person = spl[3];
	28	+ } else if (ctag.equalsIgnoreCase("siebie")) {
	29	+ this.casee = spl[0];
	30	+ } else if (Constants.VERB_TAGS.contains(ctag)) {
	31	+ this.number = spl[0];
	32	+ if (ctag.matches("winien\|praet"))
	33	+ this.gender = spl[1];
	34	+ else
	35	+ this.person = spl[1];
	36	+ }
	37	+ }
	38	+
	39	+ public String getCtag() {
	40	+ return this.ctag;
	41	+ }
	42	+
	43	+ public String getNumber() {
	44	+ return this.number;
	45	+ }
	46	+
	47	+ public String getGender() {
	48	+ return this.gender;
	49	+ }
	50	+
	51	+ public String getCase() {
	52	+ return this.casee;
	53	+ }
	54	+
	55	+ public String getBase() {
	56	+ return this.base;
	57	+ }
	58	+
	59	+ public String getPerson() {
	60	+ return this.person;
	61	+ }
	62	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.List;
	5	+
	6	+/**
	7	+ * @author Mateusz Kopec
	8	+ *
	9	+ */
	10	+public class Mention implements Comparable<Mention> {
	11	+
	12	+ private MentionGroup mentionGroup = null;
	13	+
	14	+ private List<Token> segments = new ArrayList<Token>();
	15	+ private List<Token> headSegments = new ArrayList<Token>();
	16	+
	17	+ private boolean isZeroSubject = false;
	18	+
	19	+ // empty if no head info gathered for multi-segment mention
	20	+ // if single-segment mention, then this segment is head
	21	+
	22	+ public Mention(Token segment) {
	23	+ this(segment, false);
	24	+ }
	25	+
	26	+ public Mention(List<Token> segments, List<Token> heads, boolean isZero) {
	27	+ for (Token s : segments) {
	28	+ s.addMention(this);
	29	+ this.segments.add(s);
	30	+ }
	31	+ this.headSegments.addAll(heads);
	32	+ this.isZeroSubject = isZero;
	33	+ }
	34	+
	35	+ public Mention(List<Token> segments, List<Token> heads) {
	36	+ this(segments, heads, false);
	37	+ }
	38	+
	39	+ public Mention(Token token, boolean isZero) {
	40	+ this.isZeroSubject = isZero;
	41	+ token.addMention(this);
	42	+ this.segments.add(token);
	43	+ this.headSegments.add(token);
	44	+ }
	45	+
	46	+ public void addSegment(Token s) {
	47	+ s.addMention(this);
	48	+ this.segments.add(s);
	49	+ }
	50	+
	51	+ public void addHeadSegment(Token s) {
	52	+ this.headSegments.add(s);
	53	+ }
	54	+
	55	+ public List<Token> getSegments() {
	56	+ return segments;
	57	+ }
	58	+
	59	+ public Token getFirstSegment() {
	60	+ return segments.get(0);
	61	+ }
	62	+
	63	+ public Token getLastSegment() {
	64	+ return segments.get(segments.size() - 1);
	65	+ }
	66	+
	67	+ private Token getLastHeadSegment() {
	68	+ List<Token> hs = this.getHeadSegments();
	69	+ if (hs.size() != 0)
	70	+ return hs.get(hs.size() - 1);
	71	+ return null;
	72	+ }
	73	+
	74	+ public String toString() {
	75	+ StringBuffer sb = new StringBuffer();
	76	+ sb.append("[");
	77	+ for (Token seg : segments) {
	78	+ sb.append(seg.toString() + " ");
	79	+ }
	80	+ sb.append("]");
	81	+ return sb.toString();
	82	+ }
	83	+
	84	+ public MentionGroup getMentionGroup() {
	85	+ return mentionGroup;
	86	+ }
	87	+
	88	+ public void setMentionGroup(MentionGroup mentionGroup) {
	89	+ this.mentionGroup = mentionGroup;
	90	+ }
	91	+
	92	+ public List<Token> getHeadSegments() {
	93	+ return headSegments;
	94	+ }
	95	+
	96	+ public int getNoOfParentMentions() {
	97	+ int result = -1; // because we don't want to count this mention
	98	+
	99	+ // each parenting mention must contain all the segments of this one
	100	+ for (Mention m : getFirstSegment().getMentions()) {
	101	+ if (m.getSegments().containsAll(getSegments()))
	102	+ result++;
	103	+ }
	104	+ return result;
	105	+ }
	106	+
	107	+ public boolean isPronoun() {
	108	+ return this.segments.get(0).getChosenInterpretation().getCtag().matches("ppron.*");
	109	+ }
	110	+
	111	+ @Override
	112	+ public int hashCode() {
	113	+ final int prime = 31;
	114	+ int result = 1;
	115	+ result = prime * result + ((headSegments == null) ? 0 : headSegments.hashCode());
	116	+ result = prime * result + ((segments == null) ? 0 : segments.hashCode());
	117	+ return result;
	118	+ }
	119	+
	120	+ @Override
	121	+ public boolean equals(Object obj) {
	122	+ if (this == obj)
	123	+ return true;
	124	+ if (obj == null)
	125	+ return false;
	126	+ if (getClass() != obj.getClass())
	127	+ return false;
	128	+ Mention other = (Mention) obj;
	129	+ if (headSegments == null) {
	130	+ if (other.headSegments != null)
	131	+ return false;
	132	+ } else if (!headSegments.equals(other.headSegments))
	133	+ return false;
	134	+ if (segments == null) {
	135	+ if (other.segments != null)
	136	+ return false;
	137	+ } else if (!segments.equals(other.segments))
	138	+ return false;
	139	+ return true;
	140	+ }
	141	+
	142	+ @Override
	143	+ public int compareTo(Mention other) {
	144	+ Token thisLastSegment = getLastSegment();
	145	+ Token anotherLastSegment = other.getLastSegment();
	146	+
	147	+ Sentence thisSentence = thisLastSegment.getSentence();
	148	+ Sentence anotherSentence = anotherLastSegment.getSentence();
	149	+
	150	+ Paragraph thisParagraph = thisSentence == null ? null : thisSentence.getParagraph();
	151	+ Paragraph anotherParagraph = anotherSentence == null ? null : anotherSentence.getParagraph();
	152	+
	153	+ String thisTextId = thisParagraph == null ? null : thisParagraph.getText().getId();
	154	+ String anotherTextId = anotherParagraph == null ? null : anotherParagraph.getText().getId();
	155	+
	156	+ int compare;
	157	+ // first, compare by ids of texts
	158	+ if (thisTextId != null && anotherTextId != null) {
	159	+ compare = thisTextId.compareTo(anotherTextId);
	160	+ if (compare != 0)
	161	+ return compare;
	162	+ }
	163	+
	164	+ // second, compare by paragraph position
	165	+ if (thisParagraph != null && anotherParagraph != null) {
	166	+ compare = thisParagraph.getTextPosition().compareTo(anotherParagraph.getTextPosition());
	167	+ if (compare != 0)
	168	+ return compare;
	169	+
	170	+ // third, compare by sentence position
	171	+ compare = thisSentence.getParagraphPosition().compareTo(anotherSentence.getParagraphPosition());
	172	+ if (compare != 0)
	173	+ return compare;
	174	+ }
	175	+
	176	+ // fourth, compare by last segments
	177	+ compare = thisLastSegment.getSentencePosition().compareTo(anotherLastSegment.getSentencePosition());
	178	+ if (compare != 0)
	179	+ return compare;
	180	+
	181	+ // fifth, compare by size
	182	+ Integer thisSize = getSegments().size();
	183	+ Integer anotherSize = other.getSegments().size();
	184	+ compare = thisSize.compareTo(anotherSize);
	185	+ if (compare != 0)
	186	+ return compare;
	187	+
	188	+ // sixth, compare by last head segments
	189	+ Token thisLastHeadSegment = getLastHeadSegment();
	190	+ Token anotherLastHeadSegment = other.getLastHeadSegment();
	191	+ if (thisLastHeadSegment != null && anotherLastHeadSegment != null) {
	192	+ compare = thisLastHeadSegment.getSentencePosition().compareTo(anotherLastHeadSegment.getSentencePosition());
	193	+ }
	194	+
	195	+ // seventh, compare by head segments size
	196	+ thisSize = getHeadSegments().size();
	197	+ anotherSize = other.getHeadSegments().size();
	198	+ compare = thisSize.compareTo(anotherSize);
	199	+
	200	+ return compare;
	201	+ }
	202	+
	203	+ public boolean isZeroSubject() {
	204	+ return isZeroSubject;
	205	+ }
	206	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.Comparator;
	5	+
	6	+public class MentionGroup extends ArrayList<Mention> {
	7	+
	8	+ private static final long serialVersionUID = 7051256137623728016L;
	9	+ private String dominant;
	10	+
	11	+ public MentionGroup() {
	12	+ }
	13	+
	14	+ public MentionGroup(Mention currentMention) {
	15	+ add(currentMention);
	16	+ }
	17	+
	18	+ public boolean add(Mention m) {
	19	+ m.setMentionGroup(this);
	20	+ return super.add(m);
	21	+ }
	22	+
	23	+ public Mention getLastAddedMention() {
	24	+ return this.get(this.size() - 1);
	25	+ }
	26	+
	27	+ public final static Comparator<MentionGroup> getMentionGroupComparator() {
	28	+ return mentionGroupComparator;
	29	+ }
	30	+
	31	+ private final static Comparator<MentionGroup> mentionGroupComparator = new Comparator<MentionGroup>() {
	32	+
	33	+ public int compare(MentionGroup mg1, MentionGroup mg2) {
	34	+ Mention m1 = mg1.getLastAddedMention();
	35	+ Mention m2 = mg2.getLastAddedMention();
	36	+ return m1.compareTo(m2);
	37	+ }
	38	+
	39	+ };
	40	+
	41	+ public void setDominant(String string) {
	42	+ this.dominant = string;
	43	+ }
	44	+
	45	+ public String getDominant() {
	46	+ return this.dominant;
	47	+ }
	48	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.Iterator;
	4	+import java.util.List;
	5	+
	6	+public class NamedEntity implements Comparable<NamedEntity> {
	7	+
	8	+ private List<Token> tokens;
	9	+
	10	+ public NamedEntity(List<Token> tokens) {
	11	+ this.tokens = tokens;
	12	+ }
	13	+
	14	+ public List<Token> getTokens() {
	15	+ return this.tokens;
	16	+ }
	17	+
	18	+ @Override
	19	+ public int compareTo(NamedEntity o) {
	20	+ Iterator<Token> it1 = getTokens().iterator();
	21	+ Iterator<Token> it2 = o.getTokens().iterator();
	22	+ while (it1.hasNext() && it2.hasNext()) {
	23	+ Token t1 = it1.next();
	24	+ Token t2 = it2.next();
	25	+ if (t1.compareTo(t2) != 0)
	26	+ return t1.compareTo(t2);
	27	+ }
	28	+ if (it1.hasNext())
	29	+ return 1;
	30	+ if (it2.hasNext())
	31	+ return -1;
	32	+
	33	+ return 0;
	34	+ }
	35	+
	36	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+
	5	+public class Paragraph extends ArrayList<Sentence>{
	6	+
	7	+ private static final long serialVersionUID = 4871431562737902082L;
	8	+
	9	+ private Text text;
	10	+ private int textPosition;
	11	+
	12	+ public boolean add(Sentence s) {
	13	+ s.setParagraphPosition(this.size());
	14	+ s.setParagraph(this);
	15	+ return super.add(s);
	16	+ }
	17	+
	18	+ public String toString() {
	19	+ StringBuffer sb = new StringBuffer();
	20	+ for (Sentence sentence : this)
	21	+ sb.append(sentence.toString()+"\n");
	22	+ return sb.toString();
	23	+ }
	24	+
	25	+ public Text getText() {
	26	+ return this.text;
	27	+ }
	28	+
	29	+ public void setText(Text text) {
	30	+ this.text = text;
	31	+ }
	32	+
	33	+ public Integer getTextPosition() {
	34	+ return this.textPosition;
	35	+ }
	36	+
	37	+ public void setTextPosition(int textPos) {
	38	+ this.textPosition = textPos;
	39	+ }
	40	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.List;
	5	+import java.util.Set;
	6	+import java.util.TreeSet;
	7	+
	8	+public class Sentence extends ArrayList<Token> {
	9	+
	10	+ private static final long serialVersionUID = -7300822552646737716L;
	11	+
	12	+ private Paragraph paragraph;
	13	+ private int paragraphPosition;
	14	+
	15	+ private Set<Mention> mentions = new TreeSet<>();
	16	+ private Set<SyntacticWord> syntacticWords = new TreeSet<>();
	17	+ private Set<SyntacticGroup> syntacticGroups = new TreeSet<>();
	18	+ private Set<NamedEntity> namedEntities = new TreeSet<>();
	19	+
	20	+ public boolean add(Token s) {
	21	+ s.setSentencePosition(this.size());
	22	+ s.setSentence(this);
	23	+ return super.add(s);
	24	+ }
	25	+
	26	+ public void setParagraphPosition(int paragraphPosition) {
	27	+ this.paragraphPosition = paragraphPosition;
	28	+ }
	29	+
	30	+ public Integer getParagraphPosition() {
	31	+ return this.paragraphPosition;
	32	+ }
	33	+
	34	+ public void setParagraph(Paragraph paragraph) {
	35	+ this.paragraph = paragraph;
	36	+ }
	37	+
	38	+ public Paragraph getParagraph() {
	39	+ return this.paragraph;
	40	+ }
	41	+
	42	+ public void removeMention(Mention mention) {
	43	+ mentions.remove(mention);
	44	+ for (Token s : mention.getSegments())
	45	+ s.removeMention(mention);
	46	+ }
	47	+
	48	+ public void clearMentions() {
	49	+ for (Mention mention : mentions)
	50	+ for (Token s : mention.getSegments())
	51	+ s.removeMention(mention);
	52	+ mentions.clear();
	53	+ }
	54	+
	55	+ public String toStringWithoutMentions() {
	56	+ StringBuffer sb = new StringBuffer();
	57	+ for (Token seg : this) {
	58	+ if (!seg.toString().matches("\\[.*\\]")) {
	59	+ sb.append(seg.toString());
	60	+ sb.append(" ");
	61	+ }
	62	+ }
	63	+ return sb.toString();
	64	+ }
	65	+
	66	+ public String toString() {
	67	+ StringBuffer sb = new StringBuffer();
	68	+ for (Token seg : this) {
	69	+ for (@SuppressWarnings("unused")
	70	+ Mention m : seg.getMentionsStartingBeforeSegment())
	71	+ sb.append("[");
	72	+ sb.append(seg.toString());
	73	+ for (@SuppressWarnings("unused")
	74	+ Mention m : seg.getMentionsEndingAfterSegment())
	75	+ sb.append("]");
	76	+ sb.append(" ");
	77	+ }
	78	+ return sb.toString();
	79	+ }
	80	+
	81	+ public List<Mention> getMentions() {
	82	+ return new ArrayList<Mention>(mentions);
	83	+ }
	84	+
	85	+ public List<SyntacticWord> getSyntacticWords() {
	86	+ return new ArrayList<>(syntacticWords);
	87	+ }
	88	+
	89	+ public List<NamedEntity> getNamedEntities() {
	90	+ return new ArrayList<>(namedEntities);
	91	+ }
	92	+
	93	+ public List<SyntacticGroup> getGroups() {
	94	+ return new ArrayList<>(syntacticGroups);
	95	+ }
	96	+
	97	+ public void addMention(Mention mention) {
	98	+ mentions.add(mention);
	99	+ }
	100	+
	101	+ public void addSyntacticWord(SyntacticWord syntacticWord) {
	102	+ syntacticWords.add(syntacticWord);
	103	+ }
	104	+
	105	+ public void addSyntacticGroup(SyntacticGroup syntacticGroup) {
	106	+ syntacticGroups.add(syntacticGroup);
	107	+ }
	108	+
	109	+ public void addNamedEntity(NamedEntity namedEntity) {
	110	+ namedEntities.add(namedEntity);
	111	+ }
	112	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.Iterator;
	4	+import java.util.List;
	5	+
	6	+public class SyntacticGroup implements Comparable<SyntacticGroup> {
	7	+
	8	+ private String type;
	9	+ private List<Token> tokens;
	10	+ private List<Token> headTokens;
	11	+
	12	+ public SyntacticGroup(String type, List<Token> tokens,
	13	+ List<Token> headTokens) {
	14	+ this.type = type;
	15	+ this.tokens = tokens;
	16	+ this.headTokens = headTokens;
	17	+ }
	18	+
	19	+ public String getType() {
	20	+ return type;
	21	+ }
	22	+
	23	+ public List<Token> getTokens() {
	24	+ return tokens;
	25	+ }
	26	+
	27	+ public List<Token> getSemanticHeadTokens() {
	28	+ return headTokens;
	29	+ }
	30	+
	31	+ @Override
	32	+ public int compareTo(SyntacticGroup o) {
	33	+ Iterator<Token> it1 = getTokens().iterator();
	34	+ Iterator<Token> it2 = o.getTokens().iterator();
	35	+ while (it1.hasNext() && it2.hasNext()) {
	36	+ Token t1 = it1.next();
	37	+ Token t2 = it2.next();
	38	+ if (t1.compareTo(t2) != 0)
	39	+ return t1.compareTo(t2);
	40	+ }
	41	+ it1 = getSemanticHeadTokens().iterator();
	42	+ it2 = o.getSemanticHeadTokens().iterator();
	43	+ while (it1.hasNext() && it2.hasNext()) {
	44	+ Token t1 = it1.next();
	45	+ Token t2 = it2.next();
	46	+ if (t1.compareTo(t2) != 0)
	47	+ return t1.compareTo(t2);
	48	+ }
	49	+ if (it1.hasNext())
	50	+ return 1;
	51	+ if (it2.hasNext())
	52	+ return -1;
	53	+
	54	+ return getType().compareTo(o.getType());
	55	+ }
	56	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.Iterator;
	5	+import java.util.List;
	6	+
	7	+public class SyntacticWord implements Comparable<SyntacticWord> {
	8	+
	9	+ private String ctag;
	10	+ private List<Token> tokens = new ArrayList<>();
	11	+
	12	+ public SyntacticWord(String ctag, List<Token> tokens) {
	13	+ this.ctag = ctag;
	14	+ this.tokens = tokens;
	15	+ }
	16	+
	17	+ public String getCtag() {
	18	+ return ctag;
	19	+ }
	20	+
	21	+ public List<Token> getTokens() {
	22	+ return tokens;
	23	+ }
	24	+
	25	+ @Override
	26	+ public int compareTo(SyntacticWord o) {
	27	+ Iterator<Token> it1 = getTokens().iterator();
	28	+ Iterator<Token> it2 = o.getTokens().iterator();
	29	+ while (it1.hasNext() && it2.hasNext()) {
	30	+ Token t1 = it1.next();
	31	+ Token t2 = it2.next();
	32	+ if (t1.compareTo(t2) != 0)
	33	+ return t1.compareTo(t2);
	34	+ }
	35	+ if (it1.hasNext())
	36	+ return 1;
	37	+ if (it2.hasNext())
	38	+ return -1;
	39	+
	40	+ return getCtag().compareTo(o.getCtag());
	41	+ }
	42	+
	43	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+
	5	+public class Text extends ArrayList<Paragraph> implements Comparable<Text> {
	6	+
	7	+ private static final long serialVersionUID = 3433069117444647544L;
	8	+
	9	+ private String id;
	10	+
	11	+ public boolean add(Paragraph p) {
	12	+ p.setTextPosition(this.size());
	13	+ p.setText(this);
	14	+ return super.add(p);
	15	+ }
	16	+
	17	+ public String getId() {
	18	+ return id;
	19	+ }
	20	+
	21	+ public void setId(String id) {
	22	+ this.id = id;
	23	+ }
	24	+
	25	+ public Text(String id) {
	26	+ setId(id);
	27	+ }
	28	+
	29	+ public String toString() {
	30	+ StringBuffer sb = new StringBuffer();
	31	+ for (Paragraph par : this)
	32	+ sb.append(par.toString() + "\n\n");
	33	+ return sb.toString();
	34	+ }
	35	+
	36	+ public int compareTo(Text o) {
	37	+ return getId().compareTo(o.getId());
	38	+ }
	39	+
	40	+ public void clearMentions() {
	41	+ for (Paragraph p : this)
	42	+ for (Sentence sent : p)
	43	+ sent.clearMentions();
	44	+ }
	45	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
	1	+package pl.waw.ipipan.zil.core.md.entities;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.Collection;
	5	+import java.util.Collections;
	6	+import java.util.HashSet;
	7	+import java.util.List;
	8	+import java.util.Set;
	9	+
	10	+public class Token implements Comparable<Token> {
	11	+ private Sentence sentence;
	12	+ private int sentencePosition;
	13	+
	14	+ private Set<Mention> mentions = null;
	15	+
	16	+ private String orth;
	17	+ private Interpretation chosenInterpretation;
	18	+ private Collection<Interpretation> allInterpretations = new HashSet<Interpretation>();
	19	+
	20	+ public Integer getSentencePosition() {
	21	+ return sentencePosition;
	22	+ }
	23	+
	24	+ public void setSentencePosition(int sentencePosition) {
	25	+ this.sentencePosition = sentencePosition;
	26	+ }
	27	+
	28	+ public Sentence getSentence() {
	29	+ return sentence;
	30	+ }
	31	+
	32	+ public void setSentence(Sentence sentence) {
	33	+ this.sentence = sentence;
	34	+ }
	35	+
	36	+ public void setOrth(String orth2) {
	37	+ this.orth = orth2;
	38	+ }
	39	+
	40	+ public String getOrth() {
	41	+ return this.orth;
	42	+ }
	43	+
	44	+ public void addChosenInterpretation(Interpretation chosenIterpretation) {
	45	+ setChosenInterpretation(chosenIterpretation);
	46	+ addInterpretation(chosenIterpretation);
	47	+ }
	48	+
	49	+ public void setChosenInterpretation(Interpretation chosenIterpretation) {
	50	+ this.chosenInterpretation = chosenIterpretation;
	51	+ }
	52	+
	53	+ public Interpretation getChosenInterpretation() {
	54	+ return this.chosenInterpretation;
	55	+ }
	56	+
	57	+ public String getBase() {
	58	+ return this.getChosenInterpretation().getBase();
	59	+ }
	60	+
	61	+ public String getNumber() {
	62	+ return this.getChosenInterpretation().getNumber();
	63	+ }
	64	+
	65	+ public String getGender() {
	66	+ return this.getChosenInterpretation().getGender();
	67	+ }
	68	+
	69	+ public String getCase() {
	70	+ return this.getChosenInterpretation().getCase();
	71	+ }
	72	+
	73	+ public String getPerson() {
	74	+ return this.getChosenInterpretation().getPerson();
	75	+ }
	76	+
	77	+ public void addInterpretation(Interpretation inter) {
	78	+ this.allInterpretations.add(inter);
	79	+ }
	80	+
	81	+ public String toString() {
	82	+ return orth;
	83	+ }
	84	+
	85	+ public void addMention(Mention mention) {
	86	+ if (this.mentions == null)
	87	+ this.mentions = new HashSet<Mention>();
	88	+
	89	+ this.mentions.add(mention);
	90	+ }
	91	+
	92	+ public void removeMention(Mention mention) {
	93	+ this.mentions.remove(mention);
	94	+ }
	95	+
	96	+ public Set<Mention> getMentions() {
	97	+ if (this.mentions == null)
	98	+ return new HashSet<Mention>();
	99	+ return this.mentions;
	100	+ }
	101	+
	102	+ public List<Mention> getMentionsStartingBeforeSegment() {
	103	+ List<Mention> result = new ArrayList<Mention>();
	104	+ for (Mention m : getMentions())
	105	+ if (m.getFirstSegment().equals(this))
	106	+ result.add(m);
	107	+
	108	+ Collections.sort(result);
	109	+ Collections.reverse(result);
	110	+ return result;
	111	+ }
	112	+
	113	+ public List<Mention> getMentionsEndingAfterSegment() {
	114	+ List<Mention> result = new ArrayList<Mention>();
	115	+ for (Mention m : getMentions())
	116	+ if (m.getLastSegment().equals(this))
	117	+ result.add(m);
	118	+
	119	+ Collections.sort(result);
	120	+ Collections.reverse(result);
	121	+ return result;
	122	+ }
	123	+
	124	+ public String getCtag() {
	125	+ return getChosenInterpretation().getCtag();
	126	+ }
	127	+
	128	+ @Override
	129	+ public int compareTo(Token o) {
	130	+ return getSentencePosition().compareTo(o.getSentencePosition());
	131	+ }
	132	+
	133	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
	1	+package pl.waw.ipipan.zil.core.md.io.tei;
	2	+
	3	+import ipipan.clarin.tei.api.entities.TEICorpusText;
	4	+import ipipan.clarin.tei.api.entities.TEIGroup;
	5	+import ipipan.clarin.tei.api.entities.TEIInterpretation;
	6	+import ipipan.clarin.tei.api.entities.TEIMention;
	7	+import ipipan.clarin.tei.api.entities.TEIMorph;
	8	+import ipipan.clarin.tei.api.entities.TEINamedEntity;
	9	+import ipipan.clarin.tei.api.entities.TEIParagraph;
	10	+import ipipan.clarin.tei.api.entities.TEISentence;
	11	+import ipipan.clarin.tei.api.entities.TEISyntacticEntity;
	12	+import ipipan.clarin.tei.api.entities.TEIWord;
	13	+import ipipan.clarin.tei.api.exceptions.TEIException;
	14	+import ipipan.clarin.tei.api.io.TEI_IO;
	15	+
	16	+import java.io.File;
	17	+import java.util.ArrayList;
	18	+import java.util.HashMap;
	19	+import java.util.List;
	20	+import java.util.Map;
	21	+
	22	+import org.apache.log4j.Logger;
	23	+
	24	+import pl.waw.ipipan.zil.core.md.entities.Interpretation;
	25	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	26	+import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
	27	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	28	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	29	+import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
	30	+import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
	31	+import pl.waw.ipipan.zil.core.md.entities.Text;
	32	+import pl.waw.ipipan.zil.core.md.entities.Token;
	33	+
	34	+public class TeiLoader {
	35	+
	36	+ private static Logger logger = Logger.getLogger(TeiLoader.class);
	37	+ private static TEI_IO teiAPI = TEI_IO.getInstance();
	38	+
	39	+ public static TEICorpusText readTeiText(File teiDir) throws TEIException {
	40	+ return teiAPI.readFromNKJPDirectory(teiDir);
	41	+ }
	42	+
	43	+ public static Text loadTextFromTei(TEICorpusText teiText) {
	44	+ Text text = new Text(teiText.getCorpusHeader().getId());
	45	+
	46	+ logger.debug("Loading tei text " + text.getId() + "...");
	47	+ for (TEIParagraph teiP : teiText.getParagraphs())
	48	+ loadParagraph(text, teiP);
	49	+ logger.debug("Tei text loaded.");
	50	+
	51	+ return text;
	52	+ }
	53	+
	54	+ private static void loadParagraph(Text text, TEIParagraph teiP) {
	55	+ Paragraph p = new Paragraph();
	56	+ text.add(p);
	57	+ for (TEISentence teiS : teiP.getSentences())
	58	+ loadSentence(p, teiS);
	59	+ }
	60	+
	61	+ private static void loadSentence(Paragraph p, TEISentence teiS) {
	62	+ Sentence s = new Sentence();
	63	+ p.add(s);
	64	+ Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
	65	+ for (TEIMorph teiM : teiS.getMorphs()) {
	66	+ Token token = loadToken(s, teiM);
	67	+ teiMorph2Segment.put(teiM, token);
	68	+ }
	69	+ for (TEINamedEntity ne : teiS.getAllNamedEntities())
	70	+ loadNE(s, ne, teiMorph2Segment);
	71	+ for (TEIWord w : teiS.getAllWords())
	72	+ loadSyntacticWord(s, w, teiMorph2Segment);
	73	+ for (TEIGroup g : teiS.getAllGroups())
	74	+ loadSyntacticGroup(s, g, teiMorph2Segment);
	75	+ for (TEIMention m : teiS.getAllMentions())
	76	+ loadMentions(s, m, teiMorph2Segment);
	77	+ }
	78	+
	79	+ private static void loadMentions(Sentence s, TEIMention m,
	80	+ Map<TEIMorph, Token> teiMorph2Segment) {
	81	+ List<Token> tokens = new ArrayList<>();
	82	+ for (TEIMorph mo : m.getMorphs())
	83	+ tokens.add(teiMorph2Segment.get(mo));
	84	+ List<Token> headTokens = new ArrayList<>();
	85	+ for (TEIMorph mo : m.getHeadMorphs())
	86	+ headTokens.add(teiMorph2Segment.get(mo));
	87	+ s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
	88	+ }
	89	+
	90	+ private static void loadSyntacticGroup(Sentence s, TEIGroup g,
	91	+ Map<TEIMorph, Token> teiMorph2Segment) {
	92	+ String type = g.getType();
	93	+
	94	+ List<Token> tokens = new ArrayList<>();
	95	+ for (TEIMorph m : g.getLeaves())
	96	+ tokens.add(teiMorph2Segment.get(m));
	97	+
	98	+ List<Token> headTokens = new ArrayList<>();
	99	+ TEISyntacticEntity semanticHead = g;
	100	+ while (semanticHead.isGroup()
	101	+ && semanticHead.asGroup().getSemanticHead() != null)
	102	+ semanticHead = semanticHead.asGroup().getSemanticHead();
	103	+ for (TEIMorph m : semanticHead.getLeaves())
	104	+ headTokens.add(teiMorph2Segment.get(m));
	105	+
	106	+ s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
	107	+ }
	108	+
	109	+ private static void loadSyntacticWord(Sentence s, TEIWord w,
	110	+ Map<TEIMorph, Token> teiMorph2Segment) {
	111	+ String ctag = w.getInterpretation().getCtag();
	112	+ List<Token> tokens = new ArrayList<>();
	113	+ for (TEIMorph m : w.getAllMorphs())
	114	+ tokens.add(teiMorph2Segment.get(m));
	115	+ s.addSyntacticWord(new SyntacticWord(ctag, tokens));
	116	+ }
	117	+
	118	+ private static void loadNE(Sentence s, TEINamedEntity ne,
	119	+ Map<TEIMorph, Token> teiMorph2Segment) {
	120	+ List<Token> tokens = new ArrayList<>();
	121	+ for (TEIMorph m : ne.getLeaves())
	122	+ tokens.add(teiMorph2Segment.get(m));
	123	+ s.addNamedEntity(new NamedEntity(tokens));
	124	+ }
	125	+
	126	+ private static Token loadToken(Sentence s, TEIMorph teiM) {
	127	+ Token seg = new Token();
	128	+ s.add(seg);
	129	+
	130	+ seg.setOrth(teiM.getOrth());
	131	+ TEIInterpretation interp = teiM.getChosenInterpretation();
	132	+ Interpretation chosenIterpretation = new Interpretation(
	133	+ interp.getCtag(), interp.getMorph(), interp.getBase());
	134	+ seg.addChosenInterpretation(chosenIterpretation);
	135	+
	136	+ for (TEIInterpretation interp2 : teiM.getAllInterpretations()) {
	137	+ Interpretation inter = new Interpretation(interp2.getCtag(),
	138	+ interp2.getMorph(), interp.getBase());
	139	+ seg.addInterpretation(inter);
	140	+ }
	141	+
	142	+ return seg;
	143	+ }
	144	+
	145	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
	1	+package pl.waw.ipipan.zil.core.md.io.tei;
	2	+
	3	+import ipipan.clarin.tei.api.entities.AnnotationLayer;
	4	+import ipipan.clarin.tei.api.entities.EntitiesFactory;
	5	+import ipipan.clarin.tei.api.entities.TEICoreference;
	6	+import ipipan.clarin.tei.api.entities.TEICorpusText;
	7	+import ipipan.clarin.tei.api.entities.TEIMention;
	8	+import ipipan.clarin.tei.api.entities.TEIMorph;
	9	+import ipipan.clarin.tei.api.entities.TEIParagraph;
	10	+import ipipan.clarin.tei.api.entities.TEISentence;
	11	+import ipipan.clarin.tei.api.exceptions.TEIException;
	12	+import ipipan.clarin.tei.api.io.TEI_IO;
	13	+import ipipan.clarin.tei.api.io.TEI_IO.CompressionMethod;
	14	+
	15	+import java.io.File;
	16	+import java.util.ArrayList;
	17	+import java.util.HashMap;
	18	+import java.util.Iterator;
	19	+import java.util.List;
	20	+import java.util.Map;
	21	+
	22	+import org.apache.log4j.Logger;
	23	+
	24	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	25	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	26	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	27	+import pl.waw.ipipan.zil.core.md.entities.Text;
	28	+import pl.waw.ipipan.zil.core.md.entities.Token;
	29	+
	30	+public class TeiSaver {
	31	+
	32	+ private static Logger logger = Logger.getLogger(TeiSaver.class);
	33	+ private static TEI_IO teiAPI = TEI_IO.getInstance();
	34	+ final private static EntitiesFactory ef = EntitiesFactory.getInstance();
	35	+
	36	+ public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
	37	+ logger.debug("Saving text in " + targetDir);
	38	+ CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
	39	+ teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
	40	+ }
	41	+
	42	+ public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
	43	+ Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();
	44	+
	45	+ Iterator<Paragraph> pIt = t.iterator();
	46	+ Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
	47	+ int mentionId = 0;
	48	+ while (pIt.hasNext() && pItTei.hasNext()) {
	49	+ Paragraph p = pIt.next();
	50	+ TEIParagraph pTei = pItTei.next();
	51	+
	52	+ mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
	53	+ }
	54	+ checkIterators(pIt, pItTei, "paragraph");
	55	+
	56	+ teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
	57	+ EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));
	58	+
	59	+ // clear coreference as we have new mentions it became invalid
	60	+ teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
	61	+ teiText.setCoreferences(new ArrayList<TEICoreference>());
	62	+
	63	+ logger.debug(mentionId + " mentions added");
	64	+ }
	65	+
	66	+ private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
	67	+ TEIParagraph pTei) throws TEIException {
	68	+ Iterator<Sentence> sIt = p.iterator();
	69	+ Iterator<TEISentence> sItTei = pTei.getSentences().iterator();
	70	+
	71	+ while (sIt.hasNext() && sItTei.hasNext()) {
	72	+ Sentence s = sIt.next();
	73	+ TEISentence sTei = sItTei.next();
	74	+ mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
	75	+ }
	76	+ checkIterators(sIt, sItTei, "sentence");
	77	+ return mentionId;
	78	+ }
	79	+
	80	+ private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
	81	+ TEISentence sTei) throws TEIException {
	82	+ sTei.getAllMentions().clear();
	83	+
	84	+ Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>();
	85	+
	86	+ Iterator<Token> segIt = s.iterator();
	87	+ Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();
	88	+
	89	+ while (segIt.hasNext() && segItTei.hasNext()) {
	90	+ seg2morph.put(segIt.next(), segItTei.next());
	91	+ }
	92	+ checkIterators(segIt, segItTei, "token");
	93	+
	94	+ List<TEIMention> mentions = new ArrayList<TEIMention>();
	95	+
	96	+ for (Mention m : s.getMentions()) {
	97	+ List<TEIMorph> morphs = new ArrayList<TEIMorph>();
	98	+ List<TEIMorph> heads = new ArrayList<TEIMorph>();
	99	+
	100	+ for (Token seg : m.getSegments())
	101	+ morphs.add(seg2morph.get(seg));
	102	+
	103	+ for (Token seg : m.getHeadSegments())
	104	+ heads.add(seg2morph.get(seg));
	105	+
	106	+ TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
	107	+ mentions.add(mention);
	108	+ mention2mention.put(m, mention);
	109	+ }
	110	+ sTei.setMentions(mentions);
	111	+ return mentionId;
	112	+ }
	113	+
	114	+ private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level)
	115	+ throws TEIException {
	116	+ if (one.hasNext() \|\| other.hasNext())
	117	+ throw new TEIException("Problem mapping tei to thrift for level " + level);
	118	+ }
	119	+
	120	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
	1	+package pl.waw.ipipan.zil.core.md.io.thrift;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.HashMap;
	5	+import java.util.List;
	6	+import java.util.Map;
	7	+
	8	+import org.apache.log4j.Logger;
	9	+
	10	+import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
	11	+import pl.waw.ipipan.multiservice.thrift.types.TInterpretation;
	12	+import pl.waw.ipipan.multiservice.thrift.types.TNamedEntity;
	13	+import pl.waw.ipipan.multiservice.thrift.types.TParagraph;
	14	+import pl.waw.ipipan.multiservice.thrift.types.TSentence;
	15	+import pl.waw.ipipan.multiservice.thrift.types.TSyntacticGroup;
	16	+import pl.waw.ipipan.multiservice.thrift.types.TSyntacticWord;
	17	+import pl.waw.ipipan.multiservice.thrift.types.TText;
	18	+import pl.waw.ipipan.multiservice.thrift.types.TToken;
	19	+import pl.waw.ipipan.zil.core.md.entities.Interpretation;
	20	+import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
	21	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	22	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	23	+import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
	24	+import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
	25	+import pl.waw.ipipan.zil.core.md.entities.Text;
	26	+import pl.waw.ipipan.zil.core.md.entities.Token;
	27	+
	28	+public class ThriftLoader {
	29	+
	30	+ private static Logger logger = Logger.getLogger(ThriftLoader.class);
	31	+
	32	+ public static Text loadTextFromThrift(TText thriftText)
	33	+ throws MultiserviceException {
	34	+ Text text = new Text(thriftText.getTextHeader() == null ? "null"
	35	+ : thriftText.getTextHeader().getId());
	36	+
	37	+ logger.debug("Loading text " + text.getId() + " from thrift format...");
	38	+ for (TParagraph teiP : thriftText.getParagraphs())
	39	+ loadParagraph(text, teiP);
	40	+ logger.debug("Thrift text loaded.");
	41	+
	42	+ return text;
	43	+ }
	44	+
	45	+ private static void loadParagraph(Text text, TParagraph teiP)
	46	+ throws MultiserviceException {
	47	+ Paragraph p = new Paragraph();
	48	+ text.add(p);
	49	+
	50	+ for (TSentence teiS : teiP.getSentences())
	51	+ loadSentence(p, teiS);
	52	+ }
	53	+
	54	+ private static void loadSentence(Paragraph p, TSentence thriftSent)
	55	+ throws MultiserviceException {
	56	+ Sentence s = new Sentence();
	57	+ p.add(s);
	58	+
	59	+ Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);
	60	+
	61	+ Map<String, Token> thiftTokenId2Token = new HashMap<>();
	62	+ for (TToken teiM : thriftSent.getTokens()) {
	63	+ Token token = loadToken(s, teiM);
	64	+ thiftTokenId2Token.put(teiM.getId(), token);
	65	+ }
	66	+ if (thriftSent.isSetNames())
	67	+ for (TNamedEntity ne : thriftSent.getNames())
	68	+ loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
	69	+ if (thriftSent.isSetWords())
	70	+ for (TSyntacticWord w : thriftSent.getWords())
	71	+ loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
	72	+ if (thriftSent.isSetGroups())
	73	+ for (TSyntacticGroup g : thriftSent.getGroups())
	74	+ loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
	75	+ }
	76	+
	77	+ private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
	78	+ Map<String, Object> thirftId2Entity,
	79	+ Map<String, Token> thiftTokenId2Token) {
	80	+ String type = g.getType();
	81	+ List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
	82	+ thiftTokenId2Token, false);
	83	+ List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
	84	+ thiftTokenId2Token, true);
	85	+ s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
	86	+ }
	87	+
	88	+ private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
	89	+ Map<String, Object> thirftId2Entity,
	90	+ Map<String, Token> thiftTokenId2Token) {
	91	+ String ctag = w.getChosenInterpretation().getCtag();
	92	+ List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
	93	+ thiftTokenId2Token, false);
	94	+ s.addSyntacticWord(new SyntacticWord(ctag, tokens));
	95	+ }
	96	+
	97	+ private static void loadNE(Sentence s, TNamedEntity ne,
	98	+ Map<String, Object> thirftId2Entity,
	99	+ Map<String, Token> thiftTokenId2Token) {
	100	+ List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
	101	+ thiftTokenId2Token, false);
	102	+ s.addNamedEntity(new NamedEntity(tokens));
	103	+ }
	104	+
	105	+ private static Map<String, Object> getThriftId2EntityMap(
	106	+ TSentence thriftSent) {
	107	+ Map<String, Object> idToEntity = new HashMap<>();
	108	+ for (TToken tok : thriftSent.getTokens())
	109	+ idToEntity.put(tok.getId(), tok);
	110	+ if (thriftSent.isSetWords())
	111	+ for (TSyntacticWord w : thriftSent.getWords())
	112	+ idToEntity.put(w.getId(), w);
	113	+ if (thriftSent.isSetNames())
	114	+ for (TNamedEntity ne : thriftSent.getNames())
	115	+ idToEntity.put(ne.getId(), ne);
	116	+ if (thriftSent.isSetGroups())
	117	+ for (TSyntacticGroup group : thriftSent.getGroups())
	118	+ idToEntity.put(group.getId(), group);
	119	+ return idToEntity;
	120	+ }
	121	+
	122	+ private static Token loadToken(Sentence s, TToken teiM)
	123	+ throws MultiserviceException {
	124	+ Token seg = new Token();
	125	+ s.add(seg);
	126	+
	127	+ seg.setOrth(teiM.getOrth());
	128	+ TInterpretation interp = getTokenChosenInt(teiM);
	129	+ Interpretation chosenIterpretation = new Interpretation(
	130	+ interp.getCtag(), interp.getMsd(), interp.getBase());
	131	+ seg.addChosenInterpretation(chosenIterpretation);
	132	+
	133	+ for (TInterpretation interp2 : teiM.getInterpretations()) {
	134	+ Interpretation inter = new Interpretation(interp2.getCtag(),
	135	+ interp2.getMsd(), interp.getBase());
	136	+ seg.addInterpretation(inter);
	137	+ }
	138	+ return seg;
	139	+ }
	140	+
	141	+ private static TInterpretation getTokenChosenInt(TToken token)
	142	+ throws MultiserviceException {
	143	+ TInterpretation interp = token.getChosenInterpretation();
	144	+ if (interp == null \|\| interp.getBase() == null
	145	+ \|\| interp.getBase().equals("")) {
	146	+ if (token.getCandidateInterpretations() == null
	147	+ \|\| token.getCandidateInterpretations().size() == 0
	148	+ \|\| token.getCandidateInterpretations().get(0).getBase() == null
	149	+ \|\| token.getCandidateInterpretations().get(0).getBase()
	150	+ .equals(""))
	151	+ throw new MultiserviceException(
	152	+ "No proper chosen or candidate interpretation for segment: "
	153	+ + token.id);
	154	+ interp = token.getCandidateInterpretations().get(0);
	155	+ }
	156	+ return interp;
	157	+ }
	158	+
	159	+ private static List<Token> getUnderlyingSegments(Object entity,
	160	+ Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
	161	+ boolean headsOnly) {
	162	+ List<Token> result = new ArrayList<>();
	163	+
	164	+ if (entity instanceof TToken) {
	165	+ result.add(tokenId2Segment.get(((TToken) entity).getId()));
	166	+ return result;
	167	+ }
	168	+
	169	+ List<String> childIds = new ArrayList<>();
	170	+ if (entity instanceof TSyntacticWord)
	171	+ childIds = ((TSyntacticWord) entity).getChildIds();
	172	+ else if (entity instanceof TNamedEntity)
	173	+ childIds = ((TNamedEntity) entity).getChildIds();
	174	+ else if (entity instanceof TSyntacticGroup)
	175	+ if (headsOnly) {
	176	+ childIds = new ArrayList<String>();
	177	+ childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
	178	+ } else
	179	+ childIds = ((TSyntacticGroup) entity).getChildIds();
	180	+
	181	+ for (String id : childIds)
	182	+ result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
	183	+ tokenId2Segment, headsOnly));
	184	+
	185	+ return result;
	186	+ }
	187	+}
...	...

src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
	1	+package pl.waw.ipipan.zil.core.md.io.thrift;
	2	+
	3	+import java.util.ArrayList;
	4	+import java.util.HashMap;
	5	+import java.util.Iterator;
	6	+import java.util.List;
	7	+import java.util.Map;
	8	+
	9	+import org.apache.log4j.Logger;
	10	+
	11	+import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
	12	+import pl.waw.ipipan.multiservice.thrift.types.TMention;
	13	+import pl.waw.ipipan.multiservice.thrift.types.TParagraph;
	14	+import pl.waw.ipipan.multiservice.thrift.types.TSentence;
	15	+import pl.waw.ipipan.multiservice.thrift.types.TText;
	16	+import pl.waw.ipipan.multiservice.thrift.types.TToken;
	17	+import pl.waw.ipipan.zil.core.md.entities.Mention;
	18	+import pl.waw.ipipan.zil.core.md.entities.Paragraph;
	19	+import pl.waw.ipipan.zil.core.md.entities.Sentence;
	20	+import pl.waw.ipipan.zil.core.md.entities.Text;
	21	+import pl.waw.ipipan.zil.core.md.entities.Token;
	22	+
	23	+public class ThriftSaver {
	24	+
	25	+ private static Logger logger = Logger.getLogger(ThriftSaver.class);
	26	+
	27	+ public static void updateThriftText(Text responseText, TText text)
	28	+ throws MultiserviceException {
	29	+
	30	+ logger.debug("Updating thrift text...");
	31	+ Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>();
	32	+
	33	+ Iterator<TParagraph> thrPI = text.getParagraphsIterator();
	34	+ Iterator<Paragraph> teiPI = responseText.iterator();
	35	+ int freeMentionId = 0;
	36	+ while (thrPI.hasNext() && teiPI.hasNext()) {
	37	+ TParagraph thrP = thrPI.next();
	38	+ Paragraph teiP = teiPI.next();
	39	+
	40	+ freeMentionId = updateThriftParagraph(teiMention2ThriftMention,
	41	+ freeMentionId, thrP, teiP);
	42	+ }
	43	+ checkIterators(thrPI, teiPI, "paragraph");
	44	+ }
	45	+
	46	+ private static int updateThriftParagraph(
	47	+ Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId,
	48	+ TParagraph thrP, Paragraph teiP) throws MultiserviceException {
	49	+ Iterator<TSentence> thrSI = thrP.getSentencesIterator();
	50	+ Iterator<Sentence> teiSI = teiP.iterator();
	51	+ while (thrSI.hasNext() && teiSI.hasNext()) {
	52	+ TSentence thrS = thrSI.next();
	53	+ Sentence teiS = teiSI.next();
	54	+ freeMentionId = updateThriftSentence(teiMention2ThriftMention,
	55	+ freeMentionId, thrS, teiS);
	56	+ }
	57	+ checkIterators(thrSI, teiSI, "sentence");
	58	+ return freeMentionId;
	59	+ }
	60	+
	61	+ private static int updateThriftSentence(
	62	+ Map<Mention, TMention> teiMention2ThriftMention, int id,
	63	+ TSentence thrS, Sentence teiS) throws MultiserviceException {
	64	+ thrS.unsetMentions();
	65	+ thrS.setMentions(new ArrayList<TMention>());
	66	+
	67	+ Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>();
	68	+ Iterator<TToken> thrMI = thrS.getTokensIterator();
	69	+ Iterator<Token> teiMI = teiS.iterator();
	70	+ while (thrMI.hasNext() && teiMI.hasNext()) {
	71	+ teiMorph2ThriftToken.put(teiMI.next(), thrMI.next());
	72	+ }
	73	+ checkIterators(thrMI, teiMI, "morph");
	74	+
	75	+ for (Mention m : teiS.getMentions()) {
	76	+ List<String> childIds = new ArrayList<>();
	77	+ List<String> headIds = new ArrayList<>();
	78	+ for (Token ch : m.getSegments())
	79	+ childIds.add(teiMorph2ThriftToken.get(ch).getId());
	80	+ for (Token h : m.getHeadSegments())
	81	+ headIds.add(teiMorph2ThriftToken.get(h).getId());
	82	+
	83	+ TMention tm = new TMention("m-" + (id++), headIds, childIds,
	84	+ m.isZeroSubject());
	85	+ teiMention2ThriftMention.put(m, tm);
	86	+ thrS.addToMentions(tm);
	87	+ }
	88	+ return id;
	89	+ }
	90	+
	91	+ private static void checkIterators(Iterator<? extends Object> one,
	92	+ Iterator<? extends Object> other, String level)
	93	+ throws MultiserviceException {
	94	+ if (one.hasNext() \|\| other.hasNext())
	95	+ throw new MultiserviceException(
	96	+ "Problem mapping interal text representation to thrift for level "
	97	+ + level);
	98	+ }
	99	+
	100	+}
...	...

src/main/resources/log4j.properties 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/resources/log4j.properties
	1	+log4j.appender.stderr=org.apache.log4j.ConsoleAppender
	2	+log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
	3	+log4j.appender.stderr.layout.ConversionPattern=[%p] [%C{1}] %m%n
	4	+
	5	+log4j.logger.ipipan=INFO, stderr
	6	+log4j.logger.pl.waw.ipipan=INFO, stderr
	7	+log4j.logger.org.apache.thrift=INFO, stderr
0	8	\ No newline at end of file
...	...

src/main/resources/quasi_verbs.txt 0 → 100644

View file @dfbfe3f

	1	+++ a/src/main/resources/quasi_verbs.txt
	1	+bawić
	2	+brać
	3	+brak
	4	+brakować
	5	+być
	6	+bywać
	7	+chcieć
	8	+chodzić
	9	+ciągnąć
	10	+ciec
	11	+czas
	12	+czuć
	13	+dobiec
	14	+dobiegać
	15	+dochodzić
	16	+docierać
	17	+dojść
	18	+dotrzeć
	19	+dusić
	20	+godzić
	21	+gotować
	22	+gryźć
	23	+grzmieć
	24	+iść
	25	+jechać
	26	+kłuć
	27	+kończyć
	28	+kręcić
	29	+kropić
	30	+lać
	31	+łamać
	32	+lecieć
	33	+mieć
	34	+mieszać
	35	+móc
	36	+można
	37	+musieć
	38	+należeć
	39	+nieść
	40	+nosić
	41	+nudzić
	42	+nudzić
	43	+obejść
	44	+odbijać
	45	+odchodzić
	46	+odejmować
	47	+odejść
	48	+odrzucać
	49	+odrzucić
	50	+okazać
	51	+okazywać
	52	+opłacać
	53	+opłacić
	54	+oznaczać
	55	+pachnieć
	56	+padać
	57	+palić
	58	+palić
	59	+paść
	60	+piec
	61	+podobać
	62	+pogorszyć
	63	+pójść
	64	+ponieść
	65	+poprawiać
	66	+pora
	67	+potwierdzać
	68	+potwierdzić
	69	+powinno
	70	+pozostać
	71	+pozostawać
	72	+prosić
	73	+przechodzić
	74	+przestać
	75	+przybyć
	76	+przybywać
	77	+przyjąć
	78	+przyjmować
	79	+przypominać
	80	+przypomnieć
	81	+robić
	82	+rozerwać
	83	+rozumieć
	84	+składać
	85	+skończyć
	86	+skręcać
	87	+skręcić
	88	+słychać
	89	+śnić
	90	+śpieszyć
	91	+stać
	92	+stać
	93	+stanąć
	94	+strzelić
	95	+swędzić
	96	+świecić
	97	+szkoda
	98	+trafiać
	99	+trafić
	100	+trząść
	101	+trzeba
	102	+ucieszyć
	103	+uczynić
	104	+udać
	105	+udawać
	106	+uderzać
	107	+uderzyć
	108	+układać
	109	+ułożyć
	110	+warto
	111	+wiadomo
	112	+widać
	113	+wieść
	114	+wolno
	115	+wstyd
	116	+wychodzić
	117	+wydać
	118	+wydawać
	119	+wyjaśniać
	120	+wyjaśnić
	121	+wyjść
	122	+wypadać
	123	+wypaść
	124	+wypogadzać
	125	+wyrzucić
	126	+wystarczyć
	127	+wziąć
	128	+zabraknąć
	129	+zacząć
	130	+zaczynać
	131	+zagotować
	132	+zainteresować
	133	+zakręcić
	134	+żal
	135	+zależeć
	136	+zanieść
	137	+zanieść
	138	+zanosić
	139	+zanosić
	140	+zapowiadać
	141	+zarzucać
	142	+zastanowić
	143	+zbierać
	144	+zdarzać
	145	+zdziwić
	146	+zebrać
	147	+zemrzeć
	148	+złożyć
	149	+znać
	150	+zrobić
...	...

src/main/resources/zero_subject_model.bin 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java
	1	+package pl.waw.ipipan.zil.core.md;
	2	+
	3	+import java.io.IOException;
	4	+
	5	+import org.junit.Rule;
	6	+import org.junit.Test;
	7	+import org.junit.rules.TemporaryFolder;
	8	+
	9	+public class MentionDetectorTest {
	10	+
	11	+ @Rule
	12	+ public TemporaryFolder results = new TemporaryFolder();
	13	+
	14	+ @Test
	15	+ public final void test() throws IOException {
	16	+ String[] args = {
	17	+ MentionDetectorTest.class.getResource("/example_test_tei/")
	18	+ .getFile(),
	19	+ results.newFolder().getAbsolutePath() };
	20	+ Main.main(args);
	21	+ }
	22	+}
...	...

src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java
	1	+package pl.waw.ipipan.zil.core.md.detection.zero;
	2	+
	3	+import java.io.File;
	4	+import java.io.IOException;
	5	+
	6	+import org.junit.Rule;
	7	+import org.junit.Test;
	8	+import org.junit.rules.TemporaryFolder;
	9	+
	10	+public class TrainerTest {
	11	+ @Rule
	12	+ public TemporaryFolder results = new TemporaryFolder();
	13	+
	14	+ @Test
	15	+ public final void test() throws IOException {
	16	+ String[] args = {
	17	+ TrainerTest.class.getResource("/example_train_tei/").getFile(),
	18	+ new File(results.newFolder(), "model.bin").getAbsolutePath(),
	19	+ TrainerTest.class.getResource("/example_model/quasi_verbs.txt")
	20	+ .getFile() };
	21	+ Trainer.main(args);
	22	+ }
	23	+}
...	...

src/test/resources/example_model/model.bin 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_model/quasi_verbs.txt 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_model/quasi_verbs.txt
	1	+bawić
	2	+brać
	3	+brak
	4	+brakować
	5	+być
	6	+bywać
	7	+chcieć
	8	+chodzić
	9	+ciągnąć
	10	+ciec
	11	+czas
	12	+czuć
	13	+dobiec
	14	+dobiegać
	15	+dochodzić
	16	+docierać
	17	+dojść
	18	+dotrzeć
	19	+dusić
	20	+godzić
	21	+gotować
	22	+gryźć
	23	+grzmieć
	24	+iść
	25	+jechać
	26	+kłuć
	27	+kończyć
	28	+kręcić
	29	+kropić
	30	+lać
	31	+łamać
	32	+lecieć
	33	+mieć
	34	+mieszać
	35	+móc
	36	+można
	37	+musieć
	38	+należeć
	39	+nieść
	40	+nosić
	41	+nudzić
	42	+nudzić
	43	+obejść
	44	+odbijać
	45	+odchodzić
	46	+odejmować
	47	+odejść
	48	+odrzucać
	49	+odrzucić
	50	+okazać
	51	+okazywać
	52	+opłacać
	53	+opłacić
	54	+oznaczać
	55	+pachnieć
	56	+padać
	57	+palić
	58	+palić
	59	+paść
	60	+piec
	61	+podobać
	62	+pogorszyć
	63	+pójść
	64	+ponieść
	65	+poprawiać
	66	+pora
	67	+potwierdzać
	68	+potwierdzić
	69	+powinno
	70	+pozostać
	71	+pozostawać
	72	+prosić
	73	+przechodzić
	74	+przestać
	75	+przybyć
	76	+przybywać
	77	+przyjąć
	78	+przyjmować
	79	+przypominać
	80	+przypomnieć
	81	+robić
	82	+rozerwać
	83	+rozumieć
	84	+składać
	85	+skończyć
	86	+skręcać
	87	+skręcić
	88	+słychać
	89	+śnić
	90	+śpieszyć
	91	+stać
	92	+stać
	93	+stanąć
	94	+strzelić
	95	+swędzić
	96	+świecić
	97	+szkoda
	98	+trafiać
	99	+trafić
	100	+trząść
	101	+trzeba
	102	+ucieszyć
	103	+uczynić
	104	+udać
	105	+udawać
	106	+uderzać
	107	+uderzyć
	108	+układać
	109	+ułożyć
	110	+warto
	111	+wiadomo
	112	+widać
	113	+wieść
	114	+wolno
	115	+wstyd
	116	+wychodzić
	117	+wydać
	118	+wydawać
	119	+wyjaśniać
	120	+wyjaśnić
	121	+wyjść
	122	+wypadać
	123	+wypaść
	124	+wypogadzać
	125	+wyrzucić
	126	+wystarczyć
	127	+wziąć
	128	+zabraknąć
	129	+zacząć
	130	+zaczynać
	131	+zagotować
	132	+zainteresować
	133	+zakręcić
	134	+żal
	135	+zależeć
	136	+zanieść
	137	+zanieść
	138	+zanosić
	139	+zanosić
	140	+zapowiadać
	141	+zarzucać
	142	+zastanowić
	143	+zbierać
	144	+zdarzać
	145	+zdziwić
	146	+zebrać
	147	+zemrzeć
	148	+złożyć
	149	+znać
	150	+zrobić
...	...

src/test/resources/example_test_tei/1/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/1/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/1/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/1/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/1/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/1/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/1/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-279,p-280,p-281,p-282,p-283,p-284,p-285,p-286,p-287 from NKJP text with id: IJPPAN_PolPr_TS00264</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_test_tei/1/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/1/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">– Sensownym rozwiązaniem będzie zmiana istniejącego oświetlenia na typ uliczny, czyli na wysokie słupy. W tym roku nie mamy jednak na to pieniędzy – mówi Anita Tyszkiewicz-Zimałka, rzecznik Urzędu Miasta w Raciborzu.</p>
	9	+ <p xml:id="p-2">Przyjęto więc salomonowe rozwiązanie ograniczenia nakładów do minimum. Na odcinku od kładki dla pieszych do restauracji „Zamkowa” co druga latarnia będzie zdemontowana – A elementy z nich będą służyły do naprawiania pozostałych – wyjaśnia rzecznik.</p>
	10	+ <p xml:id="p-3">Jacek Bombor</p>
	11	+ <p xml:id="p-4">W ekstraklasie Francji prowadzący w tabeli zespół Jacka Bąka RC Lens wygrał wyjazdowe spotkanie z Montpellier. Sukces gości jest tym cenniejszy, że od 33 minuty grali oni w osłabieniu, bez Ferdinanda Coly, który ukarany został czerwoną kartką.</p>
	12	+ <p xml:id="p-5">Montpellier – RC Lens 1:2 (0:1). Fugier (88) – Diouf (43), Pedron (65). Czerwona kartka: Coly (Lens)</p>
	13	+ <p xml:id="p-6">Paris St Germain – Sedan 3:0 (1:0). Arteta (23, karny), Alex (82), Cisse (90). Czerwona kartka: Elzeard (Sedan).</p>
	14	+ <p xml:id="p-7">AJ Auxerre – Nantes 2:1 (1:1). Cisse (19), Gonzales (78) – Moldovan (26). Czerwona kartka: Cetto (Nantes).</p>
	15	+ <p xml:id="p-8">Lorient – Troyes 1:0 (0:0). Feindouno (60).</p>
	16	+ <p xml:id="p-9">Metz – Girondins Bordeaux 1:2 (1:0). Desire Job (36) – Pauleta (71), Vikash Dhorasoo (83).</p>
	17	+ </body>
	18	+ </text>
	19	+ </TEI>
	20	+</teiCorpus>
0	21	\ No newline at end of file
...	...

src/test/resources/example_test_tei/2/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/2/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/2/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/2/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/2/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/2/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/2/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-328,p-329 from NKJP text with id: PWN_3102000000066</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Literatura faktu"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_test_tei/2/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/2/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">To, że mściciele mieli prawo bezkarnie zabić nie tylko mordercę, ale i jego synów, zostało zapisane czarno na białym. Ale nie koniec na tym. Sens formułki et ille ac filii eius soli sint faidosi polega na zawężeniu kręgu osób, które mścicielom wolno zabić. Wiąże się to z poprzednią częścią zdania: ośmiokrotność zwykłego wergeldu morderca ma zapłacić sam, bez udziału dalszych krewnych. Wolno stąd wnosić, że gdyby nie zapłacono "zwykłego" wergeldu in simplo, którego trzecią część musieli pokryć boczni krewni zbrodniarza, byliby oni razem z mordercą i jego domownikami wystawieni na wróżdę strony poszkodowanej.</p>
	9	+ <p xml:id="p-2">Tytuł XVIII Prawa Sasów poświęcony jest odpowiedzialności karnej pana za zabójstwo popełnione przez lita, a właściwie temu, jak można się od tej odpowiedzialności uwolnić: „Jeżeli lit z rozkazu lub z poduszczenia swojego pana zabije jakiegoś człowieka, na przykład nobila, to pan płaci główszczyznę lub podlega wróżdzie; jeżeli zaś [lit] popełni ten czyn bez wiedzy pana, to ma być przez pana wyzwolony, i [wtedy] krewni ofiary mają się mścić na nim samym [to jest sprawcy] i na pozostałych siedmiu jego krewnych, a pan lita musi przysiąc z jedenastoma współprzysiężnikami, że nie był wtajemniczony w zbrodnię" (Litus si per iuissum vel consilium domini sui hominem occiderit, ut puta nobilem, dominus conpositionem persolvat vel faidam portet; si autem absque conscientia domini hoc fecerit, dimittatur a domino, et vindicetur in illo et aliis VII consanguineis eius a propinquis occisi, et dominus liti se in hoc conscium non esse cum XI iuret).</p>
	10	+ </body>
	11	+ </text>
	12	+ </TEI>
	13	+</teiCorpus>
0	14	\ No newline at end of file
...	...

src/test/resources/example_test_tei/3/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/3/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/3/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/3/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/3/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_test_tei/3/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/3/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-38,p-39,p-40,p-41,p-42,p-43,p-44,p-45 from NKJP text with id: IJPPAN_p00111b00010a</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Literatura piękna"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_test_tei/3/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_test_tei/3/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">Wrócił niedługo potem i szepnął coś do Margaret.</p>
	9	+ <p xml:id="p-2">Oboje odwrócili się w kierunku majora Kovalsky’ego.</p>
	10	+ <p xml:id="p-3">Działo się coś złego, i to bardzo.</p>
	11	+ <p xml:id="p-4">Zdążył nawet wyciągnąć pistolet i postrzelić Smitha, ale Margaret była szybsza. Ciosem dłoni powaliła go na ziemię. Tracąc przytomność pomyślał, że nie spodziewał się tyle siły w tak wątłym ciele.</p>
	12	+ <p xml:id="p-5">VII</p>
	13	+ <p xml:id="p-6">Gdy się obudził nie miał lewej ręki. Z rany sączyła się krew. Obok leżał, dysząc ciężko, John Smith. Również krwawił, tyle, że na niebiesko. "Wszystko na opak w tym pojebanym miejscu" – pomyślał Kovalsky i znów zemdlał.</p>
	14	+ <p xml:id="p-7">Gdy ocknął się drugi raz, Smith wyglądał trochę lepiej, a całą twarz miał we krwi. Czerwonej. Ręka majora obficie krwawiła do jakiegoś naczynia. Opodal uwijała się Margaret, która sprawiedliwie rozdzielała krwawy posiłek między siebie i Johna Smitha. Oboje sprawiali wrażenie bardzo szczęśliwych.</p>
	15	+ <p xml:id="p-8">Margaret podeszła do Kovalsky’ego i pogłaskała po policzku. – Kochany, to był cudowny pomysł z tym wyścigiem. Naprawdę świetny. Nawet nie przypuszczałam... Nie przypuszczaliśmy... Jeden z nich się przewrócił i rozciął dłoń. Zaczął ssać i krwawienie ustało. A potem drugi, ale skaleczył się w nogę. Nie mógł sobie pomóc, więc myśmy to zrobili. Och... – Margaret jęknęła zmysłowo, a Kovalsky’ego znów ogarnęła ciemność.</p>
	16	+ </body>
	17	+ </text>
	18	+ </TEI>
	19	+</teiCorpus>
0	20	\ No newline at end of file
...	...

src/test/resources/example_train_tei/1/ann_coreference.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_mentions.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/1/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/1/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-57,p-58,p-59,p-60 from NKJP text with id: IPIPAN_1301919980826</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_train_tei/1/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/1/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">W spotkaniu weźmie udział blisko 7 tysięcy braci z całej Europy, ale tylko 206 z nich będzie ubiegało się o tytuł Europejskiego Króla Kurkowego. - Wezmę udział w strzelaniu, choć moje szanse są marne. Wynika to przede wszystkim z moich obowiązków gospodarza spotkań; w tym nawale pracy ciężko mi będzie się skupić na strzelaniu - przewiduje Zdzisław Maj, prezes krakowskiego Bractwa Kurkowego, panujący Król Kurkowy.</p>
	9	+ <p xml:id="p-2">Strzelanie o tytuł Europejskiego Króla Kurkowego będzie się odbywało w kilku etapach. Do finału zostanie dopuszczonych 27 braci - jeden z nich otrzyma tytuł Europejskiego Króla Kurkowego odbierając go obecnie panującemu Wilfriedowi Stammermannowi. - Król nie otrzymuje żadnych nagród finansowych, ale taki tytuł jest ogromnym zaszczytem; król jest np. zapraszany na posiedzenia Parlamentu Europejskiego - mówi Zdzisław Maj.</p>
	10	+ <p xml:id="p-3">Największą atrakcją 12. Europejskich Spotkań Bractw Strzeleckich będzie wielka parada, która rozpocznie się w niedzielę o godz. 13. Kilkuset braci w historycznych strojach przejdzie z Błoń na Rynek ulicami: Piłsudskiego, Straszewskiego, Franciszkańską i Grodzką.</p>
	11	+ <p xml:id="p-4">Początki istnienia Bractwa Kurkowego w Krakowie sięgają XIII wieku. Skupiało ono znamienitych obywateli, kupców i rzemieślników pragnących wspomóc obronność miasta. Wielkim świętem bractwa był turniej, który odbywał się na strzelnicy zwanej Celestatem. Zawody trwały zwykle trzy dni. Strzelano do drewnianego kura umocowanego na wysokiej żerdzi. Brat, który zdołał celnym strzałem strącić ostatni jego fragment zdobywał miano Króla Kurkowego. Z tym tytułem wiązały się nie tylko honory, ale także przywileje: Rada Miejska zwalniała jego posiadacza m.in. z obowiązku płacenia podatków (ten zwyczaj utrzymał się do dziś).</p>
	12	+ </body>
	13	+ </text>
	14	+ </TEI>
	15	+</teiCorpus>
0	16	\ No newline at end of file
...	...

src/test/resources/example_train_tei/2/ann_coreference.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_mentions.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/2/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/2/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-437,p-438,p-439,p-440,p-441,p-442,p-443,p-444,p-445 from NKJP text with id: IJPPAN_PolPr_SlP00841</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_train_tei/2/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/2/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">Ernest i Agnieszka nie planowali, że będą mieli wielką, babską rodzinę. Ale tak wyszło. – I całe szczęście. Lepiej się dogaduję z dziewczętami – cieszy się Ernest Kwiecień.</p>
	9	+ <p xml:id="p-2">W Wigilię do jego obowiązków, poza dostarczeniem choinki, należeć będzie zmywanie naczyń. Agnieszka zrobi pierogi, ugotuje barszcz z uszkami, usmaży karpia. Córki upieką ciasta. Potem przyjdzie czas na prezenty. Może to nawet będą empetrójki, o których marzą starsze dziewczyny.</p>
	10	+ <p xml:id="p-3">Jodełek sadzimy mniej</p>
	11	+ <p xml:id="p-4">Leśniczy, od którego pan Ernest przywozi choinkę, mieszka kilka kilometrów od domu Kwietniów. On także nie wyobraża sobie świąt bez prawdziwego świerku. – I musi być kiczowaty – uśmiecha się Gabriel Grobelny, nadleśniczy wałbrzyski.</p>
	12	+ <p xml:id="p-5">To znaczy, że powinny na nim wisieć ozdoby zrobione przez dzieci, przechowywane latami, wyciągane na tę jedyną okazję.</p>
	13	+ <p xml:id="p-6">Pan Gabriel ma dwóch synów i trzy córki. W domu została najmłodsza, 12-letnia, ale na święta zjadą wszyscy. I ubiorą choinkę. – Żona rozwiesi anielskie włosy, ja podłączę lampki – w domu nadleśniczego podział świątecznych ról jest określony.</p>
	14	+ <p xml:id="p-7">W dolnośląskich lasach najwięcej jest świerków. Na plantacjach sadzą także coraz popularniejsze jodły z miękkimi igłami.</p>
	15	+ <p xml:id="p-8">– Ale i tych jodełek sadzimy już mniej. To nie lata dziewięćdziesiąte, gdy sprzedawaliśmy prawie wszystkie wyhodowane drzewka – wspomina nadleśniczy.</p>
	16	+ <p xml:id="p-9">U Grobelnego choinkę można sobie wybrać. – Mamy rodziny, w których co roku ojciec przyjeżdża z synem, by samemu ściąć drzewko. Taką mają tradycję – dodaje pan Gabriel.</p>
	17	+ </body>
	18	+ </text>
	19	+ </TEI>
	20	+</teiCorpus>
0	21	\ No newline at end of file
...	...

src/test/resources/example_train_tei/3/ann_coreference.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_groups.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_mentions.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_morphosyntax.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_named.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_segmentation.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/ann_words.xml.gz 0 → 100644

View file @dfbfe3f

No preview for this file type

src/test/resources/example_train_tei/3/header.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/3/header.xml
	1	+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	2	+<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
	3	+ <fileDesc>
	4	+ <titleStmt>
	5	+ <title>Paragraphs: p-6,p-7,p-8,p-9 from NKJP text with id: PELCRA_1303919960926</title>
	6	+ </titleStmt>
	7	+ </fileDesc>
	8	+ <profileDesc>
	9	+ <textClass>
	10	+ <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
	11	+ </textClass>
	12	+ </profileDesc>
	13	+ <revisionDesc/>
	14	+</teiHeader>
...	...

src/test/resources/example_train_tei/3/text.xml 0 → 100644

View file @dfbfe3f

	1	+++ a/src/test/resources/example_train_tei/3/text.xml
	1	+<?xml version="1.0" ?>
	2	+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
	3	+ <xi:include href="PCC_header.xml"/>
	4	+ <TEI>
	5	+ <xi:include href="header.xml"/>
	6	+ <text>
	7	+ <body>
	8	+ <p xml:id="p-1">Cena życia</p>
	9	+ <p xml:id="p-2">Z tego pogromu ocalało kilkudziesięciu Żydów, a wśród nich rodzina Mosze Sonensona. Przed wojną była to w skali miasteczka rodzina bogata. Sonensonowie mieli garbarnię. Nie udało mi się dociec, u kogo mianowicie przechowywali się Sonensonowie oraz pozostali Żydzi w czasie okupacji niemieckiej. Faktem pozostaje natomiast, że okupację tę przeżyli. Faktem oczywistym pozostaje i to, że liczne rodziny polskie - w Ejszyszkach i w pobliskich okolicach - przechowywały Żydów. Parę kilometrów od Ejszyszek, w Korkucianach (w folwarku Lebiedniki), żołnierz AK Kazimierz Korkuć w czasie wojny w swoim domu przechowywał 28 Żydów. Od studni do piwnic domu był przekopany tunel, dzięki czemu mieli wodę. Natomiast w skali siatki AK Kazimierz Korkuć przechowywał około 70 Żydów. Rodzina Świeczków również przechowywała Żydów. W tamtych stronach liczne rodziny polskie postępowały podobnie.</p>
	10	+ <p xml:id="p-3">Prawdą jest również i to, że Żydzi za swe przechowanie płacili. Płacili za utrzymanie i chyba jeszcze - za ryzyko. O tym dzisiaj raczej tu się nie mówi, ale prawdopodobnie różnie z tym było: jedni za pieniądze, inni - z odruchu serca. Ryzykowali i Polacy, i Żydzi. Te rachunki mogły wyglądać bardzo różnie.</p>
	11	+ <p xml:id="p-4">Mieszkam w jednej z podwileńskich wsi. Otóż w tej mojej wsi pewien gospodarz - Polak - przechowywał w czasie wojny młodą Żydówkę. Spodobała mu się, z czego wynikł dramat. Zdenerwowana żona doniosła na policję. Aresztowano Żydówkę razem z gospodarzem, przerażona kobieta próbowała ocalić męża. Zanim uzbierała potrzebną sumę na łapówkę, było już za późno - rozstrzelano nie tylko Żydówkę, ale i gospodarza. Czy żonę tego straceńca można nazwać antysemitką?</p>
	12	+ </body>
	13	+ </text>
	14	+ </TEI>
	15	+</teiCorpus>
0	16	\ No newline at end of file
...	...