MtasTokenizer.java.html
16.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"/><link rel="stylesheet" href="../jacoco-resources/report.css" type="text/css"/><link rel="shortcut icon" href="../jacoco-resources/report.gif" type="image/gif"/><title>MtasTokenizer.java</title><link rel="stylesheet" href="../jacoco-resources/prettify.css" type="text/css"/><script type="text/javascript" src="../jacoco-resources/prettify.js"></script></head><body onload="window['PR_TAB_WIDTH']=4;prettyPrint()"><div class="breadcrumb" id="breadcrumb"><span class="info"><a href="../jacoco-sessions.html" class="el_session">Sessions</a></span><a href="../index.html" class="el_report">MTAS</a> > <a href="index.source.html" class="el_package">mtas.analysis</a> > <span class="el_source">MtasTokenizer.java</span></div><h1>MtasTokenizer.java</h1><pre class="source lang-java linenums">package mtas.analysis;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Iterator;
import mtas.analysis.parser.MtasParser;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
import mtas.codec.payload.MtasPayloadEncoder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;
/**
* The Class MtasTokenizer.
*/
public final class MtasTokenizer extends Tokenizer {
/** The Constant log. */
<span class="fc" id="L35"> private static final Log log = LogFactory.getLog(MtasTokenizer.class);</span>
/** The Constant CONFIGURATION_MTAS. */
public static final String CONFIGURATION_MTAS = "mtas";
public static final String CONFIGURATION_MTAS_INDEX = "index";
public static final String CONFIGURATION_MTAS_INDEX_ATTRIBUTE = "index";
public static final String CONFIGURATION_MTAS_PARSER = "parser";
public static final String CONFIGURATION_MTAS_PARSER_ATTRIBUTE = "name";
private static final String VALUE_TRUE = "true";
private static final String VALUE_FALSE = "false";
private static final String VALUE_0 = "0";
private static final String VALUE_1 = "1";
/** The current position. */
<span class="pc" id="L52"> private int currentPosition = 0;</span>
/** The encoding flags. */
<span class="pc" id="L55"> private int encodingFlags = MtasPayloadEncoder.ENCODE_DEFAULT;</span>
/** The parser name. */
<span class="pc" id="L58"> private String parserName = null;</span>
/** The parser configuration. */
<span class="pc" id="L61"> private MtasConfiguration parserConfiguration = null;</span>
/** The token collection. */
private MtasTokenCollection tokenCollection;
/** The term att. */
<span class="pc" id="L67"> private final CharTermAttribute termAtt = addAttribute(</span>
CharTermAttribute.class);
/** The offset att. */
<span class="pc" id="L71"> private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);</span>
/** The payload att. */
<span class="pc" id="L74"> private final PayloadAttribute payloadAtt = addAttribute(</span>
PayloadAttribute.class);
/** The position increment att. */
<span class="pc" id="L78"> private final PositionIncrementAttribute positionIncrementAtt = addAttribute(</span>
PositionIncrementAttribute.class);
/** The token collection iterator. */
private Iterator<MtasToken> tokenCollectionIterator;
/**
* Instantiates a new mtas tokenizer.
*/
<span class="nc" id="L87"> public MtasTokenizer() {</span>
<span class="nc" id="L88"> }</span>
/**
* Instantiates a new mtas tokenizer.
*
* @param configFileName the config file name
*/
<span class="nc" id="L95"> public MtasTokenizer(final String configFileName) {</span>
<span class="nc" id="L96"> readConfigurationFile(configFileName);</span>
<span class="nc" id="L97"> }</span>
/**
* Instantiates a new mtas tokenizer.
*
* @param config the config
* @throws IOException Signals that an I/O exception has occurred.
*/
<span class="nc" id="L105"> public MtasTokenizer(final MtasConfiguration config) throws IOException {</span>
<span class="nc" id="L106"> processConfiguration(config);</span>
<span class="nc" id="L107"> }</span>
/**
* Instantiates a new mtas tokenizer.
*
* @param reader the reader
* @throws IOException Signals that an I/O exception has occurred.
*/
<span class="nc" id="L115"> public MtasTokenizer(final InputStream reader) throws IOException {</span>
<span class="nc" id="L116"> processConfiguration(MtasConfiguration.readConfiguration(reader));</span>
<span class="nc" id="L117"> }</span>
/**
* Instantiates a new mtas tokenizer.
*
* @param factory the factory
* @param config the config
* @throws IOException Signals that an I/O exception has occurred.
*/
public MtasTokenizer(final AttributeFactory factory,
final MtasConfiguration config) throws IOException {
<span class="fc" id="L128"> super(factory);</span>
<span class="fc" id="L129"> processConfiguration(config);</span>
<span class="fc" id="L130"> }</span>
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
<span class="fc" id="L139"> clearAttributes();</span>
MtasToken token;
Integer positionIncrement;
MtasPayloadEncoder payloadEncoder;
<span class="pc bpc" id="L143" title="1 of 2 branches missed."> if (tokenCollectionIterator == null) {</span>
<span class="nc" id="L144"> return false;</span>
<span class="fc bfc" id="L145" title="All 2 branches covered."> } else if (tokenCollectionIterator.hasNext()) {</span>
<span class="fc" id="L146"> token = tokenCollectionIterator.next();</span>
// compute info
<span class="fc" id="L148"> positionIncrement = token.getPositionStart() - currentPosition;</span>
<span class="fc" id="L149"> currentPosition = token.getPositionStart();</span>
<span class="fc" id="L150"> payloadEncoder = new MtasPayloadEncoder(token, encodingFlags);</span>
// set info
<span class="fc" id="L152"> termAtt.append(token.getValue());</span>
<span class="fc" id="L153"> positionIncrementAtt.setPositionIncrement(positionIncrement);</span>
<span class="fc" id="L154"> offsetAtt.setOffset(token.getOffsetStart(), token.getOffsetEnd());</span>
<span class="fc" id="L155"> payloadAtt.setPayload(payloadEncoder.getPayload());</span>
<span class="fc" id="L156"> return true;</span>
}
<span class="fc" id="L158"> return false;</span>
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.Tokenizer#reset()
*/
@Override
public void reset() throws IOException {
<span class="fc" id="L168"> super.reset();</span>
<span class="fc" id="L169"> currentPosition = -1;</span>
try {
<span class="fc" id="L171"> constructTokenCollection(input);</span>
<span class="fc" id="L172"> tokenCollectionIterator = tokenCollection.iterator();</span>
<span class="nc" id="L173"> } catch (MtasConfigException | MtasParserException e) { </span>
<span class="nc" id="L174"> tokenCollectionIterator = null;</span>
<span class="nc" id="L175"> throw new IOException(e);</span>
<span class="fc" id="L176"> }</span>
<span class="fc" id="L177"> }</span>
/**
* Prints the.
*
* @param r the r
* @throws MtasParserException the mtas parser exception
*/
public void print(final Reader r) throws MtasParserException {
try {
<span class="nc" id="L187"> setReader(r);</span>
<span class="nc" id="L188"> reset();</span>
<span class="nc bnc" id="L189" title="All 2 branches missed."> if (tokenCollection != null) {</span>
<span class="nc" id="L190"> tokenCollection.print();</span>
}
<span class="nc" id="L192"> end();</span>
<span class="nc" id="L193"> close();</span>
<span class="nc" id="L194"> } catch (IOException e) {</span>
<span class="nc" id="L195"> log.error(e);</span>
<span class="nc" id="L196"> throw new MtasParserException(e.getClass() + " : " + e.getMessage());</span>
<span class="nc" id="L197"> }</span>
<span class="nc" id="L198"> }</span>
/**
* Gets the list.
*
* @param r the r
* @return the list
* @throws IOException Signals that an I/O exception has occurred.
*/
public String[][] getList(final Reader r) throws IOException {
try {
<span class="nc" id="L209"> setReader(r);</span>
<span class="nc" id="L210"> reset();</span>
<span class="nc" id="L211"> String[][] result = tokenCollection.getList();</span>
<span class="nc" id="L212"> end();</span>
<span class="nc" id="L213"> close();</span>
<span class="nc" id="L214"> return result;</span>
<span class="nc" id="L215"> } catch (MtasParserException e) {</span>
<span class="nc" id="L216"> log.info(e);</span>
<span class="nc" id="L217"> throw new IOException("can't produce list");</span>
}
}
/**
* Construct token collection.
*
* @param reader the reader
* @throws MtasConfigException the mtas config exception
* @throws MtasParserException the mtas parser exception
*/
private void constructTokenCollection(final Reader reader)
throws MtasConfigException, MtasParserException {
<span class="fc" id="L230"> tokenCollection = null;</span>
try {
<span class="fc" id="L232"> Constructor<?> c = Class.forName(parserName)</span>
<span class="fc" id="L233"> .getDeclaredConstructor(MtasConfiguration.class);</span>
// try {
<span class="fc" id="L235"> Object p = c.newInstance(parserConfiguration);</span>
<span class="pc bpc" id="L236" title="1 of 2 branches missed."> if (p instanceof MtasParser) {</span>
<span class="fc" id="L237"> MtasParser parser = (MtasParser) p;</span>
<span class="fc" id="L238"> tokenCollection = parser.createTokenCollection(reader);</span>
<span class="fc" id="L239"> return;</span>
} else {
<span class="nc" id="L241"> throw new MtasConfigException("no instance of MtasParser");</span>
}
<span class="nc" id="L243"> } catch (MtasParserException e) {</span>
<span class="nc" id="L244"> log.debug(e);</span>
<span class="nc" id="L245"> tokenCollection = new MtasTokenCollection();</span>
<span class="nc" id="L246"> throw new MtasParserException(e.getMessage());</span>
<span class="nc" id="L247"> } catch (NoSuchMethodException | InvocationTargetException</span>
| IllegalAccessException | ClassNotFoundException
| InstantiationException e) {
<span class="nc" id="L250"> log.debug(e);</span>
<span class="nc" id="L251"> throw new MtasConfigException(</span>
<span class="nc" id="L252"> e.getClass().getName() + " : '" + e.getMessage() + "'");</span>
}
}
/**
* Read configuration file.
*
* @param configFile the config file
*/
private void readConfigurationFile(final String configFile) {
InputStream is;
try {
<span class="nc" id="L265"> is = new FileInputStream(configFile);</span>
<span class="nc" id="L266"> processConfiguration(MtasConfiguration.readConfiguration(is));</span>
<span class="nc" id="L267"> is.close();</span>
<span class="nc" id="L268"> } catch (FileNotFoundException e) {</span>
<span class="nc" id="L269"> log.error("Couldn't find " + configFile, e);</span>
<span class="nc" id="L270"> } catch (IOException e) {</span>
<span class="nc" id="L271"> log.error("Couldn't read " + configFile, e);</span>
<span class="nc" id="L272"> }</span>
<span class="nc" id="L273"> }</span>
/**
* Process configuration.
*
* @param config the config
* @throws IOException Signals that an I/O exception has occurred.
*/
private void processConfiguration(final MtasConfiguration config)
throws IOException {
<span class="fc" id="L283"> HashMap<String, Integer> indexEncodingMapper = new HashMap<>();</span>
<span class="fc" id="L284"> indexEncodingMapper.put("payload", MtasPayloadEncoder.ENCODE_PAYLOAD);</span>
<span class="fc" id="L285"> indexEncodingMapper.put("offset", MtasPayloadEncoder.ENCODE_OFFSET);</span>
<span class="fc" id="L286"> indexEncodingMapper.put("realoffset", MtasPayloadEncoder.ENCODE_REALOFFSET);</span>
<span class="fc" id="L287"> indexEncodingMapper.put("parent", MtasPayloadEncoder.ENCODE_PARENT);</span>
// process
<span class="pc bpc" id="L289" title="1 of 2 branches missed."> if (config != null) {</span>
<span class="fc bfc" id="L290" title="All 2 branches covered."> for (int i = 0; i < config.children.size(); i++) {</span>
<span class="fc bfc" id="L291" title="All 2 branches covered."> if (config.children.get(i).name.equals(CONFIGURATION_MTAS_INDEX)) {</span>
<span class="fc" id="L292"> MtasConfiguration index = config.children.get(i);</span>
<span class="fc bfc" id="L293" title="All 2 branches covered."> for (int j = 0; j < index.children.size(); j++) {</span>
<span class="pc bpc" id="L294" title="1 of 2 branches missed."> if (indexEncodingMapper.containsKey(index.children.get(j).name)) {</span>
<span class="fc" id="L295"> String value = index.children.get(j).attributes.get(CONFIGURATION_MTAS_INDEX_ATTRIBUTE);</span>
<span class="pc bpc" id="L296" title="1 of 4 branches missed."> if ((value.equals(VALUE_TRUE)) || (value.equals(VALUE_1))) {</span>
<span class="fc" id="L297"> encodingFlags |= indexEncodingMapper</span>
<span class="fc" id="L298"> .get(index.children.get(j).name);</span>
<span class="pc bpc" id="L299" title="3 of 4 branches missed."> } else if ((value.equals(VALUE_FALSE)) || (value.equals(VALUE_0))) {</span>
<span class="fc" id="L300"> encodingFlags &= ~indexEncodingMapper</span>
<span class="fc" id="L301"> .get(index.children.get(j).name);</span>
}
}
}
<span class="pc bpc" id="L305" title="1 of 2 branches missed."> } else if (config.children.get(i).name.equals(CONFIGURATION_MTAS_PARSER)) {</span>
<span class="pc bpc" id="L306" title="1 of 2 branches missed."> if (config.children.get(i).attributes.containsKey(CONFIGURATION_MTAS_PARSER_ATTRIBUTE)) {</span>
<span class="fc" id="L307"> parserName = config.children.get(i).attributes.get(CONFIGURATION_MTAS_PARSER_ATTRIBUTE);</span>
<span class="fc" id="L308"> parserConfiguration = config.children.get(i);</span>
} else {
<span class="nc" id="L310"> throw new IOException("no parser configuration");</span>
}
}
}
} else {
<span class="nc" id="L315"> throw new IOException("no (valid) configuration");</span>
}
<span class="fc" id="L317"> }</span>
/*
* (non-Javadoc)
*
* @see org.apache.lucene.util.AttributeSource#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
<span class="nc bnc" id="L326" title="All 2 branches missed."> if (this == obj)</span>
<span class="nc" id="L327"> return true;</span>
<span class="nc bnc" id="L328" title="All 2 branches missed."> if (obj == null)</span>
<span class="nc" id="L329"> return false;</span>
<span class="nc bnc" id="L330" title="All 2 branches missed."> if (getClass() != obj.getClass())</span>
<span class="nc" id="L331"> return false;</span>
<span class="nc" id="L332"> final MtasTokenizer that = (MtasTokenizer) obj;</span>
<span class="nc" id="L333"> return super.equals(that);</span>
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.util.AttributeSource#hashCode()
*/
@Override
public int hashCode() {
<span class="nc" id="L343"> return super.hashCode();</span>
}
}
</pre><div class="footer"><span class="right">Created with <a href="http://www.jacoco.org/jacoco">JaCoCo</a> 0.7.9.201702052155</span></div></body></html>