MtasTokenizer.java.html 16.7 KB
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"/><link rel="stylesheet" href="../jacoco-resources/report.css" type="text/css"/><link rel="shortcut icon" href="../jacoco-resources/report.gif" type="image/gif"/><title>MtasTokenizer.java</title><link rel="stylesheet" href="../jacoco-resources/prettify.css" type="text/css"/><script type="text/javascript" src="../jacoco-resources/prettify.js"></script></head><body onload="window['PR_TAB_WIDTH']=4;prettyPrint()"><div class="breadcrumb" id="breadcrumb"><span class="info"><a href="../jacoco-sessions.html" class="el_session">Sessions</a></span><a href="../index.html" class="el_report">MTAS</a> &gt; <a href="index.source.html" class="el_package">mtas.analysis</a> &gt; <span class="el_source">MtasTokenizer.java</span></div><h1>MtasTokenizer.java</h1><pre class="source lang-java linenums">package mtas.analysis;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Iterator;
import mtas.analysis.parser.MtasParser;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
import mtas.codec.payload.MtasPayloadEncoder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;

/**
 * The Class MtasTokenizer.
 */

public final class MtasTokenizer extends Tokenizer {

  /** The Constant log. */
<span class="fc" id="L35">  private static final Log log = LogFactory.getLog(MtasTokenizer.class);</span>

  /** The Constant CONFIGURATION_MTAS. */
  public static final String CONFIGURATION_MTAS = &quot;mtas&quot;;

  public static final String CONFIGURATION_MTAS_INDEX = &quot;index&quot;;
  public static final String CONFIGURATION_MTAS_INDEX_ATTRIBUTE = &quot;index&quot;;

  public static final String CONFIGURATION_MTAS_PARSER = &quot;parser&quot;;
  public static final String CONFIGURATION_MTAS_PARSER_ATTRIBUTE = &quot;name&quot;;

  private static final String VALUE_TRUE = &quot;true&quot;;
  private static final String VALUE_FALSE = &quot;false&quot;;
  private static final String VALUE_0 = &quot;0&quot;;
  private static final String VALUE_1 = &quot;1&quot;;
  
  /** The current position. */
<span class="pc" id="L52">  private int currentPosition = 0;</span>

  /** The encoding flags. */
<span class="pc" id="L55">  private int encodingFlags = MtasPayloadEncoder.ENCODE_DEFAULT;</span>

  /** The parser name. */
<span class="pc" id="L58">  private String parserName = null;</span>

  /** The parser configuration. */
<span class="pc" id="L61">  private MtasConfiguration parserConfiguration = null;</span>

  /** The token collection. */
  private MtasTokenCollection tokenCollection;

  /** The term att. */
<span class="pc" id="L67">  private final CharTermAttribute termAtt = addAttribute(</span>
      CharTermAttribute.class);

  /** The offset att. */
<span class="pc" id="L71">  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);</span>

  /** The payload att. */
<span class="pc" id="L74">  private final PayloadAttribute payloadAtt = addAttribute(</span>
      PayloadAttribute.class);

  /** The position increment att. */
<span class="pc" id="L78">  private final PositionIncrementAttribute positionIncrementAtt = addAttribute(</span>
      PositionIncrementAttribute.class);

  /** The token collection iterator. */
  private Iterator&lt;MtasToken&gt; tokenCollectionIterator;

  /**
   * Instantiates a new mtas tokenizer.
   */
<span class="nc" id="L87">  public MtasTokenizer() {</span>
<span class="nc" id="L88">  }</span>

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param configFileName the config file name
   */
<span class="nc" id="L95">  public MtasTokenizer(final String configFileName) {</span>
<span class="nc" id="L96">    readConfigurationFile(configFileName);</span>
<span class="nc" id="L97">  }</span>

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
<span class="nc" id="L105">  public MtasTokenizer(final MtasConfiguration config) throws IOException {</span>
<span class="nc" id="L106">    processConfiguration(config);</span>
<span class="nc" id="L107">  }</span>

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param reader the reader
   * @throws IOException Signals that an I/O exception has occurred.
   */
<span class="nc" id="L115">  public MtasTokenizer(final InputStream reader) throws IOException {</span>
<span class="nc" id="L116">    processConfiguration(MtasConfiguration.readConfiguration(reader));</span>
<span class="nc" id="L117">  }</span>

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param factory the factory
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public MtasTokenizer(final AttributeFactory factory,
      final MtasConfiguration config) throws IOException {
<span class="fc" id="L128">    super(factory);</span>
<span class="fc" id="L129">    processConfiguration(config);</span>
<span class="fc" id="L130">  }</span>

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
<span class="fc" id="L139">    clearAttributes();</span>
    MtasToken token;
    Integer positionIncrement;
    MtasPayloadEncoder payloadEncoder;
<span class="pc bpc" id="L143" title="1 of 2 branches missed.">    if (tokenCollectionIterator == null) {</span>
<span class="nc" id="L144">      return false;</span>
<span class="fc bfc" id="L145" title="All 2 branches covered.">    } else if (tokenCollectionIterator.hasNext()) {</span>
<span class="fc" id="L146">      token = tokenCollectionIterator.next();</span>
      // compute info
<span class="fc" id="L148">      positionIncrement = token.getPositionStart() - currentPosition;</span>
<span class="fc" id="L149">      currentPosition = token.getPositionStart();</span>
<span class="fc" id="L150">      payloadEncoder = new MtasPayloadEncoder(token, encodingFlags);</span>
      // set info
<span class="fc" id="L152">      termAtt.append(token.getValue());</span>
<span class="fc" id="L153">      positionIncrementAtt.setPositionIncrement(positionIncrement);</span>
<span class="fc" id="L154">      offsetAtt.setOffset(token.getOffsetStart(), token.getOffsetEnd());</span>
<span class="fc" id="L155">      payloadAtt.setPayload(payloadEncoder.getPayload());</span>
<span class="fc" id="L156">      return true;</span>
    }
<span class="fc" id="L158">    return false;</span>
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.analysis.Tokenizer#reset()
   */
  @Override
  public void reset() throws IOException {
<span class="fc" id="L168">    super.reset();</span>
<span class="fc" id="L169">    currentPosition = -1;</span>
    try {
<span class="fc" id="L171">      constructTokenCollection(input);</span>
<span class="fc" id="L172">      tokenCollectionIterator = tokenCollection.iterator();</span>
<span class="nc" id="L173">    } catch (MtasConfigException | MtasParserException e) {      </span>
<span class="nc" id="L174">      tokenCollectionIterator = null;</span>
<span class="nc" id="L175">      throw new IOException(e);</span>
<span class="fc" id="L176">    }</span>
<span class="fc" id="L177">  }</span>

  /**
   * Prints the.
   *
   * @param r the r
   * @throws MtasParserException the mtas parser exception
   */
  public void print(final Reader r) throws MtasParserException {
    try {
<span class="nc" id="L187">      setReader(r);</span>
<span class="nc" id="L188">      reset();</span>
<span class="nc bnc" id="L189" title="All 2 branches missed.">      if (tokenCollection != null) {</span>
<span class="nc" id="L190">        tokenCollection.print();</span>
      }
<span class="nc" id="L192">      end();</span>
<span class="nc" id="L193">      close();</span>
<span class="nc" id="L194">    } catch (IOException e) {</span>
<span class="nc" id="L195">      log.error(e);</span>
<span class="nc" id="L196">      throw new MtasParserException(e.getClass() + &quot; : &quot; + e.getMessage());</span>
<span class="nc" id="L197">    }</span>
<span class="nc" id="L198">  }</span>

  /**
   * Gets the list.
   *
   * @param r the r
   * @return the list
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public String[][] getList(final Reader r) throws IOException {
    try {
<span class="nc" id="L209">      setReader(r);</span>
<span class="nc" id="L210">      reset();</span>
<span class="nc" id="L211">      String[][] result = tokenCollection.getList();</span>
<span class="nc" id="L212">      end();</span>
<span class="nc" id="L213">      close();</span>
<span class="nc" id="L214">      return result;</span>
<span class="nc" id="L215">    } catch (MtasParserException e) {</span>
<span class="nc" id="L216">      log.info(e);</span>
<span class="nc" id="L217">      throw new IOException(&quot;can't produce list&quot;);</span>
    }
  }

  /**
   * Construct token collection.
   *
   * @param reader the reader
   * @throws MtasConfigException the mtas config exception
   * @throws MtasParserException the mtas parser exception
   */
  private void constructTokenCollection(final Reader reader)
      throws MtasConfigException, MtasParserException {
<span class="fc" id="L230">    tokenCollection = null;</span>
    try {
<span class="fc" id="L232">      Constructor&lt;?&gt; c = Class.forName(parserName)</span>
<span class="fc" id="L233">          .getDeclaredConstructor(MtasConfiguration.class);</span>
      // try {
<span class="fc" id="L235">      Object p = c.newInstance(parserConfiguration);</span>
<span class="pc bpc" id="L236" title="1 of 2 branches missed.">      if (p instanceof MtasParser) {</span>
<span class="fc" id="L237">        MtasParser parser = (MtasParser) p;</span>
<span class="fc" id="L238">        tokenCollection = parser.createTokenCollection(reader);</span>
<span class="fc" id="L239">        return;</span>
      } else {
<span class="nc" id="L241">        throw new MtasConfigException(&quot;no instance of MtasParser&quot;);</span>
      }
<span class="nc" id="L243">    } catch (MtasParserException e) {</span>
<span class="nc" id="L244">      log.debug(e);</span>
<span class="nc" id="L245">      tokenCollection = new MtasTokenCollection();</span>
<span class="nc" id="L246">      throw new MtasParserException(e.getMessage());</span>
<span class="nc" id="L247">    } catch (NoSuchMethodException | InvocationTargetException</span>
        | IllegalAccessException | ClassNotFoundException
        | InstantiationException e) {
<span class="nc" id="L250">      log.debug(e);</span>
<span class="nc" id="L251">      throw new MtasConfigException(</span>
<span class="nc" id="L252">          e.getClass().getName() + &quot; : '&quot; + e.getMessage() + &quot;'&quot;);</span>
    }

  }

  /**
   * Read configuration file.
   *
   * @param configFile the config file
   */
  private void readConfigurationFile(final String configFile) {
    InputStream is;
    try {
<span class="nc" id="L265">      is = new FileInputStream(configFile);</span>
<span class="nc" id="L266">      processConfiguration(MtasConfiguration.readConfiguration(is));</span>
<span class="nc" id="L267">      is.close();</span>
<span class="nc" id="L268">    } catch (FileNotFoundException e) {</span>
<span class="nc" id="L269">      log.error(&quot;Couldn't find &quot; + configFile, e);</span>
<span class="nc" id="L270">    } catch (IOException e) {</span>
<span class="nc" id="L271">      log.error(&quot;Couldn't read &quot; + configFile, e);</span>
<span class="nc" id="L272">    }</span>
<span class="nc" id="L273">  }</span>

  /**
   * Process configuration.
   *
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
  private void processConfiguration(final MtasConfiguration config)
      throws IOException {
<span class="fc" id="L283">    HashMap&lt;String, Integer&gt; indexEncodingMapper = new HashMap&lt;&gt;();</span>
<span class="fc" id="L284">    indexEncodingMapper.put(&quot;payload&quot;, MtasPayloadEncoder.ENCODE_PAYLOAD);</span>
<span class="fc" id="L285">    indexEncodingMapper.put(&quot;offset&quot;, MtasPayloadEncoder.ENCODE_OFFSET);</span>
<span class="fc" id="L286">    indexEncodingMapper.put(&quot;realoffset&quot;, MtasPayloadEncoder.ENCODE_REALOFFSET);</span>
<span class="fc" id="L287">    indexEncodingMapper.put(&quot;parent&quot;, MtasPayloadEncoder.ENCODE_PARENT);</span>
    // process
<span class="pc bpc" id="L289" title="1 of 2 branches missed.">    if (config != null) {</span>
<span class="fc bfc" id="L290" title="All 2 branches covered.">      for (int i = 0; i &lt; config.children.size(); i++) {</span>
<span class="fc bfc" id="L291" title="All 2 branches covered.">        if (config.children.get(i).name.equals(CONFIGURATION_MTAS_INDEX)) {</span>
<span class="fc" id="L292">          MtasConfiguration index = config.children.get(i);</span>
<span class="fc bfc" id="L293" title="All 2 branches covered.">          for (int j = 0; j &lt; index.children.size(); j++) {</span>
<span class="pc bpc" id="L294" title="1 of 2 branches missed.">            if (indexEncodingMapper.containsKey(index.children.get(j).name)) {</span>
<span class="fc" id="L295">              String value = index.children.get(j).attributes.get(CONFIGURATION_MTAS_INDEX_ATTRIBUTE);</span>
<span class="pc bpc" id="L296" title="1 of 4 branches missed.">              if ((value.equals(VALUE_TRUE)) || (value.equals(VALUE_1))) {</span>
<span class="fc" id="L297">                encodingFlags |= indexEncodingMapper</span>
<span class="fc" id="L298">                    .get(index.children.get(j).name);</span>
<span class="pc bpc" id="L299" title="3 of 4 branches missed.">              } else if ((value.equals(VALUE_FALSE)) || (value.equals(VALUE_0))) {</span>
<span class="fc" id="L300">                encodingFlags &amp;= ~indexEncodingMapper</span>
<span class="fc" id="L301">                    .get(index.children.get(j).name);</span>
              }
            }
          }
<span class="pc bpc" id="L305" title="1 of 2 branches missed.">        } else if (config.children.get(i).name.equals(CONFIGURATION_MTAS_PARSER)) {</span>
<span class="pc bpc" id="L306" title="1 of 2 branches missed.">          if (config.children.get(i).attributes.containsKey(CONFIGURATION_MTAS_PARSER_ATTRIBUTE)) {</span>
<span class="fc" id="L307">            parserName = config.children.get(i).attributes.get(CONFIGURATION_MTAS_PARSER_ATTRIBUTE);</span>
<span class="fc" id="L308">            parserConfiguration = config.children.get(i);</span>
          } else {
<span class="nc" id="L310">            throw new IOException(&quot;no parser configuration&quot;);</span>
          }
        }
      }
    } else {
<span class="nc" id="L315">      throw new IOException(&quot;no (valid) configuration&quot;);</span>
    }
<span class="fc" id="L317">  }</span>

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.util.AttributeSource#equals(java.lang.Object)
   */
  @Override
  public boolean equals(Object obj) {
<span class="nc bnc" id="L326" title="All 2 branches missed.">    if (this == obj)</span>
<span class="nc" id="L327">      return true;</span>
<span class="nc bnc" id="L328" title="All 2 branches missed.">    if (obj == null)</span>
<span class="nc" id="L329">      return false;</span>
<span class="nc bnc" id="L330" title="All 2 branches missed.">    if (getClass() != obj.getClass())</span>
<span class="nc" id="L331">      return false;</span>
<span class="nc" id="L332">    final MtasTokenizer that = (MtasTokenizer) obj;</span>
<span class="nc" id="L333">    return super.equals(that);</span>
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.util.AttributeSource#hashCode()
   */
  @Override
  public int hashCode() {
<span class="nc" id="L343">    return super.hashCode();</span>
  }

}
</pre><div class="footer"><span class="right">Created with <a href="http://www.jacoco.org/jacoco">JaCoCo</a> 0.7.9.201702052155</span></div></body></html>