View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.analyzer;
17  
18  
19  import net.sf.eos.analyzer.lucene.LuceneTokenizerWrapper;
20  import net.sf.eos.config.Configured;
21  
22  import org.apache.commons.io.input.CharSequenceReader;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  
27  /**
28   * Tokenized a sequence of chars at whitespaces. Wrapper around Lucenes
29   * {@code WhitespaceTokenizer}.
30   * @author Sascha Kohlmann
31   */
32  public final class WhitespaceTokenizer extends TokenFilter
33                                         implements ResettableTokenizer{
34  
35      private final static Tokenizer NULL = new NullTokenizer();
36  
37      private LuceneTokenizerWrapper tokenizer;
38      private org.apache.lucene.analysis.WhitespaceTokenizer wrapped;
39  
40      /**
41       * Creates a new instance.
42       * @param source a source filter
43       */
44      public WhitespaceTokenizer(final Tokenizer source) {
45          super(source);
46          final Reader reader = new CharSequenceReader("");
47          this.wrapped = 
48              new org.apache.lucene.analysis.WhitespaceTokenizer(reader);
49          this.tokenizer = new LuceneTokenizerWrapper(this.wrapped);
50      }
51  
52      /** * Creates a new instance. */
53      public WhitespaceTokenizer() {
54          this("");
55      }
56  
57      /**
58       * Creates a new instance for a char sequence.
59       * @param text the sequence to tokenize
60       */
61      public WhitespaceTokenizer(final CharSequence text) {
62          super(NULL);
63          final Reader reader = new CharSequenceReader(text);
64          this.wrapped = 
65              new org.apache.lucene.analysis.WhitespaceTokenizer(reader);
66          this.tokenizer = new LuceneTokenizerWrapper(this.wrapped);
67      }
68  
69      /*
70       * @see net.sf.eos.analyzer.Tokenizer#next()
71       */
72      @Override
73      public Token next() throws TokenizerException {
74          Token retval = this.tokenizer.next();
75          if (retval == null) {
76              final Tokenizer source = getSource();
77              if (source.getClass() != NULL.getClass()) {
78                  final Token sourceToken = source.next();
79                  if (sourceToken != null) {
80                      final CharSequence seq = sourceToken.getTokenText();
81                      final Reader reader = new CharSequenceReader(seq);
82                      try {
83                          this.wrapped.reset(reader);
84                      } catch (final IOException e) {
85                          throw new TokenizerException(e);
86                      }
87                      retval = this.tokenizer.next();
88                  }
89              }
90          }
91          return retval;
92      }
93  
94      /*
95       * @see net.sf.eos.analyzer.ResettableTokenizer#reset(java.lang.CharSequence)
96       */
97      public void reset(final CharSequence input) throws TokenizerException {
98          assert this.wrapped != null;
99          final Tokenizer source = getSource();
100         if (source.getClass() != NULL.getClass() 
101                 && source instanceof ResettableTokenizer) {
102             final ResettableTokenizer resettable = (ResettableTokenizer) source;
103             resettable.reset(input);
104         } else {
105             final CharSequenceReader reader = new CharSequenceReader(input);
106             try {
107                 this.wrapped.reset(reader);
108             } catch (final IOException e) {
109                 throw new TokenizerException(e);
110             }
111         }
112     }
113 
114     /**
115      * Return value may be {@code null}.
116      * @return may be {@code null}
117      */
118     @Override
119     protected Tokenizer getSource() {
120         return super.getSource();
121     }
122 
123     private final static class NullTokenizer extends Configured
124                                              implements Tokenizer {
125         public Token next() throws TokenizerException {
126             return null;
127         }
128     }
129 }