View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.medline;
17  
18  import net.sf.eos.analyzer.CaseTokenFilter;
19  import net.sf.eos.analyzer.ResettableTokenFilter;
20  import net.sf.eos.analyzer.ResettableTokenizer;
21  import net.sf.eos.analyzer.StopTokenFilter;
22  import net.sf.eos.analyzer.SurroundingTokenFilter;
23  import net.sf.eos.analyzer.Tokenizer;
24  import net.sf.eos.analyzer.TokenizerSupplier;
25  import net.sf.eos.analyzer.WhitespaceTokenizer;
26  
27  import static net.sf.eos.medline.MedlineAbstractStructureWords.STRUCTURE_WORDS_UPPER;
28  
29  import java.util.Arrays;
30  import java.util.HashSet;
31  import java.util.Set;
32  
33  public class MedlineTokenizerSupplier extends TokenizerSupplier {
34  
35      @Override
36      public ResettableTokenizer get() {
37  
38          final ResettableTokenizer whitespace = new WhitespaceTokenizer();
39          final Set<CharSequence> stopWords =
40              new HashSet<CharSequence>(Arrays.asList(STRUCTURE_WORDS_UPPER));
41          final Tokenizer stop = new StopTokenFilter(whitespace, stopWords);
42          final Tokenizer lower = new CaseTokenFilter(stop);
43          final Tokenizer surround = new SurroundingTokenFilter(lower);
44          final ResettableTokenizer retval =
45              new ResettableTokenFilter(surround, whitespace);
46  
47          return retval;
48      }
49  }