1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.medline;
17
18 import net.sf.eos.analyzer.CaseTokenFilter;
19 import net.sf.eos.analyzer.ResettableTokenFilter;
20 import net.sf.eos.analyzer.ResettableTokenizer;
21 import net.sf.eos.analyzer.StopTokenFilter;
22 import net.sf.eos.analyzer.SurroundingTokenFilter;
23 import net.sf.eos.analyzer.Tokenizer;
24 import net.sf.eos.analyzer.TokenizerSupplier;
25 import net.sf.eos.analyzer.WhitespaceTokenizer;
26
27 import static net.sf.eos.medline.MedlineAbstractStructureWords.STRUCTURE_WORDS_UPPER;
28
29 import java.util.Arrays;
30 import java.util.HashSet;
31 import java.util.Set;
32
33 public class MedlineTokenizerSupplier extends TokenizerSupplier {
34
35 @Override
36 public ResettableTokenizer get() {
37
38 final ResettableTokenizer whitespace = new WhitespaceTokenizer();
39 final Set<CharSequence> stopWords =
40 new HashSet<CharSequence>(Arrays.asList(STRUCTURE_WORDS_UPPER));
41 final Tokenizer stop = new StopTokenFilter(whitespace, stopWords);
42 final Tokenizer lower = new CaseTokenFilter(stop);
43 final Tokenizer surround = new SurroundingTokenFilter(lower);
44 final ResettableTokenizer retval =
45 new ResettableTokenFilter(surround, whitespace);
46
47 return retval;
48 }
49 }