1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.analyzer;
17
18
19 import net.sf.eos.analyzer.lucene.LuceneTokenizerWrapper;
20 import net.sf.eos.config.Configured;
21
22 import org.apache.commons.io.input.CharSequenceReader;
23
24 import java.io.IOException;
25 import java.io.Reader;
26
27
28
29
30
31
32 public final class WhitespaceTokenizer extends TokenFilter
33 implements ResettableTokenizer{
34
35 private final static Tokenizer NULL = new NullTokenizer();
36
37 private LuceneTokenizerWrapper tokenizer;
38 private org.apache.lucene.analysis.WhitespaceTokenizer wrapped;
39
40
41
42
43
44 public WhitespaceTokenizer(final Tokenizer source) {
45 super(source);
46 final Reader reader = new CharSequenceReader("");
47 this.wrapped =
48 new org.apache.lucene.analysis.WhitespaceTokenizer(reader);
49 this.tokenizer = new LuceneTokenizerWrapper(this.wrapped);
50 }
51
52
53 public WhitespaceTokenizer() {
54 this("");
55 }
56
57
58
59
60
61 public WhitespaceTokenizer(final CharSequence text) {
62 super(NULL);
63 final Reader reader = new CharSequenceReader(text);
64 this.wrapped =
65 new org.apache.lucene.analysis.WhitespaceTokenizer(reader);
66 this.tokenizer = new LuceneTokenizerWrapper(this.wrapped);
67 }
68
69
70
71
72 @Override
73 public Token next() throws TokenizerException {
74 Token retval = this.tokenizer.next();
75 if (retval == null) {
76 final Tokenizer source = getSource();
77 if (source.getClass() != NULL.getClass()) {
78 final Token sourceToken = source.next();
79 if (sourceToken != null) {
80 final CharSequence seq = sourceToken.getTokenText();
81 final Reader reader = new CharSequenceReader(seq);
82 try {
83 this.wrapped.reset(reader);
84 } catch (final IOException e) {
85 throw new TokenizerException(e);
86 }
87 retval = this.tokenizer.next();
88 }
89 }
90 }
91 return retval;
92 }
93
94
95
96
97 public void reset(final CharSequence input) throws TokenizerException {
98 assert this.wrapped != null;
99 final Tokenizer source = getSource();
100 if (source.getClass() != NULL.getClass()
101 && source instanceof ResettableTokenizer) {
102 final ResettableTokenizer resettable = (ResettableTokenizer) source;
103 resettable.reset(input);
104 } else {
105 final CharSequenceReader reader = new CharSequenceReader(input);
106 try {
107 this.wrapped.reset(reader);
108 } catch (final IOException e) {
109 throw new TokenizerException(e);
110 }
111 }
112 }
113
114
115
116
117
118 @Override
119 protected Tokenizer getSource() {
120 return super.getSource();
121 }
122
123 private final static class NullTokenizer extends Configured
124 implements Tokenizer {
125 public Token next() throws TokenizerException {
126 return null;
127 }
128 }
129 }