1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.analyzer;
17
18 import java.text.BreakIterator;
19 import java.util.Locale;
20
21 import static net.sf.eos.util.Conditions.checkArgumentNotNull;
22
23
24
25
26
27
28 public class SentenceTokenizer
29 implements ResettableTokenizer {
30
31 public final static String SENTENCE_TYPE = "sentence";
32
33 private BreakIterator itr;
34
35 private int start;
36 private String text;
37
38 public SentenceTokenizer() {
39 this("");
40 }
41
42
43
44
45
46 public SentenceTokenizer(
47 @SuppressWarnings("hiding") final CharSequence text) {
48 this(text, Locale.getDefault());
49 }
50
51
52
53
54
55
56 public SentenceTokenizer(
57 @SuppressWarnings("hiding") final CharSequence text,
58 @SuppressWarnings("hiding") final Locale locale) {
59 checkArgumentNotNull(text, "text is null");
60 checkArgumentNotNull(locale, "locale is null");
61
62 this.itr = BreakIterator.getSentenceInstance(locale);
63 final String toTokenize = text.toString();
64 this.itr.setText(toTokenize);
65
66 this.start = this.itr.first();
67 this.text = toTokenize;
68 }
69
70
71
72
73 public Token next() throws TokenizerException {
74 final CharSequence sentence = nextSentence();
75 if (sentence != null) {
76 return new SentenceToken(sentence, SENTENCE_TYPE);
77 }
78 return null;
79 }
80
81
82
83
84 public void reset(final CharSequence input) throws TokenizerException {
85 final String t = input.toString();
86 this.itr.setText(t);
87 this.start = this.itr.first();
88 this.text = t;
89 }
90
91
92
93
94
95
96 protected CharSequence nextSentence() throws TokenizerException {
97 final int end = this.itr.next();
98 if (end == BreakIterator.DONE) {
99 return null;
100 }
101 final String sentence = this.text.substring(this.start, end).trim();
102 this.start = end;
103
104 return sentence;
105 }
106
107
108 private final static class SentenceToken extends AbstractToken {
109
110
111 public SentenceToken(final CharSequence value, final String type) {
112 super(value, type);
113 }
114 }
115 }