1 /* Copyright (c) 2008 Sascha Kohlmann
2 *
3 * This program is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU Affero General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Affero General Public License for more details.
12 *
13 * You should have received a copy of the GNU Affero General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16 package net.sf.eos.analyzer;
17
18 import java.text.BreakIterator;
19 import java.util.Locale;
20
21 import static net.sf.eos.util.Conditions.checkArgumentNotNull;
22
23 /**
24 * Tokenized a text into sentences.
25 * <p>Based on {@link BreakIterator#getLineInstance(Locale)}.</p>
26 * @author Sascha Kohlmann
27 */
28 public class SentenceTokenizer /*extends Configured*/
29 implements ResettableTokenizer {
30
31 public final static String SENTENCE_TYPE = "sentence";
32
33 private BreakIterator itr;
34 // private Locale locale;
35 private int start;
36 private String text;
37
38 public SentenceTokenizer() {
39 this("");
40 }
41
42 /**
43 * Creates a new tokenizer. Uses {@link Locale#getDefault() default Locale}.
44 * @param text the text to tokenize into sentences.
45 */
46 public SentenceTokenizer(
47 @SuppressWarnings("hiding") final CharSequence text) {
48 this(text, Locale.getDefault());
49 }
50
51 /**
52 * Creates a new tokenizer.
53 * @param text the text to tokenize into sentences.
54 * @param locale
55 */
56 public SentenceTokenizer(
57 @SuppressWarnings("hiding") final CharSequence text,
58 @SuppressWarnings("hiding") final Locale locale) {
59 checkArgumentNotNull(text, "text is null");
60 checkArgumentNotNull(locale, "locale is null");
61
62 this.itr = BreakIterator.getSentenceInstance(locale);
63 final String toTokenize = text.toString();
64 this.itr.setText(toTokenize);
65 // this.locale = locale;
66 this.start = this.itr.first();
67 this.text = toTokenize;
68 }
69
70 /*
71 * @see net.sf.eos.analyzer.Tokenizer#next()
72 */
73 public Token next() throws TokenizerException {
74 final CharSequence sentence = nextSentence();
75 if (sentence != null) {
76 return new SentenceToken(sentence, SENTENCE_TYPE);
77 }
78 return null;
79 }
80
81 /*
82 * @see net.sf.eos.analyzer.Tokenizer#reset(java.lang.CharSequence)
83 */
84 public void reset(final CharSequence input) throws TokenizerException {
85 final String t = input.toString();
86 this.itr.setText(t);
87 this.start = this.itr.first();
88 this.text = t;
89 }
90
91 /**
92 * Override this method to implement a different sentence tokenizer.
93 * @return a sentence or {@code null} if no next sentence available.
94 * @throws TokenizerException if an error occurs
95 */
96 protected CharSequence nextSentence() throws TokenizerException {
97 final int end = this.itr.next();
98 if (end == BreakIterator.DONE) {
99 return null;
100 }
101 final String sentence = this.text.substring(this.start, end).trim();
102 this.start = end;
103
104 return sentence;
105 }
106
107 /** Token represents sentence as token. */
108 private final static class SentenceToken extends AbstractToken {
109 /** Creates a new token representing a sentence.
110 * @param value a sentence */
111 public SentenceToken(final CharSequence value, final String type) {
112 super(value, type);
113 }
114 }
115 }