View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.analyzer;
17  
18  import java.text.BreakIterator;
19  import java.util.Locale;
20  
21  import static net.sf.eos.util.Conditions.checkArgumentNotNull;
22  
23  /**
24   * Tokenized a text into sentences.
25   * <p>Based on {@link BreakIterator#getLineInstance(Locale)}.</p>
26   * @author Sascha Kohlmann
27   */
28  public class SentenceTokenizer /*extends Configured*/
29                                 implements ResettableTokenizer {
30  
31      public final static String SENTENCE_TYPE = "sentence";
32  
33      private BreakIterator itr;
34  //    private Locale locale;
35      private int start;
36      private String text;
37  
38      public SentenceTokenizer() {
39          this("");
40      }
41  
42      /**
43       * Creates a new tokenizer. Uses {@link Locale#getDefault() default Locale}.
44       * @param text the text to tokenize into sentences.
45       */
46      public SentenceTokenizer(
47              @SuppressWarnings("hiding") final CharSequence text) {
48          this(text, Locale.getDefault());
49      }
50  
51      /**
52       * Creates a new tokenizer.
53       * @param text the text to tokenize into sentences.
54       * @param locale
55       */
56      public SentenceTokenizer(
57              @SuppressWarnings("hiding") final CharSequence text,
58              @SuppressWarnings("hiding") final Locale locale) {
59          checkArgumentNotNull(text, "text is null");
60          checkArgumentNotNull(locale, "locale is null");
61  
62          this.itr = BreakIterator.getSentenceInstance(locale);
63          final String toTokenize = text.toString();
64          this.itr.setText(toTokenize);
65  //        this.locale = locale;
66          this.start = this.itr.first();
67          this.text = toTokenize;
68      }
69  
70      /*
71       * @see net.sf.eos.analyzer.Tokenizer#next()
72       */
73      public Token next() throws TokenizerException {
74          final CharSequence sentence = nextSentence();
75          if (sentence != null) {
76              return new SentenceToken(sentence, SENTENCE_TYPE);
77          }
78          return null;
79      }
80  
81      /*
82       * @see net.sf.eos.analyzer.Tokenizer#reset(java.lang.CharSequence)
83       */
84      public void reset(final CharSequence input) throws TokenizerException {
85          final String t = input.toString();
86          this.itr.setText(t);
87          this.start = this.itr.first();
88          this.text = t;
89      }
90  
91      /**
92       * Override this method to implement a different sentence tokenizer.
93       * @return a sentence or {@code null} if no next sentence available.
94       * @throws TokenizerException if an error occurs
95       */
96      protected CharSequence nextSentence() throws TokenizerException {
97          final int end = this.itr.next();
98          if (end == BreakIterator.DONE) {
99              return null;
100         }
101         final String sentence = this.text.substring(this.start, end).trim();
102         this.start = end;
103 
104         return sentence;
105     }
106 
107     /** Token represents sentence as token. */
108     private final static class SentenceToken extends AbstractToken {
109         /** Creates a new token representing a sentence.
110          * @param value a sentence */
111         public SentenceToken(final CharSequence value, final String type) {
112             super(value, type);
113         }
114     }
115 }