View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.sentence;
17  
18  
19  import net.sf.eos.EosException;
20  import net.sf.eos.analyzer.ResettableTokenizer;
21  import net.sf.eos.analyzer.SentenceTokenizer;
22  import net.sf.eos.analyzer.TextBuilder;
23  import net.sf.eos.analyzer.Token;
24  import net.sf.eos.analyzer.TokenizerException;
25  import net.sf.eos.document.EosDocument;
26  
27  import org.apache.commons.codec.binary.Hex;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  
31  import java.io.UnsupportedEncodingException;
32  import java.security.MessageDigest;
33  import java.util.ArrayList;
34  import java.util.HashMap;
35  import java.util.List;
36  import java.util.Map;
37  
38  /**
39   * Simple default implementation.
40   * @author Sascha Kohlmann
41   */
42  public class DefaultSentencer extends Sentencer {
43  
44      /** For logging. */
45      private static final Log LOG =
46          LogFactory.getLog(DefaultSentencer.class.getName());
47  
48      /** Creates a new instance. */
49      public DefaultSentencer() {
50          super();
51      }
52  
53      @SuppressWarnings("nls")
54      @Override
55      public Map<String, EosDocument> 
56              toSentenceDocuments(final EosDocument doc,
57                                  final SentenceTokenizer sentencer,
58                                  final ResettableTokenizer tokenizer,
59                                  final TextBuilder builder)
60              throws EosException {
61  
62          if (LOG.isDebugEnabled()) {
63              LOG.debug("SentenceTokenizer instance: " + sentencer.getClass());
64              LOG.debug("ResettableTokenizer instance: " + tokenizer.getClass());
65              LOG.debug("TextBuilder instance: " + builder.getClass());
66          }
67          final Map<String, EosDocument> retval =
68              new HashMap<String, EosDocument>();
69          final MessageDigest md = createDigester();
70  
71          final Map<String, List<String>> meta = doc.getMeta();
72  
73          final CharSequence newTitle = extractTitle(doc, tokenizer, builder);
74          final List<CharSequence> sentences =
75              extractSentences(doc, sentencer, tokenizer, builder);
76  
77          for (final CharSequence newText : sentences) {
78              final EosDocument newDoc = new EosDocument();
79              newDoc.setText(newText);
80              newDoc.setTitle(newTitle);
81              final Map<String, List<String>> newMeta = newDoc.getMeta();
82              newMeta.putAll(meta);
83  
84              try {
85                  final byte[] bytes = ("" + newText).getBytes("UTF-8");
86                  md.reset();
87                  final byte[] key = md.digest(bytes);
88                  final char[] asChar = Hex.encodeHex(key);
89                  final String asString = new String(asChar);
90                  retval.put(asString, newDoc);
91              } catch (final UnsupportedEncodingException e) {
92                  throw new TokenizerException(e);
93              }
94          }
95          return retval;
96      }
97  
98      final List<CharSequence> extractSentences(
99              final EosDocument doc,
100             final SentenceTokenizer sentencer,
101             final ResettableTokenizer tokenizer,
102             final TextBuilder builder)
103                 throws EosException {
104         final List<CharSequence> sentences = new ArrayList<CharSequence>();
105 
106         final CharSequence text = doc.getText();
107         if (text != null) {
108             sentencer.reset(text);
109             Token sentence = null;
110             while ((sentence = sentencer.next()) != null) {
111                 final CharSequence seq = sentence.getTokenText();
112                 tokenizer.reset(seq);
113                 final List<Token> textTokens = new ArrayList<Token>();
114                 Token textToken = null;
115 
116                 while ((textToken = tokenizer.next()) != null) {
117                     textTokens.add(textToken);
118                 }
119 
120                 final CharSequence newText = builder.buildText(textTokens);
121                 sentences.add(newText);
122             }
123         }
124 
125         return sentences;
126     }
127 
128     final CharSequence extractTitle(final EosDocument doc,
129                                     final ResettableTokenizer tokenizer,
130                                     final TextBuilder builder)
131             throws EosException {
132 
133         final CharSequence title = doc.getTitle();
134         final List<Token> titleTokens = new ArrayList<Token>();
135         tokenizer.reset(title);
136         Token titleToken = null;
137         while ((titleToken = tokenizer.next()) != null) {
138             titleTokens.add(titleToken);
139         }
140         final CharSequence newTitle = builder.buildText(titleTokens);
141         return newTitle;
142     }
143 }