1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.sentence;
17
18
19 import net.sf.eos.EosException;
20 import net.sf.eos.analyzer.ResettableTokenizer;
21 import net.sf.eos.analyzer.SentenceTokenizer;
22 import net.sf.eos.analyzer.TextBuilder;
23 import net.sf.eos.analyzer.Token;
24 import net.sf.eos.analyzer.TokenizerException;
25 import net.sf.eos.document.EosDocument;
26
27 import org.apache.commons.codec.binary.Hex;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30
31 import java.io.UnsupportedEncodingException;
32 import java.security.MessageDigest;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.List;
36 import java.util.Map;
37
38
39
40
41
42 public class DefaultSentencer extends Sentencer {
43
44
45 private static final Log LOG =
46 LogFactory.getLog(DefaultSentencer.class.getName());
47
48
49 public DefaultSentencer() {
50 super();
51 }
52
53 @SuppressWarnings("nls")
54 @Override
55 public Map<String, EosDocument>
56 toSentenceDocuments(final EosDocument doc,
57 final SentenceTokenizer sentencer,
58 final ResettableTokenizer tokenizer,
59 final TextBuilder builder)
60 throws EosException {
61
62 if (LOG.isDebugEnabled()) {
63 LOG.debug("SentenceTokenizer instance: " + sentencer.getClass());
64 LOG.debug("ResettableTokenizer instance: " + tokenizer.getClass());
65 LOG.debug("TextBuilder instance: " + builder.getClass());
66 }
67 final Map<String, EosDocument> retval =
68 new HashMap<String, EosDocument>();
69 final MessageDigest md = createDigester();
70
71 final Map<String, List<String>> meta = doc.getMeta();
72
73 final CharSequence newTitle = extractTitle(doc, tokenizer, builder);
74 final List<CharSequence> sentences =
75 extractSentences(doc, sentencer, tokenizer, builder);
76
77 for (final CharSequence newText : sentences) {
78 final EosDocument newDoc = new EosDocument();
79 newDoc.setText(newText);
80 newDoc.setTitle(newTitle);
81 final Map<String, List<String>> newMeta = newDoc.getMeta();
82 newMeta.putAll(meta);
83
84 try {
85 final byte[] bytes = ("" + newText).getBytes("UTF-8");
86 md.reset();
87 final byte[] key = md.digest(bytes);
88 final char[] asChar = Hex.encodeHex(key);
89 final String asString = new String(asChar);
90 retval.put(asString, newDoc);
91 } catch (final UnsupportedEncodingException e) {
92 throw new TokenizerException(e);
93 }
94 }
95 return retval;
96 }
97
98 final List<CharSequence> extractSentences(
99 final EosDocument doc,
100 final SentenceTokenizer sentencer,
101 final ResettableTokenizer tokenizer,
102 final TextBuilder builder)
103 throws EosException {
104 final List<CharSequence> sentences = new ArrayList<CharSequence>();
105
106 final CharSequence text = doc.getText();
107 if (text != null) {
108 sentencer.reset(text);
109 Token sentence = null;
110 while ((sentence = sentencer.next()) != null) {
111 final CharSequence seq = sentence.getTokenText();
112 tokenizer.reset(seq);
113 final List<Token> textTokens = new ArrayList<Token>();
114 Token textToken = null;
115
116 while ((textToken = tokenizer.next()) != null) {
117 textTokens.add(textToken);
118 }
119
120 final CharSequence newText = builder.buildText(textTokens);
121 sentences.add(newText);
122 }
123 }
124
125 return sentences;
126 }
127
128 final CharSequence extractTitle(final EosDocument doc,
129 final ResettableTokenizer tokenizer,
130 final TextBuilder builder)
131 throws EosException {
132
133 final CharSequence title = doc.getTitle();
134 final List<Token> titleTokens = new ArrayList<Token>();
135 tokenizer.reset(title);
136 Token titleToken = null;
137 while ((titleToken = tokenizer.next()) != null) {
138 titleTokens.add(titleToken);
139 }
140 final CharSequence newTitle = builder.buildText(titleTokens);
141 return newTitle;
142 }
143 }