1 /* Copyright (c) 2008 Sascha Kohlmann
2 *
3 * This program is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU Affero General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Affero General Public License for more details.
12 *
13 * You should have received a copy of the GNU Affero General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16 package net.sf.eos.sentence;
17
18 import static net.sf.eos.config.ConfigurationKey.Type.CLASSNAME;
19
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22
23 import net.sf.eos.EosException;
24 import net.sf.eos.analyzer.ResettableTokenizer;
25 import net.sf.eos.analyzer.SentenceTokenizer;
26 import net.sf.eos.analyzer.TextBuilder;
27 import net.sf.eos.analyzer.TokenizerException;
28 import net.sf.eos.config.Configuration;
29 import net.sf.eos.config.ConfigurationKey;
30 import net.sf.eos.config.Configured;
31 import net.sf.eos.config.FactoryMethod;
32 import net.sf.eos.document.EosDocument;
33
34 import java.security.MessageDigest;
35 import java.security.NoSuchAlgorithmException;
36 import java.util.Map;
37
38 /**
39 * The implementation fragmented {@link EosDocument} with more then one sentence
40 * in a lot of sentences with maybe only one sentence. Each sentence is also
41 * represented by a hashcode. The hashcode is able to support removing double
42 * sentences from a corpus.
43 * @author Sascha Kohlmann
44 */
45 public abstract class Sentencer extends Configured {
46
47 /** For logging. */
48 private static final Log LOG =
49 LogFactory.getLog(Sentencer.class.getName());
50
51 /** The default message digest algorithm. */
52 @SuppressWarnings("nls")
53 public static final String DEFAULT_MESSAGE_DIGEST = "md5";
54
55 /** The name of the algorithm of the message digest. */
56 @SuppressWarnings("nls")
57 @ConfigurationKey(type=CLASSNAME,
58 defaultValue=DEFAULT_MESSAGE_DIGEST,
59 description="The message digest.")
60 public static final String MESSAGE_DIGEST_CONFIG_NAME =
61 "net.sf.eos.sentence.Sentencer.messageDigest";
62
63 /** The configuration key name for the classname of the implementation.
64 * @see #newInstance(Configuration) */
65 @SuppressWarnings("nls")
66 @ConfigurationKey(type=CLASSNAME,
67 description="Configuration key of the sentencer.")
68 public final static String SENTENCER_IMPL_CONFIG_NAME =
69 "net.sf.eos.sentence.Sentencer.impl";
70
71 /**
72 * Creates a new instance of a of the implementation. If the
73 * {@code Configuration} contains a key
74 * {@link #SENTENCER_IMPL_CONFIG_NAME} a new instance of the
75 * classname in the value will instantiate. The
76 * {@link DefaultSentencer} will instantiate if there is no
77 * value setted.
78 * @param config the configuration
79 * @return a new instance
80 * @throws EosException if it is not possible to instantiate an instance
81 */
82 @SuppressWarnings("nls")
83 @FactoryMethod(key=SENTENCER_IMPL_CONFIG_NAME,
84 implementation=DefaultSentencer.class)
85 public final static Sentencer newInstance(final Configuration config)
86 throws EosException {
87
88 final Thread t = Thread.currentThread();
89 ClassLoader classLoader = t.getContextClassLoader();
90 if (classLoader == null) {
91 classLoader = Sentencer.class.getClassLoader();
92 }
93
94 final String clazzName = config.get(SENTENCER_IMPL_CONFIG_NAME,
95 DefaultSentencer.class.getName());
96
97 try {
98 final Class<? extends Sentencer> clazz =
99 (Class<? extends Sentencer>)
100 Class.forName(clazzName, true, classLoader);
101 try {
102
103 final Sentencer sentencer = clazz.newInstance();
104 sentencer.configure(config);
105 if (LOG.isDebugEnabled()) {
106 LOG.debug("Sentencer instance: "
107 + sentencer.getClass().getName());
108 }
109 return sentencer;
110
111 } catch (final InstantiationException e) {
112 throw new TokenizerException(e);
113 } catch (final IllegalAccessException e) {
114 throw new TokenizerException(e);
115 }
116 } catch (final ClassNotFoundException e) {
117 throw new TokenizerException(e);
118 }
119 }
120
121 /**
122 * Creates a new instance.
123 */
124 protected Sentencer() {
125 super();
126 }
127
128 /**
129 * Returns the message digest implementation. If the
130 * {@link #configure(Configuration) configuration} contains no value
131 * for the key {@link #MESSAGE_DIGEST_CONFIG_NAME} the
132 * <em>{@linkplain #DEFAULT_MESSAGE_DIGEST default}</em> digest will be
133 * used.
134 * @return the message digest
135 * @throws EosException if it is not possible to create the message digest
136 */
137 protected MessageDigest createDigester() throws EosException {
138 try {
139 final Configuration config = getConfiguration();
140 String algorithm = DEFAULT_MESSAGE_DIGEST;
141 if (config != null) {
142 algorithm = config.get(MESSAGE_DIGEST_CONFIG_NAME,
143 DEFAULT_MESSAGE_DIGEST);
144 }
145 MessageDigest md;
146 md = MessageDigest.getInstance(algorithm);
147 return md;
148 } catch (final NoSuchAlgorithmException e) {
149 throw new EosException(e);
150 }
151 }
152
153 /**
154 * Fragments a document into documents of sentences. The return value is
155 * a map of {@linkplain #createDigester() message digests} and sentenced
156 * document. The documents of the return value has all metada data of the
157 * original document and maybe additional metadata.
158 *
159 * @param doc the document to fragment
160 * @param sentencer a sentencer instance
161 * @param tokenizer a tokenizer instance to tokenize the result of the
162 * <em>sentencer</em>
163 * @param builder the builder supports the rebuilding of the
164 * <em>tokenizer</em>
165 * @return a map of message digest <tt>-></tt> document relations
166 * @throws EosException if an error occurs
167 */
168 public abstract Map<String, EosDocument>
169 toSentenceDocuments(final EosDocument doc,
170 final SentenceTokenizer sentencer,
171 final ResettableTokenizer tokenizer,
172 final TextBuilder builder)
173 throws EosException;
174 }