View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.sentence;
17  
18  import static net.sf.eos.config.ConfigurationKey.Type.CLASSNAME;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  
23  import net.sf.eos.EosException;
24  import net.sf.eos.analyzer.ResettableTokenizer;
25  import net.sf.eos.analyzer.SentenceTokenizer;
26  import net.sf.eos.analyzer.TextBuilder;
27  import net.sf.eos.analyzer.TokenizerException;
28  import net.sf.eos.config.Configuration;
29  import net.sf.eos.config.ConfigurationKey;
30  import net.sf.eos.config.Configured;
31  import net.sf.eos.config.FactoryMethod;
32  import net.sf.eos.document.EosDocument;
33  
34  import java.security.MessageDigest;
35  import java.security.NoSuchAlgorithmException;
36  import java.util.Map;
37  
38  /**
39   * The implementation fragmented {@link EosDocument} with more then one sentence
40   * in a lot of sentences with maybe only one sentence. Each sentence is also
41   * represented by a hashcode. The hashcode is able to support removing double
42   * sentences from a corpus.
43   * @author Sascha Kohlmann
44   */
45  public abstract class Sentencer extends Configured {
46  
47      /** For logging. */
48      private static final Log LOG =
49          LogFactory.getLog(Sentencer.class.getName());
50  
51      /** The default message digest algorithm. */
52      @SuppressWarnings("nls")
53      public static final String DEFAULT_MESSAGE_DIGEST = "md5";
54  
55      /** The name of the algorithm of the message digest. */
56      @SuppressWarnings("nls")
57      @ConfigurationKey(type=CLASSNAME,
58                              defaultValue=DEFAULT_MESSAGE_DIGEST,
59                              description="The message digest.")
60      public static final String MESSAGE_DIGEST_CONFIG_NAME =
61          "net.sf.eos.sentence.Sentencer.messageDigest";
62  
63      /** The configuration key name for the classname of the implementation.
64       * @see #newInstance(Configuration) */
65      @SuppressWarnings("nls")
66      @ConfigurationKey(type=CLASSNAME,
67                              description="Configuration key of the sentencer.")
68      public final static String SENTENCER_IMPL_CONFIG_NAME =
69          "net.sf.eos.sentence.Sentencer.impl";
70  
71      /**
72       * Creates a new instance of a of the implementation. If the
73       * {@code Configuration} contains a key
74       * {@link #SENTENCER_IMPL_CONFIG_NAME} a new instance of the
75       * classname in the value will instantiate. The 
76       * {@link DefaultSentencer} will instantiate if there is no
77       * value setted.
78       * @param config the configuration
79       * @return a new instance
80       * @throws EosException if it is not possible to instantiate an instance
81       */
82      @SuppressWarnings("nls")
83      @FactoryMethod(key=SENTENCER_IMPL_CONFIG_NAME,
84                     implementation=DefaultSentencer.class)
85      public final static Sentencer newInstance(final Configuration config)
86              throws EosException {
87  
88          final Thread t = Thread.currentThread();
89          ClassLoader classLoader = t.getContextClassLoader();
90          if (classLoader == null) {
91              classLoader = Sentencer.class.getClassLoader();
92          }
93  
94          final String clazzName = config.get(SENTENCER_IMPL_CONFIG_NAME,
95                                              DefaultSentencer.class.getName());
96  
97          try {
98              final Class<? extends Sentencer> clazz = 
99                  (Class<? extends Sentencer>) 
100                     Class.forName(clazzName, true, classLoader);
101             try {
102 
103                 final Sentencer sentencer = clazz.newInstance();
104                 sentencer.configure(config);
105                 if (LOG.isDebugEnabled()) {
106                     LOG.debug("Sentencer instance: "
107                               + sentencer.getClass().getName());
108                 }
109                 return sentencer;
110 
111             } catch (final InstantiationException e) {
112                 throw new TokenizerException(e);
113             } catch (final IllegalAccessException e) {
114                 throw new TokenizerException(e);
115             }
116         } catch (final ClassNotFoundException e) {
117             throw new TokenizerException(e);
118         }
119     }
120 
121     /**
122      * Creates a new instance.
123      */
124     protected Sentencer() {
125         super();
126     }
127 
128     /**
129      * Returns the message digest implementation. If the
130      * {@link #configure(Configuration) configuration} contains no value
131      * for the key {@link #MESSAGE_DIGEST_CONFIG_NAME} the
132      * <em>{@linkplain #DEFAULT_MESSAGE_DIGEST default}</em> digest will be
133      * used.
134      * @return the message digest
135      * @throws EosException if it is not possible to create the message digest
136      */
137     protected MessageDigest createDigester() throws EosException {
138         try {
139             final Configuration config = getConfiguration();
140             String algorithm = DEFAULT_MESSAGE_DIGEST;
141             if (config != null) {
142                 algorithm = config.get(MESSAGE_DIGEST_CONFIG_NAME,
143                                        DEFAULT_MESSAGE_DIGEST);
144             }
145             MessageDigest md;
146             md = MessageDigest.getInstance(algorithm);
147             return md;
148         } catch (final NoSuchAlgorithmException e) {
149             throw new EosException(e);
150         }
151     }
152 
153     /**
154      * Fragments a document into documents of sentences. The return value is
155      * a map of {@linkplain #createDigester() message digests} and sentenced
156      * document. The documents of the return value has all metada data of the
157      * original document and maybe additional metadata.
158      * 
159      * @param doc the document to fragment
160      * @param sentencer a sentencer instance
161      * @param tokenizer a tokenizer instance to tokenize the result of the
162      *                  <em>sentencer</em>
163      * @param builder the builder supports the rebuilding of the
164      *                <em>tokenizer</em>
165      * @return a map of message digest <tt>-></tt> document relations
166      * @throws EosException if an error occurs
167      */
168     public abstract Map<String, EosDocument> 
169         toSentenceDocuments(final EosDocument doc,
170                             final SentenceTokenizer sentencer,
171                             final ResettableTokenizer tokenizer,
172                             final TextBuilder builder)
173             throws EosException;
174 }