View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.entity;
17  
18  import static net.sf.eos.config.ConfigurationKey.Type.INTEGER;
19  import static net.sf.eos.config.ConfigurationKey.Type.CLASSNAME;
20  
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  
24  import net.sf.eos.EosException;
25  import net.sf.eos.analyzer.TextBuilder;
26  import net.sf.eos.analyzer.TokenFilter;
27  import net.sf.eos.analyzer.Tokenizer;
28  import net.sf.eos.analyzer.TokenizerException;
29  import net.sf.eos.config.Configurable;
30  import net.sf.eos.config.Configuration;
31  import net.sf.eos.config.ConfigurationKey;
32  import net.sf.eos.config.FactoryMethod;
33   
34  import java.lang.reflect.Constructor;
35  import java.lang.reflect.InvocationTargetException;
36  import java.util.Map;
37  import java.util.Set;
38  
39  /**
40   * An implementation of a @code EntityRecognizer} identifies entities
41   * in a text. An entity may represented by an ID. The ID is a bracket around
42   * a collection of literal entity terms or phrases. The ID is represented by the
43   * <em>value</em> of a {@link Map} entry. The entity literal is the value of
44   * the key in the entry.
45   * @author Sascha Kohlmann
46   */
47  public abstract class AbstractDictionaryBasedEntityRecognizer
48           extends TokenFilter
49           implements EntityRecognizer,
50                      Configurable,
51                      DictionaryBasedEntityRecognizer {
52  
53      /** For logging. */
54      private static final Log LOG =
55          LogFactory.getLog(AbstractDictionaryBasedEntityRecognizer.class.getName());
56  
57      /** The configuration key name for the classname of the factory.
58       * @see #newInstance(Tokenizer, Configuration)
59       * @see #newInstance(Tokenizer) */
60      @SuppressWarnings("nls")
61      @ConfigurationKey(type=CLASSNAME,
62                              description="Implementations of a EntityRecognizer "
63                                          + "to identify entities in a text.")
64      public final static String
65          ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME =
66              "net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer.impl";
67  
68      /** Default maximum token count. */
69      @SuppressWarnings("nls")
70      private final static String DEFAULT_MAX_TOKEN = "5";
71  
72      /** Key for the maximum token count. */
73      @SuppressWarnings("nls")
74      @ConfigurationKey(type=INTEGER,
75                              defaultValue=DEFAULT_MAX_TOKEN,
76                              description="The maximum token count for indentifying.")
77      public final static String
78          MAX_TOKEN_CONFIG_NAME =
79            "net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer.maxToken";
80  
81      private Configuration config;
82  
83      private TextBuilder textBuilder;
84      private int maxToken = Integer.parseInt(DEFAULT_MAX_TOKEN);
85      private Map<CharSequence, Set<CharSequence>> entities = null;
86  
87      public AbstractDictionaryBasedEntityRecognizer(
88                  @SuppressWarnings("hiding") final Tokenizer source) {
89          super(source);
90      };
91  
92      /*
93       * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#setEntityMap(java.util.Map)
94       */
95      public void setEntityMap(final Map<CharSequence, Set<CharSequence>> entities) 
96      {
97          this.entities = entities;
98      }
99  
100     /*
101      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#getEntityMap()
102      */
103     public Map<CharSequence, Set<CharSequence>> getEntityMap() {
104         return this.entities;
105     }
106 
107     /*
108      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#setTextBuilder(net.sf.eos.analyzer.TextBuilder)
109      */
110     public void setTextBuilder(final TextBuilder builder) {
111         this.textBuilder = builder;
112     }
113 
114     /*
115      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#getTextBuilder()
116      */
117     public TextBuilder getTextBuilder() {
118         return this.textBuilder;
119     }
120 
121     /*
122      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#getMaxToken()
123      */
124     public int getMaxToken() {
125         return this.maxToken;
126     }
127 
128     /*
129      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#setMaxToken(int)
130      */
131     public void setMaxToken(@SuppressWarnings("hiding") final int maxToken) {
132         if (maxToken < 1) {
133             throw new IllegalArgumentException("maxToken < 1");
134         }
135         this.maxToken = maxToken;
136     }
137 
138     /*
139      * @see net.sf.eos.entity.DictionaryBasedEntityRecognizer#configure(net.sf.eos.config.Configuration)
140      */
141     public void configure(
142             @SuppressWarnings("hiding") final Configuration config) {
143         this.config = new Configuration(config);
144         final String lMaxToken =
145             config.get(MAX_TOKEN_CONFIG_NAME, DEFAULT_MAX_TOKEN);
146         this.maxToken = Integer.parseInt(lMaxToken);
147     }
148 
149     /**
150      * Returns the configuration.
151      * @return the configuration holder or {@code null}
152      */
153     protected final Configuration getConfiguration() {
154         return this.config;
155     }
156 
157     /**
158      * Creates a new instance of a of the recognizer. Instantiate the 
159      * {@link SimpleLongestMatchDictionaryBasedEntityRecognizer}.
160      * @param source a source tokenizer
161      * @return a new instance
162      * @throws EosException if it is not possible to instantiate an instance
163      */
164     @FactoryMethod(key=ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
165                    implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class)
166     public final static DictionaryBasedEntityRecognizer 
167             newInstance(final Tokenizer source) throws EosException {
168         final Configuration config = new Configuration();
169         return newInstance(source, config);
170     }
171 
172     /**
173      * Creates a new instance of a of the recognizer. If the
174      * {@code Configuration} contains a key
175      * {@link #ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME}
176      * a new instance of the classname in the value will instantiate. The 
177      * {@link SimpleLongestMatchDictionaryBasedEntityRecognizer} will
178      * instantiate if there is no value setted.
179      * @param source a source tokenizer
180      * @param config the configuration
181      * @return a new instance
182      * @throws EosException if it is not possible to instantiate an instance
183      */
184     @FactoryMethod(key=ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
185                    implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class)
186     public final static DictionaryBasedEntityRecognizer 
187             newInstance(final Tokenizer source, final Configuration config)
188                 throws EosException {
189 
190         final Thread t = Thread.currentThread();
191         ClassLoader classLoader = t.getContextClassLoader();
192         if (classLoader == null) {
193             classLoader =
194                 AbstractDictionaryBasedEntityRecognizer.class.getClassLoader();
195         }
196 
197         final String clazzName =
198             config.get(ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
199                     SimpleLongestMatchDictionaryBasedEntityRecognizer.class.getName());
200 
201         try {
202             final Class<? extends AbstractDictionaryBasedEntityRecognizer> clazz
203                 = (Class<? extends AbstractDictionaryBasedEntityRecognizer>) 
204                     Class.forName(clazzName, true, classLoader);
205             try {
206 
207                 final Constructor<? extends AbstractDictionaryBasedEntityRecognizer> 
208                     constructor = clazz.getConstructor(Tokenizer.class);
209 
210                 final AbstractDictionaryBasedEntityRecognizer recognizer
211                      = constructor.newInstance(source);
212                 recognizer.configure(config);
213                 if (LOG.isDebugEnabled()) {
214                     LOG.debug("AbstractDictionaryBasedEntityRecognizer instance: "
215                               + recognizer.getClass().getName());
216                 }
217                 return recognizer;
218 
219             } catch (final InstantiationException e) {
220                 throw new TokenizerException(e);
221             } catch (final IllegalAccessException e) {
222                 throw new TokenizerException(e);
223             } catch (final SecurityException e) {
224                 throw new TokenizerException(e);
225             } catch (final NoSuchMethodException e) {
226                 throw new TokenizerException(e);
227             } catch (final IllegalArgumentException e) {
228                 throw new TokenizerException(e);
229             } catch (final InvocationTargetException e) {
230                 throw new TokenizerException(e);
231             }
232         } catch (final ClassNotFoundException e) {
233             throw new TokenizerException(e);
234         }
235     }
236 }