1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.entity;
17
18 import static net.sf.eos.config.ConfigurationKey.Type.INTEGER;
19 import static net.sf.eos.config.ConfigurationKey.Type.CLASSNAME;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23
24 import net.sf.eos.EosException;
25 import net.sf.eos.analyzer.TextBuilder;
26 import net.sf.eos.analyzer.TokenFilter;
27 import net.sf.eos.analyzer.Tokenizer;
28 import net.sf.eos.analyzer.TokenizerException;
29 import net.sf.eos.config.Configurable;
30 import net.sf.eos.config.Configuration;
31 import net.sf.eos.config.ConfigurationKey;
32 import net.sf.eos.config.FactoryMethod;
33
34 import java.lang.reflect.Constructor;
35 import java.lang.reflect.InvocationTargetException;
36 import java.util.Map;
37 import java.util.Set;
38
39
40
41
42
43
44
45
46
47 public abstract class AbstractDictionaryBasedEntityRecognizer
48 extends TokenFilter
49 implements EntityRecognizer,
50 Configurable,
51 DictionaryBasedEntityRecognizer {
52
53
54 private static final Log LOG =
55 LogFactory.getLog(AbstractDictionaryBasedEntityRecognizer.class.getName());
56
57
58
59
60 @SuppressWarnings("nls")
61 @ConfigurationKey(type=CLASSNAME,
62 description="Implementations of a EntityRecognizer "
63 + "to identify entities in a text.")
64 public final static String
65 ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME =
66 "net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer.impl";
67
68
69 @SuppressWarnings("nls")
70 private final static String DEFAULT_MAX_TOKEN = "5";
71
72
73 @SuppressWarnings("nls")
74 @ConfigurationKey(type=INTEGER,
75 defaultValue=DEFAULT_MAX_TOKEN,
76 description="The maximum token count for indentifying.")
77 public final static String
78 MAX_TOKEN_CONFIG_NAME =
79 "net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer.maxToken";
80
81 private Configuration config;
82
83 private TextBuilder textBuilder;
84 private int maxToken = Integer.parseInt(DEFAULT_MAX_TOKEN);
85 private Map<CharSequence, Set<CharSequence>> entities = null;
86
87 public AbstractDictionaryBasedEntityRecognizer(
88 @SuppressWarnings("hiding") final Tokenizer source) {
89 super(source);
90 };
91
92
93
94
95 public void setEntityMap(final Map<CharSequence, Set<CharSequence>> entities)
96 {
97 this.entities = entities;
98 }
99
100
101
102
103 public Map<CharSequence, Set<CharSequence>> getEntityMap() {
104 return this.entities;
105 }
106
107
108
109
110 public void setTextBuilder(final TextBuilder builder) {
111 this.textBuilder = builder;
112 }
113
114
115
116
117 public TextBuilder getTextBuilder() {
118 return this.textBuilder;
119 }
120
121
122
123
124 public int getMaxToken() {
125 return this.maxToken;
126 }
127
128
129
130
131 public void setMaxToken(@SuppressWarnings("hiding") final int maxToken) {
132 if (maxToken < 1) {
133 throw new IllegalArgumentException("maxToken < 1");
134 }
135 this.maxToken = maxToken;
136 }
137
138
139
140
141 public void configure(
142 @SuppressWarnings("hiding") final Configuration config) {
143 this.config = new Configuration(config);
144 final String lMaxToken =
145 config.get(MAX_TOKEN_CONFIG_NAME, DEFAULT_MAX_TOKEN);
146 this.maxToken = Integer.parseInt(lMaxToken);
147 }
148
149
150
151
152
153 protected final Configuration getConfiguration() {
154 return this.config;
155 }
156
157
158
159
160
161
162
163
164 @FactoryMethod(key=ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
165 implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class)
166 public final static DictionaryBasedEntityRecognizer
167 newInstance(final Tokenizer source) throws EosException {
168 final Configuration config = new Configuration();
169 return newInstance(source, config);
170 }
171
172
173
174
175
176
177
178
179
180
181
182
183
184 @FactoryMethod(key=ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
185 implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class)
186 public final static DictionaryBasedEntityRecognizer
187 newInstance(final Tokenizer source, final Configuration config)
188 throws EosException {
189
190 final Thread t = Thread.currentThread();
191 ClassLoader classLoader = t.getContextClassLoader();
192 if (classLoader == null) {
193 classLoader =
194 AbstractDictionaryBasedEntityRecognizer.class.getClassLoader();
195 }
196
197 final String clazzName =
198 config.get(ABSTRACT_DICTIONARY_BASED_ENTITY_RECOGNIZER_IMPL_CONFIG_NAME,
199 SimpleLongestMatchDictionaryBasedEntityRecognizer.class.getName());
200
201 try {
202 final Class<? extends AbstractDictionaryBasedEntityRecognizer> clazz
203 = (Class<? extends AbstractDictionaryBasedEntityRecognizer>)
204 Class.forName(clazzName, true, classLoader);
205 try {
206
207 final Constructor<? extends AbstractDictionaryBasedEntityRecognizer>
208 constructor = clazz.getConstructor(Tokenizer.class);
209
210 final AbstractDictionaryBasedEntityRecognizer recognizer
211 = constructor.newInstance(source);
212 recognizer.configure(config);
213 if (LOG.isDebugEnabled()) {
214 LOG.debug("AbstractDictionaryBasedEntityRecognizer instance: "
215 + recognizer.getClass().getName());
216 }
217 return recognizer;
218
219 } catch (final InstantiationException e) {
220 throw new TokenizerException(e);
221 } catch (final IllegalAccessException e) {
222 throw new TokenizerException(e);
223 } catch (final SecurityException e) {
224 throw new TokenizerException(e);
225 } catch (final NoSuchMethodException e) {
226 throw new TokenizerException(e);
227 } catch (final IllegalArgumentException e) {
228 throw new TokenizerException(e);
229 } catch (final InvocationTargetException e) {
230 throw new TokenizerException(e);
231 }
232 } catch (final ClassNotFoundException e) {
233 throw new TokenizerException(e);
234 }
235 }
236 }