1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.entity;
17
18 import org.apache.commons.logging.Log;
19 import org.apache.commons.logging.LogFactory;
20
21 import net.sf.eos.analyzer.AbstractToken;
22 import net.sf.eos.analyzer.TextBuilder;
23 import net.sf.eos.analyzer.Token;
24 import net.sf.eos.analyzer.Tokenizer;
25 import net.sf.eos.analyzer.TokenizerException;
26
27 import static net.sf.eos.util.Conditions.checkArgument;
28 import static net.sf.eos.util.Conditions.checkState;
29
30 import java.util.ArrayList;
31 import java.util.LinkedList;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Queue;
35 import java.util.Set;
36
37
38
39
40
41
42
43
44
45
46 public class SimpleLongestMatchDictionaryBasedEntityRecognizer
47 extends AbstractDictionaryBasedEntityRecognizer {
48
49 static final Log LOG =
50 LogFactory.getLog(SimpleLongestMatchDictionaryBasedEntityRecognizer.class.getName());
51
52 private Map<CharSequence, Set<CharSequence>> entities;
53 private Queue<Token> retvalBuffer = new LinkedList<Token>();
54 private FixedSizeQueue<Token> longestMatchQueue = null;
55
56
57
58
59
60 public SimpleLongestMatchDictionaryBasedEntityRecognizer(
61 @SuppressWarnings("hiding") final Tokenizer source) {
62 super(source);
63 }
64
65
66
67
68
69
70
71
72 @Override
73 public Token next() throws TokenizerException {
74 assert this.retvalBuffer != null;
75 if (! this.retvalBuffer.isEmpty()) {
76 return this.retvalBuffer.poll();
77 }
78 final int max = getMaxToken();
79 assert max >= 1;
80 if (this.longestMatchQueue == null) {
81 this.longestMatchQueue = new FixedSizeQueue<Token>(max);
82 }
83
84 final Tokenizer source = getSource();
85 assert source != null;
86 final Map<CharSequence, Set<CharSequence>> entityMap = getEntityMap();
87 checkState(entityMap != null, "entitymap is null");
88 {
89 Token t = null;
90 while (this.longestMatchQueue.size() != max
91 && (t = source.next()) != null) {
92 if (this.longestMatchQueue.offerFix(t) != null) {
93 throw new TokenizerException("internal error in size"
94 + " handling");
95 }
96 }
97 }
98 if (this.longestMatchQueue.size() != 0) {
99 if (LOG.isDebugEnabled()) {
100 LOG.debug("END REACHED: "
101 + this.longestMatchQueue.toString());
102 }
103 final Match match =
104 checkForLongestMatchInTrie(this.longestMatchQueue, entityMap);
105
106 if (match != null) {
107 final Token token =
108 new SimpleLongestMatch(match.key, ENTITY_TYPE);
109 final Map<String, List<String>> meta = token.getMeta();
110 final List<String> ids = new ArrayList<String>();
111 for (final CharSequence cs : match.value) {
112 ids.add("" + cs);
113 }
114 meta.put(ENTITY_ID_KEY, ids);
115
116 return token;
117 }
118 final Token old = this.longestMatchQueue.poll();
119 return old;
120 }
121
122 return null;
123 }
124
125 final Match checkForLongestMatchInTrie(
126 final LinkedList<Token> tokens,
127 final Map<CharSequence, Set<CharSequence>> trie) {
128 final int size = tokens.size();
129 final Token[] ts = tokens.toArray(new Token[size]);
130 for (int i = size; i != 0; i--) {
131 Token[] t;
132 if (size != i) {
133 t = new Token[i];
134 System.arraycopy(ts, 0, t, 0, i);
135 } else {
136 t = ts;
137 }
138 final CharSequence seq = toCharSequence(t);
139 if (LOG.isDebugEnabled()) {
140 LOG.debug("seq: '" + seq + "'");
141 }
142 final Set<CharSequence> value = trie.get(seq);
143 if (LOG.isDebugEnabled()) {
144 LOG.debug("from trie: " + value);
145 }
146 if (value != null) {
147 for (int j = 0; j < i; j++) {
148 tokens.poll();
149 }
150 final Match match = new Match(seq, value);
151 if (LOG.isDebugEnabled()) {
152 LOG.debug("match: " + match);
153 }
154 return match;
155 }
156 }
157 return null;
158 }
159
160 final CharSequence toCharSequence(final Token[] tokens) {
161 for (int i = 0; i < tokens.length; i++) {
162 if (LOG.isTraceEnabled()) {
163 LOG.trace("token: " + tokens[i]);
164 }
165 }
166 final TextBuilder builder = getTextBuilder();
167 if (builder == null) {
168 return TextBuilder.SPACE_BUILDER.buildText(tokens);
169 }
170 return builder.buildText(tokens);
171 }
172
173 final static class Match {
174
175 final CharSequence key;
176 final Set<CharSequence> value;
177
178 public Match(final CharSequence match, final Set<CharSequence> value) {
179 this.key = match;
180 this.value = value;
181 }
182
183
184
185
186 @Override
187 public String toString() {
188 final StringBuilder sb =
189 new StringBuilder(this.getClass().getSimpleName());
190
191 sb.append("[[key:");
192 sb.append(this.key);
193 sb.append("][value:");
194 sb.append(this.value);
195 sb.append("]]");
196
197 return sb.toString();
198 }
199 }
200
201
202
203
204
205
206 final static class FixedSizeQueue<E> extends LinkedList<E> {
207
208 private static final long serialVersionUID = -2462616526335905273L;
209 private int maxSize = 0;
210
211 public FixedSizeQueue(final int size) {
212 checkArgument(size >= 0, "size < 0");
213 this.maxSize = size;
214 }
215
216 @Override
217 public boolean offer(final E e) {
218 throw new UnsupportedOperationException("use offerFix(E) instead");
219 }
220
221 public E offerFix(final E e) {
222 if (super.offer(e)) {
223 if (size() > maxSize) {
224 return poll();
225 }
226 }
227 return null;
228 }
229
230 @Override
231 public String toString() {
232 final StringBuilder sb = new StringBuilder("FixedSizeQueue[");
233
234 for (final E e : this) {
235 sb.append("[");
236 sb.append(e);
237 sb.append("]");
238 }
239 sb.append("]");
240
241 return sb.toString();
242 }
243 }
244
245
246 private final static class SimpleLongestMatch extends AbstractToken {
247
248
249
250 public SimpleLongestMatch(final CharSequence value, final String type) {
251 super(value, type);
252 }
253 }
254 }