View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.entity;
17  
18  import org.apache.commons.logging.Log;
19  import org.apache.commons.logging.LogFactory;
20  
21  import net.sf.eos.analyzer.AbstractToken;
22  import net.sf.eos.analyzer.TextBuilder;
23  import net.sf.eos.analyzer.Token;
24  import net.sf.eos.analyzer.Tokenizer;
25  import net.sf.eos.analyzer.TokenizerException;
26  
27  import static net.sf.eos.util.Conditions.checkArgument;
28  import static net.sf.eos.util.Conditions.checkState;
29  
30  import java.util.ArrayList;
31  import java.util.LinkedList;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Queue;
35  import java.util.Set;
36  
37  /**
38   * A simple matcher for named entities. The implementation slices
39   * {@link Token Tokens} of a defined maximum length thru the recognizer.
40   * If a token combination matches a key in the
41   * {@link AbstractDictionaryBasedEntityRecognizer#getEntityMap() entity map},
42   * a new {@code Token} of type {@link EntityRecognizer#ENTITY_TYPE} 
43   * is created and return by {@link #next()}.
44   * @author Sascha Kohlmann
45   */
46  public class SimpleLongestMatchDictionaryBasedEntityRecognizer
47          extends AbstractDictionaryBasedEntityRecognizer {
48  
49      static final Log LOG =
50          LogFactory.getLog(SimpleLongestMatchDictionaryBasedEntityRecognizer.class.getName());
51  
52      private Map<CharSequence, Set<CharSequence>> entities;
53      private Queue<Token> retvalBuffer = new LinkedList<Token>();
54      private FixedSizeQueue<Token> longestMatchQueue = null;
55  
56      /**
57       * Creates a new instance.
58       * @param source the source tokenizer
59       */
60      public SimpleLongestMatchDictionaryBasedEntityRecognizer(
61                  @SuppressWarnings("hiding") final Tokenizer source) {
62          super(source);
63      }
64  
65      /**
66       * Returned <code>Token</code> may be of type
67       * {@link EntityRecognizer#ENTITY_TYPE} or any different type.
68       * @throws IllegalStateException if {@link #getEntityMap()} returns
69       *                               <code>null</code>
70       * @see net.sf.eos.analyzer.Tokenizer#next()
71       */
72      @Override
73      public Token next() throws TokenizerException {
74          assert this.retvalBuffer != null;
75          if (! this.retvalBuffer.isEmpty()) {
76              return this.retvalBuffer.poll();
77          }
78          final int max = getMaxToken();
79          assert max >= 1;
80          if (this.longestMatchQueue == null) {
81              this.longestMatchQueue = new FixedSizeQueue<Token>(max);
82          }
83  
84          final Tokenizer source = getSource();
85          assert source != null;
86          final Map<CharSequence, Set<CharSequence>> entityMap = getEntityMap();
87          checkState(entityMap != null, "entitymap is null");
88          {
89              Token t = null;
90              while (this.longestMatchQueue.size() != max
91                      && (t = source.next()) != null) {
92                  if (this.longestMatchQueue.offerFix(t) != null) {
93                      throw new TokenizerException("internal error in size"
94                                                   + " handling");
95                  }
96              }
97          }
98          if (this.longestMatchQueue.size() != 0) {
99              if (LOG.isDebugEnabled()) {
100                 LOG.debug("END REACHED: "
101                           + this.longestMatchQueue.toString());
102             }
103             final Match match =
104                 checkForLongestMatchInTrie(this.longestMatchQueue, entityMap);
105 
106             if (match != null) {
107                 final Token token =
108                     new SimpleLongestMatch(match.key, ENTITY_TYPE);
109                 final Map<String, List<String>> meta = token.getMeta();
110                 final List<String> ids = new ArrayList<String>();
111                 for (final CharSequence cs : match.value) {
112                     ids.add("" + cs);
113                 }
114                 meta.put(ENTITY_ID_KEY, ids);
115 
116                 return token;
117             }
118             final Token old = this.longestMatchQueue.poll();
119             return old;
120         }
121 
122         return null;
123     }
124 
125     final Match checkForLongestMatchInTrie(
126             final LinkedList<Token> tokens,
127             final Map<CharSequence, Set<CharSequence>> trie) {
128         final int size = tokens.size();
129         final Token[] ts = tokens.toArray(new Token[size]);
130         for (int i = size; i != 0; i--) {
131             Token[] t;
132             if (size != i) {
133                 t = new Token[i];
134                 System.arraycopy(ts, 0, t, 0, i);
135             } else {
136                 t = ts;
137             }
138             final CharSequence seq = toCharSequence(t);
139             if (LOG.isDebugEnabled()) {
140                 LOG.debug("seq: '" + seq + "'");
141             }
142             final Set<CharSequence> value = trie.get(seq);
143             if (LOG.isDebugEnabled()) {
144                 LOG.debug("from trie: " + value);
145             }
146             if (value != null) {
147                 for (int j = 0; j < i; j++) {
148                     tokens.poll();
149                 }
150                 final Match match = new Match(seq, value);
151                 if (LOG.isDebugEnabled()) {
152                     LOG.debug("match: " + match);
153                 }
154                 return match;
155             }
156         }
157         return null;
158     }
159 
160     final CharSequence toCharSequence(final Token[] tokens) {
161         for (int i = 0; i < tokens.length; i++) {
162             if (LOG.isTraceEnabled()) {
163                 LOG.trace("token: " + tokens[i]);
164             }
165         }
166         final TextBuilder builder = getTextBuilder();
167         if (builder == null) {
168             return TextBuilder.SPACE_BUILDER.buildText(tokens);
169         }
170         return builder.buildText(tokens); 
171     }
172 
173     final static class Match {
174 
175         final CharSequence key;
176         final Set<CharSequence> value;
177 
178         public Match(final CharSequence match, final Set<CharSequence> value) {
179             this.key = match;
180             this.value = value;
181         }
182 
183         /*
184          * @see java.lang.Object#toString()
185          */
186         @Override
187         public String toString() {
188             final StringBuilder sb =
189                 new StringBuilder(this.getClass().getSimpleName());
190 
191             sb.append("[[key:");
192             sb.append(this.key);
193             sb.append("][value:");
194             sb.append(this.value);
195             sb.append("]]");
196 
197             return sb.toString();
198         }
199     }
200 
201     /**
202      * Only for internal use!
203      * @author Sascha Kohlmann
204      * @param <E> the type of the queue
205      */
206     final static class FixedSizeQueue<E> extends LinkedList<E> {
207 
208         private static final long serialVersionUID = -2462616526335905273L;
209         private int maxSize = 0;
210 
211         public FixedSizeQueue(final int size) {
212             checkArgument(size >= 0, "size < 0");
213             this.maxSize = size;
214         }
215 
216         @Override
217         public boolean offer(final E e) {
218             throw new UnsupportedOperationException("use offerFix(E) instead");
219         }
220 
221         public E offerFix(final E e) {
222             if (super.offer(e)) {
223                 if (size() > maxSize) {
224                     return poll();
225                 }
226             }
227             return null;
228         }
229 
230         @Override
231         public String toString() {
232             final StringBuilder sb = new StringBuilder("FixedSizeQueue[");
233 
234             for (final E e : this) {
235                 sb.append("[");
236                 sb.append(e);
237                 sb.append("]");
238             }
239             sb.append("]");
240 
241             return sb.toString();
242         }
243     }
244 
245     /** Token represents a longest match. */
246     private final static class SimpleLongestMatch extends AbstractToken {
247         /** Creates a new token.
248          * @param value the longest match value
249          * @param type the type */
250         public SimpleLongestMatch(final CharSequence value, final String type) {
251             super(value, type);
252         }
253     }
254 }