View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.hadoop.mapred.cooccurrence;
17  
18  import static net.sf.eos.entity.DictionaryBasedEntityRecognizer.ENTITY_ID_KEY;
19  import static net.sf.eos.entity.EntityRecognizer.ENTITY_TYPE;
20  import net.sf.eos.EosException;
21  import net.sf.eos.analyzer.ResettableTokenizer;
22  import net.sf.eos.analyzer.TextBuilder;
23  import net.sf.eos.analyzer.Token;
24  import net.sf.eos.analyzer.TokenizerSupplier;
25  import net.sf.eos.analyzer.TokenizerException;
26  import net.sf.eos.analyzer.TextBuilder.SpaceBuilder;
27  import net.sf.eos.config.Configuration;
28  import net.sf.eos.config.Configured;
29  import net.sf.eos.config.Service;
30  import net.sf.eos.config.Services;
31  import net.sf.eos.document.EosDocument;
32  import net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer;
33  import net.sf.eos.entity.DictionaryBasedEntityRecognizer;
34  import net.sf.eos.entity.SimpleLongestMatchDictionaryBasedEntityRecognizer;
35  import net.sf.eos.hadoop.mapred.AbstractKeyGenerator;
36  import net.sf.eos.hadoop.mapred.KeyGenerator;
37  import net.sf.eos.trie.Trie;
38  
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.io.Text;
42  
43  import java.util.ArrayList;
44  import java.util.HashMap;
45  import java.util.List;
46  import java.util.Map;
47  import java.util.Set;
48  import java.util.Map.Entry;
49  
50  @Services(
51      services={
52          @Service(
53              factory=TokenizerSupplier.class,
54              description="Tokenizer for coocurence analyzing."
55          ),
56          @Service(
57              factory=AbstractKeyGenerator.class,
58              implementation=IdMetadataKeyGenerator.class,
59              description="Create ID for map task."
60          ),
61          @Service(
62              factory=AbstractDictionaryBasedEntityRecognizer.class,
63              implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class
64          ),
65          @Service(
66              factory=TextBuilder.class,
67              implementation=SpaceBuilder.class
68          )
69      }
70  )
71  public class DictionaryBasedEntityIdKeyGenerator extends Configured
72  /*        implements KeyGenerator<Text> */ {
73  
74      /** For logging. */
75      private static final Log LOG =
76          LogFactory.getLog(DictionaryBasedEntityIdKeyGenerator.class.getName());
77  
78      private Trie<CharSequence, Set<CharSequence>> trie;
79  
80      public Map<Text, EosDocument> createKeysForDocument(final EosDocument doc)
81              throws EosException {
82  
83          final CharSequence text = doc.getText();
84          final DictionaryBasedEntityRecognizer dber =
85              getDictionaryBasedEntityRecognizerForText(text);
86  
87          List<Token> tokens = identifiyToken(dber);
88  
89          final Map<String, List<Token>> mapToTokenList =
90              new HashMap<String, List<Token>>();
91  
92          // Map for entity ID to all tokens with addition meta information.
93          for (final Token token : tokens) {
94              assert token != null;
95              final String type = token.getType();
96              if (ENTITY_TYPE.equals(type)) {
97                  final Map<String, List<String>> meta = token.getMeta();
98                  final List<String> ids = meta.get(ENTITY_ID_KEY);
99                  for (final String id :ids) {
100                     if (! mapToTokenList.containsKey(id)) {
101                         mapToTokenList.put(id, tokens);
102                     }
103                 }
104             }
105         }
106 
107         final Map<Text, EosDocument> mapToDocument =
108             new HashMap<Text, EosDocument>();
109 
110         final Configuration lconf = getConfiguration();
111         if (lconf.get(AbstractKeyGenerator.ABSTRACT_KEY_GENERATOR_IMPL_CONFIG_NAME)
112                 == null) {
113             lconf.set(AbstractKeyGenerator.ABSTRACT_KEY_GENERATOR_IMPL_CONFIG_NAME,
114                       IdMetadataKeyGenerator.class.getName());
115         }
116 
117         final KeyGenerator<Text> generator =
118             (KeyGenerator<Text>) AbstractKeyGenerator.newInstance(lconf);
119 
120         // Create new document for each entity ID. Remove entity ID from
121         // document and replace character sequence of the entity common- or
122         // other name by the entity ID.
123         for (final Entry<String, List<Token>> entry : mapToTokenList.entrySet()) {
124             final String key = entry.getKey();
125             final Text keyAsText = new Text(key);
126             if (! mapToDocument.containsKey(keyAsText)) {
127                 final TextBuilder builder = TextBuilder.newInstance(lconf);
128 
129                 final List<CharSequence> l = new ArrayList<CharSequence>();
130                 final List<Token> value = entry.getValue();
131 
132                 for (final Token token : value) {
133                     final String type = token.getType();
134 
135                     if (ENTITY_TYPE.equals(type)) {
136 
137                         final Map<String, List<String>> meta = token.getMeta();
138                         final List<String> ids = meta.get(ENTITY_ID_KEY);
139 
140                         final List<CharSequence> idList = new ArrayList<CharSequence>();
141                         for (final String id :ids) {
142                             if (! key.equals(id)) {
143                                 idList.add(id);
144                             }
145                         }
146                         final int size = idList.size();
147                         CharSequence[] css = new CharSequence[size];
148                         css = idList.toArray(css);
149 
150                         final CharSequence in = builder.buildText(css);
151 
152                         l.add(in);
153                     } else {
154                         final CharSequence in = token.getTokenText();
155                         l.add(in);
156                     }
157                 }
158 
159                 final int size = l.size();
160                 CharSequence[] css = new CharSequence[size];
161                 css = l.toArray(css);
162                 final CharSequence newText = builder.buildText(css);
163 
164                 final EosDocument newDoc = new EosDocument();
165                 newDoc.setText(newText);
166                 final CharSequence title = doc.getTitle();
167                 newDoc.setTitle(title);
168 
169                 final Map<String, List<String>> oldMeta = doc.getMeta();
170                 final Map<String, List<String>> newMap =
171                     new HashMap<String, List<String>>(oldMeta);
172                 final List<String> newIdList = new ArrayList<String>();
173                 newIdList.add(key);
174                 newMap.put(EosDocument.ID_META_KEY, newIdList);
175                 newDoc.setMeta(newMap);
176 
177                 final Map<Text, EosDocument> toStore =
178                     generator.createKeysForDocument(newDoc);
179                 for (final Entry<Text, EosDocument> toStoreEntry
180                         : toStore.entrySet()) {
181                     final Text keyAsLocalText = toStoreEntry.getKey();
182                     final EosDocument toStoreDoc = toStoreEntry.getValue();
183                     mapToDocument.put(keyAsLocalText, toStoreDoc);
184                 }
185             }
186         }
187 
188         return mapToDocument;
189     }
190 
191     final List<Token> identifiyToken(final DictionaryBasedEntityRecognizer dber)
192             throws TokenizerException {
193 
194         final List<Token> tokens = new ArrayList<Token>();
195 
196         Token t = null;
197         while ((t = dber.next()) != null) {
198             tokens.add(t);
199         }
200 
201         return tokens;
202     }
203 
204     /**
205      * Creates a new <code>DictionaryBasedEntityRecognizer</code> for the
206      * given text. Uses the factory method of
207      * {@link AbstractDictionaryBasedEntityRecognizer#newInstance(net.sf.eos.analyzer.Tokenizer, Configuration)}
208      * to create a new instance. Use {@link #getTokenizer()} for the
209      * <em>source</em>.
210      * @param text the text to tokenize
211      * @return a new instance
212      */
213     protected DictionaryBasedEntityRecognizer
214             getDictionaryBasedEntityRecognizerForText(final CharSequence text) {
215         try {
216             LOG.debug("Initialize DictionaryBasedEntityRecognizer");
217 
218             final Configuration lconf = getConfiguration();
219             final ResettableTokenizer tokenizer = getTokenizer();
220             tokenizer.reset(text);
221 
222             final DictionaryBasedEntityRecognizer regconizer =
223                 AbstractDictionaryBasedEntityRecognizer.newInstance(tokenizer,
224                                                                     lconf);
225             final Trie<CharSequence, Set<CharSequence>> ltrie = getTrie();
226             regconizer.setEntityMap(ltrie);
227             final TextBuilder textBuilder = TextBuilder.newInstance(lconf);
228             regconizer.setTextBuilder(textBuilder);
229 
230             return regconizer;
231         } catch (final EosException e) {
232             throw new RuntimeException(e);
233         }
234     }
235 
236     /**
237      * Returns a {@code Tokenizer} as <em>source</em> for the
238      * recognizer.
239      * @return the <em>source</em> for the recognizer
240      * @throws TokenizerException if an error occurs
241      */
242     protected ResettableTokenizer getTokenizer() throws TokenizerException {
243 
244         try {
245             final Configuration conf = getConfiguration();
246 
247             final TokenizerSupplier tokenBuilder =
248                 TokenizerSupplier.newInstance(conf);
249             final ResettableTokenizer tokenizer =
250                 tokenBuilder.get();
251 
252             return tokenizer;
253         } catch (final Exception e) {
254             throw new TokenizerException(e);
255         }
256     }
257 
258     public Trie<CharSequence, Set<CharSequence>> getTrie() {
259         return this.trie;
260     }
261 
262     public void setTrie(final Trie<CharSequence, Set<CharSequence>> trie) {
263         this.trie = trie;
264     }
265 }