1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.hadoop.mapred.cooccurrence;
17
18 import static net.sf.eos.entity.DictionaryBasedEntityRecognizer.ENTITY_ID_KEY;
19 import static net.sf.eos.entity.EntityRecognizer.ENTITY_TYPE;
20 import net.sf.eos.EosException;
21 import net.sf.eos.analyzer.ResettableTokenizer;
22 import net.sf.eos.analyzer.TextBuilder;
23 import net.sf.eos.analyzer.Token;
24 import net.sf.eos.analyzer.TokenizerSupplier;
25 import net.sf.eos.analyzer.TokenizerException;
26 import net.sf.eos.analyzer.TextBuilder.SpaceBuilder;
27 import net.sf.eos.config.Configuration;
28 import net.sf.eos.config.Configured;
29 import net.sf.eos.config.Service;
30 import net.sf.eos.config.Services;
31 import net.sf.eos.document.EosDocument;
32 import net.sf.eos.entity.AbstractDictionaryBasedEntityRecognizer;
33 import net.sf.eos.entity.DictionaryBasedEntityRecognizer;
34 import net.sf.eos.entity.SimpleLongestMatchDictionaryBasedEntityRecognizer;
35 import net.sf.eos.hadoop.mapred.AbstractKeyGenerator;
36 import net.sf.eos.hadoop.mapred.KeyGenerator;
37 import net.sf.eos.trie.Trie;
38
39 import org.apache.commons.logging.Log;
40 import org.apache.commons.logging.LogFactory;
41 import org.apache.hadoop.io.Text;
42
43 import java.util.ArrayList;
44 import java.util.HashMap;
45 import java.util.List;
46 import java.util.Map;
47 import java.util.Set;
48 import java.util.Map.Entry;
49
50 @Services(
51 services={
52 @Service(
53 factory=TokenizerSupplier.class,
54 description="Tokenizer for coocurence analyzing."
55 ),
56 @Service(
57 factory=AbstractKeyGenerator.class,
58 implementation=IdMetadataKeyGenerator.class,
59 description="Create ID for map task."
60 ),
61 @Service(
62 factory=AbstractDictionaryBasedEntityRecognizer.class,
63 implementation=SimpleLongestMatchDictionaryBasedEntityRecognizer.class
64 ),
65 @Service(
66 factory=TextBuilder.class,
67 implementation=SpaceBuilder.class
68 )
69 }
70 )
71 public class DictionaryBasedEntityIdKeyGenerator extends Configured
72
73
74
75 private static final Log LOG =
76 LogFactory.getLog(DictionaryBasedEntityIdKeyGenerator.class.getName());
77
78 private Trie<CharSequence, Set<CharSequence>> trie;
79
80 public Map<Text, EosDocument> createKeysForDocument(final EosDocument doc)
81 throws EosException {
82
83 final CharSequence text = doc.getText();
84 final DictionaryBasedEntityRecognizer dber =
85 getDictionaryBasedEntityRecognizerForText(text);
86
87 List<Token> tokens = identifiyToken(dber);
88
89 final Map<String, List<Token>> mapToTokenList =
90 new HashMap<String, List<Token>>();
91
92
93 for (final Token token : tokens) {
94 assert token != null;
95 final String type = token.getType();
96 if (ENTITY_TYPE.equals(type)) {
97 final Map<String, List<String>> meta = token.getMeta();
98 final List<String> ids = meta.get(ENTITY_ID_KEY);
99 for (final String id :ids) {
100 if (! mapToTokenList.containsKey(id)) {
101 mapToTokenList.put(id, tokens);
102 }
103 }
104 }
105 }
106
107 final Map<Text, EosDocument> mapToDocument =
108 new HashMap<Text, EosDocument>();
109
110 final Configuration lconf = getConfiguration();
111 if (lconf.get(AbstractKeyGenerator.ABSTRACT_KEY_GENERATOR_IMPL_CONFIG_NAME)
112 == null) {
113 lconf.set(AbstractKeyGenerator.ABSTRACT_KEY_GENERATOR_IMPL_CONFIG_NAME,
114 IdMetadataKeyGenerator.class.getName());
115 }
116
117 final KeyGenerator<Text> generator =
118 (KeyGenerator<Text>) AbstractKeyGenerator.newInstance(lconf);
119
120
121
122
123 for (final Entry<String, List<Token>> entry : mapToTokenList.entrySet()) {
124 final String key = entry.getKey();
125 final Text keyAsText = new Text(key);
126 if (! mapToDocument.containsKey(keyAsText)) {
127 final TextBuilder builder = TextBuilder.newInstance(lconf);
128
129 final List<CharSequence> l = new ArrayList<CharSequence>();
130 final List<Token> value = entry.getValue();
131
132 for (final Token token : value) {
133 final String type = token.getType();
134
135 if (ENTITY_TYPE.equals(type)) {
136
137 final Map<String, List<String>> meta = token.getMeta();
138 final List<String> ids = meta.get(ENTITY_ID_KEY);
139
140 final List<CharSequence> idList = new ArrayList<CharSequence>();
141 for (final String id :ids) {
142 if (! key.equals(id)) {
143 idList.add(id);
144 }
145 }
146 final int size = idList.size();
147 CharSequence[] css = new CharSequence[size];
148 css = idList.toArray(css);
149
150 final CharSequence in = builder.buildText(css);
151
152 l.add(in);
153 } else {
154 final CharSequence in = token.getTokenText();
155 l.add(in);
156 }
157 }
158
159 final int size = l.size();
160 CharSequence[] css = new CharSequence[size];
161 css = l.toArray(css);
162 final CharSequence newText = builder.buildText(css);
163
164 final EosDocument newDoc = new EosDocument();
165 newDoc.setText(newText);
166 final CharSequence title = doc.getTitle();
167 newDoc.setTitle(title);
168
169 final Map<String, List<String>> oldMeta = doc.getMeta();
170 final Map<String, List<String>> newMap =
171 new HashMap<String, List<String>>(oldMeta);
172 final List<String> newIdList = new ArrayList<String>();
173 newIdList.add(key);
174 newMap.put(EosDocument.ID_META_KEY, newIdList);
175 newDoc.setMeta(newMap);
176
177 final Map<Text, EosDocument> toStore =
178 generator.createKeysForDocument(newDoc);
179 for (final Entry<Text, EosDocument> toStoreEntry
180 : toStore.entrySet()) {
181 final Text keyAsLocalText = toStoreEntry.getKey();
182 final EosDocument toStoreDoc = toStoreEntry.getValue();
183 mapToDocument.put(keyAsLocalText, toStoreDoc);
184 }
185 }
186 }
187
188 return mapToDocument;
189 }
190
191 final List<Token> identifiyToken(final DictionaryBasedEntityRecognizer dber)
192 throws TokenizerException {
193
194 final List<Token> tokens = new ArrayList<Token>();
195
196 Token t = null;
197 while ((t = dber.next()) != null) {
198 tokens.add(t);
199 }
200
201 return tokens;
202 }
203
204
205
206
207
208
209
210
211
212
213 protected DictionaryBasedEntityRecognizer
214 getDictionaryBasedEntityRecognizerForText(final CharSequence text) {
215 try {
216 LOG.debug("Initialize DictionaryBasedEntityRecognizer");
217
218 final Configuration lconf = getConfiguration();
219 final ResettableTokenizer tokenizer = getTokenizer();
220 tokenizer.reset(text);
221
222 final DictionaryBasedEntityRecognizer regconizer =
223 AbstractDictionaryBasedEntityRecognizer.newInstance(tokenizer,
224 lconf);
225 final Trie<CharSequence, Set<CharSequence>> ltrie = getTrie();
226 regconizer.setEntityMap(ltrie);
227 final TextBuilder textBuilder = TextBuilder.newInstance(lconf);
228 regconizer.setTextBuilder(textBuilder);
229
230 return regconizer;
231 } catch (final EosException e) {
232 throw new RuntimeException(e);
233 }
234 }
235
236
237
238
239
240
241
242 protected ResettableTokenizer getTokenizer() throws TokenizerException {
243
244 try {
245 final Configuration conf = getConfiguration();
246
247 final TokenizerSupplier tokenBuilder =
248 TokenizerSupplier.newInstance(conf);
249 final ResettableTokenizer tokenizer =
250 tokenBuilder.get();
251
252 return tokenizer;
253 } catch (final Exception e) {
254 throw new TokenizerException(e);
255 }
256 }
257
258 public Trie<CharSequence, Set<CharSequence>> getTrie() {
259 return this.trie;
260 }
261
262 public void setTrie(final Trie<CharSequence, Set<CharSequence>> trie) {
263 this.trie = trie;
264 }
265 }