1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.hadoop.mapred.cooccurrence;
17
18
19 import org.apache.commons.logging.Log;
20 import org.apache.commons.logging.LogFactory;
21 import org.apache.hadoop.io.Text;
22
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.StringTokenizer;
29
30 import net.sf.eos.EosException;
31 import net.sf.eos.config.Configuration;
32 import net.sf.eos.config.ConfigurationKey;
33 import net.sf.eos.document.EosDocument;
34 import net.sf.eos.hadoop.mapred.AbstractKeyGenerator;
35
36
37
38
39
40
41
42 public class IdMetadataKeyGenerator extends AbstractKeyGenerator<Text> {
43
44
45 private static final Log LOG =
46 LogFactory.getLog(IdMetadataKeyGenerator.class.getName());
47
48 private final static String META_DELIMITER = "+";
49
50
51
52 @SuppressWarnings("nls")
53 @ConfigurationKey(description="Metakeys, comma delimited, for separation.")
54 public static final String META_FIELD_FOR_SEPARATION_CONFIG_NAME =
55 "net.sf.eos.hadoop.mapred.cooccurrence.IdMetadataKeyGenerator.metaKeys";
56
57
58
59
60
61 public Map<Text, EosDocument> createKeysForDocument(final EosDocument doc)
62 throws EosException {
63 final Map<String, List<String>> meta = doc.getMeta();
64 if (meta == null) {
65 throw new EosException("document contains no metadata value.");
66 }
67 final List<String> ids = meta.get(EosDocument.ID_META_KEY);
68 if (ids == null || ids.isEmpty()) {
69 throw new EosException("document contains no ID metadata value.");
70 }
71
72 final List<String> metaKeys = getMetaKeys();
73 final Map<Text, EosDocument> retval = new HashMap<Text, EosDocument>();
74 for (final String id : ids) {
75 final StringBuilder sb = new StringBuilder(id);
76 sb.append(META_DELIMITER);
77
78
79 for (final String metaKey : metaKeys) {
80 final List<String> metaValues = meta.get(metaKey);
81 if (metaValues != null) {
82 final List<String> sorted = new ArrayList<String>(metaValues);
83 Collections.sort(sorted);
84 for (final String value : sorted) {
85 sb.append(value);
86 sb.append(META_DELIMITER);
87 }
88 } else {
89 LOG.debug("document contains no metavalue for key '"
90 + metaKey + "'");
91 }
92 }
93
94
95 final String key = sb.toString();
96 final Text newTextKey = new Text(key);
97 retval.put(newTextKey, doc);
98 }
99
100 return retval;
101 }
102
103 final List<String> getMetaKeys() {
104 final Configuration lconf = getConfiguration();
105 final List<String> keys = new ArrayList<String>();
106 final String value = lconf.get(META_FIELD_FOR_SEPARATION_CONFIG_NAME,
107 EosDocument.YEAR_META_KEY);
108 for (final StringTokenizer st = new StringTokenizer(value, ", ");
109 st.hasMoreTokens(); ) {
110 final String key = st.nextToken();
111
112 if (! EosDocument.ID_META_KEY.equals(key)) {
113 keys.add(key);
114 } else {
115 LOG.debug("value of key " + META_FIELD_FOR_SEPARATION_CONFIG_NAME
116 + " contains " + EosDocument.ID_META_KEY
117 + " - ignore it.");
118 }
119 }
120 if (keys.size() == 0) {
121 keys.add(EosDocument.ID_META_KEY);
122 }
123 return keys;
124 }
125 }