View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.hadoop.mapred.cooccurrence;
17  
18  
19  import org.apache.commons.logging.Log;
20  import org.apache.commons.logging.LogFactory;
21  import org.apache.hadoop.io.Text;
22  
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.StringTokenizer;
29  
30  import net.sf.eos.EosException;
31  import net.sf.eos.config.Configuration;
32  import net.sf.eos.config.ConfigurationKey;
33  import net.sf.eos.document.EosDocument;
34  import net.sf.eos.hadoop.mapred.AbstractKeyGenerator;
35  
36  /**
37   * Creates a new map based on the document metadata value of
38   * {@link EosDocument#ID_META_KEY} and optionally other metadata information.
39   * Will sort optional metadata values.
40   * @author Sascha Kohlmann
41   */
42  public class IdMetadataKeyGenerator extends AbstractKeyGenerator<Text> {
43  
44      /** For logging. */
45      private static final Log LOG =
46          LogFactory.getLog(IdMetadataKeyGenerator.class.getName());
47  
48      private final static String META_DELIMITER = "+";
49  
50      /** The meta field for separation.
51       * Default value is {@link EosDocument#YEAR_META_KEY}. */
52      @SuppressWarnings("nls")
53      @ConfigurationKey(description="Metakeys, comma delimited, for separation.")
54      public static final String META_FIELD_FOR_SEPARATION_CONFIG_NAME =
55          "net.sf.eos.hadoop.mapred.cooccurrence.IdMetadataKeyGenerator.metaKeys";
56  
57      /**
58       * @throws EosException if <em>doc</em> doesn't contains a metadata value
59       *                      for {@link EosDocument#ID_META_KEY}.
60       */
61      public Map<Text, EosDocument> createKeysForDocument(final EosDocument doc)
62              throws EosException {
63          final Map<String, List<String>> meta = doc.getMeta();
64          if (meta == null) {
65              throw new EosException("document contains no metadata value.");
66          }
67          final List<String> ids = meta.get(EosDocument.ID_META_KEY);
68          if (ids == null || ids.isEmpty()) {
69              throw new EosException("document contains no ID metadata value.");
70          }
71  
72          final List<String> metaKeys = getMetaKeys();
73          final Map<Text, EosDocument> retval = new HashMap<Text, EosDocument>();
74          for (final String id : ids) {
75              final StringBuilder sb = new StringBuilder(id);
76              sb.append(META_DELIMITER);
77  
78              // Append metadata values in the defined sequence.
79              for (final String metaKey : metaKeys) {
80                  final List<String> metaValues = meta.get(metaKey);
81                  if (metaValues != null) {
82                      final List<String> sorted = new ArrayList<String>(metaValues);
83                      Collections.sort(sorted);
84                      for (final String value : sorted) {
85                          sb.append(value);
86                          sb.append(META_DELIMITER);
87                      }
88                  } else {
89                      LOG.debug("document contains no metavalue for key '"
90                                + metaKey + "'");
91                  }
92              }
93  
94              // Create new entry
95              final String key = sb.toString();
96              final Text newTextKey = new Text(key);
97              retval.put(newTextKey, doc);
98          }
99  
100         return retval;
101     }
102 
103     final List<String> getMetaKeys() {
104         final Configuration lconf = getConfiguration();
105         final List<String> keys = new ArrayList<String>();
106         final String value = lconf.get(META_FIELD_FOR_SEPARATION_CONFIG_NAME,
107                                        EosDocument.YEAR_META_KEY);
108         for (final StringTokenizer st = new StringTokenizer(value, ", ");
109                 st.hasMoreTokens(); ) {
110             final String key = st.nextToken();
111             // Filter out ID
112             if (! EosDocument.ID_META_KEY.equals(key)) {
113                 keys.add(key);
114             } else {
115                 LOG.debug("value of key " + META_FIELD_FOR_SEPARATION_CONFIG_NAME
116                           + " contains " + EosDocument.ID_META_KEY
117                           + " - ignore it.");
118             }
119         }
120         if (keys.size() == 0) {
121             keys.add(EosDocument.ID_META_KEY);
122         }
123         return keys;
124     }
125 }