View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.hadoop.mapred.cooccurrence;
17  
18  import net.sf.eos.hadoop.DistributedCacheStrategy;
19  import net.sf.eos.hadoop.FullyDistributedCacheStrategy;
20  import net.sf.eos.hadoop.mapred.AbstractEosDriver;
21  import net.sf.eos.trie.TrieLoader;
22  
23  import org.apache.commons.cli.CommandLine;
24  import org.apache.commons.cli.GnuParser;
25  import org.apache.commons.cli.Option;
26  import org.apache.commons.cli.Options;
27  import org.apache.commons.cli.Parser;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.filecache.DistributedCache;
32  import org.apache.hadoop.fs.Path;
33  import org.apache.hadoop.io.Text;
34  import org.apache.hadoop.mapred.JobConf;
35  import org.apache.hadoop.util.GenericOptionsParser;
36  import org.apache.hadoop.util.ToolRunner;
37  
38  
39  /**
40   * The driver supports the base arguments. To run the driver set the path
41   * to the {@linkplain #TRIE_LONG_CMD_ARG trie}.
42   * @author Sascha Kohlmann
43   * @see DictionaryBasedEntityRecognizerReducer
44   * @see DictionaryBasedEntityRecognizerMapper
45   */
46  public class DictionaryBasedEntityRecognizerMapReduceDriver
47          extends AbstractEosDriver {
48  
49      /** Short commandline parameter name for the <em>path</em> to
50       * the trie data. The parameter is required.
51       * @see TrieLoader
52       * @see DistributedCacheStrategy */ 
53      @SuppressWarnings("nls")
54      public static final String TRIE_SHORT_CMD_ARG = "t";
55  
56      /** Long commandline parameter name for the <em>path</em> to
57       * the trie data. The parameter is required. 
58       * @see TrieLoader
59       * @see DistributedCacheStrategy */ 
60      @SuppressWarnings("nls")
61      public static final String TRIE_LONG_CMD_ARG = "trie";
62  
63      /** For logging. */
64      private static final Log LOG =
65          LogFactory.getLog(DictionaryBasedEntityRecognizerMapReduceDriver.class.getName());
66  
67      /**
68       * The parameter "<tt>-t</tt>" or "<tt>--trie</tt>" must be set use the
69       * trie data.
70       * @param args the command line arguments
71       * @see #TRIE_LONG_CMD_ARG
72       * @see AbstractEosDriver#DESTINATION_LONG_CMD_ARG
73       * @see AbstractEosDriver#SOURCE_LONG_CMD_ARG
74       * @see GenericOptionsParser
75       */
76      public static void main(final String[] args) throws Exception {
77          final int res = ToolRunner.run(
78                  new Configuration(),
79                  new DictionaryBasedEntityRecognizerMapReduceDriver(),
80                  args);
81          System.exit(res);
82      }
83  
84      /** Starts the job. */
85      @Override
86      public int run(final String[] args) throws Exception {
87          super.run(args);
88          final JobConf conf = getJobConf();
89  
90          final Parser parser = new GnuParser();
91          final Options options = createOptions();
92          final CommandLine cmdLine = parser.parse(options, args);
93  
94          final String triePath = cmdLine.getOptionValue(TRIE_LONG_CMD_ARG);
95          if (triePath == null || triePath.length() == 0) {
96              LOG.fatal("No Trie data path given - exiting");
97              return 1;
98          }
99          LOG.info("Trie path: " + triePath);
100         DistributedCache.addCacheFile(new Path(triePath).toUri(), conf);
101 
102         if (conf.get(DistributedCacheStrategy.STRATEGY_IMPL_CONFIG_NAME) == null) {
103             conf.set(DistributedCacheStrategy.STRATEGY_IMPL_CONFIG_NAME,
104                     FullyDistributedCacheStrategy.class.getName());
105             if (LOG.isDebugEnabled()) {
106                 LOG.debug("No CacheStrategy given. Use '"
107                           + FullyDistributedCacheStrategy.class.getName()
108                           + "'");
109             }
110         }
111 
112         conf.setJobName("\u03b5\u00b7\u03bf\u00b7s\u00b7\u00b7\u00b7 Entity");
113 
114         conf.setOutputKeyClass(Text.class);
115         conf.setOutputValueClass(Text.class);
116         conf.setMapOutputKeyClass(Text.class);
117         conf.setMapOutputValueClass(Text.class);
118 
119         conf.setMapperClass(DictionaryBasedEntityRecognizerMapper.class);
120         conf.setReducerClass(DictionaryBasedEntityRecognizerReducer.class);
121 
122         return doJob(conf);
123     }
124 
125     @Override
126     protected Options createOptions() {
127         final Options options = super.createOptions();
128         final Option option =
129             new Option(TRIE_SHORT_CMD_ARG,
130                        TRIE_LONG_CMD_ARG,
131                        true,
132                        "Path to trie data");
133         option.setRequired(true);
134 
135         return options.addOption(option);
136     }
137 }