View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.trie;
17  
18  
19  import org.apache.commons.logging.Log;
20  import org.apache.commons.logging.LogFactory;
21  
22  import net.sf.eos.analyzer.ResettableTokenizer;
23  import net.sf.eos.analyzer.TextBuilder;
24  import net.sf.eos.analyzer.Token;
25  import net.sf.eos.analyzer.TokenizerException;
26  import net.sf.eos.trie.TrieSource.TrieEntry;
27  import net.sf.eos.trie.TrieSource.TrieEntryEvent;
28  import net.sf.eos.trie.TrieSource.TrieEntryListener;
29  
30  import java.io.InputStream;
31  import java.io.UnsupportedEncodingException;
32  import java.lang.management.ManagementFactory;
33  import java.lang.management.MemoryMXBean;
34  import java.lang.management.MemoryPoolMXBean;
35  import java.lang.management.MemoryUsage;
36  import java.util.ArrayList;
37  import java.util.HashSet;
38  import java.util.List;
39  import java.util.Set;
40  
41  import javax.xml.parsers.SAXParser;
42  import javax.xml.parsers.SAXParserFactory;
43  
44  /**
45   * <p>The builder creates a trie from a simple XML file. The file must be
46   * like the following <acronym title='Document Type Definition'>DTD</acronym>:
47   * </p>
48   * <pre>
49   * &lt;!ELEMENT trie  (entry*)     >
50   * &lt;!ELEMENT entry (key, value) >
51   * &lt;!ELEMENT key   (#PCDATA)    >
52   * &lt;!ELEMENT value (#PCDATA)    ></pre>
53   * <p>The builder doesn't validate validates the XML structure.</p>
54   * <p>If a key is twice in the XML structure the builder adds it to the
55   * value <code>Collection</code>.
56   * @author Sascha Kohlmann
57   */
58  @SuppressWarnings("nls")
59  public class XmlTrieLoader 
60          extends AbstractTrieLoader<CharSequence, Set<CharSequence>> {
61  
62      static final Log LOG = LogFactory.getLog(XmlTrieLoader.class.getName());
63  
64      /** A Tokenizer to tokenize the value of the trie before storing. */
65      private ResettableTokenizer tokenizer;
66      private TextBuilder textBuilder = TextBuilder.SPACE_BUILDER;
67  
68      /**
69       * Creates a <code>Trie</code> from the <code>InputStream</code>.
70       */
71      @Override
72      public void loadTrie(final InputStream trieData,
73                           final Trie<CharSequence, Set<CharSequence>> trie)
74              throws Exception {
75  
76          final SAXParserFactory factory = SAXParserFactory.newInstance();
77          final SAXParser parser = factory.newSAXParser();
78  
79          final TrieHandler source = new TrieHandler();
80          source.addTrieEntryListener(new TrieEntryListener() {
81              public void onEntry(final TrieEntryEvent event) {
82                  final TrieEntry entry = (TrieEntry) event.getSource();
83                  handleNewTrieEntryForCharSequenceTrie(entry, trie);
84              }
85          });
86  
87          final long start = System.currentTimeMillis();
88          LOG.trace("start time since epoche: " + start + "ms");
89          final MemoryMXBean memBean = ManagementFactory.getMemoryMXBean();
90          final StringBuilder lsb =
91              new StringBuilder("Heap Memory statistics:\n");
92          lsb.append("  heap: ");
93          final MemoryUsage heap = memBean.getHeapMemoryUsage();
94          lsb.append(heap);
95          lsb.append("\n  nonheap: ");
96          final MemoryUsage nonheap = memBean.getNonHeapMemoryUsage();
97          lsb.append(nonheap);
98          LOG.debug(lsb.toString());
99          logStatistic();
100 
101         parser.parse(trieData, source);
102 
103         logStatistic();
104         LOG.debug("Build time: " 
105                   + (System.currentTimeMillis() - start) + "ms"
106                   + " for " + trie.size() + " entries");
107     }
108 
109     final void handleNewTrieEntryForCharSequenceTrie(
110                 final TrieEntry entry,
111                 final Trie<CharSequence, Set<CharSequence>> trie) {
112         final String key = entry.getKey();
113 //        final byte[] keyArray = toUtf8ByteArray(key);
114         final String value = entry.getValue();
115         final CharSequence rebuildedValue = rebuildValue(value);
116 //        final byte[] valueArray = toUtf8ByteArray(rebuildedValue);
117 
118 //        Set<byte[]> values = trie.get(keyArray);
119 //        if (values == null) {
120 //            values = new HashSet<byte[]>();
121 //            trie.put(keyArray, values);
122 //        }
123 //        values.add(valueArray);
124         Set<CharSequence> values = trie.get(key);
125         if (values == null) {
126             values = new HashSet<CharSequence>();
127             trie.put(key, values);
128         }
129         values.add(rebuildedValue);
130     }
131 
132     final void logStatistic() {
133         final List<MemoryPoolMXBean> pool =
134             ManagementFactory.getMemoryPoolMXBeans();
135         final StringBuilder lsb =
136             new StringBuilder("Memory statistics:\n");
137         for (final MemoryPoolMXBean bean : pool) {
138             lsb.append("  name: ");
139             lsb.append(bean.getName());
140             final MemoryUsage peakUsage = bean.getPeakUsage();
141             lsb.append(" | peak: ");
142             lsb.append(peakUsage);
143             final MemoryUsage usage = bean.getUsage();
144             lsb.append(" | usage: ");
145             lsb.append(usage);
146             lsb.append("\n");
147         }
148 
149         LOG.debug(lsb.toString());
150     }
151 
152     /**
153      * @return the tokenizer
154      */
155     public ResettableTokenizer getTokenizer() {
156         return this.tokenizer;
157     }
158 
159     /**
160      * @param tokenizer the tokenizer to set
161      */
162     public void setTokenizer(
163             @SuppressWarnings("hiding") final ResettableTokenizer tokenizer) {
164         this.tokenizer = tokenizer;
165     }
166 
167     /**
168      * Sets a builder. The implementation has default builder of instance
169      * {@link TextBuilder#SPACE_BUILDER} setted at construction time.
170      * @param builder a builder to set or {@code null}.
171      */
172     public void setTextBuilder(final TextBuilder builder) {
173         this.textBuilder = builder;
174     }
175 
176     /**
177      * Returns a setted builder.
178      * @return a setted builder or {@code null}.
179      */ 
180     public TextBuilder getTextBuilder() {
181         return this.textBuilder;
182     }
183 
184     /**
185      * Rebuilds a sequence of chars if the loader has a setted 
186      * {@link #setTokenizer(ResettableTokenizer)} and a setted
187      * {@link #setTextBuilder(TextBuilder)}.
188      * @param value the value to rebuild.
189      * @return a rebuilded character sequence.
190      */
191     protected CharSequence rebuildValue(final CharSequence value) {
192         if (value == null) {
193             return value;
194         }
195 
196         final ResettableTokenizer t = getTokenizer();
197         if (t == null) {
198             return value;
199         }
200         final TextBuilder b = getTextBuilder();
201         if (b == null) {
202             return value;
203         }
204 
205         try {
206             t.reset(value);
207             final List<Token> tokens = new ArrayList<Token>();
208             Token to = null;
209             while ((to = t.next()) != null) {
210                 tokens.add(to);
211             }
212             final CharSequence newText = b.buildText(tokens);
213 
214             return newText;
215 
216         } catch (final TokenizerException e) {
217             throw new RuntimeException(e);
218         }
219     }
220 
221     final byte[] toUtf8ByteArray(final CharSequence s) {
222         try {
223             return s.toString().getBytes("UTF-8");
224         } catch (final UnsupportedEncodingException e) {
225             e.printStackTrace();
226             throw new InternalError("UFT-8 not supported");
227         }
228     }
229 }