1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package net.sf.eos.trie;
17
18
19 import org.apache.commons.logging.Log;
20 import org.apache.commons.logging.LogFactory;
21
22 import net.sf.eos.analyzer.ResettableTokenizer;
23 import net.sf.eos.analyzer.TextBuilder;
24 import net.sf.eos.analyzer.Token;
25 import net.sf.eos.analyzer.TokenizerException;
26 import net.sf.eos.trie.TrieSource.TrieEntry;
27 import net.sf.eos.trie.TrieSource.TrieEntryEvent;
28 import net.sf.eos.trie.TrieSource.TrieEntryListener;
29
30 import java.io.InputStream;
31 import java.io.UnsupportedEncodingException;
32 import java.lang.management.ManagementFactory;
33 import java.lang.management.MemoryMXBean;
34 import java.lang.management.MemoryPoolMXBean;
35 import java.lang.management.MemoryUsage;
36 import java.util.ArrayList;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40
41 import javax.xml.parsers.SAXParser;
42 import javax.xml.parsers.SAXParserFactory;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 @SuppressWarnings("nls")
59 public class XmlTrieLoader
60 extends AbstractTrieLoader<CharSequence, Set<CharSequence>> {
61
62 static final Log LOG = LogFactory.getLog(XmlTrieLoader.class.getName());
63
64
65 private ResettableTokenizer tokenizer;
66 private TextBuilder textBuilder = TextBuilder.SPACE_BUILDER;
67
68
69
70
71 @Override
72 public void loadTrie(final InputStream trieData,
73 final Trie<CharSequence, Set<CharSequence>> trie)
74 throws Exception {
75
76 final SAXParserFactory factory = SAXParserFactory.newInstance();
77 final SAXParser parser = factory.newSAXParser();
78
79 final TrieHandler source = new TrieHandler();
80 source.addTrieEntryListener(new TrieEntryListener() {
81 public void onEntry(final TrieEntryEvent event) {
82 final TrieEntry entry = (TrieEntry) event.getSource();
83 handleNewTrieEntryForCharSequenceTrie(entry, trie);
84 }
85 });
86
87 final long start = System.currentTimeMillis();
88 LOG.trace("start time since epoche: " + start + "ms");
89 final MemoryMXBean memBean = ManagementFactory.getMemoryMXBean();
90 final StringBuilder lsb =
91 new StringBuilder("Heap Memory statistics:\n");
92 lsb.append(" heap: ");
93 final MemoryUsage heap = memBean.getHeapMemoryUsage();
94 lsb.append(heap);
95 lsb.append("\n nonheap: ");
96 final MemoryUsage nonheap = memBean.getNonHeapMemoryUsage();
97 lsb.append(nonheap);
98 LOG.debug(lsb.toString());
99 logStatistic();
100
101 parser.parse(trieData, source);
102
103 logStatistic();
104 LOG.debug("Build time: "
105 + (System.currentTimeMillis() - start) + "ms"
106 + " for " + trie.size() + " entries");
107 }
108
109 final void handleNewTrieEntryForCharSequenceTrie(
110 final TrieEntry entry,
111 final Trie<CharSequence, Set<CharSequence>> trie) {
112 final String key = entry.getKey();
113
114 final String value = entry.getValue();
115 final CharSequence rebuildedValue = rebuildValue(value);
116
117
118
119
120
121
122
123
124 Set<CharSequence> values = trie.get(key);
125 if (values == null) {
126 values = new HashSet<CharSequence>();
127 trie.put(key, values);
128 }
129 values.add(rebuildedValue);
130 }
131
132 final void logStatistic() {
133 final List<MemoryPoolMXBean> pool =
134 ManagementFactory.getMemoryPoolMXBeans();
135 final StringBuilder lsb =
136 new StringBuilder("Memory statistics:\n");
137 for (final MemoryPoolMXBean bean : pool) {
138 lsb.append(" name: ");
139 lsb.append(bean.getName());
140 final MemoryUsage peakUsage = bean.getPeakUsage();
141 lsb.append(" | peak: ");
142 lsb.append(peakUsage);
143 final MemoryUsage usage = bean.getUsage();
144 lsb.append(" | usage: ");
145 lsb.append(usage);
146 lsb.append("\n");
147 }
148
149 LOG.debug(lsb.toString());
150 }
151
152
153
154
155 public ResettableTokenizer getTokenizer() {
156 return this.tokenizer;
157 }
158
159
160
161
162 public void setTokenizer(
163 @SuppressWarnings("hiding") final ResettableTokenizer tokenizer) {
164 this.tokenizer = tokenizer;
165 }
166
167
168
169
170
171
172 public void setTextBuilder(final TextBuilder builder) {
173 this.textBuilder = builder;
174 }
175
176
177
178
179
180 public TextBuilder getTextBuilder() {
181 return this.textBuilder;
182 }
183
184
185
186
187
188
189
190
191 protected CharSequence rebuildValue(final CharSequence value) {
192 if (value == null) {
193 return value;
194 }
195
196 final ResettableTokenizer t = getTokenizer();
197 if (t == null) {
198 return value;
199 }
200 final TextBuilder b = getTextBuilder();
201 if (b == null) {
202 return value;
203 }
204
205 try {
206 t.reset(value);
207 final List<Token> tokens = new ArrayList<Token>();
208 Token to = null;
209 while ((to = t.next()) != null) {
210 tokens.add(to);
211 }
212 final CharSequence newText = b.buildText(tokens);
213
214 return newText;
215
216 } catch (final TokenizerException e) {
217 throw new RuntimeException(e);
218 }
219 }
220
221 final byte[] toUtf8ByteArray(final CharSequence s) {
222 try {
223 return s.toString().getBytes("UTF-8");
224 } catch (final UnsupportedEncodingException e) {
225 e.printStackTrace();
226 throw new InternalError("UFT-8 not supported");
227 }
228 }
229 }