1 /* Copyright (c) 2008 Sascha Kohlmann
2 *
3 * This program is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU Affero General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Affero General Public License for more details.
12 *
13 * You should have received a copy of the GNU Affero General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16 package net.sf.eos.search;
17
18 import java.util.ArrayList;
19 import java.util.List;
20 import java.util.StringTokenizer;
21
22 import net.sf.eos.Function;
23
24 /**
25 * Simple splitter for user search queries. User search queries must be very
26 * simple to understand (for the most users, not for power user).
27 *
28 * <p>A search query is a quantity of phrases. A term is a literal without any
29 * whitespace character. Whitespace characters are the delimiter of the terms
30 * in a search query. A compound is a quantity of terms enclosed by two
31 * ASCII {@literal 0x22} (") characters called phrase. The processor}}
32 * returns a list of terms and phrases in order of there index position in
33 * the query. The processor removes ASCII {@literal 0x22} characters an
34 * normalizes following whitespace characters to only one
35 * ASCII {@literal 0x20} character. All whitespace characters other
36 * than ASCII {@literal 0x20} will normalized to
37 * ASCII {@literal 0x20}. Newlines were removed if possible. If the
38 * character count of ASCII {@literal 0x22} is odd, the last
39 * ASCII {@literal 0x22} will be part of a term or a single term.</p>
40 *
41 * @author Sascha Kohlmann
42 */
43 public class UserQueryParser implements Function<String, List<String> > {
44
45 /**
46 * Splits the user query in different terms and phrases.
47 *
48 * @param query the users search query
49 * @return Array with different search information
50 */
51 @SuppressWarnings("nls")
52 final List<String> parseUserQuery(final String query) {
53
54 if (query == null) {
55 return new ArrayList<String>(0);
56 }
57
58 final String adjusted = " " + removeNewLines(query);
59 if (! query.contains("\"")) {
60 return splitSimpleTerms(adjusted);
61 }
62
63 final List<String> retval = new ArrayList<String>();
64 boolean inQuote = false;
65 for (final StringTokenizer st = new StringTokenizer(adjusted, "\"");
66 st.hasMoreElements(); ) {
67 final String part = st.nextToken();
68 if (inQuote) {
69 final String s = adjustWhiteSpace(part);
70 if (s.length() != 0) {
71 retval.add(s);
72 }
73 inQuote = false;
74 } else {
75 final List<String> terms = splitSimpleTerms(part);
76 retval.addAll(terms);
77 inQuote = true;
78 }
79 }
80
81 return retval;
82 }
83
84 @SuppressWarnings("nls")
85 final static String adjustWhiteSpace(final String s) {
86 final StringBuilder sb = new StringBuilder();
87 for (final StringTokenizer st = new StringTokenizer(s);
88 st.hasMoreTokens(); ) {
89 final String part = st.nextToken();
90 sb.append(part);
91 sb.append(" ");
92 }
93 return sb.toString().trim();
94 }
95
96 @SuppressWarnings("nls")
97 final static String removeNewLines(final String s) {
98 assert s != null;
99 final StringBuilder sb = new StringBuilder();
100 for (final StringTokenizer st = new StringTokenizer(s, "\n\r\f");
101 st.hasMoreTokens(); ) {
102 final String part = st.nextToken();
103 sb.append(part);
104 }
105 return sb.toString();
106 }
107
108 @SuppressWarnings("nls")
109 final static List<String> splitSimpleTerms(final String terms) {
110 final List<String> retval = new ArrayList<String>();
111 if (terms == null) {
112 return retval;
113 }
114 for (final StringTokenizer st =
115 new StringTokenizer(terms,
116 " \t\u001c\u001d\u001e\u001f\u00a0\u2007\u202f");
117 st.hasMoreTokens(); ) {
118 final String part = st.nextToken();
119 retval.add(part);
120 }
121 return retval;
122 }
123
124 /**
125 * Splits the user query in different terms and phrases.
126 *
127 * @param query the users search query
128 * @return Array with different search information
129 */
130 public List<String> apply(final String query) {
131 return parseUserQuery(query);
132 }
133 }