View Javadoc

1   /* Copyright (c) 2008 Sascha Kohlmann
2    *
3    * This program is free software: you can redistribute it and/or modify
4    * it under the terms of the GNU Affero General Public License as published by
5    * the Free Software Foundation, either version 3 of the License, or
6    * (at your option) any later version.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Affero General Public License for more details.
12   *
13   * You should have received a copy of the GNU Affero General Public License
14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15   */
16  package net.sf.eos.search;
17  
18  import java.util.ArrayList;
19  import java.util.List;
20  import java.util.StringTokenizer;
21  
22  import net.sf.eos.Function;
23  
24  /**
25   * Simple splitter for user search queries. User search queries must be very
26   * simple to understand (for the most users, not for power user).
27   *
28   * <p>A search query is a quantity of phrases. A term is a literal without any
29   * whitespace character. Whitespace characters are the delimiter of the terms
30   * in a search query. A compound is a quantity of terms enclosed by two 
31   * ASCII&nbsp;{@literal 0x22} (") characters called phrase. The processor}}
32   * returns a list of terms and phrases in order of there index position in
33   * the query. The processor removes ASCII&nbsp;{@literal 0x22} characters an
34   * normalizes following whitespace characters to only one
35   * ASCII&nbsp;{@literal 0x20} character. All whitespace characters other
36   * than ASCII&nbsp;{@literal 0x20} will normalized to
37   * ASCII&nbsp;{@literal 0x20}. Newlines were removed if possible. If the
38   * character count of ASCII&nbsp;{@literal 0x22} is odd, the last
39   * ASCII&nbsp;{@literal 0x22} will be part of a term or a single term.</p>
40   *
41   * @author Sascha Kohlmann
42   */
43  public class UserQueryParser implements Function<String, List<String> > {
44  
45      /**
46       * Splits the user query in different terms and phrases.
47       *
48       * @param query the users search query
49       * @return Array with different search information
50       */
51      @SuppressWarnings("nls")
52      final List<String> parseUserQuery(final String query) {
53  
54          if (query == null) {
55              return new ArrayList<String>(0);
56          }
57  
58          final String adjusted = " " + removeNewLines(query);
59          if (! query.contains("\"")) {
60              return splitSimpleTerms(adjusted);
61          }
62          
63          final List<String> retval = new ArrayList<String>();
64          boolean inQuote = false;
65          for (final StringTokenizer st = new StringTokenizer(adjusted, "\"");
66                  st.hasMoreElements(); ) {
67              final String part = st.nextToken();
68              if (inQuote) {
69                  final String s = adjustWhiteSpace(part);
70                  if (s.length() != 0) {
71                      retval.add(s);
72                  }
73                  inQuote = false;
74              } else {
75                  final List<String> terms = splitSimpleTerms(part);
76                  retval.addAll(terms);
77                  inQuote = true;
78              }
79          }
80  
81          return retval;
82      }
83  
84      @SuppressWarnings("nls")
85      final static String adjustWhiteSpace(final String s) {
86          final StringBuilder sb = new StringBuilder();
87          for (final StringTokenizer st = new StringTokenizer(s);
88                  st.hasMoreTokens(); ) {
89              final String part = st.nextToken();
90              sb.append(part);
91              sb.append(" ");
92          }
93          return sb.toString().trim();
94      }
95  
96      @SuppressWarnings("nls")
97      final static String removeNewLines(final String s) {
98          assert s != null;
99          final StringBuilder sb = new StringBuilder();
100         for (final StringTokenizer st = new StringTokenizer(s, "\n\r\f");
101                 st.hasMoreTokens(); ) {
102             final String part = st.nextToken();
103             sb.append(part);
104         }
105         return sb.toString();
106     }
107 
108     @SuppressWarnings("nls")
109     final static List<String> splitSimpleTerms(final String terms) {
110         final List<String> retval = new ArrayList<String>();
111         if (terms == null) {
112             return retval;
113         }
114         for (final StringTokenizer st =
115                     new StringTokenizer(terms, 
116                                " \t\u001c\u001d\u001e\u001f\u00a0\u2007\u202f");
117                 st.hasMoreTokens(); ) {
118             final String part = st.nextToken();
119             retval.add(part);
120         }
121         return retval;
122     }
123 
124     /**
125      * Splits the user query in different terms and phrases.
126      *
127      * @param query the users search query
128      * @return Array with different search information
129      */
130     public List<String> apply(final String query) {
131         return parseUserQuery(query);
132     }
133 }