Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
216 / 216 |
|
100.00% |
24 / 24 |
CRAP | |
100.00% |
1 / 1 |
LuceneSyntaxHelper | |
100.00% |
216 / 216 |
|
100.00% |
24 / 24 |
84 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
containsBooleans | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
containsRanges | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
containsAdvancedLuceneSyntax | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
11 | |||
normalizeSearchString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
capitalizeCaseInsensitiveBooleans | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
capitalizeBooleans | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
capitalizeRanges | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
extractSearchTerms | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
10 | |||
hasCaseSensitiveBooleans | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasCaseSensitiveRanges | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalizeFancyQuotes | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
normalizeWildcards | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
normalizeParens | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
normalizeBoosts | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
normalizeBracesAndBrackets | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
3 | |||
normalizeUnquotedText | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
2 | |||
normalizeColons | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
prepareForLuceneSyntax | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
6 | |||
getBoolsToCap | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
7 | |||
capitalizeRangesCallback | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
5 | |||
countNonQuoted | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
removeNonQuoted | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
processQueryString | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 |
1 | <?php |
2 | |
3 | /** |
4 | * Lucene query syntax helper class. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2010. |
9 | * Copyright (C) The National Library of Finland 2016. |
10 | * |
11 | * This program is free software; you can redistribute it and/or modify |
12 | * it under the terms of the GNU General Public License version 2, |
13 | * as published by the Free Software Foundation. |
14 | * |
15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | * GNU General Public License for more details. |
19 | * |
20 | * You should have received a copy of the GNU General Public License |
21 | * along with this program; if not, write to the Free Software |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
23 | * |
24 | * @category VuFind |
25 | * @package Search |
26 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
27 | * @author David Maus <maus@hab.de> |
28 | * @author Demian Katz <demian.katz@villanova.edu> |
29 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
30 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
31 | * @link https://vufind.org |
32 | */ |
33 | |
34 | namespace VuFindSearch\Backend\Solr; |
35 | |
36 | use function count; |
37 | use function in_array; |
38 | |
39 | /** |
40 | * Lucene query syntax helper class. |
41 | * |
42 | * @category VuFind |
43 | * @package Search |
44 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
45 | * @author David Maus <maus@hab.de> |
46 | * @author Demian Katz <demian.katz@villanova.edu> |
47 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
48 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
49 | * @link https://vufind.org |
50 | */ |
51 | class LuceneSyntaxHelper |
52 | { |
53 | /** |
54 | * Regular expression matching a SOLR range. |
55 | * |
56 | * @var string |
57 | */ |
58 | public const SOLR_RANGE_RE = '/(\[.+\s+TO\s+.+\])|(\{.+\s+TO\s+.+\})/'; |
59 | |
60 | /** |
61 | * Lookahead that detects whether or not we are inside quotes. |
62 | * |
63 | * @var string |
64 | */ |
65 | protected static $insideQuotes = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)'; |
66 | |
67 | /** |
68 | * Force ranges to uppercase? |
69 | * |
70 | * @var bool |
71 | */ |
72 | protected $caseSensitiveRanges; |
73 | |
74 | /** |
75 | * Force boolean operators to uppercase? Set to true to make all Booleans |
76 | * case-sensitive; false to make no Booleans case-sensitive; comma-separated |
77 | * string to make only certain operators case sensitive. |
78 | * |
79 | * @var bool|string |
80 | */ |
81 | protected $caseSensitiveBooleans; |
82 | |
83 | /** |
84 | * All boolean operators supported by the class. |
85 | * |
86 | * @var array |
87 | */ |
88 | protected $allBools = ['AND', 'OR', 'NOT']; |
89 | |
90 | /** |
91 | * Constructor. |
92 | * |
93 | * @param bool|string $csBools Case sensitive Booleans setting |
94 | * @param bool $csRanges Case sensitive ranges setting |
95 | */ |
96 | public function __construct($csBools = true, $csRanges = true) |
97 | { |
98 | $this->caseSensitiveBooleans = $csBools; |
99 | $this->caseSensitiveRanges = $csRanges; |
100 | } |
101 | |
102 | /// Public API |
103 | |
104 | /** |
105 | * Return true if the search string contains boolean operators. |
106 | * |
107 | * @param string $searchString Search string |
108 | * |
109 | * @return bool |
110 | */ |
111 | public function containsBooleans($searchString) |
112 | { |
113 | // Build a regular expression to detect booleans -- AND/OR/NOT surrounded |
114 | // by whitespace, or NOT leading the query and followed by whitespace. |
115 | $lookahead = self::$insideQuotes; |
116 | $boolReg = '/((\s+(AND|OR|NOT)\s+)|^NOT\s+)' . $lookahead . '/'; |
117 | $checkString = $this->capitalizeCaseInsensitiveBooleans($searchString); |
118 | return preg_match($boolReg, $checkString) ? true : false; |
119 | } |
120 | |
121 | /** |
122 | * Return true if the search string contains ranges. |
123 | * |
124 | * @param string $searchString Search string |
125 | * |
126 | * @return bool |
127 | */ |
128 | public function containsRanges($searchString) |
129 | { |
130 | $rangeReg = self::SOLR_RANGE_RE; |
131 | if (!$this->caseSensitiveRanges) { |
132 | $rangeReg .= 'i'; |
133 | } |
134 | return preg_match($rangeReg, $searchString) ? true : false; |
135 | } |
136 | |
137 | /** |
138 | * Return true if the search string contains advanced Lucene syntax. |
139 | * |
140 | * @param string $searchString Search string |
141 | * |
142 | * @return bool |
143 | */ |
144 | public function containsAdvancedLuceneSyntax($searchString) |
145 | { |
146 | // Check for various conditions that flag an advanced Lucene query: |
147 | if ($searchString == '*:*') { |
148 | return true; |
149 | } |
150 | |
151 | // The following conditions do not apply to text inside quoted strings, |
152 | // so let's just strip all quoted strings out of the query to simplify |
153 | // detection. We'll replace quoted phrases with a dummy keyword so quote |
154 | // removal doesn't interfere with the field specifier check below. |
155 | $searchString = preg_replace('/"[^"]*"/', 'quoted', $searchString); |
156 | |
157 | // Check for field specifiers: |
158 | if (preg_match("/[^\s\\\]\:[^\s]/", $searchString)) { |
159 | return true; |
160 | } |
161 | |
162 | // Check for unescaped parentheses: |
163 | $stripped = str_replace(['\(', '\)'], '', $searchString); |
164 | if (strstr($stripped, '(') && strstr($stripped, ')')) { |
165 | return true; |
166 | } |
167 | |
168 | // Check for ranges, booleans, wildcards and fuzzy matches: |
169 | if ( |
170 | $this->containsRanges($searchString) |
171 | || $this->containsBooleans($searchString) |
172 | || strstr($searchString, '*') || strstr($searchString, '?') |
173 | || strstr($searchString, '~') |
174 | ) { |
175 | return true; |
176 | } |
177 | |
178 | // Check for boosts: |
179 | if (preg_match('/[\^][0-9]+/', $searchString)) { |
180 | return true; |
181 | } |
182 | |
183 | return false; |
184 | } |
185 | |
186 | /** |
187 | * Return normalized input string. |
188 | * |
189 | * @param string $searchString Input search string |
190 | * |
191 | * @return string |
192 | */ |
193 | public function normalizeSearchString($searchString) |
194 | { |
195 | $searchString = $this->prepareForLuceneSyntax($searchString); |
196 | |
197 | // Force boolean operators to uppercase if we are in a |
198 | // case-insensitive mode: |
199 | $searchString = $this->capitalizeCaseInsensitiveBooleans($searchString); |
200 | |
201 | // Adjust range operators if we are in a case-insensitive mode: |
202 | if (!$this->caseSensitiveRanges) { |
203 | $searchString = $this->capitalizeRanges($searchString); |
204 | } |
205 | return $searchString; |
206 | } |
207 | |
208 | /** |
209 | * Wrapper around capitalizeBooleans that accounts for the caseSensitiveBooleans |
210 | * property of this class. |
211 | * |
212 | * @param string $string Search string |
213 | * |
214 | * @return string |
215 | */ |
216 | public function capitalizeCaseInsensitiveBooleans($string) |
217 | { |
218 | return $this->capitalizeBooleans($string, $this->getBoolsToCap()); |
219 | } |
220 | |
221 | /** |
222 | * Capitalize boolean operators. |
223 | * |
224 | * @param string $string Search string |
225 | * @param array $bools Which booleans to capitalize (default = all) |
226 | * |
227 | * @return string |
228 | */ |
229 | public function capitalizeBooleans($string, $bools = ['AND', 'OR', 'NOT']) |
230 | { |
231 | // Short-circuit if no Booleans were selected: |
232 | if (empty($bools)) { |
233 | return $string; |
234 | } |
235 | |
236 | // Load the "inside quotes" lookahead so we can use it to prevent |
237 | // switching case of Boolean reserved words inside quotes, since |
238 | // that can cause problems in case-sensitive fields when the reserved |
239 | // words are actually used as search terms. |
240 | $lookahead = self::$insideQuotes; |
241 | |
242 | // Create standard conversions: |
243 | $regs = $replace = []; |
244 | foreach ($bools as $bool) { |
245 | $regs[] = "/\s+{$bool}\s+{$lookahead}/i"; |
246 | $replace[] = ' ' . $bool . ' '; |
247 | } |
248 | |
249 | // Special extra case for NOT: |
250 | if (in_array('NOT', $bools)) { |
251 | $regs[] = "/\(NOT\s+{$lookahead}/i"; |
252 | $replace[] = '(NOT '; |
253 | } |
254 | |
255 | return trim(preg_replace($regs, $replace, $string)); |
256 | } |
257 | |
258 | /** |
259 | * Capitalize range operator. |
260 | * |
261 | * @param string $string Search string |
262 | * |
263 | * @return string |
264 | */ |
265 | public function capitalizeRanges($string) |
266 | { |
267 | // Load the "inside quotes" lookahead so we can use it to prevent |
268 | // switching case of ranges inside quotes, since that can cause |
269 | // problems in case-sensitive fields when the reserved words are |
270 | // actually used as search terms. |
271 | $lookahead = self::$insideQuotes; |
272 | $regs = ["/(\[)([^\]]+)\s+TO\s+([^\]]+)(\]){$lookahead}/i", |
273 | "/(\{)([^}]+)\s+TO\s+([^}]+)(\}){$lookahead}/i"]; |
274 | $callback = [$this, 'capitalizeRangesCallback']; |
275 | return trim(preg_replace_callback($regs, $callback, $string)); |
276 | } |
277 | |
278 | /** |
279 | * Extract search terms from a query string for spell checking. |
280 | * |
281 | * This will only handle the most often used simple cases. |
282 | * |
283 | * @param string $query Query string |
284 | * |
285 | * @return string |
286 | */ |
287 | public function extractSearchTerms($query) |
288 | { |
289 | $result = []; |
290 | $collected = ''; |
291 | $discardParens = 0; |
292 | // Discard local parameters |
293 | $query = preg_replace('/\{!.+?\}/', '', $query); |
294 | // Discard fuzziness and proximity indicators |
295 | $query = preg_replace('/\~[^\s]*/', '', $query); |
296 | $query = preg_replace('/\^[^\s]*/', '', $query); |
297 | |
298 | $this->processQueryString( |
299 | function ( |
300 | string $ch, |
301 | bool $quoted, |
302 | bool $esc |
303 | ) use ( |
304 | &$result, |
305 | &$collected, |
306 | &$discardParens |
307 | ) { |
308 | if (!$quoted) { |
309 | // Discard closing parenthesis for previously discarded opening |
310 | // ones to keep balance |
311 | if (!$esc && ')' === $ch && $discardParens > 0) { |
312 | --$discardParens; |
313 | return; |
314 | } |
315 | // Flush to result array on word break |
316 | if ($ch == ' ' && $collected !== '') { |
317 | $result[] = $collected; |
318 | $collected = ''; |
319 | return; |
320 | } |
321 | // If we encounter ':', discard preceding string as it's a field |
322 | // name |
323 | if (!$esc && $ch == ':') { |
324 | // Take into account any opening parenthesis we discard here |
325 | $discardParens += $this->countNonQuoted('(', $collected); |
326 | $collected = ''; |
327 | return; |
328 | } |
329 | } |
330 | $collected .= $ch; |
331 | }, |
332 | $query |
333 | ); |
334 | // Flush final collected string |
335 | if ($collected !== '') { |
336 | $result[] = $collected; |
337 | } |
338 | // Discard any preceding pluses or minuses |
339 | $result = array_map( |
340 | function ($s) { |
341 | return ltrim($s, '+-'); |
342 | }, |
343 | $result |
344 | ); |
345 | return implode(' ', $result); |
346 | } |
347 | |
348 | /** |
349 | * Are there any case-sensitive Boolean operators configured? |
350 | * |
351 | * @return bool |
352 | */ |
353 | public function hasCaseSensitiveBooleans() |
354 | { |
355 | // If there are some Boolean operators that are not in the list |
356 | // of operators that need to be auto-capitalized, then some of |
357 | // the operators will exhibit case-sensitive behavior. |
358 | return count($this->allBools) > count($this->getBoolsToCap()); |
359 | } |
360 | |
361 | /** |
362 | * Are case-sensitive ranges configured? |
363 | * |
364 | * @return bool |
365 | */ |
366 | public function hasCaseSensitiveRanges() |
367 | { |
368 | return $this->caseSensitiveRanges; |
369 | } |
370 | |
371 | /// Internal API |
372 | |
373 | /** |
374 | * Normalize fancy quotes in a query. |
375 | * |
376 | * @param string $input String to normalize |
377 | * |
378 | * @return string |
379 | */ |
380 | protected function normalizeFancyQuotes($input) |
381 | { |
382 | // Normalize fancy quotes: |
383 | $quotes = [ |
384 | "\xC2\xAB" => '"', // « (U+00AB) in UTF-8 |
385 | "\xC2\xBB" => '"', // » (U+00BB) in UTF-8 |
386 | "\xE2\x80\x98" => "'", // ‘ (U+2018) in UTF-8 |
387 | "\xE2\x80\x99" => "'", // ’ (U+2019) in UTF-8 |
388 | "\xE2\x80\x9A" => "'", // ‚ (U+201A) in UTF-8 |
389 | "\xE2\x80\x9B" => "'", // ? (U+201B) in UTF-8 |
390 | "\xE2\x80\x9C" => '"', // “ (U+201C) in UTF-8 |
391 | "\xE2\x80\x9D" => '"', // ” (U+201D) in UTF-8 |
392 | "\xE2\x80\x9E" => '"', // „ (U+201E) in UTF-8 |
393 | "\xE2\x80\x9F" => '"', // ? (U+201F) in UTF-8 |
394 | "\xE2\x80\xB9" => "'", // ‹ (U+2039) in UTF-8 |
395 | "\xE2\x80\xBA" => "'", // › (U+203A) in UTF-8 |
396 | ]; |
397 | return strtr($input, $quotes); |
398 | } |
399 | |
400 | /** |
401 | * Normalize wildcards in a query. |
402 | * |
403 | * @param string $input String to normalize |
404 | * |
405 | * @return string |
406 | */ |
407 | protected function normalizeWildcards($input) |
408 | { |
409 | // Ensure wildcards are not at beginning of input |
410 | return str_starts_with($input, '*') || str_starts_with($input, '?') |
411 | ? substr($input, 1) : $input; |
412 | } |
413 | |
414 | /** |
415 | * Normalize parentheses in a query. |
416 | * |
417 | * Removes all non-quoted parentheses if they're not balanced. |
418 | * |
419 | * @param string $input String to normalize |
420 | * |
421 | * @return string |
422 | */ |
423 | protected function normalizeParens($input) |
424 | { |
425 | $start = $this->countNonQuoted('(', $input); |
426 | $end = $this->countNonQuoted(')', $input); |
427 | return $start !== $end ? $this->removeNonQuoted(['(', ')'], $input) : $input; |
428 | } |
429 | |
430 | /** |
431 | * Normalize boosts in a query. |
432 | * |
433 | * @param string $input String to normalize |
434 | * |
435 | * @return string |
436 | */ |
437 | protected function normalizeBoosts($input) |
438 | { |
439 | // Ensure ^ is used properly |
440 | // Better: Remove all ^ if not followed by digits |
441 | // -- dmaus, 2012-11-11 |
442 | $cnt = preg_match_all('/\^/', $input, $tmp); |
443 | $matches = preg_match_all('/[^^]+\^[0-9]/', $input, $tmp); |
444 | return (($cnt) && ($cnt !== $matches)) |
445 | ? str_replace('^', '', $input) : $input; |
446 | } |
447 | |
448 | /** |
449 | * Normalize braces/brackets in a query. |
450 | * |
451 | * IMPORTANT: This should only be called on a string that has already been |
452 | * cleaned up by normalizeBoosts(). |
453 | * |
454 | * @param string $input String to normalize |
455 | * |
456 | * @return string |
457 | */ |
458 | protected function normalizeBracesAndBrackets($input) |
459 | { |
460 | // Remove unwanted brackets/braces that are not part of range queries. |
461 | // This is a bit of a shell game -- first we replace valid brackets and |
462 | // braces with tokens that cannot possibly already be in the query (due |
463 | // to the work of normalizeBoosts()). Next, we escape all remaining |
464 | // invalid brackets/braces, and transform our tokens back into valid ones. |
465 | // Obviously, the order of the patterns/merges array is critically |
466 | // important to get this right!! |
467 | $patterns = [ |
468 | // STEP 1 -- rename valid brackets/braces |
469 | '/\[([^\[\]\s]+\s+TO\s+[^\[\]\s]+)\]/' . |
470 | ($this->caseSensitiveRanges ? '' : 'i'), |
471 | '/\{([^\{\}\s]+\s+TO\s+[^\{\}\s]+)\}/' . |
472 | ($this->caseSensitiveRanges ? '' : 'i'), |
473 | // STEP 2 -- escape remaining unescaped brackets/braces |
474 | // (use a negative lookbehind (?<!\\) to skip escaped characters) |
475 | '/(?<!\\\\)([\[\]\{\}])/', |
476 | // STEP 3 -- restore valid brackets/braces |
477 | '/\^\^lbrack\^\^/', '/\^\^rbrack\^\^/', |
478 | '/\^\^lbrace\^\^/', '/\^\^rbrace\^\^/']; |
479 | $matches = [ |
480 | // STEP 1 -- rename valid brackets/braces |
481 | '^^lbrack^^$1^^rbrack^^', '^^lbrace^^$1^^rbrace^^', |
482 | // STEP 2 -- escape remaining brackets/braces |
483 | '\\\\$1', |
484 | // STEP 3 -- restore valid brackets/braces |
485 | '[', ']', '{', '}']; |
486 | return preg_replace($patterns, $matches, $input); |
487 | } |
488 | |
489 | /** |
490 | * Normalize various problems found in unquoted text within the query. |
491 | * |
492 | * @param string $input String to normalize |
493 | * |
494 | * @return string |
495 | */ |
496 | protected function normalizeUnquotedText($input) |
497 | { |
498 | // Freestanding hyphens, pluses and slashes can cause problems: |
499 | $lookahead = self::$insideQuotes; |
500 | // remove freestanding hyphens and pluses |
501 | $input = preg_replace( |
502 | '/(\s+[+-]+$|\s+[+-]+\s+|^[+-]+\s+)' . $lookahead . '/', |
503 | ' ', |
504 | $input |
505 | ); |
506 | // wrap quotes on standalone slashes |
507 | $input = preg_replace( |
508 | '/(\s+[\/]+\s+)' . $lookahead . '/', |
509 | ' "/" ', |
510 | $input |
511 | ); |
512 | // remove trailing and leading slashes |
513 | $input = preg_replace( |
514 | '/(\s+[\/]+$|^[\/]+\s+)' . $lookahead . '/', |
515 | ' ', |
516 | $input |
517 | ); |
518 | // A proximity of 1 is illegal and meaningless -- remove it: |
519 | $input = preg_replace('/~1(\.0*)?$/', '', $input); |
520 | $input = preg_replace('/~1(\.0*)?\s+' . $lookahead . '/', ' ', $input); |
521 | |
522 | // Remove empty parentheses outside of quotation marks -- these will |
523 | // cause a fatal Solr error and should be ignored. |
524 | $parenRegex = '/\(\s*\)' . $lookahead . '/'; |
525 | while (preg_match($parenRegex, $input)) { |
526 | $input = preg_replace($parenRegex, '', $input); |
527 | } |
528 | |
529 | return $input; |
530 | } |
531 | |
532 | /** |
533 | * Normalize field specifications within the query. |
534 | * |
535 | * @param string $input String to normalize |
536 | * |
537 | * @return string |
538 | */ |
539 | protected function normalizeColons($input) |
540 | { |
541 | $lookahead = self::$insideQuotes; |
542 | $input = preg_replace('/:+/', ':', $input); |
543 | $input = preg_replace('/(\:[:\s]+|[:\s]+:)' . $lookahead . '/', ' ', $input); |
544 | return trim($input, ':'); |
545 | } |
546 | |
547 | /** |
548 | * Prepare input to be used in a SOLR query. |
549 | * |
550 | * Handles certain cases where the input might conflict with Lucene |
551 | * syntax rules. |
552 | * |
553 | * @param string $input Input string |
554 | * |
555 | * @return string |
556 | * |
557 | * @todo Check if it is safe to assume $input to be an UTF-8 encoded string. |
558 | */ |
559 | protected function prepareForLuceneSyntax($input) |
560 | { |
561 | $input = $this->normalizeFancyQuotes($input); |
562 | |
563 | // If the user has entered a lone BOOLEAN operator, convert it to lowercase |
564 | // so it is treated as a word (otherwise it will trigger a fatal error): |
565 | switch (trim($input)) { |
566 | case 'OR': |
567 | return 'or'; |
568 | case 'AND': |
569 | return 'and'; |
570 | case 'NOT': |
571 | return 'not'; |
572 | } |
573 | |
574 | // If the string consists only of control characters and/or BOOLEANs with no |
575 | // other input, wipe it out entirely to prevent weird errors: |
576 | $operators = ['AND', 'OR', 'NOT', '+', '-', '"', '&&', '||']; |
577 | if (trim(str_replace($operators, '', $input)) == '') { |
578 | return ''; |
579 | } |
580 | |
581 | // Translate "all records" search into a blank string |
582 | if (trim($input) == '*:*') { |
583 | return ''; |
584 | } |
585 | |
586 | // Standard normalization actions (order is significant): |
587 | $input = $this->normalizeWildcards($input); |
588 | $input = $this->normalizeParens($input); |
589 | $input = $this->normalizeBoosts($input); |
590 | $input = $this->normalizeBracesAndBrackets($input); |
591 | $input = $this->normalizeUnquotedText($input); |
592 | $input = $this->normalizeColons($input); |
593 | |
594 | // Remove surrounding slashes and whitespace -- these serve no purpose |
595 | // and can cause problems. |
596 | $input = trim($input, '/ '); |
597 | |
598 | return $input; |
599 | } |
600 | |
601 | /** |
602 | * Convert the caseSensitiveBooleans property into an array for use with the |
603 | * capitalizeBooleans function. |
604 | * |
605 | * @return array |
606 | */ |
607 | protected function getBoolsToCap() |
608 | { |
609 | if ( |
610 | $this->caseSensitiveBooleans === false |
611 | || $this->caseSensitiveBooleans === 0 |
612 | || $this->caseSensitiveBooleans === '0' |
613 | ) { |
614 | return $this->allBools; |
615 | } elseif ( |
616 | $this->caseSensitiveBooleans === true |
617 | || $this->caseSensitiveBooleans === 1 |
618 | || $this->caseSensitiveBooleans === '1' |
619 | ) { |
620 | return []; |
621 | } |
622 | |
623 | // Callback function to clean up configuration settings: |
624 | $callback = function ($i) { |
625 | return strtoupper(trim($i)); |
626 | }; |
627 | |
628 | // Return all values from $this->allBools not found in the configuration: |
629 | return array_values( |
630 | array_diff( |
631 | $this->allBools, |
632 | array_map($callback, explode(',', $this->caseSensitiveBooleans)) |
633 | ) |
634 | ); |
635 | } |
636 | |
637 | /** |
638 | * Callback helper function. |
639 | * |
640 | * @param array $match Matches as of preg_replace_callback() |
641 | * |
642 | * @return string |
643 | * |
644 | * @see \VuFindSearch\Backend\Solr\LuceneSyntaxHelper::capitalizeRanges() |
645 | * |
646 | * @todo Check possible problem with umlauts/non-ASCII word characters |
647 | */ |
648 | protected function capitalizeRangesCallback($match) |
649 | { |
650 | // Extract the relevant parts of the expression: |
651 | $open = $match[1]; // opening symbol |
652 | $close = $match[4]; // closing symbol |
653 | $start = $match[2]; // start of range |
654 | $end = $match[3]; // end of range |
655 | |
656 | // Is this a case-sensitive range? |
657 | if ( |
658 | strtoupper($start) != strtolower($start) |
659 | || strtoupper($end) != strtolower($end) |
660 | ) { |
661 | // Build a lowercase version of the range: |
662 | $lower = $open . trim(strtolower($start)) . ' TO ' . |
663 | trim(strtolower($end)) . $close; |
664 | // Build a uppercase version of the range: |
665 | $upper = $open . trim(strtoupper($start)) . ' TO ' . |
666 | trim(strtoupper($end)) . $close; |
667 | |
668 | // Special case: don't create illegal timestamps! |
669 | $timestamp = '/[0-9]{4}-[0-9]{2}-[0-9]{2}t[0-9]{2}:[0-9]{2}:[0-9]{2}z/i'; |
670 | if (preg_match($timestamp, $start) || preg_match($timestamp, $end)) { |
671 | return $upper; |
672 | } |
673 | |
674 | // Accept results matching either range: |
675 | return '(' . $lower . ' OR ' . $upper . ')'; |
676 | } else { |
677 | // Simpler case -- case insensitive (probably numeric) range: |
678 | return $open . trim($start) . ' TO ' . trim($end) . $close; |
679 | } |
680 | } |
681 | |
682 | /** |
683 | * Count occurrences of a character in non-quoted parts of the string |
684 | * |
685 | * @param string $needle Character to look for (non-escaped) |
686 | * @param string $haystack String to process |
687 | * |
688 | * @return int |
689 | */ |
690 | protected function countNonQuoted(string $needle, string $haystack): int |
691 | { |
692 | $count = 0; |
693 | $this->processQueryString( |
694 | function (string $ch, bool $quoted, bool $esc) use ($needle, &$count) { |
695 | if (!$quoted && !$esc && $ch === $needle) { |
696 | ++$count; |
697 | } |
698 | }, |
699 | $haystack |
700 | ); |
701 | |
702 | return $count; |
703 | } |
704 | |
705 | /** |
706 | * Remove occurrences of given characters in non-quoted parts of the string |
707 | * |
708 | * @param array $needles Characters to remove (non-escaped) |
709 | * @param string $haystack String to process |
710 | * |
711 | * @return string |
712 | */ |
713 | protected function removeNonQuoted(array $needles, string $haystack): string |
714 | { |
715 | $result = ''; |
716 | $this->processQueryString( |
717 | function (string $ch, bool $quoted, bool $esc) use ($needles, &$result) { |
718 | if ($quoted || $esc || !in_array($ch, $needles)) { |
719 | $result .= $ch; |
720 | } |
721 | }, |
722 | $haystack |
723 | ); |
724 | return $result; |
725 | } |
726 | |
727 | /** |
728 | * Process a Lucene query string with a callback |
729 | * |
730 | * @param callable $callback Callback that gets called for each character |
731 | * @param string $str String to process |
732 | * |
733 | * @return void |
734 | */ |
735 | protected function processQueryString(callable $callback, string $str): void |
736 | { |
737 | $quoted = false; |
738 | $escaped = false; |
739 | foreach (str_split($str) as $ch) { |
740 | if ('\\' === $ch) { |
741 | $escaped = !$escaped; |
742 | } |
743 | // Check for escaped character (i.e. preceding character is backslash |
744 | // that's not escaped): |
745 | if (!$escaped && '"' === $ch) { |
746 | $quoted = !$quoted; |
747 | } |
748 | $callback($ch, $quoted, $escaped); |
749 | if ('\\' !== $ch) { |
750 | $escaped = false; |
751 | } |
752 | } |
753 | } |
754 | } |