Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
84.75% |
150 / 177 |
|
73.91% |
17 / 23 |
CRAP | |
0.00% |
0 / 1 |
SearchHandler | |
84.75% |
150 / 177 |
|
73.91% |
17 / 23 |
87.39 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
createAdvancedQueryString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
createSimpleQueryString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
preprocessQueryString | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
createBoostQueryString | |
63.64% |
14 / 22 |
|
0.00% |
0 / 1 |
12.89 | |||
hasDismax | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxHandler | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasExtendedDismax | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
getAllFields | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxFields | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxParams | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFilterQuery | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
hasFilterQuery | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
toArray | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setDefaultMustMatch | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
dismaxSubquery | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
2 | |||
mungeValues | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
7 | |||
dismaxMunge | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
customMunge | |
81.48% |
22 / 27 |
|
0.00% |
0 / 1 |
8.41 | |||
createQueryString | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
6 | |||
mungeRules | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
munge | |
52.63% |
10 / 19 |
|
0.00% |
0 / 1 |
9.83 | |||
tokenize | |
83.33% |
15 / 18 |
|
0.00% |
0 / 1 |
4.07 |
1 | <?php |
2 | |
3 | /** |
4 | * VuFind SearchHandler. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Search |
25 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
26 | * @author David Maus <maus@hab.de> |
27 | * @author Demian Katz <demian.katz@villanova.edu> |
28 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
29 | * @link https://vufind.org |
30 | */ |
31 | |
32 | namespace VuFindSearch\Backend\Solr; |
33 | |
34 | use function chr; |
35 | use function in_array; |
36 | use function intval; |
37 | use function sprintf; |
38 | |
39 | /** |
40 | * VuFind SearchHandler. |
41 | * |
42 | * The SearchHandler implements the rule-based translation of a user search |
43 | * query to a SOLR query string. |
44 | * |
45 | * @category VuFind |
46 | * @package Search |
47 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
48 | * @author David Maus <maus@hab.de> |
49 | * @author Demian Katz <demian.katz@villanova.edu> |
50 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
51 | * @link https://vufind.org |
52 | */ |
53 | class SearchHandler |
54 | { |
55 | /** |
56 | * Known configuration keys. |
57 | * |
58 | * @var array |
59 | */ |
60 | protected static $configKeys = [ |
61 | 'CustomMunge', 'DismaxFields', 'DismaxHandler', 'QueryFields', |
62 | 'DismaxParams', 'FilterQuery', 'DismaxMunge', |
63 | ]; |
64 | |
65 | /** |
66 | * Known boolean operators. |
67 | * |
68 | * @var array |
69 | */ |
70 | protected static $booleanOperators = ['AND', 'OR', 'NOT']; |
71 | |
72 | /** |
73 | * Search handler specification. |
74 | * |
75 | * @var array |
76 | */ |
77 | protected $specs; |
78 | |
79 | /** |
80 | * Constructor. |
81 | * |
82 | * @param array $spec Search handler specification |
83 | * @param string $defaultDismaxHandler Default dismax handler (if no |
84 | * DismaxHandler set in specs). |
85 | * |
86 | * @return void |
87 | */ |
88 | public function __construct(array $spec, $defaultDismaxHandler = 'dismax') |
89 | { |
90 | foreach (self::$configKeys as $key) { |
91 | $this->specs[$key] = $spec[$key] ?? []; |
92 | } |
93 | // Set dismax handler to default if not specified: |
94 | if (empty($this->specs['DismaxHandler'])) { |
95 | $this->specs['DismaxHandler'] = $defaultDismaxHandler; |
96 | } |
97 | // Set default mm handler if necessary: |
98 | $this->setDefaultMustMatch(); |
99 | } |
100 | |
101 | /// Public API |
102 | |
103 | /** |
104 | * Return an advanced query string. |
105 | * |
106 | * An advanced query string is a query string based on a search string w/ |
107 | * lucene syntax features. |
108 | * |
109 | * @param string $search Search string |
110 | * |
111 | * @return string |
112 | * |
113 | * @see \VuFind\Service\Solr\LuceneSyntaxHelper::containsAdvancedLuceneSyntax() |
114 | */ |
115 | public function createAdvancedQueryString($search) |
116 | { |
117 | return $this->createQueryString($search, true); |
118 | } |
119 | |
120 | /** |
121 | * Return a simple query string. |
122 | * |
123 | * @param string $search Search string |
124 | * |
125 | * @return string |
126 | * |
127 | * @see \VuFind\Service\Solr\SearchHandler::createAdvancedQueryString() |
128 | */ |
129 | public function createSimpleQueryString($search) |
130 | { |
131 | return $this->createQueryString($search, false); |
132 | } |
133 | |
134 | /** |
135 | * Apply standard pre-processing to the query string. |
136 | * |
137 | * @param string $search Search string |
138 | * |
139 | * @return string |
140 | */ |
141 | public function preprocessQueryString($search) |
142 | { |
143 | // Apply Dismax munging, if required: |
144 | if ($this->hasDismax()) { |
145 | return $this->dismaxMunge($search); |
146 | } |
147 | return $search; |
148 | } |
149 | |
150 | /** |
151 | * Return an advanced query string for specified search string. |
152 | * |
153 | * @param string $search Search string |
154 | * |
155 | * @return string |
156 | */ |
157 | public function createBoostQueryString($search) |
158 | { |
159 | $boostQuery = []; |
160 | if ($this->hasDismax()) { |
161 | foreach ($this->getDismaxParams() as $param) { |
162 | [$name, $value] = $param; |
163 | if ($name === 'bq') { |
164 | $boostQuery[] = $value; |
165 | } elseif ($name === 'bf') { |
166 | // BF parameter may contain multiple space-separated functions |
167 | // with individual boosts. We need to parse this into _val_ |
168 | // query components: |
169 | foreach (explode(' ', $value) as $boostFunction) { |
170 | if ($boostFunction) { |
171 | $parts = explode('^', $boostFunction, 2); |
172 | $boostQuery[] = sprintf( |
173 | '_val_:"%s"%s', |
174 | addcslashes($parts[0], '"'), |
175 | isset($parts[1]) ? "^{$parts[1]}" : '' |
176 | ); |
177 | } |
178 | } |
179 | } |
180 | } |
181 | } |
182 | if ($boostQuery) { |
183 | return sprintf( |
184 | '(%s) AND (*:* OR %s)', |
185 | $search, |
186 | implode(' OR ', $boostQuery) |
187 | ); |
188 | } else { |
189 | return $search; |
190 | } |
191 | } |
192 | |
193 | /** |
194 | * Return true if the handler defines Dismax fields. |
195 | * |
196 | * @return bool |
197 | */ |
198 | public function hasDismax() |
199 | { |
200 | return !empty($this->specs['DismaxFields']); |
201 | } |
202 | |
203 | /** |
204 | * Get the name of the Dismax handler to be used with this search. |
205 | * |
206 | * @return string |
207 | */ |
208 | public function getDismaxHandler() |
209 | { |
210 | return $this->specs['DismaxHandler']; |
211 | } |
212 | |
213 | /** |
214 | * Return true if the handler supports Extended Dismax. |
215 | * |
216 | * @return bool |
217 | */ |
218 | public function hasExtendedDismax() |
219 | { |
220 | return $this->hasDismax() && ('edismax' == $this->getDismaxHandler()); |
221 | } |
222 | |
223 | /** |
224 | * Get a list of all Solr fields searched by this handler. |
225 | * |
226 | * @return array |
227 | */ |
228 | public function getAllFields() |
229 | { |
230 | // If we have non-Dismax rules, the keys are the field names. |
231 | $queryFields = array_keys($this->mungeRules()); |
232 | |
233 | // If we have Dismax fields, we need to strip off boost values. |
234 | $callback = function ($f) { |
235 | return current(explode('^', $f)); |
236 | }; |
237 | $dismaxFields = array_map($callback, $this->getDismaxFields()); |
238 | |
239 | return array_unique(array_merge($queryFields, $dismaxFields)); |
240 | } |
241 | |
242 | /** |
243 | * Return defined dismax fields. |
244 | * |
245 | * @return array |
246 | */ |
247 | public function getDismaxFields() |
248 | { |
249 | return $this->specs['DismaxFields']; |
250 | } |
251 | |
252 | /** |
253 | * Return defined dismax parameters. |
254 | * |
255 | * @return array |
256 | */ |
257 | public function getDismaxParams() |
258 | { |
259 | return $this->specs['DismaxParams']; |
260 | } |
261 | |
262 | /** |
263 | * Return the filter query. |
264 | * |
265 | * @return string |
266 | */ |
267 | public function getFilterQuery() |
268 | { |
269 | return empty($this->specs['FilterQuery']) |
270 | ? null : $this->specs['FilterQuery']; |
271 | } |
272 | |
273 | /** |
274 | * Return true if handler defines a filter query. |
275 | * |
276 | * @return bool |
277 | */ |
278 | public function hasFilterQuery() |
279 | { |
280 | return (bool)$this->specs['FilterQuery']; |
281 | } |
282 | |
283 | /** |
284 | * Serialize handler specs as array. |
285 | * |
286 | * @return array |
287 | */ |
288 | public function toArray() |
289 | { |
290 | return $this->specs; |
291 | } |
292 | |
293 | /// Internal API |
294 | |
295 | /** |
296 | * Support method for constructor: if no mm is provided, set a reasonable |
297 | * default based on the selected Dismax handler. |
298 | * |
299 | * @return void |
300 | */ |
301 | protected function setDefaultMustMatch() |
302 | { |
303 | // Initialize parameter array if absent: |
304 | if (!isset($this->specs['DismaxParams'])) { |
305 | $this->specs['DismaxParams'] = []; |
306 | } |
307 | // Add mm if applicable: |
308 | if ($this->hasDismax()) { |
309 | // Our default mm depends on whether we're using dismax or edismax; |
310 | // for dismax, we want 100% matches, because we always want to |
311 | // simulate "AND" behavior by default (any "OR" searches will get |
312 | // rerouted to Lucene queries). For edismax, boolean operators are |
313 | // accounted for, and with an mm of 100%, OR searches will always |
314 | // fail. We can use 0% here, because the default q.op of AND will |
315 | // make AND searches work correctly even without a high mm value. |
316 | $default = $this->hasExtendedDismax() ? '0%' : '100%'; |
317 | |
318 | // Now if the configuration has no explicit mm value, let's push in |
319 | // our default: |
320 | $foundSetting = false; |
321 | foreach ($this->specs['DismaxParams'] as $current) { |
322 | if ($current[0] == 'mm') { |
323 | $foundSetting = true; |
324 | break; |
325 | } |
326 | } |
327 | if (!$foundSetting) { |
328 | $this->specs['DismaxParams'][] = ['mm', $default]; |
329 | } |
330 | } |
331 | } |
332 | |
333 | /** |
334 | * Return a Dismax subquery for specified search string. |
335 | * |
336 | * @param string $search Search string |
337 | * |
338 | * @return string |
339 | */ |
340 | protected function dismaxSubquery($search) |
341 | { |
342 | $dismaxParams = []; |
343 | foreach ($this->specs['DismaxParams'] as $param) { |
344 | $dismaxParams[] = sprintf( |
345 | "%s='%s'", |
346 | $param[0], |
347 | addcslashes($param[1], "'") |
348 | ); |
349 | } |
350 | $dismaxQuery = sprintf( |
351 | '{!%s qf="%s" %s}%s', |
352 | $this->getDismaxHandler(), |
353 | implode(' ', $this->specs['DismaxFields']), |
354 | implode(' ', $dismaxParams), |
355 | $search |
356 | ); |
357 | return sprintf('_query_:"%s"', addslashes($dismaxQuery)); |
358 | } |
359 | |
360 | /** |
361 | * Return the munge values for specified search string. |
362 | * |
363 | * If optional argument $tokenize is true tokenize the search string. |
364 | * |
365 | * @param string $search Search string |
366 | * @param bool $tokenize Tokenize the search string? |
367 | * |
368 | * @return string |
369 | */ |
370 | protected function mungeValues($search, $tokenize = true) |
371 | { |
372 | if ($tokenize) { |
373 | $tokens = $this->tokenize($search); |
374 | $mungeValues = [ |
375 | 'onephrase' => sprintf( |
376 | '"%s"', |
377 | str_replace('"', '', implode(' ', $tokens)) |
378 | ), |
379 | 'and' => implode(' AND ', $tokens), |
380 | 'or' => implode(' OR ', $tokens), |
381 | 'identity' => $search, |
382 | ]; |
383 | } else { |
384 | $mungeValues = [ |
385 | 'and' => $search, |
386 | 'or' => $search, |
387 | ]; |
388 | // If we're skipping tokenization, we just want to pass $lookfor through |
389 | // unmodified (it's probably an advanced search that won't benefit from |
390 | // tokenization). We'll just set all possible values to the same thing, |
391 | // except that we'll try to do the "one phrase" in quotes if possible. |
392 | // IMPORTANT: If we detect a boolean NOT, we MUST omit the quotes. We |
393 | // also omit quotes if the phrase is already quoted or if there is no |
394 | // whitespace (in which case phrase searching is pointless and might |
395 | // interfere with wildcard behavior): |
396 | if ( |
397 | strstr($search, '"') || strstr($search, ' NOT ') |
398 | || !preg_match('/\s/', $search) |
399 | ) { |
400 | $mungeValues['onephrase'] = $search; |
401 | } else { |
402 | $mungeValues['onephrase'] = sprintf('"%s"', $search); |
403 | } |
404 | } |
405 | |
406 | $mungeValues['identity'] = $search; |
407 | |
408 | foreach ($this->specs['CustomMunge'] as $mungeName => $mungeOps) { |
409 | $mungeValues[$mungeName] = $search; |
410 | foreach ($mungeOps as $operation) { |
411 | $mungeValues[$mungeName] |
412 | = $this->customMunge($mungeValues[$mungeName], $operation); |
413 | } |
414 | } |
415 | return $mungeValues; |
416 | } |
417 | |
418 | /** |
419 | * Apply custom search string munging to a Dismax query. |
420 | * |
421 | * @param string $search searchstring |
422 | * |
423 | * @return string |
424 | */ |
425 | protected function dismaxMunge($search) |
426 | { |
427 | foreach ($this->specs['DismaxMunge'] as $operation) { |
428 | $search = $this->customMunge($search, $operation); |
429 | } |
430 | return $search; |
431 | } |
432 | |
433 | /** |
434 | * Apply a munge operation to a search string. |
435 | * |
436 | * @param string $string string to munge |
437 | * @param array $operation munge operation |
438 | * |
439 | * @return string |
440 | */ |
441 | protected function customMunge($string, $operation) |
442 | { |
443 | switch ($operation[0]) { |
444 | case 'append': |
445 | $string .= $operation[1]; |
446 | break; |
447 | case 'lowercase': |
448 | $string = strtolower($string); |
449 | break; |
450 | case 'preg_replace': |
451 | $string = preg_replace( |
452 | $operation[1], |
453 | $operation[2], |
454 | $string |
455 | ); |
456 | break; |
457 | case 'prepend': |
458 | $string = $operation[1] . $string; |
459 | break; |
460 | case 'ucfirst': |
461 | $string = ucfirst($string); |
462 | break; |
463 | case 'uppercase': |
464 | $string = strtoupper($string); |
465 | break; |
466 | default: |
467 | throw new \InvalidArgumentException( |
468 | sprintf('Unknown munge operation: %s', $operation[0]) |
469 | ); |
470 | } |
471 | return $string; |
472 | } |
473 | |
474 | /** |
475 | * Return query string for specified search string. |
476 | * |
477 | * If optional argument $advanced is true the search string contains |
478 | * advanced lucene query syntax. |
479 | * |
480 | * @param string $search Search string |
481 | * @param bool $advanced Is the search an advanced search string? |
482 | * |
483 | * @return string |
484 | */ |
485 | protected function createQueryString($search, $advanced = false) |
486 | { |
487 | // If this is a basic query and we have Dismax settings (or if we have |
488 | // Extended Dismax available), let's build a Dismax subquery to avoid |
489 | // some of the ugly side effects of our Lucene query generation logic. |
490 | if (($this->hasExtendedDismax() || !$advanced) && $this->hasDismax()) { |
491 | $query = $this->dismaxSubquery( |
492 | $this->dismaxMunge($search) |
493 | ); |
494 | } else { |
495 | $mungeRules = $this->mungeRules(); |
496 | // Do not munge w/o rules |
497 | if ($mungeRules) { |
498 | $mungeValues = $this->mungeValues($search, !$advanced); |
499 | $query = $this->munge($mungeRules, $mungeValues); |
500 | } else { |
501 | $query = $search; |
502 | } |
503 | } |
504 | if ($this->hasFilterQuery()) { |
505 | $query = sprintf('(%s) AND (%s)', $query, $this->getFilterQuery()); |
506 | } |
507 | return "($query)"; |
508 | } |
509 | |
510 | /** |
511 | * Return array of munge rules. |
512 | * |
513 | * @todo Maybe rename? |
514 | * |
515 | * @return array |
516 | */ |
517 | protected function mungeRules() |
518 | { |
519 | return $this->specs['QueryFields']; |
520 | } |
521 | |
522 | /** |
523 | * Return modified search string after applying the transformation rules. |
524 | * |
525 | * @param array $mungeRules Munge rules |
526 | * @param array $mungeValues Munge values |
527 | * @param string $joiner Joiner of subqueries |
528 | * |
529 | * @return string |
530 | */ |
531 | protected function munge(array $mungeRules, array $mungeValues, $joiner = 'OR') |
532 | { |
533 | $clauses = []; |
534 | foreach ($mungeRules as $field => $clausearray) { |
535 | if (is_numeric($field)) { |
536 | // shift off the join string and weight |
537 | $sw = array_shift($clausearray); |
538 | $internalJoin = ' ' . $sw[0] . ' '; |
539 | // Build it up recursively |
540 | $sstring = '(' . |
541 | $this->munge($clausearray, $mungeValues, $internalJoin) . |
542 | ')'; |
543 | // ...and add a weight if we have one |
544 | $weight = intval($sw[1] ?? 0); |
545 | if ($weight > 0) { |
546 | $sstring .= '^' . $weight; |
547 | } |
548 | // push it onto the stack of clauses |
549 | $clauses[] = $sstring; |
550 | } else { |
551 | // Otherwise, we've got a (list of) [munge, weight] pairs to deal |
552 | // with |
553 | foreach ($clausearray as $spec) { |
554 | // build a string like title:("one two") |
555 | $sstring = $field . ':(' . $mungeValues[$spec[0]] . ')'; |
556 | // Add the weight if we have one. Yes, I know, it's redundant |
557 | // code. |
558 | $weight = intval($spec[1] ?? 0); |
559 | if ($weight > 0) { |
560 | $sstring .= '^' . $weight; |
561 | } |
562 | // ..and push it on the stack of clauses |
563 | $clauses[] = $sstring; |
564 | } |
565 | } |
566 | } |
567 | |
568 | // Join it all together |
569 | return implode(' ' . $joiner . ' ', $clauses); |
570 | } |
571 | |
572 | /** |
573 | * Tokenize the search string. |
574 | * |
575 | * @param string $string Search string |
576 | * |
577 | * @return array |
578 | */ |
579 | protected function tokenize($string) |
580 | { |
581 | // First replace escaped quotes with a non-printable character that will |
582 | // never be found in user input (ASCII 26, "substitute"). Next use a regex |
583 | // to split on whitespace and quoted phrases. Finally, swap the "substitute" |
584 | // characters back to escaped quotes. This allows for a simpler regex. |
585 | $string = str_replace('\\"', chr(26), $string); |
586 | preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases); |
587 | $callback = function ($str) { |
588 | return str_replace(chr(26), '\\"', $str); |
589 | }; |
590 | $phrases = array_map($callback, $phrases[0]); |
591 | |
592 | $tokens = []; |
593 | $token = []; |
594 | |
595 | reset($phrases); |
596 | while (current($phrases) !== false) { |
597 | $token[] = current($phrases); |
598 | $next = next($phrases); |
599 | if (in_array($next, self::$booleanOperators)) { |
600 | $token[] = $next; |
601 | if (next($phrases) === false) { |
602 | $tokens[] = implode(' ', $token); |
603 | } |
604 | } else { |
605 | $tokens[] = implode(' ', $token); |
606 | $token = []; |
607 | } |
608 | } |
609 | |
610 | return $tokens; |
611 | } |
612 | } |