Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
83.62% |
148 / 177 |
|
73.91% |
17 / 23 |
CRAP | |
0.00% |
0 / 1 |
SearchHandler | |
83.62% |
148 / 177 |
|
73.91% |
17 / 23 |
91.55 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
createAdvancedQueryString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
createSimpleQueryString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
preprocessQueryString | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
createBoostQueryString | |
63.64% |
14 / 22 |
|
0.00% |
0 / 1 |
12.89 | |||
hasDismax | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxHandler | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasExtendedDismax | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
getAllFields | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxFields | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getDismaxParams | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFilterQuery | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
hasFilterQuery | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
toArray | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setDefaultMustMatch | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
dismaxSubquery | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
2 | |||
mungeValues | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
7 | |||
dismaxMunge | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
customMunge | |
74.07% |
20 / 27 |
|
0.00% |
0 / 1 |
9.12 | |||
createQueryString | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
6 | |||
mungeRules | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
munge | |
52.63% |
10 / 19 |
|
0.00% |
0 / 1 |
9.83 | |||
tokenize | |
83.33% |
15 / 18 |
|
0.00% |
0 / 1 |
4.07 |
1 | <?php |
2 | |
3 | /** |
4 | * VuFind SearchHandler. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Search |
25 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
26 | * @author David Maus <maus@hab.de> |
27 | * @author Demian Katz <demian.katz@villanova.edu> |
28 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
29 | * @link https://vufind.org |
30 | */ |
31 | |
32 | namespace VuFindSearch\Backend\Solr; |
33 | |
34 | use function chr; |
35 | use function in_array; |
36 | use function intval; |
37 | |
38 | /** |
39 | * VuFind SearchHandler. |
40 | * |
41 | * The SearchHandler implements the rule-based translation of a user search |
42 | * query to a SOLR query string. |
43 | * |
44 | * @category VuFind |
45 | * @package Search |
46 | * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> |
47 | * @author David Maus <maus@hab.de> |
48 | * @author Demian Katz <demian.katz@villanova.edu> |
49 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
50 | * @link https://vufind.org |
51 | */ |
52 | class SearchHandler |
53 | { |
54 | /** |
55 | * Known configuration keys. |
56 | * |
57 | * @var array |
58 | */ |
59 | protected static $configKeys = [ |
60 | 'CustomMunge', 'DismaxFields', 'DismaxHandler', 'QueryFields', |
61 | 'DismaxParams', 'FilterQuery', 'DismaxMunge', |
62 | ]; |
63 | |
64 | /** |
65 | * Known boolean operators. |
66 | * |
67 | * @var array |
68 | */ |
69 | protected static $booleanOperators = ['AND', 'OR', 'NOT']; |
70 | |
71 | /** |
72 | * Search handler specification. |
73 | * |
74 | * @var array |
75 | */ |
76 | protected $specs; |
77 | |
78 | /** |
79 | * Constructor. |
80 | * |
81 | * @param array $spec Search handler specification |
82 | * @param string $defaultDismaxHandler Default dismax handler (if no |
83 | * DismaxHandler set in specs). |
84 | * |
85 | * @return void |
86 | */ |
87 | public function __construct(array $spec, $defaultDismaxHandler = 'dismax') |
88 | { |
89 | foreach (self::$configKeys as $key) { |
90 | $this->specs[$key] = $spec[$key] ?? []; |
91 | } |
92 | // Set dismax handler to default if not specified: |
93 | if (empty($this->specs['DismaxHandler'])) { |
94 | $this->specs['DismaxHandler'] = $defaultDismaxHandler; |
95 | } |
96 | // Set default mm handler if necessary: |
97 | $this->setDefaultMustMatch(); |
98 | } |
99 | |
100 | /// Public API |
101 | |
102 | /** |
103 | * Return an advanced query string. |
104 | * |
105 | * An advanced query string is a query string based on a search string w/ |
106 | * lucene syntax features. |
107 | * |
108 | * @param string $search Search string |
109 | * |
110 | * @return string |
111 | * |
112 | * @see \VuFind\Service\Solr\LuceneSyntaxHelper::containsAdvancedLuceneSyntax() |
113 | */ |
114 | public function createAdvancedQueryString($search) |
115 | { |
116 | return $this->createQueryString($search, true); |
117 | } |
118 | |
119 | /** |
120 | * Return a simple query string. |
121 | * |
122 | * @param string $search Search string |
123 | * |
124 | * @return string |
125 | * |
126 | * @see \VuFind\Service\Solr\SearchHandler::createAdvancedQueryString() |
127 | */ |
128 | public function createSimpleQueryString($search) |
129 | { |
130 | return $this->createQueryString($search, false); |
131 | } |
132 | |
133 | /** |
134 | * Apply standard pre-processing to the query string. |
135 | * |
136 | * @param string $search Search string |
137 | * |
138 | * @return string |
139 | */ |
140 | public function preprocessQueryString($search) |
141 | { |
142 | // Apply Dismax munging, if required: |
143 | if ($this->hasDismax()) { |
144 | return $this->dismaxMunge($search); |
145 | } |
146 | return $search; |
147 | } |
148 | |
149 | /** |
150 | * Return an advanced query string for specified search string. |
151 | * |
152 | * @param string $search Search string |
153 | * |
154 | * @return string |
155 | */ |
156 | public function createBoostQueryString($search) |
157 | { |
158 | $boostQuery = []; |
159 | if ($this->hasDismax()) { |
160 | foreach ($this->getDismaxParams() as $param) { |
161 | [$name, $value] = $param; |
162 | if ($name === 'bq') { |
163 | $boostQuery[] = $value; |
164 | } elseif ($name === 'bf') { |
165 | // BF parameter may contain multiple space-separated functions |
166 | // with individual boosts. We need to parse this into _val_ |
167 | // query components: |
168 | foreach (explode(' ', $value) as $boostFunction) { |
169 | if ($boostFunction) { |
170 | $parts = explode('^', $boostFunction, 2); |
171 | $boostQuery[] = sprintf( |
172 | '_val_:"%s"%s', |
173 | addcslashes($parts[0], '"'), |
174 | isset($parts[1]) ? "^{$parts[1]}" : '' |
175 | ); |
176 | } |
177 | } |
178 | } |
179 | } |
180 | } |
181 | if ($boostQuery) { |
182 | return sprintf( |
183 | '(%s) AND (*:* OR %s)', |
184 | $search, |
185 | implode(' OR ', $boostQuery) |
186 | ); |
187 | } else { |
188 | return $search; |
189 | } |
190 | } |
191 | |
192 | /** |
193 | * Return true if the handler defines Dismax fields. |
194 | * |
195 | * @return bool |
196 | */ |
197 | public function hasDismax() |
198 | { |
199 | return !empty($this->specs['DismaxFields']); |
200 | } |
201 | |
202 | /** |
203 | * Get the name of the Dismax handler to be used with this search. |
204 | * |
205 | * @return string |
206 | */ |
207 | public function getDismaxHandler() |
208 | { |
209 | return $this->specs['DismaxHandler']; |
210 | } |
211 | |
212 | /** |
213 | * Return true if the handler supports Extended Dismax. |
214 | * |
215 | * @return bool |
216 | */ |
217 | public function hasExtendedDismax() |
218 | { |
219 | return $this->hasDismax() && ('edismax' == $this->getDismaxHandler()); |
220 | } |
221 | |
222 | /** |
223 | * Get a list of all Solr fields searched by this handler. |
224 | * |
225 | * @return array |
226 | */ |
227 | public function getAllFields() |
228 | { |
229 | // If we have non-Dismax rules, the keys are the field names. |
230 | $queryFields = array_keys($this->mungeRules()); |
231 | |
232 | // If we have Dismax fields, we need to strip off boost values. |
233 | $callback = function ($f) { |
234 | return current(explode('^', $f)); |
235 | }; |
236 | $dismaxFields = array_map($callback, $this->getDismaxFields()); |
237 | |
238 | return array_unique(array_merge($queryFields, $dismaxFields)); |
239 | } |
240 | |
241 | /** |
242 | * Return defined dismax fields. |
243 | * |
244 | * @return array |
245 | */ |
246 | public function getDismaxFields() |
247 | { |
248 | return $this->specs['DismaxFields']; |
249 | } |
250 | |
251 | /** |
252 | * Return defined dismax parameters. |
253 | * |
254 | * @return array |
255 | */ |
256 | public function getDismaxParams() |
257 | { |
258 | return $this->specs['DismaxParams']; |
259 | } |
260 | |
261 | /** |
262 | * Return the filter query. |
263 | * |
264 | * @return string |
265 | */ |
266 | public function getFilterQuery() |
267 | { |
268 | return empty($this->specs['FilterQuery']) |
269 | ? null : $this->specs['FilterQuery']; |
270 | } |
271 | |
272 | /** |
273 | * Return true if handler defines a filter query. |
274 | * |
275 | * @return bool |
276 | */ |
277 | public function hasFilterQuery() |
278 | { |
279 | return (bool)$this->specs['FilterQuery']; |
280 | } |
281 | |
282 | /** |
283 | * Serialize handler specs as array. |
284 | * |
285 | * @return array |
286 | */ |
287 | public function toArray() |
288 | { |
289 | return $this->specs; |
290 | } |
291 | |
292 | /// Internal API |
293 | |
294 | /** |
295 | * Support method for constructor: if no mm is provided, set a reasonable |
296 | * default based on the selected Dismax handler. |
297 | * |
298 | * @return void |
299 | */ |
300 | protected function setDefaultMustMatch() |
301 | { |
302 | // Initialize parameter array if absent: |
303 | if (!isset($this->specs['DismaxParams'])) { |
304 | $this->specs['DismaxParams'] = []; |
305 | } |
306 | // Add mm if applicable: |
307 | if ($this->hasDismax()) { |
308 | // Our default mm depends on whether we're using dismax or edismax; |
309 | // for dismax, we want 100% matches, because we always want to |
310 | // simulate "AND" behavior by default (any "OR" searches will get |
311 | // rerouted to Lucene queries). For edismax, boolean operators are |
312 | // accounted for, and with an mm of 100%, OR searches will always |
313 | // fail. We can use 0% here, because the default q.op of AND will |
314 | // make AND searches work correctly even without a high mm value. |
315 | $default = $this->hasExtendedDismax() ? '0%' : '100%'; |
316 | |
317 | // Now if the configuration has no explicit mm value, let's push in |
318 | // our default: |
319 | $foundSetting = false; |
320 | foreach ($this->specs['DismaxParams'] as $current) { |
321 | if ($current[0] == 'mm') { |
322 | $foundSetting = true; |
323 | break; |
324 | } |
325 | } |
326 | if (!$foundSetting) { |
327 | $this->specs['DismaxParams'][] = ['mm', $default]; |
328 | } |
329 | } |
330 | } |
331 | |
332 | /** |
333 | * Return a Dismax subquery for specified search string. |
334 | * |
335 | * @param string $search Search string |
336 | * |
337 | * @return string |
338 | */ |
339 | protected function dismaxSubquery($search) |
340 | { |
341 | $dismaxParams = []; |
342 | foreach ($this->specs['DismaxParams'] as $param) { |
343 | $dismaxParams[] = sprintf( |
344 | "%s='%s'", |
345 | $param[0], |
346 | addcslashes($param[1], "'") |
347 | ); |
348 | } |
349 | $dismaxQuery = sprintf( |
350 | '{!%s qf="%s" %s}%s', |
351 | $this->getDismaxHandler(), |
352 | implode(' ', $this->specs['DismaxFields']), |
353 | implode(' ', $dismaxParams), |
354 | $search |
355 | ); |
356 | return sprintf('_query_:"%s"', addslashes($dismaxQuery)); |
357 | } |
358 | |
359 | /** |
360 | * Return the munge values for specified search string. |
361 | * |
362 | * If optional argument $tokenize is true tokenize the search string. |
363 | * |
364 | * @param string $search Search string |
365 | * @param bool $tokenize Tokenize the search string? |
366 | * |
367 | * @return string |
368 | */ |
369 | protected function mungeValues($search, $tokenize = true) |
370 | { |
371 | if ($tokenize) { |
372 | $tokens = $this->tokenize($search); |
373 | $mungeValues = [ |
374 | 'onephrase' => sprintf( |
375 | '"%s"', |
376 | str_replace('"', '', implode(' ', $tokens)) |
377 | ), |
378 | 'and' => implode(' AND ', $tokens), |
379 | 'or' => implode(' OR ', $tokens), |
380 | 'identity' => $search, |
381 | ]; |
382 | } else { |
383 | $mungeValues = [ |
384 | 'and' => $search, |
385 | 'or' => $search, |
386 | ]; |
387 | // If we're skipping tokenization, we just want to pass $lookfor through |
388 | // unmodified (it's probably an advanced search that won't benefit from |
389 | // tokenization). We'll just set all possible values to the same thing, |
390 | // except that we'll try to do the "one phrase" in quotes if possible. |
391 | // IMPORTANT: If we detect a boolean NOT, we MUST omit the quotes. We |
392 | // also omit quotes if the phrase is already quoted or if there is no |
393 | // whitespace (in which case phrase searching is pointless and might |
394 | // interfere with wildcard behavior): |
395 | if ( |
396 | strstr($search, '"') || strstr($search, ' NOT ') |
397 | || !preg_match('/\s/', $search) |
398 | ) { |
399 | $mungeValues['onephrase'] = $search; |
400 | } else { |
401 | $mungeValues['onephrase'] = sprintf('"%s"', $search); |
402 | } |
403 | } |
404 | |
405 | $mungeValues['identity'] = $search; |
406 | |
407 | foreach ($this->specs['CustomMunge'] as $mungeName => $mungeOps) { |
408 | $mungeValues[$mungeName] = $search; |
409 | foreach ($mungeOps as $operation) { |
410 | $mungeValues[$mungeName] |
411 | = $this->customMunge($mungeValues[$mungeName], $operation); |
412 | } |
413 | } |
414 | return $mungeValues; |
415 | } |
416 | |
417 | /** |
418 | * Apply custom search string munging to a Dismax query. |
419 | * |
420 | * @param string $search searchstring |
421 | * |
422 | * @return string |
423 | */ |
424 | protected function dismaxMunge($search) |
425 | { |
426 | foreach ($this->specs['DismaxMunge'] as $operation) { |
427 | $search = $this->customMunge($search, $operation); |
428 | } |
429 | return $search; |
430 | } |
431 | |
432 | /** |
433 | * Apply a munge operation to a search string. |
434 | * |
435 | * @param string $string string to munge |
436 | * @param array $operation munge operation |
437 | * |
438 | * @return string |
439 | */ |
440 | protected function customMunge($string, $operation) |
441 | { |
442 | switch ($operation[0]) { |
443 | case 'append': |
444 | $string .= $operation[1]; |
445 | break; |
446 | case 'lowercase': |
447 | $string = strtolower($string); |
448 | break; |
449 | case 'preg_replace': |
450 | $string = preg_replace( |
451 | $operation[1], |
452 | $operation[2], |
453 | $string |
454 | ); |
455 | break; |
456 | case 'prepend': |
457 | $string = $operation[1] . $string; |
458 | break; |
459 | case 'ucfirst': |
460 | $string = ucfirst($string); |
461 | break; |
462 | case 'uppercase': |
463 | $string = strtoupper($string); |
464 | break; |
465 | default: |
466 | throw new \InvalidArgumentException( |
467 | sprintf('Unknown munge operation: %s', $operation[0]) |
468 | ); |
469 | } |
470 | return $string; |
471 | } |
472 | |
473 | /** |
474 | * Return query string for specified search string. |
475 | * |
476 | * If optional argument $advanced is true the search string contains |
477 | * advanced lucene query syntax. |
478 | * |
479 | * @param string $search Search string |
480 | * @param bool $advanced Is the search an advanced search string? |
481 | * |
482 | * @return string |
483 | */ |
484 | protected function createQueryString($search, $advanced = false) |
485 | { |
486 | // If this is a basic query and we have Dismax settings (or if we have |
487 | // Extended Dismax available), let's build a Dismax subquery to avoid |
488 | // some of the ugly side effects of our Lucene query generation logic. |
489 | if (($this->hasExtendedDismax() || !$advanced) && $this->hasDismax()) { |
490 | $query = $this->dismaxSubquery( |
491 | $this->dismaxMunge($search) |
492 | ); |
493 | } else { |
494 | $mungeRules = $this->mungeRules(); |
495 | // Do not munge w/o rules |
496 | if ($mungeRules) { |
497 | $mungeValues = $this->mungeValues($search, !$advanced); |
498 | $query = $this->munge($mungeRules, $mungeValues); |
499 | } else { |
500 | $query = $search; |
501 | } |
502 | } |
503 | if ($this->hasFilterQuery()) { |
504 | $query = sprintf('(%s) AND (%s)', $query, $this->getFilterQuery()); |
505 | } |
506 | return "($query)"; |
507 | } |
508 | |
509 | /** |
510 | * Return array of munge rules. |
511 | * |
512 | * @todo Maybe rename? |
513 | * |
514 | * @return array |
515 | */ |
516 | protected function mungeRules() |
517 | { |
518 | return $this->specs['QueryFields']; |
519 | } |
520 | |
521 | /** |
522 | * Return modified search string after applying the transformation rules. |
523 | * |
524 | * @param array $mungeRules Munge rules |
525 | * @param array $mungeValues Munge values |
526 | * @param string $joiner Joiner of subqueries |
527 | * |
528 | * @return string |
529 | */ |
530 | protected function munge(array $mungeRules, array $mungeValues, $joiner = 'OR') |
531 | { |
532 | $clauses = []; |
533 | foreach ($mungeRules as $field => $clausearray) { |
534 | if (is_numeric($field)) { |
535 | // shift off the join string and weight |
536 | $sw = array_shift($clausearray); |
537 | $internalJoin = ' ' . $sw[0] . ' '; |
538 | // Build it up recursively |
539 | $sstring = '(' . |
540 | $this->munge($clausearray, $mungeValues, $internalJoin) . |
541 | ')'; |
542 | // ...and add a weight if we have one |
543 | $weight = intval($sw[1] ?? 0); |
544 | if ($weight > 0) { |
545 | $sstring .= '^' . $weight; |
546 | } |
547 | // push it onto the stack of clauses |
548 | $clauses[] = $sstring; |
549 | } else { |
550 | // Otherwise, we've got a (list of) [munge, weight] pairs to deal |
551 | // with |
552 | foreach ($clausearray as $spec) { |
553 | // build a string like title:("one two") |
554 | $sstring = $field . ':(' . $mungeValues[$spec[0]] . ')'; |
555 | // Add the weight if we have one. Yes, I know, it's redundant |
556 | // code. |
557 | $weight = intval($spec[1] ?? 0); |
558 | if ($weight > 0) { |
559 | $sstring .= '^' . $weight; |
560 | } |
561 | // ..and push it on the stack of clauses |
562 | $clauses[] = $sstring; |
563 | } |
564 | } |
565 | } |
566 | |
567 | // Join it all together |
568 | return implode(' ' . $joiner . ' ', $clauses); |
569 | } |
570 | |
571 | /** |
572 | * Tokenize the search string. |
573 | * |
574 | * @param string $string Search string |
575 | * |
576 | * @return array |
577 | */ |
578 | protected function tokenize($string) |
579 | { |
580 | // First replace escaped quotes with a non-printable character that will |
581 | // never be found in user input (ASCII 26, "substitute"). Next use a regex |
582 | // to split on whitespace and quoted phrases. Finally, swap the "substitute" |
583 | // characters back to escaped quotes. This allows for a simpler regex. |
584 | $string = str_replace('\\"', chr(26), $string); |
585 | preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases); |
586 | $callback = function ($str) { |
587 | return str_replace(chr(26), '\\"', $str); |
588 | }; |
589 | $phrases = array_map($callback, $phrases[0]); |
590 | |
591 | $tokens = []; |
592 | $token = []; |
593 | |
594 | reset($phrases); |
595 | while (current($phrases) !== false) { |
596 | $token[] = current($phrases); |
597 | $next = next($phrases); |
598 | if (in_array($next, self::$booleanOperators)) { |
599 | $token[] = $next; |
600 | if (next($phrases) === false) { |
601 | $tokens[] = implode(' ', $token); |
602 | } |
603 | } else { |
604 | $tokens[] = implode(' ', $token); |
605 | $token = []; |
606 | } |
607 | } |
608 | |
609 | return $tokens; |
610 | } |
611 | } |