Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
95 / 95 |
|
100.00% |
9 / 9 |
CRAP | |
100.00% |
1 / 1 |
SpellingProcessor | |
100.00% |
95 / 95 |
|
100.00% |
9 / 9 |
38 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
shouldSkipNumericSpelling | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getSpellingLimit | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
tokenize | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
9 | |||
getSuggestions | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
6 | |||
formatAndFilterSuggestions | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
6 | |||
shouldSkipTerm | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
processSuggestions | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
5 | |||
doSingleReplace | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
6 |
1 | <?php |
2 | |
3 | /** |
4 | * Solr spelling processor. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2011. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Search_Solr |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org Main Page |
28 | */ |
29 | |
30 | namespace VuFind\Search\Solr; |
31 | |
32 | use Laminas\Config\Config; |
33 | use VuFindSearch\Backend\Solr\Response\Json\Spellcheck; |
34 | use VuFindSearch\Query\AbstractQuery; |
35 | |
36 | use function count; |
37 | use function in_array; |
38 | use function is_array; |
39 | use function strlen; |
40 | |
41 | /** |
42 | * Solr spelling processor. |
43 | * |
44 | * @category VuFind |
45 | * @package Search_Solr |
46 | * @author Demian Katz <demian.katz@villanova.edu> |
47 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
48 | * @link https://vufind.org Main Page |
49 | */ |
50 | class SpellingProcessor |
51 | { |
52 | /** |
53 | * Spelling limit |
54 | * |
55 | * @var int |
56 | */ |
57 | protected $spellingLimit; |
58 | |
59 | /** |
60 | * Spell check words with numbers in them? |
61 | * |
62 | * @var bool |
63 | */ |
64 | protected $spellSkipNumeric; |
65 | |
66 | /** |
67 | * Offer expansions on terms as well as basic replacements? |
68 | * |
69 | * @var bool |
70 | */ |
71 | protected $expand; |
72 | |
73 | /** |
74 | * Show the full modified search phrase on screen rather then just the suggested |
75 | * word? |
76 | * |
77 | * @var bool |
78 | */ |
79 | protected $phrase; |
80 | |
81 | /** |
82 | * Callback for normalizing text. |
83 | * |
84 | * @var callable |
85 | */ |
86 | protected $normalizer; |
87 | |
88 | /** |
89 | * Constructor |
90 | * |
91 | * @param Config $config Spelling configuration (optional) |
92 | * @param callable $normalizer Callback for normalization of text (optional). |
93 | */ |
94 | public function __construct($config = null, $normalizer = null) |
95 | { |
96 | $this->spellingLimit = $config->limit ?? 3; |
97 | $this->spellSkipNumeric = $config->skip_numeric ?? true; |
98 | $this->expand = $config->expand ?? true; |
99 | $this->phrase = $config->phrase ?? false; |
100 | $this->normalizer = $normalizer; |
101 | } |
102 | |
103 | /** |
104 | * Are we skipping numeric words? |
105 | * |
106 | * @return bool |
107 | */ |
108 | public function shouldSkipNumericSpelling() |
109 | { |
110 | return $this->spellSkipNumeric; |
111 | } |
112 | |
113 | /** |
114 | * Get the spelling limit. |
115 | * |
116 | * @return int |
117 | */ |
118 | public function getSpellingLimit() |
119 | { |
120 | return $this->spellingLimit; |
121 | } |
122 | |
123 | /** |
124 | * Input Tokenizer - Specifically for spelling purposes |
125 | * |
126 | * Because of its focus on spelling, these tokens are unsuitable |
127 | * for actual searching. They are stripping important search data |
128 | * such as joins and groups, simply because they don't need to be |
129 | * spellchecked. |
130 | * |
131 | * @param string $input Query to tokenize |
132 | * |
133 | * @return array Tokenized array |
134 | */ |
135 | public function tokenize($input) |
136 | { |
137 | // Exclusion list of useless tokens: |
138 | $joins = ['AND', 'OR', 'NOT']; |
139 | |
140 | // Strip out parentheses -- irrelevant for tokenization: |
141 | $paren = ['(' => ' ', ')' => ' ']; |
142 | $input = trim(strtr($input, $paren)); |
143 | |
144 | // Base of this algorithm comes straight from PHP doc example by |
145 | // benighted at gmail dot com: http://php.net/manual/en/function.strtok.php |
146 | $tokens = []; |
147 | $token = strtok($input, " \t"); |
148 | while ($token !== false) { |
149 | // find double quoted tokens |
150 | if (str_starts_with($token, '"') && !str_ends_with($token, '"')) { |
151 | $token .= ' ' . strtok('"') . '"'; |
152 | } |
153 | // skip boolean operators |
154 | if (!in_array($token, $joins)) { |
155 | $tokens[] = $token; |
156 | } |
157 | $token = strtok(" \t"); |
158 | } |
159 | |
160 | // If the last token ends in a double quote but the input string does not, |
161 | // the tokenization process added the quote, which will break spelling |
162 | // replacements. We need to strip it back off again: |
163 | $last = count($tokens) > 0 ? $tokens[count($tokens) - 1] : null; |
164 | if ($last && str_ends_with($last, '"') && !str_ends_with($input, '"')) { |
165 | $tokens[count($tokens) - 1] = substr($last, 0, strlen($last) - 1); |
166 | } |
167 | return $tokens; |
168 | } |
169 | |
170 | /** |
171 | * Get raw spelling suggestions for a query. |
172 | * |
173 | * @param Spellcheck $spellcheck Complete spellcheck information |
174 | * @param AbstractQuery $query Query for which info should be retrieved |
175 | * |
176 | * @return array |
177 | * @throws \Exception |
178 | */ |
179 | public function getSuggestions(Spellcheck $spellcheck, AbstractQuery $query) |
180 | { |
181 | $allSuggestions = []; |
182 | foreach ($spellcheck as $term => $info) { |
183 | if ( |
184 | !$this->shouldSkipTerm($query, $term, false) |
185 | && ($suggestions = $this->formatAndFilterSuggestions($query, $info)) |
186 | ) { |
187 | $allSuggestions[$term] = [ |
188 | 'freq' => $info['origFreq'], |
189 | 'suggestions' => $suggestions, |
190 | ]; |
191 | } |
192 | } |
193 | // Fail over to secondary suggestions if primary failed: |
194 | if (empty($allSuggestions) && ($secondary = $spellcheck->getSecondary())) { |
195 | return $this->getSuggestions($secondary, $query); |
196 | } |
197 | return $allSuggestions; |
198 | } |
199 | |
200 | /** |
201 | * Support method for getSuggestions() |
202 | * |
203 | * @param AbstractQuery $query Query for which info should be retrieved |
204 | * @param array $info Spelling suggestion information |
205 | * |
206 | * @return array |
207 | * @throws \Exception |
208 | */ |
209 | protected function formatAndFilterSuggestions($query, $info) |
210 | { |
211 | // Validate response format |
212 | if (isset($info['suggestion'][0]) && !is_array($info['suggestion'][0])) { |
213 | throw new \Exception( |
214 | 'Unexpected suggestion format; spellcheck.extendedResults' |
215 | . ' must be set to true.' |
216 | ); |
217 | } |
218 | $limit = $this->getSpellingLimit(); |
219 | $suggestions = []; |
220 | foreach ($info['suggestion'] as $suggestion) { |
221 | if (count($suggestions) >= $limit) { |
222 | break; |
223 | } |
224 | $word = $suggestion['word']; |
225 | if (!$this->shouldSkipTerm($query, $word, true)) { |
226 | $suggestions[$word] = $suggestion['freq']; |
227 | } |
228 | } |
229 | return $suggestions; |
230 | } |
231 | |
232 | /** |
233 | * Should we skip the specified term? |
234 | * |
235 | * @param AbstractQuery $query Query for which info should be retrieved |
236 | * @param string $term Term to check |
237 | * @param bool $queryContains Should we skip the term if it is found |
238 | * in the query (true), or should we skip the term if it is NOT found in the |
239 | * query (false)? |
240 | * |
241 | * @return bool |
242 | */ |
243 | protected function shouldSkipTerm($query, $term, $queryContains) |
244 | { |
245 | // If term is numeric and we're in "skip numeric" mode, we should skip it: |
246 | if ($this->shouldSkipNumericSpelling() && is_numeric($term)) { |
247 | return true; |
248 | } |
249 | // We should also skip terms already contained within the query: |
250 | return $queryContains == $query->containsTerm($term, $this->normalizer); |
251 | } |
252 | |
253 | /** |
254 | * Process spelling suggestions. |
255 | * |
256 | * @param array $suggestions Raw suggestions from getSuggestions() |
257 | * @param string $query Spelling query |
258 | * @param Params $params Params helper object |
259 | * |
260 | * @return array |
261 | */ |
262 | public function processSuggestions($suggestions, $query, Params $params) |
263 | { |
264 | $returnArray = []; |
265 | foreach ($suggestions as $term => $details) { |
266 | // Find out if our suggestion is part of a token |
267 | $inToken = false; |
268 | $targetTerm = ''; |
269 | foreach ($this->tokenize($query) as $token) { |
270 | // Is the term part of the current token? |
271 | if (str_contains($token, (string)$term)) { |
272 | $inToken = true; |
273 | // We need to replace the whole token |
274 | $targetTerm = $token; |
275 | // Go and replace this token |
276 | $returnArray = $this->doSingleReplace( |
277 | $term, |
278 | $targetTerm, |
279 | $inToken, |
280 | $details, |
281 | $returnArray, |
282 | $params |
283 | ); |
284 | } |
285 | } |
286 | // If no tokens were found, just look for the suggestion 'as is' |
287 | if ($targetTerm == '') { |
288 | $targetTerm = $term; |
289 | $returnArray = $this->doSingleReplace( |
290 | $term, |
291 | $targetTerm, |
292 | $inToken, |
293 | $details, |
294 | $returnArray, |
295 | $params |
296 | ); |
297 | } |
298 | } |
299 | return $returnArray; |
300 | } |
301 | |
302 | /** |
303 | * Process one instance of a spelling replacement and modify the return |
304 | * data structure with the details of what was done. |
305 | * |
306 | * @param string $term The actually term we're replacing |
307 | * @param string $targetTerm The term above, or the token it is inside |
308 | * @param bool $inToken Flag for whether the token or term is used |
309 | * @param array $details The spelling suggestions |
310 | * @param array $returnArray Return data structure so far |
311 | * @param Params $params Params helper object |
312 | * |
313 | * @return array $returnArray modified |
314 | */ |
315 | protected function doSingleReplace( |
316 | $term, |
317 | $targetTerm, |
318 | $inToken, |
319 | $details, |
320 | $returnArray, |
321 | Params $params |
322 | ) { |
323 | $returnArray[$targetTerm]['freq'] = $details['freq']; |
324 | foreach ($details['suggestions'] as $word => $freq) { |
325 | // If the suggested word is part of a token, we need to make sure we |
326 | // replace the whole token: |
327 | $replacement = $inToken ? str_replace($term, $word, $targetTerm) : $word; |
328 | |
329 | // Do we need to show the whole, modified query? |
330 | $label = $this->phrase |
331 | ? $params->getDisplayQueryWithReplacedTerm( |
332 | $targetTerm, |
333 | $replacement |
334 | ) : $replacement; |
335 | |
336 | // Basic spelling suggestion data |
337 | $returnArray[$targetTerm]['suggestions'][$label] = [ |
338 | 'freq' => $freq, |
339 | 'new_term' => $replacement, |
340 | ]; |
341 | |
342 | // Only generate expansions if enabled in config |
343 | if ($this->expand) { |
344 | // Parentheses differ for shingles |
345 | $replacement = (strstr($targetTerm, ' ') !== false) |
346 | ? "(($targetTerm) OR ($replacement))" |
347 | : "($targetTerm OR $replacement)"; |
348 | $returnArray[$targetTerm]['suggestions'][$label]['expand_term'] |
349 | = $replacement; |
350 | } |
351 | } |
352 | |
353 | return $returnArray; |
354 | } |
355 | } |