Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
95 / 95
100.00% covered (success)
100.00%
9 / 9
CRAP
100.00% covered (success)
100.00%
1 / 1
SpellingProcessor
100.00% covered (success)
100.00%
95 / 95
100.00% covered (success)
100.00%
9 / 9
38
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 shouldSkipNumericSpelling
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getSpellingLimit
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 tokenize
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
9
 getSuggestions
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
6
 formatAndFilterSuggestions
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
6
 shouldSkipTerm
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
3
 processSuggestions
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
1 / 1
5
 doSingleReplace
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
6
1<?php
2
3/**
4 * Solr spelling processor.
5 *
6 * PHP version 8
7 *
8 * Copyright (C) Villanova University 2011.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22 *
23 * @category VuFind
24 * @package  Search_Solr
25 * @author   Demian Katz <demian.katz@villanova.edu>
26 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
27 * @link     https://vufind.org Main Page
28 */
29
30namespace VuFind\Search\Solr;
31
32use Laminas\Config\Config;
33use VuFindSearch\Backend\Solr\Response\Json\Spellcheck;
34use VuFindSearch\Query\AbstractQuery;
35
36use function count;
37use function in_array;
38use function is_array;
39use function strlen;
40
41/**
42 * Solr spelling processor.
43 *
44 * @category VuFind
45 * @package  Search_Solr
46 * @author   Demian Katz <demian.katz@villanova.edu>
47 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
48 * @link     https://vufind.org Main Page
49 */
50class SpellingProcessor
51{
52    /**
53     * Spelling limit
54     *
55     * @var int
56     */
57    protected $spellingLimit;
58
59    /**
60     * Spell check words with numbers in them?
61     *
62     * @var bool
63     */
64    protected $spellSkipNumeric;
65
66    /**
67     * Offer expansions on terms as well as basic replacements?
68     *
69     * @var bool
70     */
71    protected $expand;
72
73    /**
74     * Show the full modified search phrase on screen rather then just the suggested
75     * word?
76     *
77     * @var bool
78     */
79    protected $phrase;
80
81    /**
82     * Callback for normalizing text.
83     *
84     * @var callable
85     */
86    protected $normalizer;
87
88    /**
89     * Constructor
90     *
91     * @param Config   $config     Spelling configuration (optional)
92     * @param callable $normalizer Callback for normalization of text (optional).
93     */
94    public function __construct($config = null, $normalizer = null)
95    {
96        $this->spellingLimit = $config->limit ?? 3;
97        $this->spellSkipNumeric = $config->skip_numeric ?? true;
98        $this->expand = $config->expand ?? true;
99        $this->phrase = $config->phrase ?? false;
100        $this->normalizer = $normalizer;
101    }
102
103    /**
104     * Are we skipping numeric words?
105     *
106     * @return bool
107     */
108    public function shouldSkipNumericSpelling()
109    {
110        return $this->spellSkipNumeric;
111    }
112
113    /**
114     * Get the spelling limit.
115     *
116     * @return int
117     */
118    public function getSpellingLimit()
119    {
120        return $this->spellingLimit;
121    }
122
123    /**
124     * Input Tokenizer - Specifically for spelling purposes
125     *
126     * Because of its focus on spelling, these tokens are unsuitable
127     * for actual searching. They are stripping important search data
128     * such as joins and groups, simply because they don't need to be
129     * spellchecked.
130     *
131     * @param string $input Query to tokenize
132     *
133     * @return array        Tokenized array
134     */
135    public function tokenize($input)
136    {
137        // Exclusion list of useless tokens:
138        $joins = ['AND', 'OR', 'NOT'];
139
140        // Strip out parentheses -- irrelevant for tokenization:
141        $paren = ['(' => ' ', ')' => ' '];
142        $input = trim(strtr($input, $paren));
143
144        // Base of this algorithm comes straight from PHP doc example by
145        // benighted at gmail dot com: http://php.net/manual/en/function.strtok.php
146        $tokens = [];
147        $token = strtok($input, " \t");
148        while ($token !== false) {
149            // find double quoted tokens
150            if (str_starts_with($token, '"') && !str_ends_with($token, '"')) {
151                $token .= ' ' . strtok('"') . '"';
152            }
153            // skip boolean operators
154            if (!in_array($token, $joins)) {
155                $tokens[] = $token;
156            }
157            $token = strtok(" \t");
158        }
159
160        // If the last token ends in a double quote but the input string does not,
161        // the tokenization process added the quote, which will break spelling
162        // replacements. We need to strip it back off again:
163        $last = count($tokens) > 0 ? $tokens[count($tokens) - 1] : null;
164        if ($last && str_ends_with($last, '"') && !str_ends_with($input, '"')) {
165            $tokens[count($tokens) - 1] = substr($last, 0, strlen($last) - 1);
166        }
167        return $tokens;
168    }
169
170    /**
171     * Get raw spelling suggestions for a query.
172     *
173     * @param Spellcheck    $spellcheck Complete spellcheck information
174     * @param AbstractQuery $query      Query for which info should be retrieved
175     *
176     * @return array
177     * @throws \Exception
178     */
179    public function getSuggestions(Spellcheck $spellcheck, AbstractQuery $query)
180    {
181        $allSuggestions = [];
182        foreach ($spellcheck as $term => $info) {
183            if (
184                !$this->shouldSkipTerm($query, $term, false)
185                && ($suggestions = $this->formatAndFilterSuggestions($query, $info))
186            ) {
187                $allSuggestions[$term] = [
188                    'freq' => $info['origFreq'],
189                    'suggestions' => $suggestions,
190                ];
191            }
192        }
193        // Fail over to secondary suggestions if primary failed:
194        if (empty($allSuggestions) && ($secondary = $spellcheck->getSecondary())) {
195            return $this->getSuggestions($secondary, $query);
196        }
197        return $allSuggestions;
198    }
199
200    /**
201     * Support method for getSuggestions()
202     *
203     * @param AbstractQuery $query Query for which info should be retrieved
204     * @param array         $info  Spelling suggestion information
205     *
206     * @return array
207     * @throws \Exception
208     */
209    protected function formatAndFilterSuggestions($query, $info)
210    {
211        // Validate response format
212        if (isset($info['suggestion'][0]) && !is_array($info['suggestion'][0])) {
213            throw new \Exception(
214                'Unexpected suggestion format; spellcheck.extendedResults'
215                . ' must be set to true.'
216            );
217        }
218        $limit = $this->getSpellingLimit();
219        $suggestions = [];
220        foreach ($info['suggestion'] as $suggestion) {
221            if (count($suggestions) >= $limit) {
222                break;
223            }
224            $word = $suggestion['word'];
225            if (!$this->shouldSkipTerm($query, $word, true)) {
226                $suggestions[$word] = $suggestion['freq'];
227            }
228        }
229        return $suggestions;
230    }
231
232    /**
233     * Should we skip the specified term?
234     *
235     * @param AbstractQuery $query         Query for which info should be retrieved
236     * @param string        $term          Term to check
237     * @param bool          $queryContains Should we skip the term if it is found
238     * in the query (true), or should we skip the term if it is NOT found in the
239     * query (false)?
240     *
241     * @return bool
242     */
243    protected function shouldSkipTerm($query, $term, $queryContains)
244    {
245        // If term is numeric and we're in "skip numeric" mode, we should skip it:
246        if ($this->shouldSkipNumericSpelling() && is_numeric($term)) {
247            return true;
248        }
249        // We should also skip terms already contained within the query:
250        return $queryContains == $query->containsTerm($term, $this->normalizer);
251    }
252
253    /**
254     * Process spelling suggestions.
255     *
256     * @param array  $suggestions Raw suggestions from getSuggestions()
257     * @param string $query       Spelling query
258     * @param Params $params      Params helper object
259     *
260     * @return array
261     */
262    public function processSuggestions($suggestions, $query, Params $params)
263    {
264        $returnArray = [];
265        foreach ($suggestions as $term => $details) {
266            // Find out if our suggestion is part of a token
267            $inToken = false;
268            $targetTerm = '';
269            foreach ($this->tokenize($query) as $token) {
270                // Is the term part of the current token?
271                if (str_contains($token, (string)$term)) {
272                    $inToken = true;
273                    // We need to replace the whole token
274                    $targetTerm = $token;
275                    // Go and replace this token
276                    $returnArray = $this->doSingleReplace(
277                        $term,
278                        $targetTerm,
279                        $inToken,
280                        $details,
281                        $returnArray,
282                        $params
283                    );
284                }
285            }
286            // If no tokens were found, just look for the suggestion 'as is'
287            if ($targetTerm == '') {
288                $targetTerm = $term;
289                $returnArray = $this->doSingleReplace(
290                    $term,
291                    $targetTerm,
292                    $inToken,
293                    $details,
294                    $returnArray,
295                    $params
296                );
297            }
298        }
299        return $returnArray;
300    }
301
302    /**
303     * Process one instance of a spelling replacement and modify the return
304     *   data structure with the details of what was done.
305     *
306     * @param string $term        The actually term we're replacing
307     * @param string $targetTerm  The term above, or the token it is inside
308     * @param bool   $inToken     Flag for whether the token or term is used
309     * @param array  $details     The spelling suggestions
310     * @param array  $returnArray Return data structure so far
311     * @param Params $params      Params helper object
312     *
313     * @return array              $returnArray modified
314     */
315    protected function doSingleReplace(
316        $term,
317        $targetTerm,
318        $inToken,
319        $details,
320        $returnArray,
321        Params $params
322    ) {
323        $returnArray[$targetTerm]['freq'] = $details['freq'];
324        foreach ($details['suggestions'] as $word => $freq) {
325            // If the suggested word is part of a token, we need to make sure we
326            // replace the whole token:
327            $replacement = $inToken ? str_replace($term, $word, $targetTerm) : $word;
328
329            //  Do we need to show the whole, modified query?
330            $label = $this->phrase
331                ? $params->getDisplayQueryWithReplacedTerm(
332                    $targetTerm,
333                    $replacement
334                ) : $replacement;
335
336            // Basic spelling suggestion data
337            $returnArray[$targetTerm]['suggestions'][$label] = [
338                'freq' => $freq,
339                'new_term' => $replacement,
340            ];
341
342            // Only generate expansions if enabled in config
343            if ($this->expand) {
344                // Parentheses differ for shingles
345                $replacement = (strstr($targetTerm, ' ') !== false)
346                    ? "(($targetTerm) OR ($replacement))"
347                    : "($targetTerm OR $replacement)";
348                $returnArray[$targetTerm]['suggestions'][$label]['expand_term']
349                    = $replacement;
350            }
351        }
352
353        return $returnArray;
354    }
355}