Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
59 / 59 |
|
100.00% |
3 / 3 |
CRAP | |
100.00% |
1 / 1 |
VuFindWorkKeys | |
100.00% |
59 / 59 |
|
100.00% |
3 / 3 |
19 | |
100.00% |
1 / 1 |
getWorkKeys | |
100.00% |
49 / 49 |
|
100.00% |
1 / 1 |
14 | |||
deDom | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalize | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | /** |
4 | * XSLT importer support methods for work key generation. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (c) Demian Katz 2020. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Import_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing Wiki |
28 | */ |
29 | |
30 | namespace VuFind\XSLT\Import; |
31 | |
32 | use DOMDocument; |
33 | use Normalizer; |
34 | |
35 | use function in_array; |
36 | |
37 | /** |
38 | * XSLT importer support methods for work key generation. |
39 | * |
40 | * @category VuFind |
41 | * @package Import_Tools |
42 | * @author Demian Katz <demian.katz@villanova.edu> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/indexing Wiki |
45 | */ |
46 | class VuFindWorkKeys |
47 | { |
48 | /** |
49 | * Get all work identification keys for the record. |
50 | * |
51 | * @param Iterable $uniformTitles Uniform title(s) for the work |
52 | * @param Iterable $titles Other title(s) for the work |
53 | * @param Iterable $trimmedTitles Title(s) with leading articles, etc., |
54 | * removed |
55 | * @param Iterable $authors Author(s) for the work |
56 | * @param string $includeRegEx Regular expression defining characters to |
57 | * keep |
58 | * @param string $excludeRegEx Regular expression defining characters to |
59 | * remove |
60 | * @param string $transliteratorRules Optional ICU transliteration rules to be |
61 | * applied before the include and exclude regex's. See |
62 | * https://unicode-org.github.io/icu/userguide/transforms/general/ |
63 | * #icu-transliterators for more information on the transliteration rules. |
64 | * |
65 | * @return DOMDocument |
66 | */ |
67 | public static function getWorkKeys( |
68 | $uniformTitles, |
69 | $titles, |
70 | $trimmedTitles, |
71 | $authors, |
72 | $includeRegEx = '', |
73 | $excludeRegEx = '', |
74 | $transliteratorRules = '' |
75 | ) { |
76 | $transliterator = $transliteratorRules |
77 | ? \Transliterator::createFromRules( |
78 | $transliteratorRules, |
79 | \Transliterator::FORWARD |
80 | ) : null; |
81 | |
82 | $dom = new DOMDocument('1.0', 'utf-8'); |
83 | |
84 | $uniformTitles = is_iterable($uniformTitles) |
85 | ? $uniformTitles : (array)$uniformTitles; |
86 | foreach ($uniformTitles as $uniformTitle) { |
87 | $normalizedTitle = self::normalize( |
88 | $uniformTitle, |
89 | $includeRegEx, |
90 | $excludeRegEx, |
91 | $transliterator |
92 | ); |
93 | if (!empty($normalizedTitle)) { |
94 | $element = $dom->createElement('workKey', 'UT ' . $normalizedTitle); |
95 | $dom->appendChild($element); |
96 | } |
97 | } |
98 | |
99 | // Exit early if there are no authors, since we can't make author/title keys: |
100 | $authors = is_iterable($authors) ? $authors : (array)$authors; |
101 | if (empty($authors)) { |
102 | return $dom; |
103 | } |
104 | $titles = $titles instanceof \Traversable |
105 | ? iterator_to_array($titles) : (array)$titles; |
106 | $trimmedTitles = $trimmedTitles instanceof \Traversable |
107 | ? iterator_to_array($trimmedTitles) : (array)$trimmedTitles; |
108 | $normalizedTitles = []; |
109 | foreach (array_merge($titles, $trimmedTitles) as $title) { |
110 | $normalizedTitle = self::normalize( |
111 | $title, |
112 | $includeRegEx, |
113 | $excludeRegEx, |
114 | $transliterator |
115 | ); |
116 | if ( |
117 | empty($normalizedTitle) // skip empties |
118 | || in_array($normalizedTitle, $normalizedTitles) // avoid dupes |
119 | ) { |
120 | continue; |
121 | } |
122 | $normalizedTitles[] = $normalizedTitle; |
123 | foreach ($authors as $author) { |
124 | $normalizedAuthor = self::normalize( |
125 | $author, |
126 | $includeRegEx, |
127 | $excludeRegEx, |
128 | $transliterator |
129 | ); |
130 | if (!empty($author)) { |
131 | $key = 'AT ' . $normalizedAuthor . ' ' . $normalizedTitle; |
132 | $element = $dom->createElement('workKey', $key); |
133 | $dom->appendChild($element); |
134 | } |
135 | } |
136 | } |
137 | |
138 | return $dom; |
139 | } |
140 | |
141 | /** |
142 | * Force a value to a string, even if it's a DOMElement. |
143 | * |
144 | * @param string|DOMElement $string String to normalize |
145 | * |
146 | * @return string |
147 | */ |
148 | protected static function deDom($string): string |
149 | { |
150 | return $string->textContent ?? (string)$string; |
151 | } |
152 | |
153 | /** |
154 | * Create a key string. |
155 | * |
156 | * @param string|DOMElement $rawString String to normalize |
157 | * @param string $includeRegEx Regular expression defining |
158 | * characters to keep |
159 | * @param string $excludeRegEx Regular expression defining |
160 | * characters to remove |
161 | * @param \Transliterator $transliterator Transliterator |
162 | * |
163 | * @return string |
164 | */ |
165 | protected static function normalize( |
166 | $rawString, |
167 | $includeRegEx, |
168 | $excludeRegEx, |
169 | $transliterator |
170 | ) { |
171 | // Handle strings and/or DOM elements: |
172 | $string = self::deDom($rawString); |
173 | $normalized = $transliterator ? $transliterator->transliterate($string) |
174 | : Normalizer::normalize($string, Normalizer::FORM_KC); |
175 | if (!empty($includeRegEx)) { |
176 | preg_match_all($includeRegEx, $normalized, $matches); |
177 | $normalized = implode($matches[0] ?? []); |
178 | } |
179 | if (!empty($excludeRegEx)) { |
180 | $normalized = preg_replace($excludeRegEx, '', $normalized); |
181 | } |
182 | return mb_substr(mb_strtolower($normalized, 'UTF-8'), 0, 255, 'UTF-8'); |
183 | } |
184 | } |