Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
62.43% |
118 / 189 |
|
55.17% |
16 / 29 |
CRAP | |
0.00% |
0 / 1 |
VuFind | |
62.43% |
118 / 189 |
|
55.17% |
16 / 29 |
468.03 | |
0.00% |
0 / 1 |
setServiceLocator | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getChangeTracker | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getConfig | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getFirstIndexed | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
getLastIndexed | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
harvestTextFile | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
3.33 | |||
getParser | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
harvestWithParser | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
getApertureCommand | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
stripBadChars | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
harvestWithAperture | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
getTikaCommand | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
harvestWithTika | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
mapString | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
stripArticles | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
stripAccents | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
stripPunctuation | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
removeOuterBrackets | |
76.92% |
10 / 13 |
|
0.00% |
0 / 1 |
11.23 | |||
solrMarcStyleCleanData | |
78.57% |
11 / 14 |
|
0.00% |
0 / 1 |
8.63 | |||
titleSortLower | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
xmlAsText | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
removeTagAndReturnXMLasText | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
explode | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
implode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
extractBestDateOrRange | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
extractEarliestYear | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
9 | |||
isInvertedName | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
invertName | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
invertNames | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
3.01 |
1 | <?php |
2 | |
3 | /** |
4 | * XSLT importer support methods. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (c) Demian Katz 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Import_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing Wiki |
28 | */ |
29 | |
30 | namespace VuFind\XSLT\Import; |
31 | |
32 | use DOMDocument; |
33 | |
34 | use function count; |
35 | use function in_array; |
36 | use function is_callable; |
37 | use function strlen; |
38 | |
39 | /** |
40 | * XSLT support class -- all methods of this class must be public and static; |
41 | * they will be automatically made available to your XSL stylesheet for use |
42 | * with the php:function() function. |
43 | * |
44 | * @category VuFind |
45 | * @package Import_Tools |
46 | * @author Demian Katz <demian.katz@villanova.edu> |
47 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
48 | * @link https://vufind.org/wiki/indexing Wiki |
49 | */ |
50 | class VuFind |
51 | { |
52 | /** |
53 | * ISO8601 date format string |
54 | * |
55 | * @var string |
56 | */ |
57 | protected const ISO8601_FORMAT = 'Y-m-d\TH:i:s\Z'; |
58 | |
59 | /** |
60 | * Service locator |
61 | * |
62 | * @var ServiceLocatorInterface |
63 | */ |
64 | protected static $serviceLocator; |
65 | |
66 | /** |
67 | * Set the service locator. |
68 | * |
69 | * @param ServiceLocatorInterface $serviceLocator Locator to register |
70 | * |
71 | * @return void |
72 | */ |
73 | public static function setServiceLocator($serviceLocator) |
74 | { |
75 | static::$serviceLocator = $serviceLocator; |
76 | } |
77 | |
78 | /** |
79 | * Get the change tracker service object. |
80 | * |
81 | * @return \VuFind\Db\Service\ChangeTrackerServiceInterface |
82 | */ |
83 | public static function getChangeTracker() |
84 | { |
85 | return static::$serviceLocator->get(\VuFind\Db\Service\PluginManager::class) |
86 | ->get(\VuFind\Db\Service\ChangeTrackerServiceInterface::class); |
87 | } |
88 | |
89 | /** |
90 | * Get a configuration file. |
91 | * |
92 | * @param string $config Configuration name |
93 | * |
94 | * @return \Laminas\Config\Config |
95 | */ |
96 | public static function getConfig($config = 'config') |
97 | { |
98 | return static::$serviceLocator->get(\VuFind\Config\PluginManager::class) |
99 | ->get($config); |
100 | } |
101 | |
102 | /** |
103 | * Get the date/time of the first time this record was indexed. |
104 | * |
105 | * @param string $core Solr core holding this record. |
106 | * @param string $id Record ID within specified core. |
107 | * @param string $date Date record was last modified. |
108 | * |
109 | * @return string First index date/time. |
110 | */ |
111 | public static function getFirstIndexed($core, $id, $date) |
112 | { |
113 | $date = strtotime($date); |
114 | $row = static::getChangeTracker()->index($core, $id, $date); |
115 | return $row->getFirstIndexed()->format(self::ISO8601_FORMAT); |
116 | } |
117 | |
118 | /** |
119 | * Get the date/time of the most recent time this record was indexed. |
120 | * |
121 | * @param string $core Solr core holding this record. |
122 | * @param string $id Record ID within specified core. |
123 | * @param string $date Date record was last modified. |
124 | * |
125 | * @return string Latest index date/time. |
126 | */ |
127 | public static function getLastIndexed($core, $id, $date) |
128 | { |
129 | $date = strtotime($date); |
130 | $row = static::getChangeTracker()->index($core, $id, $date); |
131 | return $row->getLastIndexed()->format(self::ISO8601_FORMAT); |
132 | } |
133 | |
134 | /** |
135 | * Harvest the contents of a text file for inclusion in the output. |
136 | * |
137 | * @param string $url URL of file to retrieve. |
138 | * |
139 | * @return string file contents. |
140 | */ |
141 | public static function harvestTextFile($url) |
142 | { |
143 | // Skip blank URLs: |
144 | if (empty($url)) { |
145 | return ''; |
146 | } |
147 | |
148 | $text = file_get_contents($url); |
149 | if ($text === false) { |
150 | throw new \Exception("Unable to access {$url}."); |
151 | } |
152 | return $text; |
153 | } |
154 | |
155 | /** |
156 | * Read parser method from fulltext.ini |
157 | * |
158 | * @return string Name of parser to use (i.e. Aperture or Tika) |
159 | */ |
160 | public static function getParser() |
161 | { |
162 | $settings = static::getConfig('fulltext'); |
163 | |
164 | // Is user preference explicitly set? |
165 | if (isset($settings->General->parser)) { |
166 | return $settings->General->parser; |
167 | } |
168 | |
169 | // Is Aperture enabled? |
170 | if (isset($settings->Aperture->webcrawler)) { |
171 | return 'Aperture'; |
172 | } |
173 | |
174 | // Is Tika enabled? |
175 | if (isset($settings->Tika->path)) { |
176 | return 'Tika'; |
177 | } |
178 | |
179 | // If we got this far, no parser is available: |
180 | return 'None'; |
181 | } |
182 | |
183 | /** |
184 | * Call parsing method based on parser setting in fulltext.ini |
185 | * |
186 | * @param string $url URL to harvest |
187 | * |
188 | * @return string Text contents of URL |
189 | */ |
190 | public static function harvestWithParser($url) |
191 | { |
192 | $parser = static::getParser(); |
193 | switch (strtolower($parser)) { |
194 | case 'aperture': |
195 | return static::harvestWithAperture($url); |
196 | case 'tika': |
197 | return static::harvestWithTika($url); |
198 | default: |
199 | // Ignore unrecognized parser option: |
200 | return ''; |
201 | } |
202 | } |
203 | |
204 | /** |
205 | * Generic method for building Aperture Command |
206 | * |
207 | * @param string $input name of input file | url |
208 | * @param string $output name of output file |
209 | * @param string $method webcrawler | filecrawler |
210 | * |
211 | * @return string command to be executed |
212 | */ |
213 | public static function getApertureCommand( |
214 | $input, |
215 | $output, |
216 | $method = 'webcrawler' |
217 | ) { |
218 | // get the path to our sh/bat from the config |
219 | $settings = static::getConfig('fulltext'); |
220 | if (!isset($settings->Aperture->webcrawler)) { |
221 | return ''; |
222 | } |
223 | $cmd = $settings->Aperture->webcrawler; |
224 | |
225 | // if we're using another method - substitute that into the path |
226 | $cmd = ($method != 'webcrawler') |
227 | ? str_replace('webcrawler', $method, $cmd) : $cmd; |
228 | |
229 | // return the full command |
230 | return "{$cmd} -o {$output} -x {$input}"; |
231 | } |
232 | |
233 | /** |
234 | * Strip illegal XML characters from a string. |
235 | * |
236 | * @param string $in String to process |
237 | * |
238 | * @return string |
239 | */ |
240 | public static function stripBadChars($in) |
241 | { |
242 | $badChars = '/[^\\x0009\\x000A\\x000D\\x0020-\\xD7FF\\xE000-\\xFFFD]/'; |
243 | return preg_replace($badChars, ' ', $in); |
244 | } |
245 | |
246 | /** |
247 | * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. |
248 | * This method will only work if Aperture is properly configured in the |
249 | * fulltext.ini file. Without proper configuration, this will simply return an |
250 | * empty string. |
251 | * |
252 | * @param string $url URL of file to retrieve. |
253 | * @param string $method webcrawler | filecrawler |
254 | * |
255 | * @return string text contents of file. |
256 | */ |
257 | public static function harvestWithAperture($url, $method = 'webcrawler') |
258 | { |
259 | // Build a filename for temporary XML storage: |
260 | $xmlFile = tempnam('/tmp', 'apt'); |
261 | |
262 | // Determine the base Aperture command (or fail if it is not configured): |
263 | $aptCmd = static::getApertureCommand($url, $xmlFile, $method); |
264 | if (empty($aptCmd)) { |
265 | return ''; |
266 | } |
267 | |
268 | // Call Aperture: |
269 | exec($aptCmd); |
270 | |
271 | // If we failed to process the file, give up now: |
272 | if (!file_exists($xmlFile)) { |
273 | return ''; |
274 | } |
275 | |
276 | // Extract and decode the full text from the XML: |
277 | $xml = file_get_contents($xmlFile); |
278 | @unlink($xmlFile); |
279 | preg_match('/<plainTextContent[^>]*>([^<]*)</ms', $xml, $matches); |
280 | $final = isset($matches[1]) ? |
281 | html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; |
282 | |
283 | // Send back what we extracted, stripping out any illegal characters that |
284 | // will prevent XML from generating correctly: |
285 | return static::stripBadChars($final); |
286 | } |
287 | |
288 | /** |
289 | * Generic method for building Tika command |
290 | * |
291 | * @param string $input url | fileresource |
292 | * @param string $output name of output file |
293 | * @param string $arg optional Tika arguments |
294 | * |
295 | * @return array Parameters for proc_open command |
296 | */ |
297 | public static function getTikaCommand($input, $output, $arg) |
298 | { |
299 | $settings = static::getConfig('fulltext'); |
300 | if (!isset($settings->Tika->path)) { |
301 | return ''; |
302 | } |
303 | $tika = $settings->Tika->path; |
304 | |
305 | // We need to use this method to get the output from STDOUT into the file |
306 | $descriptorspec = [ |
307 | 0 => ['pipe', 'r'], |
308 | 1 => ['file', $output, 'w'], |
309 | 2 => ['pipe', 'w'], |
310 | ]; |
311 | return [ |
312 | "java -jar $tika $arg -eUTF8 $input", $descriptorspec, [], |
313 | ]; |
314 | } |
315 | |
316 | /** |
317 | * Harvest the contents of a document file (PDF, Word, etc.) using Tika. |
318 | * This method will only work if Tika is properly configured in the |
319 | * fulltext.ini file. Without proper configuration, this will simply return an |
320 | * empty string. |
321 | * |
322 | * @param string $url URL of file to retrieve. |
323 | * @param string $arg optional argument(s) for Tika |
324 | * |
325 | * @return string text contents of file. |
326 | */ |
327 | public static function harvestWithTika($url, $arg = '--text') |
328 | { |
329 | // Build a filename for temporary XML storage: |
330 | $outputFile = tempnam('/tmp', 'tika'); |
331 | |
332 | // Determine the base Tika command and execute |
333 | $tikaCommand = static::getTikaCommand($url, $outputFile, $arg); |
334 | proc_close(proc_open($tikaCommand[0], $tikaCommand[1], $tikaCommand[2])); |
335 | |
336 | // If we failed to process the file, give up now: |
337 | if (!file_exists($outputFile)) { |
338 | return ''; |
339 | } |
340 | |
341 | // Extract and decode the full text from the XML: |
342 | $txt = file_get_contents($outputFile); |
343 | @unlink($outputFile); |
344 | |
345 | return $txt; |
346 | } |
347 | |
348 | /** |
349 | * Map string using a config file from the translation_maps folder. |
350 | * |
351 | * @param string $in string to map. |
352 | * @param string $filename filename of map file |
353 | * |
354 | * @return string mapped text. |
355 | */ |
356 | public static function mapString($in, $filename) |
357 | { |
358 | // Load the translation map and send back the appropriate value. Note |
359 | // that PHP's parse_ini_file() function is not compatible with SolrMarc's |
360 | // style of properties map, so we are parsing this manually. |
361 | $map = []; |
362 | $resolver = static::$serviceLocator->get(\VuFind\Config\PathResolver::class); |
363 | $mapFile = $resolver->getConfigPath($filename, 'import/translation_maps'); |
364 | foreach ($mapFile ? file($mapFile) : [] as $line) { |
365 | $parts = explode('=', $line, 2); |
366 | if (isset($parts[1])) { |
367 | $key = trim($parts[0]); |
368 | $map[$key] = trim($parts[1]); |
369 | } |
370 | } |
371 | return $map[$in] ?? $in; |
372 | } |
373 | |
374 | /** |
375 | * Strip articles from the front of the text (for creating sortable titles). |
376 | * |
377 | * @param string $in title to process. |
378 | * |
379 | * @return string article-stripped text. |
380 | */ |
381 | public static function stripArticles($in) |
382 | { |
383 | static $articles = ['a', 'an', 'the']; |
384 | |
385 | $text = is_callable('mb_strtolower') |
386 | ? mb_strtolower(trim($in), 'UTF-8') |
387 | : strtolower(trim($in)); |
388 | |
389 | foreach ($articles as $a) { |
390 | if (str_starts_with($text, $a . ' ')) { |
391 | $text = substr($text, strlen($a) + 1); |
392 | break; |
393 | } |
394 | } |
395 | |
396 | return $text; |
397 | } |
398 | |
399 | /** |
400 | * Strip accents from a string. |
401 | * |
402 | * @param string $str String to process. |
403 | * |
404 | * @return string Processed string. |
405 | */ |
406 | public static function stripAccents(string $str): string |
407 | { |
408 | $tl = \Transliterator::create('Latin-ASCII;'); |
409 | return $tl->transliterate($str); |
410 | } |
411 | |
412 | /** |
413 | * Strip punctuation from a string. |
414 | * |
415 | * @param string $str String to process. |
416 | * |
417 | * @return string Processed string. |
418 | */ |
419 | public static function stripPunctuation(string $str): string |
420 | { |
421 | // Convert strings of spaces and punctuation into single spaces, for |
422 | // consistency with SolrMarc behavior. |
423 | return preg_replace('/[[:punct:]\s]+/', ' ', $str); |
424 | } |
425 | |
426 | /** |
427 | * Remove single square bracket characters if they are the start and/or end |
428 | * chars (matched or unmatched) and are the only square bracket chars in the |
429 | * string. |
430 | * |
431 | * Ported from SolrMarc's DataUtil class. |
432 | * |
433 | * @param string $str Text string with possible enclosing brackets |
434 | * |
435 | * @return string Processed string with the brackets removed. |
436 | */ |
437 | public static function removeOuterBrackets(string $str): string |
438 | { |
439 | $result = trim($str); |
440 | if (strlen($result) > 0) { |
441 | $openBracketFirst = str_starts_with($result, '['); |
442 | $closeBracketLast = str_ends_with($result, ']'); |
443 | $totalLefts = substr_count($result, '['); |
444 | $totalRights = substr_count($result, ']'); |
445 | if ($openBracketFirst && $closeBracketLast && $totalLefts === 1 && $totalRights === 1) { |
446 | // only square brackets are at beginning and end |
447 | $result = substr($result, 1, strlen($result) - 2); |
448 | } elseif ($openBracketFirst && $totalRights === 0) { |
449 | // starts with '[' but no ']'; remove open bracket |
450 | $result = substr($result, 1); |
451 | } elseif ($closeBracketLast && $totalLefts === 0) { |
452 | // ends with ']' but no '['; remove close bracket |
453 | $result = substr($result, 0, strlen($result) - 1); |
454 | } |
455 | } |
456 | return $result; |
457 | } |
458 | |
459 | /** |
460 | * Port of logic from SolrMarc's DataUtil::cleanData method. |
461 | * |
462 | * @param string $str String to process. |
463 | * |
464 | * @return string Processed string. |
465 | */ |
466 | public static function solrMarcStyleCleanData(string $str): string |
467 | { |
468 | $needsPeriodStripping = function ($strToCheck) { |
469 | $noStrippingRegex = [ |
470 | '/.*[JS]r\.$/', // don't strip period off of Jr. or Sr. |
471 | ]; |
472 | $strippingRegex = [ |
473 | '/.*\w\w\.$/', |
474 | '/.*\p{L}\p{L}\.$/', |
475 | // The following regex is unsupported by PHP but retained for reference: |
476 | //'/.*\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?\.$/u', |
477 | '/.*\p{P}\.$/u', |
478 | ]; |
479 | foreach ($noStrippingRegex as $regex) { |
480 | if (preg_match($regex, $strToCheck)) { |
481 | return false; |
482 | } |
483 | } |
484 | foreach ($strippingRegex as $regex) { |
485 | if (preg_match($regex, $strToCheck)) { |
486 | return true; |
487 | } |
488 | } |
489 | return false; |
490 | }; |
491 | |
492 | $current = $str; |
493 | do { |
494 | $previous = $current; |
495 | $current = trim($current); |
496 | $current = preg_replace('|\s*([,/;:])$|', '', $current); |
497 | if (str_ends_with($current, '.')) { |
498 | if ($needsPeriodStripping($current)) { |
499 | $current = mb_substr($current, 0, mb_strlen($current, 'UTF-8') - 1, 'UTF-8'); |
500 | } |
501 | } |
502 | $current = static::removeOuterBrackets($current); |
503 | if (strlen($current) === 0) { |
504 | return $current; |
505 | } |
506 | } while ($current !== $previous); |
507 | return $current; |
508 | } |
509 | |
510 | /** |
511 | * Perform text processing roughly equivalent to SolrMarc's titleSortLower |
512 | * feature to allow consistent indexing into the title_sort field. |
513 | * |
514 | * @param string $str String to process. |
515 | * |
516 | * @return string Processed string. |
517 | */ |
518 | public static function titleSortLower(string $str): string |
519 | { |
520 | return mb_strtolower( |
521 | static::solrMarcStyleCleanData( |
522 | static::stripPunctuation( |
523 | static::stripAccents($str) |
524 | ) |
525 | ), |
526 | 'UTF-8' |
527 | ); |
528 | } |
529 | |
530 | /** |
531 | * Convert provided nodes into XML and return as text. This is useful for |
532 | * populating the fullrecord field with the raw input XML. |
533 | * |
534 | * @param array $in array of DOMElement objects. |
535 | * |
536 | * @return string XML as string |
537 | */ |
538 | public static function xmlAsText($in) |
539 | { |
540 | // Start building return value: |
541 | $text = ''; |
542 | |
543 | // Extract all text: |
544 | foreach ((array)$in as $current) { |
545 | // Convert DOMElement to SimpleXML: |
546 | $xml = simplexml_import_dom($current); |
547 | |
548 | // Pull out text: |
549 | $text .= $xml->asXML(); |
550 | } |
551 | |
552 | // Collapse whitespace: |
553 | return $text; |
554 | } |
555 | |
556 | /** |
557 | * Remove a given tag from the provided nodes, then convert |
558 | * into XML and return as text. This is useful for |
559 | * populating the fullrecord field with the raw input XML but |
560 | * allow for removal of certain elements (eg: full text field). |
561 | * |
562 | * @param array $in array of DOMElement objects. |
563 | * @param string $tag name of tag to remove |
564 | * |
565 | * @return string XML as string |
566 | */ |
567 | public static function removeTagAndReturnXMLasText($in, $tag) |
568 | { |
569 | foreach ((array)$in as $current) { |
570 | $matches = $current->getElementsByTagName($tag); |
571 | foreach ($matches as $match) { |
572 | $current->removeChild($match); |
573 | } |
574 | } |
575 | |
576 | return static::xmlAsText($in); |
577 | } |
578 | |
579 | /** |
580 | * Proxy the explode PHP function for use in XSL transformation. |
581 | * |
582 | * @param string $delimiter Delimiter for splitting $string |
583 | * @param string $string String to split |
584 | * |
585 | * @return DOMDocument |
586 | */ |
587 | public static function explode($delimiter, $string) |
588 | { |
589 | $parts = explode($delimiter, $string); |
590 | $dom = new DOMDocument('1.0', 'utf-8'); |
591 | foreach ($parts as $part) { |
592 | $element = $dom->createElement('part', $part); |
593 | $dom->appendChild($element); |
594 | } |
595 | return $dom; |
596 | } |
597 | |
598 | /** |
599 | * Proxy the implode PHP function for use in XSL transformation. |
600 | * |
601 | * @param string $glue Glue string |
602 | * @param array $pieces DOM elements to join together. |
603 | * |
604 | * @return string |
605 | */ |
606 | public static function implode($glue, $pieces) |
607 | { |
608 | $mapper = function ($dom) { |
609 | return trim($dom->textContent); |
610 | }; |
611 | return implode($glue, array_map($mapper, $pieces)); |
612 | } |
613 | |
614 | /** |
615 | * Try to find the best single year or date range in a set of DOM elements. |
616 | * Best is defined as the first value to consist of only YYYY or YYYY-ZZZZ, |
617 | * with no other text. If no "best" match is found, the first value is used. |
618 | * |
619 | * @param array $input DOM elements to search. |
620 | * |
621 | * @return string |
622 | */ |
623 | public static function extractBestDateOrRange($input) |
624 | { |
625 | foreach ($input as $current) { |
626 | if (preg_match('/^\d{4}(-\d{4})?$/', $current->textContent)) { |
627 | return $current->textContent; |
628 | } |
629 | } |
630 | return reset($input)->textContent; |
631 | } |
632 | |
633 | /** |
634 | * Try to find a four-digit year in a set of DOM elements. |
635 | * |
636 | * @param array $input DOM elements to search. |
637 | * |
638 | * @return string |
639 | */ |
640 | public static function extractEarliestYear($input) |
641 | { |
642 | $goodMatch = $adequateMatch = ''; |
643 | foreach ($input as $current) { |
644 | // Best match -- a four-digit string starting with 1 or 2 |
645 | preg_match_all('/[12]\d{3}/', $current->textContent, $matches); |
646 | foreach ($matches[0] as $match) { |
647 | if (empty($goodMatch) || $goodMatch > $match) { |
648 | $goodMatch = $match; |
649 | } |
650 | } |
651 | // Next best match -- any string of four or fewer digits. |
652 | for ($length = 4; $length > 0; $length--) { |
653 | preg_match_all( |
654 | '/\d{' . $length . '}/', |
655 | $current->textContent, |
656 | $matches |
657 | ); |
658 | foreach ($matches[0] as $match) { |
659 | if (strlen($match) > strlen($adequateMatch)) { |
660 | $adequateMatch = $match; |
661 | } |
662 | } |
663 | } |
664 | } |
665 | return empty($goodMatch) ? $adequateMatch : $goodMatch; |
666 | } |
667 | |
668 | /** |
669 | * Is the provided name inverted ("Last, First") or not ("First Last")? |
670 | * |
671 | * @param string $name Name to check |
672 | * |
673 | * @return bool |
674 | */ |
675 | public static function isInvertedName(string $name): bool |
676 | { |
677 | $parts = explode(',', $name); |
678 | // If there are no commas, it's not inverted... |
679 | if (count($parts) < 2) { |
680 | return false; |
681 | } |
682 | // If there are commas, let's see if the last part is a title, |
683 | // in which case it could go either way, so we need to recalculate. |
684 | $lastPart = array_pop($parts); |
685 | $titles = ['jr', 'sr', 'dr', 'mrs', 'ii', 'iii', 'iv']; |
686 | if (in_array(strtolower(trim($lastPart, ' .')), $titles)) { |
687 | return count($parts) > 1; |
688 | } |
689 | return true; |
690 | } |
691 | |
692 | /** |
693 | * Invert "Firstname Lastname" authors into "Lastname, Firstname." |
694 | * |
695 | * @param string $rawName Raw name |
696 | * |
697 | * @return string |
698 | */ |
699 | public static function invertName(string $rawName): string |
700 | { |
701 | // includes the full name, eg.: Bento, Filipe Manuel dos Santos |
702 | $parts = preg_split('/\s+(?=[^\s]+$)/', $rawName, 2); |
703 | if (count($parts) != 2) { |
704 | return $rawName; |
705 | } |
706 | [$fnames, $lname] = $parts; |
707 | return "$lname, $fnames"; |
708 | } |
709 | |
710 | /** |
711 | * Call invertName on all matching elements; return a DOMDocument with a |
712 | * name tag for each inverted name. |
713 | * |
714 | * @param array $input DOM elements to adjust |
715 | * |
716 | * @return DOMDocument |
717 | */ |
718 | public static function invertNames($input): DOMDocument |
719 | { |
720 | $dom = new DOMDocument('1.0', 'utf-8'); |
721 | foreach ($input as $name) { |
722 | $inverted = self::isInvertedName($name->textContent) |
723 | ? $name->textContent |
724 | : self::invertName($name->textContent); |
725 | $element = $dom->createElement('name'); |
726 | $element->nodeValue = htmlspecialchars($inverted); |
727 | $dom->appendChild($element); |
728 | } |
729 | return $dom; |
730 | } |
731 | } |