Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
76.00% |
114 / 150 |
|
30.77% |
4 / 13 |
CRAP | |
0.00% |
0 / 1 |
Wikipedia | |
76.00% |
114 / 150 |
|
30.77% |
4 / 13 |
104.50 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setLanguage | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
get | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
3.10 | |||
alreadyRetrieved | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
extractImageFromInfoBox | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
15 | |||
extractInfoBox | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
4.07 | |||
extractImageFromBody | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
12 | |||
stripImageAndFileLinks | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
6 | |||
sanitizeWikipediaBody | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
checkForRedirect | |
83.33% |
10 / 12 |
|
0.00% |
0 / 1 |
5.12 | |||
extractBodyText | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
parseWikipedia | |
80.00% |
20 / 25 |
|
0.00% |
0 / 1 |
7.39 | |||
getWikipediaImageURL | |
65.00% |
13 / 20 |
|
0.00% |
0 / 1 |
10.74 |
1 | <?php |
2 | |
3 | /** |
4 | * Wikipedia connection class |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Connection |
25 | * @author Chris Hallberg <challber@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/development Wiki |
28 | */ |
29 | |
30 | namespace VuFind\Connection; |
31 | |
32 | use VuFind\I18n\Translator\TranslatorAwareInterface; |
33 | |
34 | use function count; |
35 | use function is_array; |
36 | use function strlen; |
37 | |
38 | /** |
39 | * Wikipedia connection class |
40 | * |
41 | * @category VuFind |
42 | * @package Connection |
43 | * @author Chris Hallberg <challber@villanova.edu> |
44 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
45 | * @link https://vufind.org/wiki/development Wiki |
46 | */ |
47 | class Wikipedia implements TranslatorAwareInterface |
48 | { |
49 | use \VuFind\I18n\Translator\TranslatorAwareTrait; |
50 | |
51 | /** |
52 | * HTTP client |
53 | * |
54 | * @var \Laminas\Http\Client |
55 | */ |
56 | protected $client; |
57 | |
58 | /** |
59 | * Selected language |
60 | * |
61 | * @var string |
62 | */ |
63 | protected $lang = 'en'; |
64 | |
65 | /** |
66 | * Log of Wikipedia pages already retrieved |
67 | * |
68 | * @var array |
69 | */ |
70 | protected $pagesRetrieved = []; |
71 | |
72 | /** |
73 | * Constructor |
74 | * |
75 | * @param \Laminas\Http\Client $client HTTP client |
76 | */ |
77 | public function __construct(\Laminas\Http\Client $client) |
78 | { |
79 | $this->client = $client; |
80 | } |
81 | |
82 | /** |
83 | * Set language |
84 | * |
85 | * @param string $lang Language |
86 | * |
87 | * @return void |
88 | */ |
89 | public function setLanguage($lang) |
90 | { |
91 | $this->lang = substr($lang, 0, 2); // strip off regional suffixes |
92 | } |
93 | |
94 | /** |
95 | * This method is responsible for connecting to Wikipedia via the REST API |
96 | * and pulling the content for the relevant author. |
97 | * |
98 | * @param string $author The author name to search for |
99 | * |
100 | * @return ?array |
101 | */ |
102 | public function get($author) |
103 | { |
104 | // Don't retrieve the same page multiple times; this indicates a loop |
105 | // that needs to be broken! |
106 | if ($this->alreadyRetrieved($author)) { |
107 | return []; |
108 | } |
109 | |
110 | // Get information from Wikipedia API |
111 | $uri = 'https://' . $this->lang . '.wikipedia.org/w/api.php' . |
112 | '?action=query&prop=revisions&rvprop=content&format=php' . |
113 | '&list=allpages&titles=' . urlencode($author); |
114 | |
115 | $response = $this->client->setUri($uri)->setMethod('GET')->send(); |
116 | if ($response->isSuccess()) { |
117 | return $this->parseWikipedia(unserialize($response->getBody())); |
118 | } |
119 | return null; |
120 | } |
121 | |
122 | /** |
123 | * Check if a page has already been retrieved; if it hasn't, flag it as |
124 | * retrieved for future reference. |
125 | * |
126 | * @param string $author Author being retrieved |
127 | * |
128 | * @return bool |
129 | */ |
130 | protected function alreadyRetrieved($author) |
131 | { |
132 | if (isset($this->pagesRetrieved[$author])) { |
133 | return true; |
134 | } |
135 | $this->pagesRetrieved[$author] = true; |
136 | return false; |
137 | } |
138 | |
139 | /** |
140 | * Extract image information from an infobox |
141 | * |
142 | * @param string $infoboxStr Infobox text |
143 | * |
144 | * @return array Array with two values values: image name and image caption |
145 | */ |
146 | protected function extractImageFromInfoBox($infoboxStr) |
147 | { |
148 | $imageName = $imageCaption = null; |
149 | |
150 | // Get rid of the last pair of braces and split |
151 | $infobox = explode( |
152 | "\n|", |
153 | preg_replace('/^\s+|/m', '', substr($infoboxStr, 2, -2)) |
154 | ); |
155 | |
156 | // Look through every row of the infobox |
157 | foreach ($infobox as $row) { |
158 | $data = explode('=', $row); |
159 | $key = trim(array_shift($data)); |
160 | $value = trim(implode('=', $data)); |
161 | |
162 | // At the moment we only want stuff related to the image. |
163 | switch (strtolower($key)) { |
164 | case 'img': |
165 | case 'image': |
166 | case 'image:': |
167 | case 'image_name': |
168 | case 'imagem': |
169 | case 'imagen': |
170 | case 'immagine': |
171 | $imageName = str_replace(' ', '_', $value); |
172 | break; |
173 | case 'caption': |
174 | case 'img_capt': |
175 | case 'image_caption': |
176 | case 'legenda': |
177 | case 'textoimagen': |
178 | $imageCaption = $value; |
179 | break; |
180 | default: |
181 | /* Nothing else... yet */ |
182 | break; |
183 | } |
184 | } |
185 | |
186 | return [$imageName, $imageCaption]; |
187 | } |
188 | |
189 | /** |
190 | * Support method for parseWikipedia - extract infobox details |
191 | * |
192 | * @param array $body The Wikipedia response to parse |
193 | * |
194 | * @return string |
195 | */ |
196 | protected function extractInfoBox($body) |
197 | { |
198 | // We are looking for the infobox inside "{{...}}" |
199 | // It may contain nested blocks too, thus the recursion |
200 | preg_match_all('/\{([^{}]++|(?R))*\}/s', $body['*'], $matches); |
201 | |
202 | foreach ($matches[1] as $m) { |
203 | // Check if this is the Infobox; name may vary by language |
204 | $infoboxTags = [ |
205 | 'Bio', 'Ficha de escritor', 'Infobox', 'Info/Biografia', |
206 | ]; |
207 | foreach ($infoboxTags as $tag) { |
208 | if (str_starts_with($m, '{' . $tag)) { |
209 | // We found an infobox!! |
210 | return '{' . $m . '}'; |
211 | } |
212 | } |
213 | } |
214 | |
215 | return null; |
216 | } |
217 | |
218 | /** |
219 | * Support method for parseWikipedia - extract first image from body |
220 | * |
221 | * @param array $body The Wikipedia response to parse |
222 | * |
223 | * @return array |
224 | */ |
225 | protected function extractImageFromBody($body) |
226 | { |
227 | $imageName = $imageCaption = null; |
228 | // The tag marking image files will vary depending on API language: |
229 | $tags = [ |
230 | 'Archivo', 'Bestand', 'Datei', 'Ficheiro', 'Fichier', 'File', 'Image', |
231 | ]; |
232 | $pattern = '/(\x5b\x5b)(' |
233 | . implode('|', $tags) |
234 | . '):([^\x5d]*\.jpg[^\x5d]*)(\x5d\x5d)/U'; |
235 | preg_match_all($pattern, $body['*'], $matches); |
236 | if (isset($matches[3][0])) { |
237 | $parts = explode('|', $matches[3][0]); |
238 | $imageName = str_replace(' ', '_', $parts[0]); |
239 | if (count($parts) > 1) { |
240 | $imageCaption = strip_tags( |
241 | preg_replace('/({{).*(}})/U', '', $parts[count($parts) - 1]) |
242 | ); |
243 | } |
244 | } |
245 | return [$imageName, $imageCaption]; |
246 | } |
247 | |
248 | /** |
249 | * Support method for sanitizeWikipediaBody -- strip image/file links. |
250 | * |
251 | * @param string $body The Wikipedia response to sanitize |
252 | * |
253 | * @return string |
254 | */ |
255 | protected function stripImageAndFileLinks($body) |
256 | { |
257 | // Remove unwanted image/file links |
258 | // Nested brackets make this annoying: We can't add 'File' or 'Image' as |
259 | // mandatory because the recursion fails, or as optional because then |
260 | // normal links get hit. |
261 | // ... unless there's a better pattern? TODO |
262 | // eg. [[File:Johann Sebastian Bach.jpg|thumb|Bach in a 1748 portrait by |
263 | // [[Elias Gottlob Haussmann|Haussmann]]]] |
264 | $open = '\\['; |
265 | $close = '\\]'; |
266 | $content = '(?>[^\\[\\]]+)'; // Anything but [ or ] |
267 | // We can either find content or recursive brackets: |
268 | $recursive_match = "($content|(?R))*"; |
269 | $body .= '[[file:bad]]'; |
270 | preg_match_all("/{$open}{$recursive_match}{$close}/Us", $body, $new_matches); |
271 | // Loop through every match (link) we found |
272 | if (is_array($new_matches)) { |
273 | foreach ($new_matches as $nm) { |
274 | foreach ((array)$nm as $n) { |
275 | // If it's a file link get rid of it |
276 | if ( |
277 | str_starts_with(strtolower($n), '[[file:') |
278 | || str_starts_with(strtolower($n), '[[image:') |
279 | ) { |
280 | $body = str_replace($n, '', $body); |
281 | } |
282 | } |
283 | } |
284 | } |
285 | return $body; |
286 | } |
287 | |
288 | /** |
289 | * Support method for parseWikipedia - fix up details in the body |
290 | * |
291 | * @param string $body The Wikipedia response to sanitize |
292 | * |
293 | * @return string |
294 | */ |
295 | protected function sanitizeWikipediaBody($body) |
296 | { |
297 | // Cull our content back to everything before the first heading |
298 | $body = trim(substr($body, 0, strpos($body, '=='))); |
299 | |
300 | // Strip out links |
301 | $body = $this->stripImageAndFileLinks($body); |
302 | |
303 | // Initialize arrays of processing instructions |
304 | $pattern = []; |
305 | $replacement = []; |
306 | |
307 | // Convert wikipedia links |
308 | $pattern[] = '/(\x5b\x5b)([^\x5d|]*)(\x5d\x5d)/Us'; |
309 | $replacement[] |
310 | = '<a href="___baseurl___?lookfor=%22$2%22&type=AllFields">$2</a>'; |
311 | $pattern[] = '/(\x5b\x5b)([^\x5d]*)\x7c([^\x5d]*)(\x5d\x5d)/Us'; |
312 | $replacement[] |
313 | = '<a href="___baseurl___?lookfor=%22$2%22&type=AllFields">$3</a>'; |
314 | |
315 | // Fix pronunciation guides |
316 | $pattern[] = '/({{)pron-en\|([^}]*)(}})/Us'; |
317 | $replacement[] = $this->translate('pronounced') . ' /$2/'; |
318 | |
319 | // Fix dashes |
320 | $pattern[] = '/{{ndash}}/'; |
321 | $replacement[] = ' - '; |
322 | |
323 | // Removes citations |
324 | $pattern[] = '/({{)[^}]*(}})/Us'; |
325 | $replacement[] = ''; |
326 | // <ref ... > ... </ref> OR <ref> ... </ref> |
327 | $pattern[] = '/<ref[^\/]*>.*<\/ref>/Us'; |
328 | $replacement[] = ''; |
329 | // <ref ... /> |
330 | $pattern[] = '/<ref.*\/>/Us'; |
331 | $replacement[] = ''; |
332 | |
333 | // Removes comments followed by carriage returns to avoid excess whitespace |
334 | $pattern[] = '/<!--.*-->\n*/Us'; |
335 | $replacement[] = ''; |
336 | |
337 | // Formatting |
338 | $pattern[] = "/'''([^']*)'''/Us"; |
339 | $replacement[] = '<strong>$1</strong>'; |
340 | |
341 | // Trim leading newlines (which can result from leftovers after stripping |
342 | // other items above). We want this to be greedy. |
343 | $pattern[] = '/^\n*/s'; |
344 | $replacement[] = ''; |
345 | |
346 | // Convert multiple newlines into two breaks |
347 | // We DO want this to be greedy |
348 | $pattern[] = "/\n{2,}/s"; |
349 | $replacement[] = '<br><br>'; |
350 | |
351 | return preg_replace($pattern, $replacement, $body); |
352 | } |
353 | |
354 | /** |
355 | * Check for redirection in the Wikipedia response |
356 | * |
357 | * @param array $body Response body |
358 | * |
359 | * @return array |
360 | */ |
361 | protected function checkForRedirect($body) |
362 | { |
363 | $name = $redirectTo = $page = null; |
364 | |
365 | // Loop through the pages and find the first that isn't a redirect: |
366 | foreach ($body['query']['pages'] as $page) { |
367 | $name = $page['title']; |
368 | |
369 | // Get the latest revision |
370 | $page = array_shift($page['revisions']); |
371 | // Check for redirection |
372 | $as_lines = explode("\n", $page['*']); |
373 | $redirectTo = false; |
374 | $redirectTokens = ['#REDIRECT', '#WEITERLEITUNG', '#OMDIRIGERING']; |
375 | foreach ($redirectTokens as $redirectToken) { |
376 | if (stristr($as_lines[0], $redirectToken)) { |
377 | preg_match('/\[\[(.*)\]\]/', $as_lines[0], $matches); |
378 | $redirectTo = $matches[1]; |
379 | break; |
380 | } |
381 | } |
382 | if (!$redirectTo) { |
383 | break; |
384 | } |
385 | } |
386 | |
387 | return [$name, $redirectTo, $page]; |
388 | } |
389 | |
390 | /** |
391 | * Extract body text |
392 | * |
393 | * @param array $body Body details |
394 | * @param string $infoboxStr Infobox found within body (if any) |
395 | * |
396 | * @return string |
397 | */ |
398 | protected function extractBodyText($body, $infoboxStr) |
399 | { |
400 | if ($infoboxStr) { |
401 | // Start of the infobox |
402 | $start = strpos($body['*'], $infoboxStr); |
403 | // + the length of the infobox |
404 | $offset = strlen($infoboxStr); |
405 | // Every after the infobox |
406 | return substr($body['*'], $start + $offset); |
407 | } |
408 | // No infobox -- use whole thing: |
409 | return $body['*']; |
410 | } |
411 | |
412 | /** |
413 | * _parseWikipedia |
414 | * |
415 | * This method is responsible for parsing the output from the Wikipedia |
416 | * REST API. |
417 | * |
418 | * @param array $rawBody The Wikipedia response to parse |
419 | * |
420 | * @return array |
421 | * @author Rushikesh Katikar <rushikesh.katikar@gmail.com> |
422 | */ |
423 | protected function parseWikipedia($rawBody) |
424 | { |
425 | $imageName = null; |
426 | $imageCaption = null; |
427 | // Check if data exists or not |
428 | if (isset($rawBody['query']['pages']['-1'])) { |
429 | return null; |
430 | } |
431 | |
432 | // Check for redirects; get some basic information: |
433 | [$name, $redirectTo, $bodyArr] = $this->checkForRedirect($rawBody); |
434 | |
435 | // Recurse if we only found redirects: |
436 | if ($redirectTo) { |
437 | return $this->get($redirectTo); |
438 | } |
439 | |
440 | /* Infobox */ |
441 | $infoboxStr = $this->extractInfoBox($bodyArr); |
442 | |
443 | /* Body */ |
444 | $bodyStr = $this->extractBodyText($bodyArr, $infoboxStr); |
445 | $info = [ |
446 | 'name' => $name, |
447 | 'description' => $this->sanitizeWikipediaBody($bodyStr), |
448 | 'wiki_lang' => $this->lang, |
449 | ]; |
450 | |
451 | /* Image */ |
452 | |
453 | // Try to find an image in either the infobox or the body: |
454 | if ($infoboxStr) { |
455 | [$imageName, $imageCaption] |
456 | = $this->extractImageFromInfoBox($infoboxStr); |
457 | } |
458 | if (!isset($imageName)) { |
459 | [$imageName, $imageCaption] = $this->extractImageFromBody($bodyArr); |
460 | } |
461 | |
462 | // Given an image name found above, look up the associated URL and add it to |
463 | // our return array: |
464 | if (isset($imageName)) { |
465 | $imageUrl = $this->getWikipediaImageURL($imageName); |
466 | if ($imageUrl != false) { |
467 | $info['image'] = $imageUrl; |
468 | $info['altimage'] = $imageCaption ?? $name; |
469 | } |
470 | } |
471 | |
472 | return $info; |
473 | } |
474 | |
475 | /** |
476 | * This method is responsible for obtaining an image URL based on a name. |
477 | * |
478 | * @param string $imageName The image name to look up |
479 | * |
480 | * @return mixed URL on success, false on failure |
481 | */ |
482 | protected function getWikipediaImageURL($imageName) |
483 | { |
484 | $imageUrl = null; |
485 | $url = "https://{$this->lang}.wikipedia.org/w/api.php" . |
486 | '?prop=imageinfo&action=query&iiprop=url&iiurlwidth=150&format=php' . |
487 | '&titles=Image:' . urlencode($imageName); |
488 | |
489 | try { |
490 | $result = $this->client->setUri($url)->setMethod('GET')->send(); |
491 | } catch (\Exception $e) { |
492 | return false; |
493 | } |
494 | if (!$result->isSuccess()) { |
495 | return false; |
496 | } |
497 | |
498 | if ($response = $result->getBody()) { |
499 | if ($imageinfo = unserialize($response)) { |
500 | if ( |
501 | isset($imageinfo['query']['pages']['-1']['imageinfo'][0]['url']) |
502 | ) { |
503 | $imageUrl |
504 | = $imageinfo['query']['pages']['-1']['imageinfo'][0]['url']; |
505 | } |
506 | |
507 | // Hack for wikipedia api, just in case we couldn't find it |
508 | // above look for a http url inside the response. |
509 | if (!isset($imageUrl)) { |
510 | preg_match('/\"https?:\/\/(.*)\"/', $response, $matches); |
511 | if (isset($matches[1])) { |
512 | $imageUrl = 'https://' . |
513 | substr($matches[1], 0, strpos($matches[1], '"')); |
514 | } |
515 | } |
516 | } |
517 | } |
518 | |
519 | return $imageUrl ?? false; |
520 | } |
521 | } |