Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
84.67% |
116 / 137 |
|
36.36% |
4 / 11 |
CRAP | |
0.00% |
0 / 1 |
WebCrawlCommand | |
84.67% |
116 / 137 |
|
36.36% |
4 / 11 |
61.74 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
configure | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
1 | |||
downloadFile | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
removeTempFile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getTransformCachePath | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
updateLastIndexed | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
readFromTransformCache | |
60.00% |
9 / 15 |
|
0.00% |
0 / 1 |
14.18 | |||
indexFromTransformCache | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
updateTransformCache | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
harvestSitemap | |
87.50% |
42 / 48 |
|
0.00% |
0 / 1 |
18.63 | |||
execute | |
96.55% |
28 / 29 |
|
0.00% |
0 / 1 |
13 |
1 | <?php |
2 | |
3 | /** |
4 | * Console command: web crawler |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2020. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Console |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/development Wiki |
28 | */ |
29 | |
30 | namespace VuFindConsole\Command\Import; |
31 | |
32 | use Laminas\Config\Config; |
33 | use Symfony\Component\Console\Attribute\AsCommand; |
34 | use Symfony\Component\Console\Command\Command; |
35 | use Symfony\Component\Console\Input\InputInterface; |
36 | use Symfony\Component\Console\Input\InputOption; |
37 | use Symfony\Component\Console\Output\OutputInterface; |
38 | use VuFind\Solr\Writer; |
39 | use VuFind\XSLT\Importer; |
40 | use VuFindSearch\Backend\Solr\Document\RawXMLDocument; |
41 | |
42 | use function is_string; |
43 | |
44 | /** |
45 | * Console command: web crawler |
46 | * |
47 | * @category VuFind |
48 | * @package Console |
49 | * @author Demian Katz <demian.katz@villanova.edu> |
50 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
51 | * @link https://vufind.org/wiki/development Wiki |
52 | */ |
53 | #[AsCommand( |
54 | name: 'import/webcrawl', |
55 | description: 'Web crawler' |
56 | )] |
57 | class WebCrawlCommand extends Command |
58 | { |
59 | /** |
60 | * Should we bypass cache expiration? |
61 | * |
62 | * @var bool |
63 | */ |
64 | protected bool $bypassCacheExpiration = false; |
65 | |
66 | /** |
67 | * Constructor |
68 | * |
69 | * @param Importer $importer XSLT importer |
70 | * @param Writer $solr Solr writer |
71 | * @param Config $config Configuration from webcrawl.ini |
72 | * @param string|null $name The name of the command; passing null means it |
73 | * must be set in configure() |
74 | */ |
75 | public function __construct( |
76 | protected Importer $importer, |
77 | protected Writer $solr, |
78 | protected Config $config, |
79 | $name = null |
80 | ) { |
81 | parent::__construct($name); |
82 | } |
83 | |
84 | /** |
85 | * Configure the command. |
86 | * |
87 | * @return void |
88 | */ |
89 | protected function configure() |
90 | { |
91 | $this |
92 | ->setHelp('Crawls websites to populate VuFind\'s web index.') |
93 | ->addOption( |
94 | 'test-only', |
95 | null, |
96 | InputOption::VALUE_NONE, |
97 | 'activates test mode, which displays output without updating Solr' |
98 | )->addOption( |
99 | 'use-expired-cache', |
100 | null, |
101 | InputOption::VALUE_NONE, |
102 | 'use cached data, even if expired; useful when the index needs to be quickly rebuilt, ' |
103 | . 'e.g. after a Solr upgrade' |
104 | )->addOption( |
105 | 'index', |
106 | null, |
107 | InputOption::VALUE_OPTIONAL, |
108 | 'name of search backend to index content into', |
109 | 'SolrWeb' |
110 | ); |
111 | } |
112 | |
113 | /** |
114 | * Download a URL to a temporary file. |
115 | * |
116 | * @param string $url URL to download |
117 | * |
118 | * @return string Filename of downloaded content |
119 | */ |
120 | protected function downloadFile($url) |
121 | { |
122 | $file = tempnam('/tmp', 'sitemap'); |
123 | file_put_contents($file, file_get_contents($url)); |
124 | return $file; |
125 | } |
126 | |
127 | /** |
128 | * Remove a temporary file. |
129 | * |
130 | * @param string $file Name of file to delete |
131 | * |
132 | * @return void |
133 | */ |
134 | protected function removeTempFile($file) |
135 | { |
136 | unlink($file); |
137 | } |
138 | |
139 | /** |
140 | * Given a URL, get the transform cache path (or null if the cache |
141 | * is disabled). |
142 | * |
143 | * @param string $url URL to cache |
144 | * |
145 | * @return ?string |
146 | */ |
147 | protected function getTransformCachePath(string $url): ?string |
148 | { |
149 | if ($dir = $this->config->Cache->transform_cache_dir ?? null) { |
150 | return $dir . '/' . md5($url); |
151 | } |
152 | return null; |
153 | } |
154 | |
155 | /** |
156 | * Update the last_indexed dates in a cached XML document to the current |
157 | * time so reindexing cached documents works correctly. |
158 | * |
159 | * @param string $xml XML to update |
160 | * |
161 | * @return string |
162 | */ |
163 | protected function updateLastIndexed(string $xml): string |
164 | { |
165 | $newDate = date('Y-m-d\TH:i:s\Z'); |
166 | return preg_replace( |
167 | '|<field name="last_indexed">([^<]+)</field>|', |
168 | '<field name="last_indexed">' . $newDate . '</field>', |
169 | $xml |
170 | ); |
171 | } |
172 | |
173 | /** |
174 | * Fetch transform cache data for the specified URL; return null if the cache is disabled, |
175 | * the data is expired, or something goes wrong. |
176 | * |
177 | * @param OutputInterface $output Output object |
178 | * @param string $url URL of sitemap to read. |
179 | * @param string $lastMod Last modification date of URL. |
180 | * @param bool $verbose Are we in verbose mode? |
181 | * |
182 | * @return ?string |
183 | */ |
184 | protected function readFromTransformCache( |
185 | OutputInterface $output, |
186 | string $url, |
187 | string $lastMod, |
188 | bool $verbose |
189 | ): ?string { |
190 | // If cache is write-only, don't retrieve data! |
191 | if ($this->config->Cache->transform_cache_write_only ?? false) { |
192 | return null; |
193 | } |
194 | // If we can't find the data in the cache, we can't proceed. |
195 | if (!($path = $this->getTransformCachePath($url)) || !file_exists($path)) { |
196 | return null; |
197 | } |
198 | if (strtotime($lastMod) > filemtime($path) && !$this->bypassCacheExpiration) { |
199 | if ($verbose) { |
200 | $output->writeln("Cached data for $url ($path) has expired."); |
201 | } |
202 | return null; |
203 | } |
204 | $rawXml = file_get_contents($path); |
205 | if (!is_string($rawXml)) { |
206 | $output->writeln("WARNING: Problem reading cached data for $url ($path)"); |
207 | return null; |
208 | } |
209 | if ($verbose) { |
210 | $output->writeln("Found $url in cache: $path"); |
211 | } |
212 | return $rawXml; |
213 | } |
214 | |
215 | /** |
216 | * Check the cache and configuration to see if the provided URL can |
217 | * be loaded from cache, and load it to Solr if possible. |
218 | * |
219 | * @param OutputInterface $output Output object |
220 | * @param string $url URL of sitemap to read. |
221 | * @param string $lastMod Last modification date of URL. |
222 | * @param bool $verbose Are we in verbose mode? |
223 | * @param string $index Solr index to update |
224 | * @param bool $testMode Are we in test mode? |
225 | * |
226 | * @return bool True if loaded from cache, false if not. |
227 | */ |
228 | protected function indexFromTransformCache( |
229 | OutputInterface $output, |
230 | string $url, |
231 | string $lastMod, |
232 | bool $verbose = false, |
233 | string $index = 'SolrWeb', |
234 | bool $testMode = false |
235 | ): bool { |
236 | $rawXml = $this->readFromTransformCache($output, $url, $lastMod, $verbose); |
237 | if ($rawXml === null) { |
238 | return false; |
239 | } |
240 | $xml = $this->updateLastIndexed($rawXml); |
241 | if ($testMode) { |
242 | $output->writeln($xml); |
243 | } else { |
244 | $this->solr->save($index, new RawXMLDocument($xml)); |
245 | } |
246 | return true; |
247 | } |
248 | |
249 | /** |
250 | * Update the transform cache (if activated). Returns true if the cache was updated, |
251 | * false otherwise. |
252 | * |
253 | * @param string $url URL to use for cache key |
254 | * @param string $result Result of transforming the URL |
255 | * |
256 | * @return bool |
257 | */ |
258 | protected function updateTransformCache(string $url, string $result): bool |
259 | { |
260 | if ($transformCachePath = $this->getTransformCachePath($url)) { |
261 | return false !== file_put_contents($transformCachePath, $result); |
262 | } |
263 | return false; |
264 | } |
265 | |
266 | /** |
267 | * Process a sitemap URL, either harvesting its contents directly or recursively |
268 | * reading in child sitemaps. |
269 | * |
270 | * @param OutputInterface $output Output object |
271 | * @param string $url URL of sitemap to read. |
272 | * @param bool $verbose Are we in verbose mode? |
273 | * @param string $index Solr index to update |
274 | * @param bool $testMode Are we in test mode? |
275 | * |
276 | * @return bool True on success, false on error. |
277 | */ |
278 | protected function harvestSitemap( |
279 | OutputInterface $output, |
280 | $url, |
281 | $verbose = false, |
282 | $index = 'SolrWeb', |
283 | $testMode = false |
284 | ) { |
285 | // Date to use as a default "last modification" date in scenarios where we |
286 | // don't care about cache invalidation. |
287 | $pastDate = '1980-01-01'; |
288 | |
289 | // If we're not concerned about cache expiration, we can potentially |
290 | // short-circuit the process with the cache up front. Otherwise, we'll |
291 | // need to wait until we can get last modification dates to know whether |
292 | // it's safe to rely on cached data. |
293 | if ( |
294 | $this->bypassCacheExpiration |
295 | && $this->indexFromTransformCache($output, $url, $pastDate, $verbose, $index, $testMode) |
296 | ) { |
297 | return true; |
298 | } |
299 | |
300 | if ($verbose) { |
301 | $output->writeln("Harvesting $url..."); |
302 | } |
303 | |
304 | $retVal = true; |
305 | |
306 | $file = $this->downloadFile($url); |
307 | $xml = simplexml_load_file($file); |
308 | if ($xml) { |
309 | // Are there any child sitemaps? If so, pull them in: |
310 | $results = $xml->sitemap ?? []; |
311 | foreach ($results as $current) { |
312 | if (isset($current->loc)) { |
313 | // If there's a last modification date (or we're forcing a |
314 | // reindex from the cache) and we can retrieve data from the |
315 | // cache, we can bypass the harvest. |
316 | if ( |
317 | (!isset($current->lastmod) && !$this->bypassCacheExpiration) |
318 | || !$this->indexFromTransformCache( |
319 | $output, |
320 | (string)$current->loc, |
321 | (string)($current->lastmod ?? $pastDate), |
322 | $verbose, |
323 | $index, |
324 | $testMode |
325 | ) |
326 | ) { |
327 | $success = $this->harvestSitemap( |
328 | $output, |
329 | (string)$current->loc, |
330 | $verbose, |
331 | $index, |
332 | $testMode |
333 | ); |
334 | if (!$success) { |
335 | $retVal = false; |
336 | } |
337 | } |
338 | } |
339 | } |
340 | // Only import the current sitemap if it contains URLs! |
341 | if (isset($xml->url)) { |
342 | try { |
343 | $result = $this->importer->save( |
344 | $file, |
345 | 'sitemap.properties', |
346 | $index, |
347 | $testMode |
348 | ); |
349 | if ($result && $this->updateTransformCache($url, $result) && $verbose) { |
350 | $output->writeln('Wrote results to transform cache.'); |
351 | } |
352 | if ($testMode) { |
353 | $output->writeln($result); |
354 | } |
355 | } catch (\Exception $e) { |
356 | if ($verbose) { |
357 | $output->writeln($e::class . ': ' . $e->getMessage()); |
358 | } |
359 | $retVal = false; |
360 | } |
361 | } |
362 | } |
363 | $this->removeTempFile($file); |
364 | return $retVal; |
365 | } |
366 | |
367 | /** |
368 | * Run the command. |
369 | * |
370 | * @param InputInterface $input Input object |
371 | * @param OutputInterface $output Output object |
372 | * |
373 | * @return int 0 for success |
374 | */ |
375 | protected function execute(InputInterface $input, OutputInterface $output) |
376 | { |
377 | // Get command line parameters: |
378 | $testMode = $input->getOption('test-only') ? true : false; |
379 | $this->bypassCacheExpiration = $input->getOption('use-expired-cache') ? true : false; |
380 | $index = $input->getOption('index'); |
381 | |
382 | // Get the time we started indexing -- we'll delete records older than this |
383 | // date after everything is finished. Note that we subtract a few seconds |
384 | // for safety. |
385 | $startTime = date('Y-m-d\TH:i:s\Z', time() - 5); |
386 | |
387 | // Are we in verbose mode? |
388 | $verbose = ($this->config->General->verbose ?? false) |
389 | || ($input->hasOption('verbose') && $input->getOption('verbose')); |
390 | |
391 | // Loop through sitemap URLs in the config file. |
392 | $error = false; |
393 | foreach ($this->config->Sitemaps->url as $current) { |
394 | $error = $error || !$this->harvestSitemap( |
395 | $output, |
396 | $current, |
397 | $verbose, |
398 | $index, |
399 | $testMode |
400 | ); |
401 | } |
402 | if ($error) { |
403 | $output->writeln('Error encountered during harvest.'); |
404 | } |
405 | |
406 | // Skip Solr operations if we're in test mode. |
407 | if (!$testMode) { |
408 | if ($verbose) { |
409 | $output->writeln("Deleting old records (prior to $startTime)..."); |
410 | } |
411 | // Perform the delete of outdated records: |
412 | $this->solr |
413 | ->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']'); |
414 | if ($verbose) { |
415 | $output->writeln('Committing...'); |
416 | } |
417 | $this->solr->commit($index); |
418 | if ($verbose) { |
419 | $output->writeln('Optimizing...'); |
420 | } |
421 | $this->solr->optimize($index); |
422 | } |
423 | return $error ? 1 : 0; |
424 | } |
425 | } |