Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
81.93% covered (warning)
81.93%
68 / 83
33.33% covered (danger)
33.33%
2 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
WebCrawlCommand
81.93% covered (warning)
81.93%
68 / 83
33.33% covered (danger)
33.33%
2 / 6
29.99
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 configure
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
 downloadFile
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 removeTempFile
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 harvestSitemap
78.79% covered (warning)
78.79%
26 / 33
0.00% covered (danger)
0.00%
0 / 1
10.95
 execute
85.71% covered (warning)
85.71%
24 / 28
0.00% covered (danger)
0.00%
0 / 1
12.42
1<?php
2
3/**
4 * Console command: web crawler
5 *
6 * PHP version 8
7 *
8 * Copyright (C) Villanova University 2020.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22 *
23 * @category VuFind
24 * @package  Console
25 * @author   Demian Katz <demian.katz@villanova.edu>
26 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
27 * @link     https://vufind.org/wiki/development Wiki
28 */
29
30namespace VuFindConsole\Command\Import;
31
32use Laminas\Config\Config;
33use Symfony\Component\Console\Attribute\AsCommand;
34use Symfony\Component\Console\Command\Command;
35use Symfony\Component\Console\Input\InputInterface;
36use Symfony\Component\Console\Input\InputOption;
37use Symfony\Component\Console\Output\OutputInterface;
38use VuFind\Solr\Writer;
39use VuFind\XSLT\Importer;
40
41/**
42 * Console command: web crawler
43 *
44 * @category VuFind
45 * @package  Console
46 * @author   Demian Katz <demian.katz@villanova.edu>
47 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
48 * @link     https://vufind.org/wiki/development Wiki
49 */
50#[AsCommand(
51    name: 'import/webcrawl',
52    description: 'Web crawler'
53)]
54class WebCrawlCommand extends Command
55{
56    /**
57     * XSLT importer
58     *
59     * @var Importer
60     */
61    protected $importer;
62
63    /**
64     * Solr writer
65     *
66     * @var Writer
67     */
68    protected $solr;
69
70    /**
71     * Configuration from webcrawl.ini
72     *
73     * @var Config
74     */
75    protected $config;
76
77    /**
78     * Constructor
79     *
80     * @param Importer    $importer XSLT importer
81     * @param Writer      $solr     Solr writer
82     * @param Config      $config   Configuration from webcrawl.ini
83     * @param string|null $name     The name of the command; passing null means it
84     * must be set in configure()
85     */
86    public function __construct(
87        Importer $importer,
88        Writer $solr,
89        Config $config,
90        $name = null
91    ) {
92        $this->importer = $importer;
93        $this->solr = $solr;
94        $this->config = $config;
95        parent::__construct($name);
96    }
97
98    /**
99     * Configure the command.
100     *
101     * @return void
102     */
103    protected function configure()
104    {
105        $this
106            ->setHelp('Crawls websites to populate VuFind\'s web index.')
107            ->addOption(
108                'test-only',
109                null,
110                InputOption::VALUE_NONE,
111                'activates test mode, which displays output without updating Solr'
112            )->addOption(
113                'index',
114                null,
115                InputOption::VALUE_OPTIONAL,
116                'name of search backend to index content into',
117                'SolrWeb'
118            );
119    }
120
121    /**
122     * Download a URL to a temporary file.
123     *
124     * @param string $url URL to download
125     *
126     * @return string     Filename of downloaded content
127     */
128    protected function downloadFile($url)
129    {
130        $file = tempnam('/tmp', 'sitemap');
131        file_put_contents($file, file_get_contents($url));
132        return $file;
133    }
134
135    /**
136     * Remove a temporary file.
137     *
138     * @param string $file Name of file to delete
139     *
140     * @return void
141     */
142    protected function removeTempFile($file)
143    {
144        unlink($file);
145    }
146
147    /**
148     * Process a sitemap URL, either harvesting its contents directly or recursively
149     * reading in child sitemaps.
150     *
151     * @param OutputInterface $output   Output object
152     * @param string          $url      URL of sitemap to read.
153     * @param bool            $verbose  Are we in verbose mode?
154     * @param string          $index    Solr index to update
155     * @param bool            $testMode Are we in test mode?
156     *
157     * @return bool           True on success, false on error.
158     */
159    protected function harvestSitemap(
160        OutputInterface $output,
161        $url,
162        $verbose = false,
163        $index = 'SolrWeb',
164        $testMode = false
165    ) {
166        if ($verbose) {
167            $output->writeln("Harvesting $url...");
168        }
169
170        $retVal = true;
171
172        $file = $this->downloadFile($url);
173        $xml = simplexml_load_file($file);
174        if ($xml) {
175            // Are there any child sitemaps?  If so, pull them in:
176            $results = $xml->sitemap ?? [];
177            foreach ($results as $current) {
178                if (isset($current->loc)) {
179                    $success = $this->harvestSitemap(
180                        $output,
181                        (string)$current->loc,
182                        $verbose,
183                        $index,
184                        $testMode
185                    );
186                    if (!$success) {
187                        $retVal = false;
188                    }
189                }
190            }
191            // Only import the current sitemap if it contains URLs!
192            if (isset($xml->url)) {
193                try {
194                    $result = $this->importer->save(
195                        $file,
196                        'sitemap.properties',
197                        $index,
198                        $testMode
199                    );
200                    if ($testMode) {
201                        $output->writeln($result);
202                    }
203                } catch (\Exception $e) {
204                    if ($verbose) {
205                        $output->writeln($e::class . ': ' . $e->getMessage());
206                    }
207                    $retVal = false;
208                }
209            }
210        }
211        $this->removeTempFile($file);
212        return $retVal;
213    }
214
215    /**
216     * Run the command.
217     *
218     * @param InputInterface  $input  Input object
219     * @param OutputInterface $output Output object
220     *
221     * @return int 0 for success
222     */
223    protected function execute(InputInterface $input, OutputInterface $output)
224    {
225        // Get command line parameters:
226        $testMode = $input->getOption('test-only') ? true : false;
227        $index = $input->getOption('index');
228
229        // Get the time we started indexing -- we'll delete records older than this
230        // date after everything is finished. Note that we subtract a few seconds
231        // for safety.
232        $startTime = date('Y-m-d\TH:i:s\Z', time() - 5);
233
234        // Are we in verbose mode?
235        $verbose = ($this->config->General->verbose ?? false)
236            || ($input->hasOption('verbose') && $input->getOption('verbose'));
237
238        // Loop through sitemap URLs in the config file.
239        $error = false;
240        foreach ($this->config->Sitemaps->url as $current) {
241            $error = $error || !$this->harvestSitemap(
242                $output,
243                $current,
244                $verbose,
245                $index,
246                $testMode
247            );
248        }
249        if ($error) {
250            $output->writeln('Error encountered during harvest.');
251        }
252
253        // Skip Solr operations if we're in test mode.
254        if (!$testMode) {
255            if ($verbose) {
256                $output->writeln("Deleting old records (prior to $startTime)...");
257            }
258            // Perform the delete of outdated records:
259            $this->solr
260                ->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']');
261            if ($verbose) {
262                $output->writeln('Committing...');
263            }
264            $this->solr->commit($index);
265            if ($verbose) {
266                $output->writeln('Optimizing...');
267            }
268            $this->solr->optimize($index);
269        }
270        return $error ? 1 : 0;
271    }
272}