Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.93% |
68 / 83 |
|
33.33% |
2 / 6 |
CRAP | |
0.00% |
0 / 1 |
WebCrawlCommand | |
81.93% |
68 / 83 |
|
33.33% |
2 / 6 |
29.99 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
configure | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
downloadFile | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
removeTempFile | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
harvestSitemap | |
78.79% |
26 / 33 |
|
0.00% |
0 / 1 |
10.95 | |||
execute | |
85.71% |
24 / 28 |
|
0.00% |
0 / 1 |
12.42 |
1 | <?php |
2 | |
3 | /** |
4 | * Console command: web crawler |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2020. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Console |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/development Wiki |
28 | */ |
29 | |
30 | namespace VuFindConsole\Command\Import; |
31 | |
32 | use Laminas\Config\Config; |
33 | use Symfony\Component\Console\Attribute\AsCommand; |
34 | use Symfony\Component\Console\Command\Command; |
35 | use Symfony\Component\Console\Input\InputInterface; |
36 | use Symfony\Component\Console\Input\InputOption; |
37 | use Symfony\Component\Console\Output\OutputInterface; |
38 | use VuFind\Solr\Writer; |
39 | use VuFind\XSLT\Importer; |
40 | |
41 | /** |
42 | * Console command: web crawler |
43 | * |
44 | * @category VuFind |
45 | * @package Console |
46 | * @author Demian Katz <demian.katz@villanova.edu> |
47 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
48 | * @link https://vufind.org/wiki/development Wiki |
49 | */ |
50 | #[AsCommand( |
51 | name: 'import/webcrawl', |
52 | description: 'Web crawler' |
53 | )] |
54 | class WebCrawlCommand extends Command |
55 | { |
56 | /** |
57 | * XSLT importer |
58 | * |
59 | * @var Importer |
60 | */ |
61 | protected $importer; |
62 | |
63 | /** |
64 | * Solr writer |
65 | * |
66 | * @var Writer |
67 | */ |
68 | protected $solr; |
69 | |
70 | /** |
71 | * Configuration from webcrawl.ini |
72 | * |
73 | * @var Config |
74 | */ |
75 | protected $config; |
76 | |
77 | /** |
78 | * Constructor |
79 | * |
80 | * @param Importer $importer XSLT importer |
81 | * @param Writer $solr Solr writer |
82 | * @param Config $config Configuration from webcrawl.ini |
83 | * @param string|null $name The name of the command; passing null means it |
84 | * must be set in configure() |
85 | */ |
86 | public function __construct( |
87 | Importer $importer, |
88 | Writer $solr, |
89 | Config $config, |
90 | $name = null |
91 | ) { |
92 | $this->importer = $importer; |
93 | $this->solr = $solr; |
94 | $this->config = $config; |
95 | parent::__construct($name); |
96 | } |
97 | |
98 | /** |
99 | * Configure the command. |
100 | * |
101 | * @return void |
102 | */ |
103 | protected function configure() |
104 | { |
105 | $this |
106 | ->setHelp('Crawls websites to populate VuFind\'s web index.') |
107 | ->addOption( |
108 | 'test-only', |
109 | null, |
110 | InputOption::VALUE_NONE, |
111 | 'activates test mode, which displays output without updating Solr' |
112 | )->addOption( |
113 | 'index', |
114 | null, |
115 | InputOption::VALUE_OPTIONAL, |
116 | 'name of search backend to index content into', |
117 | 'SolrWeb' |
118 | ); |
119 | } |
120 | |
121 | /** |
122 | * Download a URL to a temporary file. |
123 | * |
124 | * @param string $url URL to download |
125 | * |
126 | * @return string Filename of downloaded content |
127 | */ |
128 | protected function downloadFile($url) |
129 | { |
130 | $file = tempnam('/tmp', 'sitemap'); |
131 | file_put_contents($file, file_get_contents($url)); |
132 | return $file; |
133 | } |
134 | |
135 | /** |
136 | * Remove a temporary file. |
137 | * |
138 | * @param string $file Name of file to delete |
139 | * |
140 | * @return void |
141 | */ |
142 | protected function removeTempFile($file) |
143 | { |
144 | unlink($file); |
145 | } |
146 | |
147 | /** |
148 | * Process a sitemap URL, either harvesting its contents directly or recursively |
149 | * reading in child sitemaps. |
150 | * |
151 | * @param OutputInterface $output Output object |
152 | * @param string $url URL of sitemap to read. |
153 | * @param bool $verbose Are we in verbose mode? |
154 | * @param string $index Solr index to update |
155 | * @param bool $testMode Are we in test mode? |
156 | * |
157 | * @return bool True on success, false on error. |
158 | */ |
159 | protected function harvestSitemap( |
160 | OutputInterface $output, |
161 | $url, |
162 | $verbose = false, |
163 | $index = 'SolrWeb', |
164 | $testMode = false |
165 | ) { |
166 | if ($verbose) { |
167 | $output->writeln("Harvesting $url..."); |
168 | } |
169 | |
170 | $retVal = true; |
171 | |
172 | $file = $this->downloadFile($url); |
173 | $xml = simplexml_load_file($file); |
174 | if ($xml) { |
175 | // Are there any child sitemaps? If so, pull them in: |
176 | $results = $xml->sitemap ?? []; |
177 | foreach ($results as $current) { |
178 | if (isset($current->loc)) { |
179 | $success = $this->harvestSitemap( |
180 | $output, |
181 | (string)$current->loc, |
182 | $verbose, |
183 | $index, |
184 | $testMode |
185 | ); |
186 | if (!$success) { |
187 | $retVal = false; |
188 | } |
189 | } |
190 | } |
191 | // Only import the current sitemap if it contains URLs! |
192 | if (isset($xml->url)) { |
193 | try { |
194 | $result = $this->importer->save( |
195 | $file, |
196 | 'sitemap.properties', |
197 | $index, |
198 | $testMode |
199 | ); |
200 | if ($testMode) { |
201 | $output->writeln($result); |
202 | } |
203 | } catch (\Exception $e) { |
204 | if ($verbose) { |
205 | $output->writeln($e::class . ': ' . $e->getMessage()); |
206 | } |
207 | $retVal = false; |
208 | } |
209 | } |
210 | } |
211 | $this->removeTempFile($file); |
212 | return $retVal; |
213 | } |
214 | |
215 | /** |
216 | * Run the command. |
217 | * |
218 | * @param InputInterface $input Input object |
219 | * @param OutputInterface $output Output object |
220 | * |
221 | * @return int 0 for success |
222 | */ |
223 | protected function execute(InputInterface $input, OutputInterface $output) |
224 | { |
225 | // Get command line parameters: |
226 | $testMode = $input->getOption('test-only') ? true : false; |
227 | $index = $input->getOption('index'); |
228 | |
229 | // Get the time we started indexing -- we'll delete records older than this |
230 | // date after everything is finished. Note that we subtract a few seconds |
231 | // for safety. |
232 | $startTime = date('Y-m-d\TH:i:s\Z', time() - 5); |
233 | |
234 | // Are we in verbose mode? |
235 | $verbose = ($this->config->General->verbose ?? false) |
236 | || ($input->hasOption('verbose') && $input->getOption('verbose')); |
237 | |
238 | // Loop through sitemap URLs in the config file. |
239 | $error = false; |
240 | foreach ($this->config->Sitemaps->url as $current) { |
241 | $error = $error || !$this->harvestSitemap( |
242 | $output, |
243 | $current, |
244 | $verbose, |
245 | $index, |
246 | $testMode |
247 | ); |
248 | } |
249 | if ($error) { |
250 | $output->writeln('Error encountered during harvest.'); |
251 | } |
252 | |
253 | // Skip Solr operations if we're in test mode. |
254 | if (!$testMode) { |
255 | if ($verbose) { |
256 | $output->writeln("Deleting old records (prior to $startTime)..."); |
257 | } |
258 | // Perform the delete of outdated records: |
259 | $this->solr |
260 | ->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']'); |
261 | if ($verbose) { |
262 | $output->writeln('Committing...'); |
263 | } |
264 | $this->solr->commit($index); |
265 | if ($verbose) { |
266 | $output->writeln('Optimizing...'); |
267 | } |
268 | $this->solr->optimize($index); |
269 | } |
270 | return $error ? 1 : 0; |
271 | } |
272 | } |