Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
95.20% |
238 / 250 |
|
77.78% |
7 / 9 |
CRAP | |
0.00% |
0 / 1 |
HarvesterCommand | |
95.20% |
238 / 250 |
|
77.78% |
7 / 9 |
35 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
configure | |
100.00% |
160 / 160 |
|
100.00% |
1 / 1 |
1 | |||
updateSettingsWithConsoleOptions | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
execute | |
74.42% |
32 / 43 |
|
0.00% |
0 / 1 |
15.83 | |||
getHarvestRoot | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getHttpClient | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getSettingsFromIni | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
getSettings | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
4.01 | |||
harvestSingleRepository | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | /** |
4 | * OAI-PMH Harvest Tool (Symfony Console Command) |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (c) Demian Katz 2016. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Harvest_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
28 | */ |
29 | |
30 | namespace VuFindHarvest\OaiPmh; |
31 | |
32 | use Laminas\Http\Client; |
33 | use Symfony\Component\Console\Attribute\AsCommand; |
34 | use Symfony\Component\Console\Command\Command; |
35 | use Symfony\Component\Console\Input\InputArgument; |
36 | use Symfony\Component\Console\Input\InputInterface; |
37 | use Symfony\Component\Console\Input\InputOption; |
38 | use Symfony\Component\Console\Output\OutputInterface; |
39 | use VuFindHarvest\ConsoleOutput\ConsoleWriter; |
40 | use VuFindHarvest\ConsoleOutput\WriterAwareTrait; |
41 | use VuFindHarvest\Exception\OaiException; |
42 | |
43 | /** |
44 | * OAI-PMH Harvest Tool (Symfony Console Command) |
45 | * |
46 | * @category VuFind |
47 | * @package Harvest_Tools |
48 | * @author Demian Katz <demian.katz@villanova.edu> |
49 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
50 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
51 | */ |
52 | #[AsCommand( |
53 | name: 'harvest/harvest_oai', |
54 | description: 'OAI-PMH harvester' |
55 | )] |
56 | class HarvesterCommand extends Command |
57 | { |
58 | use WriterAwareTrait; |
59 | |
60 | /** |
61 | * The name of the command |
62 | * |
63 | * @var string |
64 | */ |
65 | protected static $defaultName = 'harvest/harvest_oai'; |
66 | |
67 | /** |
68 | * HTTP client |
69 | * |
70 | * @var Client |
71 | */ |
72 | protected $client; |
73 | |
74 | /** |
75 | * Root directory for harvesting |
76 | * |
77 | * @var string |
78 | */ |
79 | protected $harvestRoot; |
80 | |
81 | /** |
82 | * Harvester factory |
83 | * |
84 | * @var HarvesterFactory |
85 | */ |
86 | protected $factory; |
87 | |
88 | /** |
89 | * Silent mode |
90 | * |
91 | * @var bool |
92 | */ |
93 | protected $silent; |
94 | |
95 | /** |
96 | * Constructor |
97 | * |
98 | * @param Client $client HTTP client (omit for default) |
99 | * @param string $harvestRoot Root directory for harvesting (omit for |
100 | * default) |
101 | * @param HarvesterFactory $factory Harvester factory (omit for default) |
102 | * @param bool $silent Should we suppress output? |
103 | * @param string|null $name The name of the command; passing null |
104 | * means it must be set in configure() |
105 | */ |
106 | public function __construct( |
107 | $client = null, |
108 | $harvestRoot = null, |
109 | HarvesterFactory $factory = null, |
110 | $silent = false, |
111 | $name = null |
112 | ) { |
113 | $this->client = $client ?: new Client(); |
114 | $this->harvestRoot = $harvestRoot ?: getcwd(); |
115 | $this->factory = $factory ?: new HarvesterFactory(); |
116 | $this->silent = $silent; |
117 | parent::__construct($name); |
118 | } |
119 | |
120 | /** |
121 | * Configure the command. |
122 | * |
123 | * @return void |
124 | * |
125 | * @SuppressWarnings(PHPMD.ExcessiveMethodLength) |
126 | */ |
127 | protected function configure() |
128 | { |
129 | $this |
130 | ->setHelp('Harvests metadata using the OAI-PMH protocol.') |
131 | ->addArgument( |
132 | 'target', |
133 | InputArgument::OPTIONAL, |
134 | 'the name of a section of the configuration specified by the ini ' |
135 | . "option,\nor a directory to harvest into if no .ini file is used. " |
136 | . "If <target> is\nomitted, all .ini sections will be processed." |
137 | )->addOption( |
138 | 'from', |
139 | null, |
140 | InputOption::VALUE_REQUIRED, |
141 | 'Harvest start date' |
142 | )->addOption( |
143 | 'until', |
144 | null, |
145 | InputOption::VALUE_REQUIRED, |
146 | 'Harvest end date' |
147 | )->addOption( |
148 | 'ini', |
149 | null, |
150 | InputOption::VALUE_REQUIRED, |
151 | '.ini file to load; if you set other more specific options, they' |
152 | . " will\noverride equivalent settings loaded from the .ini file." |
153 | )->addOption( |
154 | 'url', |
155 | null, |
156 | InputOption::VALUE_REQUIRED, |
157 | 'Base URL of OAI-PMH server' |
158 | )->addOption( |
159 | 'httpUser', |
160 | null, |
161 | InputOption::VALUE_REQUIRED, |
162 | 'Username to access url' |
163 | )->addOption( |
164 | 'httpPass', |
165 | null, |
166 | InputOption::VALUE_REQUIRED, |
167 | 'Password to access url' |
168 | )->addOption( |
169 | 'set', |
170 | null, |
171 | InputOption::VALUE_REQUIRED, |
172 | 'Set name to harvest' |
173 | )->addOption( |
174 | 'metadataPrefix', |
175 | null, |
176 | InputOption::VALUE_REQUIRED, |
177 | 'Metadata prefix to harvest' |
178 | )->addOption( |
179 | 'timeout', |
180 | null, |
181 | InputOption::VALUE_REQUIRED, |
182 | 'HTTP timeout (in seconds)' |
183 | )->addOption( |
184 | 'combineRecords', |
185 | null, |
186 | InputOption::VALUE_NONE, |
187 | 'Turn off "one record per file" mode' |
188 | )->addOption( |
189 | 'combineRecordsTag', |
190 | null, |
191 | InputOption::VALUE_REQUIRED, |
192 | 'Specify the XML tag wrapped around multiple records in ' |
193 | . "combineRecords\nmode (default = <collection> if this " |
194 | . 'option is omitted)' |
195 | )->addOption( |
196 | 'globalSearch', |
197 | null, |
198 | InputOption::VALUE_REQUIRED, |
199 | 'Regular expression to replace in raw XML' |
200 | )->addOption( |
201 | 'globalReplace', |
202 | null, |
203 | InputOption::VALUE_REQUIRED, |
204 | 'String to replace globalSearch regex matches' |
205 | )->addOption( |
206 | 'injectDate', |
207 | null, |
208 | InputOption::VALUE_REQUIRED, |
209 | 'Inject date from header into specified tag' |
210 | )->addOption( |
211 | 'injectId', |
212 | null, |
213 | InputOption::VALUE_REQUIRED, |
214 | 'Inject ID from header into specified tag' |
215 | )->addOption( |
216 | 'injectSetName', |
217 | null, |
218 | InputOption::VALUE_REQUIRED, |
219 | 'Inject setName from header into specified tag' |
220 | )->addOption( |
221 | 'injectSetSpec', |
222 | null, |
223 | InputOption::VALUE_REQUIRED, |
224 | 'Inject setSpec from header into specified tag' |
225 | )->addOption( |
226 | 'idSearch', |
227 | null, |
228 | InputOption::VALUE_REQUIRED, |
229 | 'Regular expression to replace in ID' |
230 | . ' (only relevant when injectId is on)' |
231 | )->addOption( |
232 | 'idReplace', |
233 | null, |
234 | InputOption::VALUE_REQUIRED, |
235 | 'String to replace idSearch regex matches' |
236 | )->addOption( |
237 | 'dateGranularity', |
238 | null, |
239 | InputOption::VALUE_REQUIRED, |
240 | '"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)' |
241 | )->addOption( |
242 | 'harvestedIdLog', |
243 | null, |
244 | InputOption::VALUE_REQUIRED, |
245 | 'Filename (relative to harvest directory)' |
246 | . ' to store log of harvested IDs.' |
247 | )->addOption( |
248 | 'autosslca', |
249 | null, |
250 | InputOption::VALUE_NONE, |
251 | 'Attempt to autodetect SSL certificate file/path' |
252 | )->addOption( |
253 | 'sslcapath', |
254 | null, |
255 | InputOption::VALUE_REQUIRED, |
256 | 'Path to SSL certificate authority directory' |
257 | )->addOption( |
258 | 'sslcafile', |
259 | null, |
260 | InputOption::VALUE_REQUIRED, |
261 | 'Path to SSL certificate authority file' |
262 | )->addOption( |
263 | 'nosslverifypeer', |
264 | null, |
265 | InputOption::VALUE_NONE, |
266 | 'Disable SSL verification' |
267 | )->addOption( |
268 | 'sanitize', |
269 | null, |
270 | InputOption::VALUE_NONE, |
271 | 'Strip illegal characters from XML' |
272 | )->addOption( |
273 | 'sanitizeRegex', |
274 | null, |
275 | InputOption::VALUE_REQUIRED, |
276 | 'Optional regular expression defining XML characters to remove' |
277 | )->addOption( |
278 | 'badXMLLog', |
279 | null, |
280 | InputOption::VALUE_REQUIRED, |
281 | 'Filename (relative to harvest directory) to log' |
282 | . ' XML fixed by sanitize setting' |
283 | )->addOption( |
284 | 'stopAfter', |
285 | null, |
286 | InputOption::VALUE_NONE, |
287 | 'an option to stop harvesting after the first n records of each set.' |
288 | ); |
289 | } |
290 | |
291 | /** |
292 | * Use command-line switches to add/override settings found in the .ini |
293 | * file, if necessary. |
294 | * |
295 | * @param InputInterface $input Input object |
296 | * @param array $settings Incoming settings |
297 | * |
298 | * @return array |
299 | */ |
300 | protected function updateSettingsWithConsoleOptions( |
301 | InputInterface $input, |
302 | $settings |
303 | ) { |
304 | $directMapSettings = [ |
305 | 'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag', |
306 | 'injectDate', 'injectId', 'injectSetName', 'injectSetSpec', |
307 | 'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog', |
308 | 'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile', |
309 | 'sanitizeRegex', |
310 | ]; |
311 | foreach ($directMapSettings as $setting) { |
312 | if ($value = $input->getOption($setting)) { |
313 | $settings[$setting] = $value; |
314 | } |
315 | } |
316 | $flagSettings = [ |
317 | 'combineRecords' => ['combineRecords', true], |
318 | 'verbose' => ['verbose', true], |
319 | 'autosslca' => ['autosslca', true], |
320 | 'nosslverifypeer' => ['sslverifypeer', false], |
321 | 'sanitize' => ['sanitize', true], |
322 | ]; |
323 | foreach ($flagSettings as $in => $details) { |
324 | if ($input->hasOption($in) && $input->getOption($in)) { |
325 | [$out, $val] = $details; |
326 | $settings[$out] = $val; |
327 | } |
328 | } |
329 | return $settings; |
330 | } |
331 | |
332 | /** |
333 | * Run the command. |
334 | * |
335 | * @param InputInterface $input Input object |
336 | * @param OutputInterface $output Output object |
337 | * |
338 | * @return int 0 for success |
339 | */ |
340 | protected function execute(InputInterface $input, OutputInterface $output) |
341 | { |
342 | // Only set up output writer if not in silent mode: |
343 | if (!$this->silent) { |
344 | $this->setOutputWriter(new ConsoleWriter($output)); |
345 | } |
346 | |
347 | if (!$allSettings = $this->getSettings($input)) { |
348 | return 1; |
349 | } |
350 | |
351 | // Loop through all the settings and perform harvests: |
352 | $processed = $skipped = $errors = 0; |
353 | foreach ($allSettings as $target => $baseSettings) { |
354 | $settings = $this->updateSettingsWithConsoleOptions( |
355 | $input, |
356 | $baseSettings |
357 | ); |
358 | if (empty($target) || empty($settings)) { |
359 | $skipped++; |
360 | continue; |
361 | } |
362 | $this->writeLine("Processing {$target}..."); |
363 | try { |
364 | $this->harvestSingleRepository($input, $output, $target, $settings); |
365 | } catch (\Exception $e) { |
366 | if ( |
367 | $e instanceof OaiException |
368 | && strtolower($e->getOaiCode()) == 'norecordsmatch' |
369 | ) { |
370 | $this->writeLine('No new records found.'); |
371 | } else { |
372 | $this->writeLine($e->getMessage()); |
373 | $errors++; |
374 | } |
375 | } |
376 | $processed++; |
377 | } |
378 | |
379 | // All done. |
380 | if (isset($settings['stopAfter'])) { |
381 | $this->writeLine( |
382 | 'stopAfter option set; ' |
383 | . 'all sources may not have been fully harvested.' |
384 | ); |
385 | } |
386 | if ($processed == 0 && $skipped > 0) { |
387 | $this->writeLine( |
388 | 'No valid settings found; ' |
389 | . 'please set url and metadataPrefix at minimum.' |
390 | ); |
391 | return 1; |
392 | } |
393 | if ($errors > 0) { |
394 | $this->writeLine( |
395 | "Completed with {$errors} error(s) -- " |
396 | . "{$processed} source(s) processed." |
397 | ); |
398 | return 1; |
399 | } |
400 | $this->writeLine( |
401 | "Completed without errors -- {$processed} source(s) processed." |
402 | ); |
403 | return 0; |
404 | } |
405 | |
406 | /** |
407 | * Get the target directory for writing harvested files. |
408 | * |
409 | * @return string |
410 | */ |
411 | protected function getHarvestRoot() |
412 | { |
413 | return $this->harvestRoot; |
414 | } |
415 | |
416 | /** |
417 | * Get an HTTP client. |
418 | * |
419 | * @return Client |
420 | */ |
421 | protected function getHttpClient() |
422 | { |
423 | return $this->client; |
424 | } |
425 | |
426 | /** |
427 | * Load configuration from an .ini file (or return false on error) |
428 | * |
429 | * @param string $ini Configuration file to load |
430 | * @param string|bool $section Section of .ini to load (or false for all) |
431 | * |
432 | * @return array|bool |
433 | */ |
434 | protected function getSettingsFromIni($ini, $section) |
435 | { |
436 | $oaiSettings = @parse_ini_file($ini, true); |
437 | if (empty($oaiSettings)) { |
438 | $this->writeLine("Please add OAI-PMH settings to {$ini}."); |
439 | return false; |
440 | } |
441 | if ($section) { |
442 | if (!isset($oaiSettings[$section])) { |
443 | $this->writeLine("$section not found in $ini."); |
444 | return false; |
445 | } |
446 | $oaiSettings = [$section => $oaiSettings[$section]]; |
447 | } |
448 | return $oaiSettings; |
449 | } |
450 | |
451 | /** |
452 | * Load the harvest settings. Return false on error. |
453 | * |
454 | * @param InputInterface $input Input object |
455 | * |
456 | * @return array|bool |
457 | */ |
458 | protected function getSettings(InputInterface $input) |
459 | { |
460 | $ini = $input->getOption('ini'); |
461 | $section = $input->getArgument('target'); |
462 | if (!$ini && !$section) { |
463 | $this->writeLine( |
464 | 'Please specify an .ini file with the --ini flag' |
465 | . ' or a target directory with the first parameter.' |
466 | ); |
467 | return false; |
468 | } |
469 | return $ini |
470 | ? $this->getSettingsFromIni($ini, $section) |
471 | : [$section => []]; |
472 | } |
473 | |
474 | /** |
475 | * Harvest a single repository. |
476 | * |
477 | * @param InputInterface $input Input object |
478 | * @param OutputInterface $output Output object |
479 | * @param string $target Name of repo (used for target directory) |
480 | * @param array $settings Settings for the harvester. |
481 | * |
482 | * @return void |
483 | * @throws \Exception |
484 | */ |
485 | protected function harvestSingleRepository( |
486 | InputInterface $input, |
487 | OutputInterface $output, |
488 | $target, |
489 | $settings |
490 | ) { |
491 | $settings['from'] = $input->getOption('from'); |
492 | $settings['until'] = $input->getOption('until'); |
493 | $settings['silent'] = false; |
494 | $harvest = $this->factory->getHarvester( |
495 | $target, |
496 | $this->getHarvestRoot(), |
497 | $this->getHttpClient(), |
498 | $settings, |
499 | $output |
500 | ); |
501 | $harvest->launch(); |
502 | } |
503 | } |