Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 265 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
HarvesterCommand | |
0.00% |
0 / 265 |
|
0.00% |
0 / 9 |
1260 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
configure | |
0.00% |
0 / 161 |
|
0.00% |
0 / 1 |
2 | |||
updateSettingsWithConsoleOptions | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
42 | |||
execute | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
182 | |||
getHarvestRoot | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHttpClient | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getSettingsFromIni | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
getSettings | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
harvestSingleRepository | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | /** |
4 | * OAI-PMH Harvest Tool (Symfony Console Command) |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (c) Demian Katz 2016. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Harvest_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
28 | */ |
29 | |
30 | namespace VuFindHarvest\OaiPmh; |
31 | |
32 | use Laminas\Http\Client; |
33 | use Symfony\Component\Console\Command\Command; |
34 | use Symfony\Component\Console\Input\InputArgument; |
35 | use Symfony\Component\Console\Input\InputInterface; |
36 | use Symfony\Component\Console\Input\InputOption; |
37 | use Symfony\Component\Console\Output\OutputInterface; |
38 | use VuFindHarvest\ConsoleOutput\ConsoleWriter; |
39 | use VuFindHarvest\ConsoleOutput\WriterAwareTrait; |
40 | use VuFindHarvest\Exception\OaiException; |
41 | |
42 | /** |
43 | * OAI-PMH Harvest Tool (Symfony Console Command) |
44 | * |
45 | * @category VuFind |
46 | * @package Harvest_Tools |
47 | * @author Demian Katz <demian.katz@villanova.edu> |
48 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
49 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
50 | */ |
51 | class HarvesterCommand extends Command |
52 | { |
53 | use WriterAwareTrait; |
54 | |
55 | /** |
56 | * The name of the command |
57 | * |
58 | * @var string |
59 | */ |
60 | protected static $defaultName = 'harvest/harvest_oai'; |
61 | |
62 | /** |
63 | * HTTP client |
64 | * |
65 | * @var Client |
66 | */ |
67 | protected $client; |
68 | |
69 | /** |
70 | * Root directory for harvesting |
71 | * |
72 | * @var string |
73 | */ |
74 | protected $harvestRoot; |
75 | |
76 | /** |
77 | * Harvester factory |
78 | * |
79 | * @var HarvesterFactory |
80 | */ |
81 | protected $factory; |
82 | |
83 | /** |
84 | * Silent mode |
85 | * |
86 | * @var bool |
87 | */ |
88 | protected $silent; |
89 | |
90 | /** |
91 | * Constructor |
92 | * |
93 | * @param Client $client HTTP client (omit for default) |
94 | * @param string $harvestRoot Root directory for harvesting (omit for |
95 | * default) |
96 | * @param HarvesterFactory $factory Harvester factory (omit for default) |
97 | * @param bool $silent Should we suppress output? |
98 | * @param string|null $name The name of the command; passing null |
99 | * means it must be set in configure() |
100 | */ |
101 | public function __construct( |
102 | $client = null, |
103 | $harvestRoot = null, |
104 | HarvesterFactory $factory = null, |
105 | $silent = false, |
106 | $name = null |
107 | ) { |
108 | $this->client = $client ?: new Client(); |
109 | $this->harvestRoot = $harvestRoot ?: getcwd(); |
110 | $this->factory = $factory ?: new HarvesterFactory(); |
111 | $this->silent = $silent; |
112 | parent::__construct($name); |
113 | } |
114 | |
115 | /** |
116 | * Configure the command. |
117 | * |
118 | * @return void |
119 | * |
120 | * @SuppressWarnings(PHPMD.ExcessiveMethodLength) |
121 | */ |
122 | protected function configure() |
123 | { |
124 | $this |
125 | ->setDescription('OAI-PMH harvester') |
126 | ->setHelp('Harvests metadata using the OAI-PMH protocol.') |
127 | ->addArgument( |
128 | 'target', |
129 | InputArgument::OPTIONAL, |
130 | 'the name of a section of the configuration specified by the ini ' |
131 | . "option,\nor a directory to harvest into if no .ini file is used. " |
132 | . "If <target> is\nomitted, all .ini sections will be processed." |
133 | )->addOption( |
134 | 'from', |
135 | null, |
136 | InputOption::VALUE_REQUIRED, |
137 | 'Harvest start date' |
138 | )->addOption( |
139 | 'until', |
140 | null, |
141 | InputOption::VALUE_REQUIRED, |
142 | 'Harvest end date' |
143 | )->addOption( |
144 | 'ini', |
145 | null, |
146 | InputOption::VALUE_REQUIRED, |
147 | '.ini file to load; if you set other more specific options, they' |
148 | . " will\noverride equivalent settings loaded from the .ini file." |
149 | )->addOption( |
150 | 'url', |
151 | null, |
152 | InputOption::VALUE_REQUIRED, |
153 | 'Base URL of OAI-PMH server' |
154 | )->addOption( |
155 | 'httpUser', |
156 | null, |
157 | InputOption::VALUE_REQUIRED, |
158 | 'Username to access url' |
159 | )->addOption( |
160 | 'httpPass', |
161 | null, |
162 | InputOption::VALUE_REQUIRED, |
163 | 'Password to access url' |
164 | )->addOption( |
165 | 'set', |
166 | null, |
167 | InputOption::VALUE_REQUIRED, |
168 | 'Set name to harvest' |
169 | )->addOption( |
170 | 'metadataPrefix', |
171 | null, |
172 | InputOption::VALUE_REQUIRED, |
173 | 'Metadata prefix to harvest' |
174 | )->addOption( |
175 | 'timeout', |
176 | null, |
177 | InputOption::VALUE_REQUIRED, |
178 | 'HTTP timeout (in seconds)' |
179 | )->addOption( |
180 | 'combineRecords', |
181 | null, |
182 | InputOption::VALUE_NONE, |
183 | 'Turn off "one record per file" mode' |
184 | )->addOption( |
185 | 'combineRecordsTag', |
186 | null, |
187 | InputOption::VALUE_REQUIRED, |
188 | 'Specify the XML tag wrapped around multiple records in ' |
189 | . "combineRecords\nmode (default = <collection> if this " |
190 | . 'option is omitted)' |
191 | )->addOption( |
192 | 'globalSearch', |
193 | null, |
194 | InputOption::VALUE_REQUIRED, |
195 | 'Regular expression to replace in raw XML' |
196 | )->addOption( |
197 | 'globalReplace', |
198 | null, |
199 | InputOption::VALUE_REQUIRED, |
200 | 'String to replace globalSearch regex matches' |
201 | )->addOption( |
202 | 'injectDate', |
203 | null, |
204 | InputOption::VALUE_REQUIRED, |
205 | 'Inject date from header into specified tag' |
206 | )->addOption( |
207 | 'injectId', |
208 | null, |
209 | InputOption::VALUE_REQUIRED, |
210 | 'Inject ID from header into specified tag' |
211 | )->addOption( |
212 | 'injectSetName', |
213 | null, |
214 | InputOption::VALUE_REQUIRED, |
215 | 'Inject setName from header into specified tag' |
216 | )->addOption( |
217 | 'injectSetSpec', |
218 | null, |
219 | InputOption::VALUE_REQUIRED, |
220 | 'Inject setSpec from header into specified tag' |
221 | )->addOption( |
222 | 'idSearch', |
223 | null, |
224 | InputOption::VALUE_REQUIRED, |
225 | 'Regular expression to replace in ID' |
226 | . ' (only relevant when injectId is on)' |
227 | )->addOption( |
228 | 'idReplace', |
229 | null, |
230 | InputOption::VALUE_REQUIRED, |
231 | 'String to replace idSearch regex matches' |
232 | )->addOption( |
233 | 'dateGranularity', |
234 | null, |
235 | InputOption::VALUE_REQUIRED, |
236 | '"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)' |
237 | )->addOption( |
238 | 'harvestedIdLog', |
239 | null, |
240 | InputOption::VALUE_REQUIRED, |
241 | 'Filename (relative to harvest directory)' |
242 | . ' to store log of harvested IDs.' |
243 | )->addOption( |
244 | 'autosslca', |
245 | null, |
246 | InputOption::VALUE_NONE, |
247 | 'Attempt to autodetect SSL certificate file/path' |
248 | )->addOption( |
249 | 'sslcapath', |
250 | null, |
251 | InputOption::VALUE_REQUIRED, |
252 | 'Path to SSL certificate authority directory' |
253 | )->addOption( |
254 | 'sslcafile', |
255 | null, |
256 | InputOption::VALUE_REQUIRED, |
257 | 'Path to SSL certificate authority file' |
258 | )->addOption( |
259 | 'nosslverifypeer', |
260 | null, |
261 | InputOption::VALUE_NONE, |
262 | 'Disable SSL verification' |
263 | )->addOption( |
264 | 'sanitize', |
265 | null, |
266 | InputOption::VALUE_NONE, |
267 | 'Strip illegal characters from XML' |
268 | )->addOption( |
269 | 'sanitizeRegex', |
270 | null, |
271 | InputOption::VALUE_REQUIRED, |
272 | 'Optional regular expression defining XML characters to remove' |
273 | )->addOption( |
274 | 'badXMLLog', |
275 | null, |
276 | InputOption::VALUE_REQUIRED, |
277 | 'Filename (relative to harvest directory) to log' |
278 | . ' XML fixed by sanitize setting' |
279 | )->addOption( |
280 | 'stopAfter', |
281 | null, |
282 | InputOption::VALUE_NONE, |
283 | 'an option to stop harvesting after the first n records of each set.' |
284 | ); |
285 | } |
286 | |
287 | /** |
288 | * Use command-line switches to add/override settings found in the .ini |
289 | * file, if necessary. |
290 | * |
291 | * @param InputInterface $input Input object |
292 | * @param array $settings Incoming settings |
293 | * |
294 | * @return array |
295 | */ |
296 | protected function updateSettingsWithConsoleOptions( |
297 | InputInterface $input, |
298 | $settings |
299 | ) { |
300 | $directMapSettings = [ |
301 | 'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag', |
302 | 'injectDate', 'injectId', 'injectSetName', 'injectSetSpec', |
303 | 'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog', |
304 | 'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile', |
305 | 'sanitizeRegex', |
306 | ]; |
307 | foreach ($directMapSettings as $setting) { |
308 | if ($value = $input->getOption($setting)) { |
309 | $settings[$setting] = $value; |
310 | } |
311 | } |
312 | $flagSettings = [ |
313 | 'combineRecords' => ['combineRecords', true], |
314 | 'verbose' => ['verbose', true], |
315 | 'autosslca' => ['autosslca', true], |
316 | 'nosslverifypeer' => ['sslverifypeer', false], |
317 | 'sanitize' => ['sanitize', true], |
318 | ]; |
319 | foreach ($flagSettings as $in => $details) { |
320 | if ($input->hasOption($in) && $input->getOption($in)) { |
321 | [$out, $val] = $details; |
322 | $settings[$out] = $val; |
323 | } |
324 | } |
325 | return $settings; |
326 | } |
327 | |
328 | /** |
329 | * Run the command. |
330 | * |
331 | * @param InputInterface $input Input object |
332 | * @param OutputInterface $output Output object |
333 | * |
334 | * @return int 0 for success |
335 | */ |
336 | protected function execute(InputInterface $input, OutputInterface $output) |
337 | { |
338 | // Only set up output writer if not in silent mode: |
339 | if (!$this->silent) { |
340 | $this->setOutputWriter(new ConsoleWriter($output)); |
341 | } |
342 | |
343 | if (!$allSettings = $this->getSettings($input)) { |
344 | return 1; |
345 | } |
346 | |
347 | // Loop through all the settings and perform harvests: |
348 | $processed = $skipped = $errors = 0; |
349 | foreach ($allSettings as $target => $baseSettings) { |
350 | $settings = $this->updateSettingsWithConsoleOptions( |
351 | $input, |
352 | $baseSettings |
353 | ); |
354 | if (empty($target) || empty($settings)) { |
355 | $skipped++; |
356 | continue; |
357 | } |
358 | $this->writeLine("Processing {$target}..."); |
359 | try { |
360 | $this->harvestSingleRepository($input, $output, $target, $settings); |
361 | } catch (\Exception $e) { |
362 | if ( |
363 | $e instanceof OaiException |
364 | && strtolower($e->getOaiCode()) == 'norecordsmatch' |
365 | ) { |
366 | $this->writeLine('No new records found.'); |
367 | } else { |
368 | $this->writeLine($e->getMessage()); |
369 | $errors++; |
370 | } |
371 | } |
372 | $processed++; |
373 | } |
374 | |
375 | // All done. |
376 | if (isset($settings['stopAfter'])) { |
377 | $this->writeLine( |
378 | 'stopAfter option set; ' |
379 | . 'all sources may not have been fully harvested.' |
380 | ); |
381 | } |
382 | if ($processed == 0 && $skipped > 0) { |
383 | $this->writeLine( |
384 | 'No valid settings found; ' |
385 | . 'please set url and metadataPrefix at minimum.' |
386 | ); |
387 | return 1; |
388 | } |
389 | if ($errors > 0) { |
390 | $this->writeLine( |
391 | "Completed with {$errors} error(s) -- " |
392 | . "{$processed} source(s) processed." |
393 | ); |
394 | return 1; |
395 | } |
396 | $this->writeLine( |
397 | "Completed without errors -- {$processed} source(s) processed." |
398 | ); |
399 | return 0; |
400 | } |
401 | |
402 | /** |
403 | * Get the target directory for writing harvested files. |
404 | * |
405 | * @return string |
406 | */ |
407 | protected function getHarvestRoot() |
408 | { |
409 | return $this->harvestRoot; |
410 | } |
411 | |
412 | /** |
413 | * Get an HTTP client. |
414 | * |
415 | * @return Client |
416 | */ |
417 | protected function getHttpClient() |
418 | { |
419 | return $this->client; |
420 | } |
421 | |
422 | /** |
423 | * Load configuration from an .ini file (or return false on error) |
424 | * |
425 | * @param string $ini Configuration file to load |
426 | * @param string|bool $section Section of .ini to load (or false for all) |
427 | * |
428 | * @return array|bool |
429 | */ |
430 | protected function getSettingsFromIni($ini, $section) |
431 | { |
432 | $oaiSettings = @parse_ini_file($ini, true); |
433 | if (empty($oaiSettings)) { |
434 | $this->writeLine("Please add OAI-PMH settings to {$ini}."); |
435 | return false; |
436 | } |
437 | if ($section) { |
438 | if (!isset($oaiSettings[$section])) { |
439 | $this->writeLine("$section not found in $ini."); |
440 | return false; |
441 | } |
442 | $oaiSettings = [$section => $oaiSettings[$section]]; |
443 | } |
444 | return $oaiSettings; |
445 | } |
446 | |
447 | /** |
448 | * Load the harvest settings. Return false on error. |
449 | * |
450 | * @param InputInterface $input Input object |
451 | * |
452 | * @return array|bool |
453 | */ |
454 | protected function getSettings(InputInterface $input) |
455 | { |
456 | $ini = $input->getOption('ini'); |
457 | $section = $input->getArgument('target'); |
458 | if (!$ini && !$section) { |
459 | $this->writeLine( |
460 | 'Please specify an .ini file with the --ini flag' |
461 | . ' or a target directory with the first parameter.' |
462 | ); |
463 | return false; |
464 | } |
465 | return $ini |
466 | ? $this->getSettingsFromIni($ini, $section) |
467 | : [$section => []]; |
468 | } |
469 | |
470 | /** |
471 | * Harvest a single repository. |
472 | * |
473 | * @param InputInterface $input Input object |
474 | * @param OutputInterface $output Output object |
475 | * @param string $target Name of repo (used for target directory) |
476 | * @param array $settings Settings for the harvester. |
477 | * |
478 | * @return void |
479 | * @throws \Exception |
480 | */ |
481 | protected function harvestSingleRepository( |
482 | InputInterface $input, |
483 | OutputInterface $output, |
484 | $target, |
485 | $settings |
486 | ) { |
487 | $settings['from'] = $input->getOption('from'); |
488 | $settings['until'] = $input->getOption('until'); |
489 | $settings['silent'] = false; |
490 | $harvest = $this->factory->getHarvester( |
491 | $target, |
492 | $this->getHarvestRoot(), |
493 | $this->getHttpClient(), |
494 | $settings, |
495 | $output |
496 | ); |
497 | $harvest->launch(); |
498 | } |
499 | } |