Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
95.20% covered (success)
95.20%
238 / 250
77.78% covered (warning)
77.78%
7 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
HarvesterCommand
95.20% covered (success)
95.20%
238 / 250
77.78% covered (warning)
77.78%
7 / 9
35
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 configure
100.00% covered (success)
100.00%
160 / 160
100.00% covered (success)
100.00%
1 / 1
1
 updateSettingsWithConsoleOptions
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 execute
74.42% covered (warning)
74.42%
32 / 43
0.00% covered (danger)
0.00%
0 / 1
15.83
 getHarvestRoot
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getHttpClient
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getSettingsFromIni
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 getSettings
90.91% covered (success)
90.91%
10 / 11
0.00% covered (danger)
0.00%
0 / 1
4.01
 harvestSingleRepository
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3/**
4 * OAI-PMH Harvest Tool (Symfony Console Command)
5 *
6 * PHP version 7
7 *
8 * Copyright (c) Demian Katz 2016.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22 *
23 * @category VuFind
24 * @package  Harvest_Tools
25 * @author   Demian Katz <demian.katz@villanova.edu>
26 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
27 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
28 */
29
30namespace VuFindHarvest\OaiPmh;
31
32use Laminas\Http\Client;
33use Symfony\Component\Console\Attribute\AsCommand;
34use Symfony\Component\Console\Command\Command;
35use Symfony\Component\Console\Input\InputArgument;
36use Symfony\Component\Console\Input\InputInterface;
37use Symfony\Component\Console\Input\InputOption;
38use Symfony\Component\Console\Output\OutputInterface;
39use VuFindHarvest\ConsoleOutput\ConsoleWriter;
40use VuFindHarvest\ConsoleOutput\WriterAwareTrait;
41use VuFindHarvest\Exception\OaiException;
42
43/**
44 * OAI-PMH Harvest Tool (Symfony Console Command)
45 *
46 * @category VuFind
47 * @package  Harvest_Tools
48 * @author   Demian Katz <demian.katz@villanova.edu>
49 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
50 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
51 */
52#[AsCommand(
53    name: 'harvest/harvest_oai',
54    description: 'OAI-PMH harvester'
55)]
56class HarvesterCommand extends Command
57{
58    use WriterAwareTrait;
59
60    /**
61     * The name of the command
62     *
63     * @var string
64     */
65    protected static $defaultName = 'harvest/harvest_oai';
66
67    /**
68     * HTTP client
69     *
70     * @var Client
71     */
72    protected $client;
73
74    /**
75     * Root directory for harvesting
76     *
77     * @var string
78     */
79    protected $harvestRoot;
80
81    /**
82     * Harvester factory
83     *
84     * @var HarvesterFactory
85     */
86    protected $factory;
87
88    /**
89     * Silent mode
90     *
91     * @var bool
92     */
93    protected $silent;
94
95    /**
96     * Constructor
97     *
98     * @param Client           $client      HTTP client (omit for default)
99     * @param string           $harvestRoot Root directory for harvesting (omit for
100     * default)
101     * @param HarvesterFactory $factory     Harvester factory (omit for default)
102     * @param bool             $silent      Should we suppress output?
103     * @param string|null      $name        The name of the command; passing null
104     * means it must be set in configure()
105     */
106    public function __construct(
107        $client = null,
108        $harvestRoot = null,
109        HarvesterFactory $factory = null,
110        $silent = false,
111        $name = null
112    ) {
113        $this->client = $client ?: new Client();
114        $this->harvestRoot = $harvestRoot ?: getcwd();
115        $this->factory = $factory ?: new HarvesterFactory();
116        $this->silent = $silent;
117        parent::__construct($name);
118    }
119
120    /**
121     * Configure the command.
122     *
123     * @return void
124     *
125     * @SuppressWarnings(PHPMD.ExcessiveMethodLength)
126     */
127    protected function configure()
128    {
129        $this
130            ->setHelp('Harvests metadata using the OAI-PMH protocol.')
131            ->addArgument(
132                'target',
133                InputArgument::OPTIONAL,
134                'the name of a section of the configuration specified by the ini '
135                . "option,\nor a directory to harvest into if no .ini file is used. "
136                . "If <target> is\nomitted, all .ini sections will be processed."
137            )->addOption(
138                'from',
139                null,
140                InputOption::VALUE_REQUIRED,
141                'Harvest start date'
142            )->addOption(
143                'until',
144                null,
145                InputOption::VALUE_REQUIRED,
146                'Harvest end date'
147            )->addOption(
148                'ini',
149                null,
150                InputOption::VALUE_REQUIRED,
151                '.ini file to load; if you set other more specific options, they'
152                . " will\noverride equivalent settings loaded from the .ini file."
153            )->addOption(
154                'url',
155                null,
156                InputOption::VALUE_REQUIRED,
157                'Base URL of OAI-PMH server'
158            )->addOption(
159                'httpUser',
160                null,
161                InputOption::VALUE_REQUIRED,
162                'Username to access url'
163            )->addOption(
164                'httpPass',
165                null,
166                InputOption::VALUE_REQUIRED,
167                'Password to access url'
168            )->addOption(
169                'set',
170                null,
171                InputOption::VALUE_REQUIRED,
172                'Set name to harvest'
173            )->addOption(
174                'metadataPrefix',
175                null,
176                InputOption::VALUE_REQUIRED,
177                'Metadata prefix to harvest'
178            )->addOption(
179                'timeout',
180                null,
181                InputOption::VALUE_REQUIRED,
182                'HTTP timeout (in seconds)'
183            )->addOption(
184                'combineRecords',
185                null,
186                InputOption::VALUE_NONE,
187                'Turn off "one record per file" mode'
188            )->addOption(
189                'combineRecordsTag',
190                null,
191                InputOption::VALUE_REQUIRED,
192                'Specify the XML tag wrapped around multiple records in '
193                . "combineRecords\nmode (default = <collection> if this "
194                . 'option is omitted)'
195            )->addOption(
196                'globalSearch',
197                null,
198                InputOption::VALUE_REQUIRED,
199                'Regular expression to replace in raw XML'
200            )->addOption(
201                'globalReplace',
202                null,
203                InputOption::VALUE_REQUIRED,
204                'String to replace globalSearch regex matches'
205            )->addOption(
206                'injectDate',
207                null,
208                InputOption::VALUE_REQUIRED,
209                'Inject date from header into specified tag'
210            )->addOption(
211                'injectId',
212                null,
213                InputOption::VALUE_REQUIRED,
214                'Inject ID from header into specified tag'
215            )->addOption(
216                'injectSetName',
217                null,
218                InputOption::VALUE_REQUIRED,
219                'Inject setName from header into specified tag'
220            )->addOption(
221                'injectSetSpec',
222                null,
223                InputOption::VALUE_REQUIRED,
224                'Inject setSpec from header into specified tag'
225            )->addOption(
226                'idSearch',
227                null,
228                InputOption::VALUE_REQUIRED,
229                'Regular expression to replace in ID'
230                . ' (only relevant when injectId is on)'
231            )->addOption(
232                'idReplace',
233                null,
234                InputOption::VALUE_REQUIRED,
235                'String to replace idSearch regex matches'
236            )->addOption(
237                'dateGranularity',
238                null,
239                InputOption::VALUE_REQUIRED,
240                '"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)'
241            )->addOption(
242                'harvestedIdLog',
243                null,
244                InputOption::VALUE_REQUIRED,
245                'Filename (relative to harvest directory)'
246                . ' to store log of harvested IDs.'
247            )->addOption(
248                'autosslca',
249                null,
250                InputOption::VALUE_NONE,
251                'Attempt to autodetect SSL certificate file/path'
252            )->addOption(
253                'sslcapath',
254                null,
255                InputOption::VALUE_REQUIRED,
256                'Path to SSL certificate authority directory'
257            )->addOption(
258                'sslcafile',
259                null,
260                InputOption::VALUE_REQUIRED,
261                'Path to SSL certificate authority file'
262            )->addOption(
263                'nosslverifypeer',
264                null,
265                InputOption::VALUE_NONE,
266                'Disable SSL verification'
267            )->addOption(
268                'sanitize',
269                null,
270                InputOption::VALUE_NONE,
271                'Strip illegal characters from XML'
272            )->addOption(
273                'sanitizeRegex',
274                null,
275                InputOption::VALUE_REQUIRED,
276                'Optional regular expression defining XML characters to remove'
277            )->addOption(
278                'badXMLLog',
279                null,
280                InputOption::VALUE_REQUIRED,
281                'Filename (relative to harvest directory) to log'
282                . ' XML fixed by sanitize setting'
283            )->addOption(
284                'stopAfter',
285                null,
286                InputOption::VALUE_NONE,
287                'an option to stop harvesting after the first n records of each set.'
288            );
289    }
290
291    /**
292     * Use command-line switches to add/override settings found in the .ini
293     * file, if necessary.
294     *
295     * @param InputInterface $input    Input object
296     * @param array          $settings Incoming settings
297     *
298     * @return array
299     */
300    protected function updateSettingsWithConsoleOptions(
301        InputInterface $input,
302        $settings
303    ) {
304        $directMapSettings = [
305            'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag',
306            'injectDate', 'injectId', 'injectSetName', 'injectSetSpec',
307            'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog',
308            'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile',
309            'sanitizeRegex',
310        ];
311        foreach ($directMapSettings as $setting) {
312            if ($value = $input->getOption($setting)) {
313                $settings[$setting] = $value;
314            }
315        }
316        $flagSettings = [
317            'combineRecords' => ['combineRecords', true],
318            'verbose' => ['verbose', true],
319            'autosslca' => ['autosslca', true],
320            'nosslverifypeer' => ['sslverifypeer', false],
321            'sanitize' => ['sanitize', true],
322        ];
323        foreach ($flagSettings as $in => $details) {
324            if ($input->hasOption($in) && $input->getOption($in)) {
325                [$out, $val] = $details;
326                $settings[$out] = $val;
327            }
328        }
329        return $settings;
330    }
331
332    /**
333     * Run the command.
334     *
335     * @param InputInterface  $input  Input object
336     * @param OutputInterface $output Output object
337     *
338     * @return int 0 for success
339     */
340    protected function execute(InputInterface $input, OutputInterface $output)
341    {
342        // Only set up output writer if not in silent mode:
343        if (!$this->silent) {
344            $this->setOutputWriter(new ConsoleWriter($output));
345        }
346
347        if (!$allSettings = $this->getSettings($input)) {
348            return 1;
349        }
350
351        // Loop through all the settings and perform harvests:
352        $processed = $skipped = $errors = 0;
353        foreach ($allSettings as $target => $baseSettings) {
354            $settings = $this->updateSettingsWithConsoleOptions(
355                $input,
356                $baseSettings
357            );
358            if (empty($target) || empty($settings)) {
359                $skipped++;
360                continue;
361            }
362            $this->writeLine("Processing {$target}...");
363            try {
364                $this->harvestSingleRepository($input, $output, $target, $settings);
365            } catch (\Exception $e) {
366                if (
367                    $e instanceof OaiException
368                    && strtolower($e->getOaiCode()) == 'norecordsmatch'
369                ) {
370                    $this->writeLine('No new records found.');
371                } else {
372                    $this->writeLine($e->getMessage());
373                    $errors++;
374                }
375            }
376            $processed++;
377        }
378
379        // All done.
380        if (isset($settings['stopAfter'])) {
381            $this->writeLine(
382                'stopAfter option set; '
383                . 'all sources may not have been fully harvested.'
384            );
385        }
386        if ($processed == 0 && $skipped > 0) {
387            $this->writeLine(
388                'No valid settings found; '
389                . 'please set url and metadataPrefix at minimum.'
390            );
391            return 1;
392        }
393        if ($errors > 0) {
394            $this->writeLine(
395                "Completed with {$errors} error(s) -- "
396                . "{$processed} source(s) processed."
397            );
398            return 1;
399        }
400        $this->writeLine(
401            "Completed without errors -- {$processed} source(s) processed."
402        );
403        return 0;
404    }
405
406    /**
407     * Get the target directory for writing harvested files.
408     *
409     * @return string
410     */
411    protected function getHarvestRoot()
412    {
413        return $this->harvestRoot;
414    }
415
416    /**
417     * Get an HTTP client.
418     *
419     * @return Client
420     */
421    protected function getHttpClient()
422    {
423        return $this->client;
424    }
425
426    /**
427     * Load configuration from an .ini file (or return false on error)
428     *
429     * @param string      $ini     Configuration file to load
430     * @param string|bool $section Section of .ini to load (or false for all)
431     *
432     * @return array|bool
433     */
434    protected function getSettingsFromIni($ini, $section)
435    {
436        $oaiSettings = @parse_ini_file($ini, true);
437        if (empty($oaiSettings)) {
438            $this->writeLine("Please add OAI-PMH settings to {$ini}.");
439            return false;
440        }
441        if ($section) {
442            if (!isset($oaiSettings[$section])) {
443                $this->writeLine("$section not found in $ini.");
444                return false;
445            }
446            $oaiSettings = [$section => $oaiSettings[$section]];
447        }
448        return $oaiSettings;
449    }
450
451    /**
452     * Load the harvest settings. Return false on error.
453     *
454     * @param InputInterface $input Input object
455     *
456     * @return array|bool
457     */
458    protected function getSettings(InputInterface $input)
459    {
460        $ini = $input->getOption('ini');
461        $section = $input->getArgument('target');
462        if (!$ini && !$section) {
463            $this->writeLine(
464                'Please specify an .ini file with the --ini flag'
465                . ' or a target directory with the first parameter.'
466            );
467            return false;
468        }
469        return $ini
470            ? $this->getSettingsFromIni($ini, $section)
471            : [$section => []];
472    }
473
474    /**
475     * Harvest a single repository.
476     *
477     * @param InputInterface  $input    Input object
478     * @param OutputInterface $output   Output object
479     * @param string          $target   Name of repo (used for target directory)
480     * @param array           $settings Settings for the harvester.
481     *
482     * @return void
483     * @throws \Exception
484     */
485    protected function harvestSingleRepository(
486        InputInterface $input,
487        OutputInterface $output,
488        $target,
489        $settings
490    ) {
491        $settings['from'] = $input->getOption('from');
492        $settings['until'] = $input->getOption('until');
493        $settings['silent'] = false;
494        $harvest = $this->factory->getHarvester(
495            $target,
496            $this->getHarvestRoot(),
497            $this->getHttpClient(),
498            $settings,
499            $output
500        );
501        $harvest->launch();
502    }
503}