Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 265
0.00% covered (danger)
0.00%
0 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
HarvesterCommand
0.00% covered (danger)
0.00%
0 / 265
0.00% covered (danger)
0.00%
0 / 9
1260
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
20
 configure
0.00% covered (danger)
0.00%
0 / 161
0.00% covered (danger)
0.00%
0 / 1
2
 updateSettingsWithConsoleOptions
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
42
 execute
0.00% covered (danger)
0.00%
0 / 43
0.00% covered (danger)
0.00%
0 / 1
182
 getHarvestRoot
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHttpClient
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getSettingsFromIni
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
20
 getSettings
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
20
 harvestSingleRepository
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3/**
4 * OAI-PMH Harvest Tool (Symfony Console Command)
5 *
6 * PHP version 7
7 *
8 * Copyright (c) Demian Katz 2016.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22 *
23 * @category VuFind
24 * @package  Harvest_Tools
25 * @author   Demian Katz <demian.katz@villanova.edu>
26 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
27 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
28 */
29
30namespace VuFindHarvest\OaiPmh;
31
32use Laminas\Http\Client;
33use Symfony\Component\Console\Command\Command;
34use Symfony\Component\Console\Input\InputArgument;
35use Symfony\Component\Console\Input\InputInterface;
36use Symfony\Component\Console\Input\InputOption;
37use Symfony\Component\Console\Output\OutputInterface;
38use VuFindHarvest\ConsoleOutput\ConsoleWriter;
39use VuFindHarvest\ConsoleOutput\WriterAwareTrait;
40use VuFindHarvest\Exception\OaiException;
41
42/**
43 * OAI-PMH Harvest Tool (Symfony Console Command)
44 *
45 * @category VuFind
46 * @package  Harvest_Tools
47 * @author   Demian Katz <demian.katz@villanova.edu>
48 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
49 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
50 */
51class HarvesterCommand extends Command
52{
53    use WriterAwareTrait;
54
55    /**
56     * The name of the command
57     *
58     * @var string
59     */
60    protected static $defaultName = 'harvest/harvest_oai';
61
62    /**
63     * HTTP client
64     *
65     * @var Client
66     */
67    protected $client;
68
69    /**
70     * Root directory for harvesting
71     *
72     * @var string
73     */
74    protected $harvestRoot;
75
76    /**
77     * Harvester factory
78     *
79     * @var HarvesterFactory
80     */
81    protected $factory;
82
83    /**
84     * Silent mode
85     *
86     * @var bool
87     */
88    protected $silent;
89
90    /**
91     * Constructor
92     *
93     * @param Client           $client      HTTP client (omit for default)
94     * @param string           $harvestRoot Root directory for harvesting (omit for
95     * default)
96     * @param HarvesterFactory $factory     Harvester factory (omit for default)
97     * @param bool             $silent      Should we suppress output?
98     * @param string|null      $name        The name of the command; passing null
99     * means it must be set in configure()
100     */
101    public function __construct(
102        $client = null,
103        $harvestRoot = null,
104        HarvesterFactory $factory = null,
105        $silent = false,
106        $name = null
107    ) {
108        $this->client = $client ?: new Client();
109        $this->harvestRoot = $harvestRoot ?: getcwd();
110        $this->factory = $factory ?: new HarvesterFactory();
111        $this->silent = $silent;
112        parent::__construct($name);
113    }
114
115    /**
116     * Configure the command.
117     *
118     * @return void
119     *
120     * @SuppressWarnings(PHPMD.ExcessiveMethodLength)
121     */
122    protected function configure()
123    {
124        $this
125            ->setDescription('OAI-PMH harvester')
126            ->setHelp('Harvests metadata using the OAI-PMH protocol.')
127            ->addArgument(
128                'target',
129                InputArgument::OPTIONAL,
130                'the name of a section of the configuration specified by the ini '
131                . "option,\nor a directory to harvest into if no .ini file is used. "
132                . "If <target> is\nomitted, all .ini sections will be processed."
133            )->addOption(
134                'from',
135                null,
136                InputOption::VALUE_REQUIRED,
137                'Harvest start date'
138            )->addOption(
139                'until',
140                null,
141                InputOption::VALUE_REQUIRED,
142                'Harvest end date'
143            )->addOption(
144                'ini',
145                null,
146                InputOption::VALUE_REQUIRED,
147                '.ini file to load; if you set other more specific options, they'
148                . " will\noverride equivalent settings loaded from the .ini file."
149            )->addOption(
150                'url',
151                null,
152                InputOption::VALUE_REQUIRED,
153                'Base URL of OAI-PMH server'
154            )->addOption(
155                'httpUser',
156                null,
157                InputOption::VALUE_REQUIRED,
158                'Username to access url'
159            )->addOption(
160                'httpPass',
161                null,
162                InputOption::VALUE_REQUIRED,
163                'Password to access url'
164            )->addOption(
165                'set',
166                null,
167                InputOption::VALUE_REQUIRED,
168                'Set name to harvest'
169            )->addOption(
170                'metadataPrefix',
171                null,
172                InputOption::VALUE_REQUIRED,
173                'Metadata prefix to harvest'
174            )->addOption(
175                'timeout',
176                null,
177                InputOption::VALUE_REQUIRED,
178                'HTTP timeout (in seconds)'
179            )->addOption(
180                'combineRecords',
181                null,
182                InputOption::VALUE_NONE,
183                'Turn off "one record per file" mode'
184            )->addOption(
185                'combineRecordsTag',
186                null,
187                InputOption::VALUE_REQUIRED,
188                'Specify the XML tag wrapped around multiple records in '
189                . "combineRecords\nmode (default = <collection> if this "
190                . 'option is omitted)'
191            )->addOption(
192                'globalSearch',
193                null,
194                InputOption::VALUE_REQUIRED,
195                'Regular expression to replace in raw XML'
196            )->addOption(
197                'globalReplace',
198                null,
199                InputOption::VALUE_REQUIRED,
200                'String to replace globalSearch regex matches'
201            )->addOption(
202                'injectDate',
203                null,
204                InputOption::VALUE_REQUIRED,
205                'Inject date from header into specified tag'
206            )->addOption(
207                'injectId',
208                null,
209                InputOption::VALUE_REQUIRED,
210                'Inject ID from header into specified tag'
211            )->addOption(
212                'injectSetName',
213                null,
214                InputOption::VALUE_REQUIRED,
215                'Inject setName from header into specified tag'
216            )->addOption(
217                'injectSetSpec',
218                null,
219                InputOption::VALUE_REQUIRED,
220                'Inject setSpec from header into specified tag'
221            )->addOption(
222                'idSearch',
223                null,
224                InputOption::VALUE_REQUIRED,
225                'Regular expression to replace in ID'
226                . ' (only relevant when injectId is on)'
227            )->addOption(
228                'idReplace',
229                null,
230                InputOption::VALUE_REQUIRED,
231                'String to replace idSearch regex matches'
232            )->addOption(
233                'dateGranularity',
234                null,
235                InputOption::VALUE_REQUIRED,
236                '"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)'
237            )->addOption(
238                'harvestedIdLog',
239                null,
240                InputOption::VALUE_REQUIRED,
241                'Filename (relative to harvest directory)'
242                . ' to store log of harvested IDs.'
243            )->addOption(
244                'autosslca',
245                null,
246                InputOption::VALUE_NONE,
247                'Attempt to autodetect SSL certificate file/path'
248            )->addOption(
249                'sslcapath',
250                null,
251                InputOption::VALUE_REQUIRED,
252                'Path to SSL certificate authority directory'
253            )->addOption(
254                'sslcafile',
255                null,
256                InputOption::VALUE_REQUIRED,
257                'Path to SSL certificate authority file'
258            )->addOption(
259                'nosslverifypeer',
260                null,
261                InputOption::VALUE_NONE,
262                'Disable SSL verification'
263            )->addOption(
264                'sanitize',
265                null,
266                InputOption::VALUE_NONE,
267                'Strip illegal characters from XML'
268            )->addOption(
269                'sanitizeRegex',
270                null,
271                InputOption::VALUE_REQUIRED,
272                'Optional regular expression defining XML characters to remove'
273            )->addOption(
274                'badXMLLog',
275                null,
276                InputOption::VALUE_REQUIRED,
277                'Filename (relative to harvest directory) to log'
278                . ' XML fixed by sanitize setting'
279            )->addOption(
280                'stopAfter',
281                null,
282                InputOption::VALUE_NONE,
283                'an option to stop harvesting after the first n records of each set.'
284            );
285    }
286
287    /**
288     * Use command-line switches to add/override settings found in the .ini
289     * file, if necessary.
290     *
291     * @param InputInterface $input    Input object
292     * @param array          $settings Incoming settings
293     *
294     * @return array
295     */
296    protected function updateSettingsWithConsoleOptions(
297        InputInterface $input,
298        $settings
299    ) {
300        $directMapSettings = [
301            'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag',
302            'injectDate', 'injectId', 'injectSetName', 'injectSetSpec',
303            'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog',
304            'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile',
305            'sanitizeRegex',
306        ];
307        foreach ($directMapSettings as $setting) {
308            if ($value = $input->getOption($setting)) {
309                $settings[$setting] = $value;
310            }
311        }
312        $flagSettings = [
313            'combineRecords' => ['combineRecords', true],
314            'verbose' => ['verbose', true],
315            'autosslca' => ['autosslca', true],
316            'nosslverifypeer' => ['sslverifypeer', false],
317            'sanitize' => ['sanitize', true],
318        ];
319        foreach ($flagSettings as $in => $details) {
320            if ($input->hasOption($in) && $input->getOption($in)) {
321                [$out, $val] = $details;
322                $settings[$out] = $val;
323            }
324        }
325        return $settings;
326    }
327
328    /**
329     * Run the command.
330     *
331     * @param InputInterface  $input  Input object
332     * @param OutputInterface $output Output object
333     *
334     * @return int 0 for success
335     */
336    protected function execute(InputInterface $input, OutputInterface $output)
337    {
338        // Only set up output writer if not in silent mode:
339        if (!$this->silent) {
340            $this->setOutputWriter(new ConsoleWriter($output));
341        }
342
343        if (!$allSettings = $this->getSettings($input)) {
344            return 1;
345        }
346
347        // Loop through all the settings and perform harvests:
348        $processed = $skipped = $errors = 0;
349        foreach ($allSettings as $target => $baseSettings) {
350            $settings = $this->updateSettingsWithConsoleOptions(
351                $input,
352                $baseSettings
353            );
354            if (empty($target) || empty($settings)) {
355                $skipped++;
356                continue;
357            }
358            $this->writeLine("Processing {$target}...");
359            try {
360                $this->harvestSingleRepository($input, $output, $target, $settings);
361            } catch (\Exception $e) {
362                if (
363                    $e instanceof OaiException
364                    && strtolower($e->getOaiCode()) == 'norecordsmatch'
365                ) {
366                    $this->writeLine('No new records found.');
367                } else {
368                    $this->writeLine($e->getMessage());
369                    $errors++;
370                }
371            }
372            $processed++;
373        }
374
375        // All done.
376        if (isset($settings['stopAfter'])) {
377            $this->writeLine(
378                'stopAfter option set; '
379                . 'all sources may not have been fully harvested.'
380            );
381        }
382        if ($processed == 0 && $skipped > 0) {
383            $this->writeLine(
384                'No valid settings found; '
385                . 'please set url and metadataPrefix at minimum.'
386            );
387            return 1;
388        }
389        if ($errors > 0) {
390            $this->writeLine(
391                "Completed with {$errors} error(s) -- "
392                . "{$processed} source(s) processed."
393            );
394            return 1;
395        }
396        $this->writeLine(
397            "Completed without errors -- {$processed} source(s) processed."
398        );
399        return 0;
400    }
401
402    /**
403     * Get the target directory for writing harvested files.
404     *
405     * @return string
406     */
407    protected function getHarvestRoot()
408    {
409        return $this->harvestRoot;
410    }
411
412    /**
413     * Get an HTTP client.
414     *
415     * @return Client
416     */
417    protected function getHttpClient()
418    {
419        return $this->client;
420    }
421
422    /**
423     * Load configuration from an .ini file (or return false on error)
424     *
425     * @param string      $ini     Configuration file to load
426     * @param string|bool $section Section of .ini to load (or false for all)
427     *
428     * @return array|bool
429     */
430    protected function getSettingsFromIni($ini, $section)
431    {
432        $oaiSettings = @parse_ini_file($ini, true);
433        if (empty($oaiSettings)) {
434            $this->writeLine("Please add OAI-PMH settings to {$ini}.");
435            return false;
436        }
437        if ($section) {
438            if (!isset($oaiSettings[$section])) {
439                $this->writeLine("$section not found in $ini.");
440                return false;
441            }
442            $oaiSettings = [$section => $oaiSettings[$section]];
443        }
444        return $oaiSettings;
445    }
446
447    /**
448     * Load the harvest settings. Return false on error.
449     *
450     * @param InputInterface $input Input object
451     *
452     * @return array|bool
453     */
454    protected function getSettings(InputInterface $input)
455    {
456        $ini = $input->getOption('ini');
457        $section = $input->getArgument('target');
458        if (!$ini && !$section) {
459            $this->writeLine(
460                'Please specify an .ini file with the --ini flag'
461                . ' or a target directory with the first parameter.'
462            );
463            return false;
464        }
465        return $ini
466            ? $this->getSettingsFromIni($ini, $section)
467            : [$section => []];
468    }
469
470    /**
471     * Harvest a single repository.
472     *
473     * @param InputInterface  $input    Input object
474     * @param OutputInterface $output   Output object
475     * @param string          $target   Name of repo (used for target directory)
476     * @param array           $settings Settings for the harvester.
477     *
478     * @return void
479     * @throws \Exception
480     */
481    protected function harvestSingleRepository(
482        InputInterface $input,
483        OutputInterface $output,
484        $target,
485        $settings
486    ) {
487        $settings['from'] = $input->getOption('from');
488        $settings['until'] = $input->getOption('until');
489        $settings['silent'] = false;
490        $harvest = $this->factory->getHarvester(
491            $target,
492            $this->getHarvestRoot(),
493            $this->getHttpClient(),
494            $settings,
495            $output
496        );
497        $harvest->launch();
498    }
499}