Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
88.89% |
8 / 9 |
CRAP | |
96.35% |
211 / 219 |
HarvesterCommand | |
0.00% |
0 / 1 |
|
88.89% |
8 / 9 |
31 | |
96.35% |
211 / 219 |
__construct | |
100.00% |
1 / 1 |
4 | |
100.00% |
6 / 6 |
|||
configure | |
100.00% |
1 / 1 |
1 | |
100.00% |
147 / 147 |
|||
updateSettingsWithConsoleOptions | |
100.00% |
1 / 1 |
6 | |
100.00% |
10 / 10 |
|||
execute | |
0.00% |
0 / 1 |
12.00 | |
66.67% |
16 / 24 |
|||
getHarvestRoot | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
getHttpClient | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
getSettingsFromIni | |
100.00% |
1 / 1 |
4 | |
100.00% |
10 / 10 |
|||
getSettings | |
100.00% |
1 / 1 |
4 | |
100.00% |
9 / 9 |
|||
harvestSingleRepository | |
100.00% |
1 / 1 |
1 | |
100.00% |
11 / 11 |
<?php | |
/** | |
* OAI-PMH Harvest Tool (Symfony Console Command) | |
* | |
* PHP version 7 | |
* | |
* Copyright (c) Demian Katz 2016. | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License version 2, | |
* as published by the Free Software Foundation. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
* | |
* @category VuFind | |
* @package Harvest_Tools | |
* @author Demian Katz <demian.katz@villanova.edu> | |
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License | |
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki | |
*/ | |
namespace VuFindHarvest\OaiPmh; | |
use Laminas\Http\Client; | |
use Symfony\Component\Console\Command\Command; | |
use Symfony\Component\Console\Input\InputArgument; | |
use Symfony\Component\Console\Input\InputInterface; | |
use Symfony\Component\Console\Input\InputOption; | |
use Symfony\Component\Console\Output\OutputInterface; | |
use VuFindHarvest\ConsoleOutput\ConsoleWriter; | |
use VuFindHarvest\ConsoleOutput\WriterAwareTrait; | |
/** | |
* OAI-PMH Harvest Tool (Symfony Console Command) | |
* | |
* @category VuFind | |
* @package Harvest_Tools | |
* @author Demian Katz <demian.katz@villanova.edu> | |
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License | |
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki | |
*/ | |
class HarvesterCommand extends Command | |
{ | |
use WriterAwareTrait; | |
/** | |
* The name of the command | |
* | |
* @var string | |
*/ | |
protected static $defaultName = 'harvest/harvest_oai'; | |
/** | |
* HTTP client | |
* | |
* @var Client | |
*/ | |
protected $client; | |
/** | |
* Root directory for harvesting | |
* | |
* @var string | |
*/ | |
protected $harvestRoot; | |
/** | |
* Harvester factory | |
* | |
* @var HarvesterFactory | |
*/ | |
protected $factory; | |
/** | |
* Silent mode | |
* | |
* @var bool | |
*/ | |
protected $silent; | |
/** | |
* Constructor | |
* | |
* @param Client $client HTTP client (omit for default) | |
* @param string $harvestRoot Root directory for harvesting (omit for | |
* default) | |
* @param HarvesterFactory $factory Harvester factory (omit for default) | |
* @param bool $silent Should we suppress output? | |
* @param string|null $name The name of the command; passing null | |
* means it must be set in configure() | |
*/ | |
public function __construct($client = null, $harvestRoot = null, | |
HarvesterFactory $factory = null, $silent = false, $name = null | |
) { | |
$this->client = $client ?: new Client(); | |
$this->harvestRoot = $harvestRoot ?: getcwd(); | |
$this->factory = $factory ?: new HarvesterFactory(); | |
$this->silent = $silent; | |
parent::__construct($name); | |
} | |
/** | |
* Configure the command. | |
* | |
* @return void | |
* | |
* @SuppressWarnings(PHPMD.ExcessiveMethodLength) | |
*/ | |
protected function configure() | |
{ | |
$this | |
->setDescription('OAI-PMH harvester') | |
->setHelp('Harvests metadata using the OAI-PMH protocol.') | |
->addArgument( | |
'target', | |
InputArgument::OPTIONAL, | |
'the name of a section of the configuration specified by the ini ' | |
. "option,\nor a directory to harvest into if no .ini file is used. " | |
. "If <target> is\nomitted, all .ini sections will be processed." | |
)->addOption( | |
'from', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Harvest start date' | |
)->addOption( | |
'until', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Harvest end date' | |
)->addOption( | |
'ini', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'.ini file to load; if you set other more specific options, they' | |
. " will\noverride equivalent settings loaded from the .ini file." | |
)->addOption( | |
'url', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Base URL of OAI-PMH server' | |
)->addOption( | |
'httpUser', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Username to access url' | |
)->addOption( | |
'httpPass', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Password to access url' | |
)->addOption( | |
'set', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Set name to harvest' | |
)->addOption( | |
'metadataPrefix', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Metadata prefix to harvest' | |
)->addOption( | |
'timeout', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'HTTP timeout (in seconds)' | |
)->addOption( | |
'combineRecords', | |
null, | |
InputOption::VALUE_NONE, | |
'Turn off "one record per file" mode' | |
)->addOption( | |
'combineRecordsTag', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Specify the XML tag wrapped around multiple records in ' | |
. "combineRecords\nmode (default = <collection> if this " | |
. 'option is omitted)' | |
)->addOption( | |
'globalSearch', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Regular expression to replace in raw XML' | |
)->addOption( | |
'globalReplace', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'String to replace globalSearch regex matches' | |
)->addOption( | |
'injectDate', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Inject date from header into specified tag' | |
)->addOption( | |
'injectId', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Inject ID from header into specified tag' | |
)->addOption( | |
'injectSetName', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Inject setName from header into specified tag' | |
)->addOption( | |
'injectSetSpec', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Inject setSpec from header into specified tag' | |
)->addOption( | |
'idSearch', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Regular expression to replace in ID' | |
. ' (only relevant when injectId is on)' | |
)->addOption( | |
'idReplace', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'String to replace idSearch regex matches' | |
)->addOption( | |
'dateGranularity', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)' | |
)->addOption( | |
'harvestedIdLog', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Filename (relative to harvest directory)' | |
. ' to store log of harvested IDs.' | |
)->addOption( | |
'autosslca', | |
null, | |
InputOption::VALUE_NONE, | |
'Attempt to autodetect SSL certificate file/path' | |
)->addOption( | |
'sslcapath', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Path to SSL certificate authority directory' | |
)->addOption( | |
'sslcafile', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Path to SSL certificate authority file' | |
)->addOption( | |
'nosslverifypeer', | |
null, | |
InputOption::VALUE_NONE, | |
'Disable SSL verification' | |
)->addOption( | |
'sanitize', | |
null, | |
InputOption::VALUE_NONE, | |
'Strip illegal characters from XML' | |
)->addOption( | |
'sanitizeRegex', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Optional regular expression defining XML characters to remove' | |
)->addOption( | |
'badXMLLog', | |
null, | |
InputOption::VALUE_REQUIRED, | |
'Filename (relative to harvest directory) to log' | |
. ' XML fixed by sanitize setting' | |
); | |
} | |
/** | |
* Use command-line switches to add/override settings found in the .ini | |
* file, if necessary. | |
* | |
* @param InputInterface $input Input object | |
* @param array $settings Incoming settings | |
* | |
* @return array | |
*/ | |
protected function updateSettingsWithConsoleOptions(InputInterface $input, | |
$settings | |
) { | |
$directMapSettings = [ | |
'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag', | |
'injectDate', 'injectId', 'injectSetName', 'injectSetSpec', | |
'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog', | |
'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile', | |
'sanitizeRegex', | |
]; | |
foreach ($directMapSettings as $setting) { | |
if ($value = $input->getOption($setting)) { | |
$settings[$setting] = $value; | |
} | |
} | |
$flagSettings = [ | |
'combineRecords' => ['combineRecords', true], | |
'verbose' => ['verbose', true], | |
'autosslca' => ['autosslca', true], | |
'nosslverifypeer' => ['sslverifypeer', false], | |
'sanitize' => ['sanitize', true], | |
]; | |
foreach ($flagSettings as $in => $details) { | |
if ($input->hasOption($in) && $input->getOption($in)) { | |
list($out, $val) = $details; | |
$settings[$out] = $val; | |
} | |
} | |
return $settings; | |
} | |
/** | |
* Run the command. | |
* | |
* @param InputInterface $input Input object | |
* @param OutputInterface $output Output object | |
* | |
* @return int 0 for success | |
*/ | |
protected function execute(InputInterface $input, OutputInterface $output) | |
{ | |
// Only set up output writer if not in silent mode: | |
if (!$this->silent) { | |
$this->setOutputWriter(new ConsoleWriter($output)); | |
} | |
if (!$allSettings = $this->getSettings($input)) { | |
return 1; | |
} | |
// Loop through all the settings and perform harvests: | |
$processed = $skipped = 0; | |
foreach ($allSettings as $target => $baseSettings) { | |
$settings = $this->updateSettingsWithConsoleOptions( | |
$input, $baseSettings | |
); | |
if (empty($target) || empty($settings)) { | |
$skipped++; | |
continue; | |
} | |
$this->writeLine("Processing {$target}..."); | |
try { | |
$this->harvestSingleRepository($input, $output, $target, $settings); | |
} catch (\Exception $e) { | |
$this->writeLine($e->getMessage()); | |
return 1; | |
} | |
$processed++; | |
} | |
// All done. | |
if ($processed == 0 && $skipped > 0) { | |
$this->writeLine( | |
'No valid settings found; ' | |
. 'please set url and metadataPrefix at minimum.' | |
); | |
return 1; | |
} | |
$this->writeLine( | |
"Completed without errors -- {$processed} source(s) processed." | |
); | |
return 0; | |
} | |
/** | |
* Get the target directory for writing harvested files. | |
* | |
* @return string | |
*/ | |
protected function getHarvestRoot() | |
{ | |
return $this->harvestRoot; | |
} | |
/** | |
* Get an HTTP client. | |
* | |
* @return Client | |
*/ | |
protected function getHttpClient() | |
{ | |
return $this->client; | |
} | |
/** | |
* Load configuration from an .ini file (or return false on error) | |
* | |
* @param string $ini Configuration file to load | |
* @param string|bool $section Section of .ini to load (or false for all) | |
* | |
* @return array|bool | |
*/ | |
protected function getSettingsFromIni($ini, $section) | |
{ | |
$oaiSettings = @parse_ini_file($ini, true); | |
if (empty($oaiSettings)) { | |
$this->writeLine("Please add OAI-PMH settings to {$ini}."); | |
return false; | |
} | |
if ($section) { | |
if (!isset($oaiSettings[$section])) { | |
$this->writeLine("$section not found in $ini."); | |
return false; | |
} | |
$oaiSettings = [$section => $oaiSettings[$section]]; | |
} | |
return $oaiSettings; | |
} | |
/** | |
* Load the harvest settings. Return false on error. | |
* | |
* @param InputInterface $input Input object | |
* | |
* @return array|bool | |
*/ | |
protected function getSettings(InputInterface $input) | |
{ | |
$ini = $input->getOption('ini'); | |
$section = $input->getArgument('target'); | |
if (!$ini && !$section) { | |
$this->writeLine( | |
'Please specify an .ini file with the --ini flag' | |
. ' or a target directory with the first parameter.' | |
); | |
return false; | |
} | |
return $ini | |
? $this->getSettingsFromIni($ini, $section) | |
: [$section => []]; | |
} | |
/** | |
* Harvest a single repository. | |
* | |
* @param InputInterface $input Input object | |
* @param OutputInterface $output Output object | |
* @param string $target Name of repo (used for target directory) | |
* @param array $settings Settings for the harvester. | |
* | |
* @return void | |
* @throws \Exception | |
*/ | |
protected function harvestSingleRepository(InputInterface $input, | |
OutputInterface $output, $target, $settings | |
) { | |
$settings['from'] = $input->getOption('from'); | |
$settings['until'] = $input->getOption('until'); | |
$settings['silent'] = false; | |
$harvest = $this->factory->getHarvester( | |
$target, | |
$this->getHarvestRoot(), | |
$this->getHttpClient(), | |
$settings, | |
$output | |
); | |
$harvest->launch(); | |
} | |
} |