Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
89.38% covered (warning)
89.38%
101 / 113
91.67% covered (success)
91.67%
11 / 12
CRAP
0.00% covered (danger)
0.00%
0 / 1
Harvester
89.38% covered (warning)
89.38%
101 / 113
91.67% covered (success)
91.67%
11 / 12
45.21
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 setEndDate
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setStartDate
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 launch
77.36% covered (warning)
77.36%
41 / 53
0.00% covered (danger)
0.00%
0 / 1
17.61
 sendRequest
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 checkResponseForErrors
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 getRecords
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 getRecordsByDate
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
4
 getRecordsByToken
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getIdentifyResponse
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
 storeDateSettings
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
 storeMiscSettings
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3/**
4 * OAI-PMH Harvest Tool
5 *
6 * PHP version 7
7 *
8 * Copyright (c) Demian Katz 2010.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22 *
23 * @category VuFind
24 * @package  Harvest_Tools
25 * @author   Demian Katz <demian.katz@villanova.edu>
26 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
27 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
28 */
29
30namespace VuFindHarvest\OaiPmh;
31
32use VuFindHarvest\ConsoleOutput\WriterAwareTrait;
33use VuFindHarvest\Exception\OaiException;
34
35use function count;
36
37/**
38 * OAI-PMH Harvest Tool
39 *
40 * @category VuFind
41 * @package  Harvest_Tools
42 * @author   Demian Katz <demian.katz@villanova.edu>
43 * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
44 * @link     https://vufind.org/wiki/indexing:oai-pmh Wiki
45 */
46class Harvester
47{
48    use WriterAwareTrait;
49
50    /**
51     * Record writer
52     *
53     * @var RecordWriter
54     */
55    protected $writer;
56
57    /**
58     * Low-level OAI-PMH communicator
59     *
60     * @var Communicator
61     */
62    protected $communicator;
63
64    /**
65     * State manager
66     *
67     * @var StateManager
68     */
69    protected $stateManager;
70
71    /**
72     * Target set(s) to harvest (null for all records)
73     *
74     * @var string|array
75     */
76    protected $set = null;
77
78    /**
79     * Metadata type to harvest
80     *
81     * @var string
82     */
83    protected $metadataPrefix = 'oai_dc';
84
85    /**
86     * Harvest end date (null for no specific end)
87     *
88     * @var string
89     */
90    protected $harvestEndDate;
91
92    /**
93     * Harvest start date (null for no specific start)
94     *
95     * @var string
96     */
97    protected $startDate = null;
98
99    /**
100     * Date granularity ('auto' to autodetect)
101     *
102     * @var string
103     */
104    protected $granularity = 'auto';
105
106    /**
107     * Identify information from OAI host
108     *
109     * @var stdClass
110     */
111    protected $identifyResponse = null;
112
113    /**
114     * Flag to limit number of harvested records (null = no limit).
115     * Used only for testing.
116     *
117     * @var ?int
118     */
119    protected $stopAfter = null;
120
121    /**
122     * Count harvested records.
123     */
124    protected $recordsCount = 0;
125
126    /**
127     * Constructor.
128     *
129     * @param Communicator $communicator Low-level API client
130     * @param RecordWriter $writer       Record writer
131     * @param StateManager $stateManager State manager
132     * @param array        $settings     OAI-PMH settings
133     */
134    public function __construct(
135        Communicator $communicator,
136        RecordWriter $writer,
137        StateManager $stateManager,
138        $settings = []
139    ) {
140        // Don't time out during harvest!!
141        set_time_limit(0);
142
143        // Store dependencies
144        $this->communicator = $communicator;
145        $this->writer = $writer;
146        $this->stateManager = $stateManager;
147
148        // Store other settings
149        $this->storeDateSettings($settings);
150        $this->storeMiscSettings($settings);
151    }
152
153    /**
154     * Set an end date for the harvest (only harvest records BEFORE this date).
155     *
156     * @param string $date End date (YYYY-MM-DD format).
157     *
158     * @return void
159     */
160    public function setEndDate($date)
161    {
162        $this->harvestEndDate = $date;
163    }
164
165    /**
166     * Set a start date for the harvest (only harvest records AFTER this date).
167     *
168     * @param string $date Start date (YYYY-MM-DD format).
169     *
170     * @return void
171     */
172    public function setStartDate($date)
173    {
174        $this->startDate = $date;
175    }
176
177    /**
178     * Harvest all available documents.
179     *
180     * @return void
181     *
182     * @throws \Exception
183     */
184    public function launch()
185    {
186        // Normalize sets setting to an array:
187        $sets = (array)$this->set;
188        if (empty($sets)) {
189            $sets = [null];
190        }
191
192        // The harvestEndDate may be null. Some OAI-PMH hosts may depend on a
193        // null value for backwards compatibility and reliability for various
194        // edge cases, so we allow a null value to be used during the initial
195        // records request. However, we still need to track an explicit end
196        // date, based on the current OAI server time, as the basis for future
197        // harvest start ranges. Note that this value can also be declared via
198        // state data as it should always track the time the harvest was
199        // first started.
200        // @see https://github.com/vufind-org/vufindharvest/issues/7
201        if (empty($this->harvestEndDate)) {
202            $explicitHarvestEndDate = $this->getIdentifyResponse()->responseDate;
203            // Add support for OAI-PMH hosts that require day granularity by
204            // converting the date format if necessary.
205            $granularity = $this->granularity == 'auto' ?
206                $this->getIdentifyResponse()->granularity : $this->granularity;
207            if ($granularity == 'YYYY-MM-DD') {
208                $explicitHarvestEndDate = substr($explicitHarvestEndDate, 0, 10);
209            }
210        } else {
211            $explicitHarvestEndDate = $this->harvestEndDate;
212        }
213
214        // Load last state, if applicable (used to recover from server failure).
215        if ($state = $this->stateManager->loadState()) {
216            $this->write("Found saved state; attempting to resume.\n");
217            // State data must contain 4 values for reliable resumption.
218            if (count($state) !== 4) {
219                $this->stateManager->clearState();
220                throw new \Exception(
221                    'Corrupt or incomplete state data detected; '
222                    . 'removing last_state.txt. Please restart harvest.'
223                );
224            }
225            [
226                $resumeSet,
227                $resumeToken,
228                $this->startDate,
229                $explicitHarvestEndDate
230            ] = $state;
231        }
232
233        // Loop through all of the selected sets:
234        foreach ($sets as $set) {
235            // If we're resuming and there are multiple sets, find the right one.
236            if (isset($resumeToken) && $resumeSet != $set) {
237                continue;
238            }
239
240            // If we have a token to resume from, pick up there now...
241            if (isset($resumeToken)) {
242                $token = $resumeToken;
243                unset($resumeToken);
244            } else {
245                // ...otherwise, start harvesting at the requested date:
246                $token = $this->getRecordsByDate(
247                    $this->startDate,
248                    $set,
249                    $this->harvestEndDate
250                );
251            }
252
253            // Keep harvesting as long as a resumption token is provided:
254            while ($token !== false) {
255                // If stopAfter is set, stop harvesting after given limit
256                if (
257                    !empty($this->stopAfter)
258                    && $this->recordsCount >= $this->stopAfter
259                ) {
260                    $this->writeLine(
261                        'reached limit of records to harvest: ' . $this->stopAfter
262                    );
263                    $this->writeLine('stop harvesting.');
264                    $token = false;
265                    break;
266                }
267                // Save current state in case we need to resume later:
268                $this->stateManager->saveState(
269                    $set,
270                    $token,
271                    $this->startDate,
272                    $explicitHarvestEndDate
273                );
274                $token = $this->getRecordsByToken($token);
275            }
276        }
277
278        // If we made it this far, all was successful. Save last harvest info and
279        // clean up the stored state (unless we have a limit imposed by stopAfter)
280        if (empty($this->stopAfter)) {
281            $this->stateManager->saveDate($explicitHarvestEndDate);
282        }
283        $this->stateManager->clearState();
284    }
285
286    /**
287     * Make an OAI-PMH request.  Die if there is an error; return a SimpleXML object
288     * on success.
289     *
290     * @param string $verb   OAI-PMH verb to execute.
291     * @param array  $params GET parameters for ListRecords method.
292     *
293     * @return object        SimpleXML-formatted response.
294     */
295    protected function sendRequest($verb, $params = [])
296    {
297        $response = $this->communicator->request($verb, $params);
298        $this->checkResponseForErrors($response);
299        return $response;
300    }
301
302    /**
303     * Check an OAI-PMH response for errors that need to be handled.
304     *
305     * @param object $result OAI-PMH response (SimpleXML object)
306     *
307     * @return void
308     *
309     * @throws \Exception
310     * @throws OaiException
311     */
312    protected function checkResponseForErrors($result)
313    {
314        // Detect errors and die if one is found:
315        if ($result->error) {
316            $attribs = $result->error->attributes();
317
318            // If this is a bad resumption token error and we're trying to
319            // restore a prior state, we should clean up.
320            if (
321                $attribs['code'] == 'badResumptionToken'
322                && $this->stateManager->loadState()
323            ) {
324                $this->stateManager->clearState();
325                throw new \Exception(
326                    'Token expired; removing last_state.txt. Please restart harvest.'
327                );
328            }
329            throw new OaiException($attribs['code'], $result->error);
330        }
331    }
332
333    /**
334     * Harvest records using OAI-PMH.
335     *
336     * @param array $params GET parameters for ListRecords method.
337     *
338     * @return mixed        Resumption token if provided, false if finished
339     */
340    protected function getRecords($params)
341    {
342        // Make the OAI-PMH request:
343        $response = $this->sendRequest('ListRecords', $params);
344
345        // Save the records from the response:
346        if ($response->ListRecords->record) {
347            $newRecords = count($response->ListRecords->record);
348            $this->writeLine(
349                '[' . $this->recordsCount . ' records harvested] Processing '
350                . $newRecords . ' records...'
351            );
352            // count numRecords
353            $this->recordsCount += $newRecords;
354            $this->writer->write($response->ListRecords->record);
355        }
356
357        // If we have a resumption token, keep going; otherwise, we're done.
358        if (
359            isset($response->ListRecords->resumptionToken)
360            && !empty($response->ListRecords->resumptionToken)
361        ) {
362            return $response->ListRecords->resumptionToken;
363        }
364        return false;
365    }
366
367    /**
368     * Harvest records via OAI-PMH using date and set.
369     *
370     * @param string $from  Harvest start date (null for no specific start).
371     * @param string $set   Set to harvest (null for all records).
372     * @param string $until Harvest end date (null for no specific end).
373     *
374     * @return mixed        Resumption token if provided, false if finished
375     */
376    protected function getRecordsByDate($from = null, $set = null, $until = null)
377    {
378        $params = ['metadataPrefix' => $this->metadataPrefix];
379        if (!empty($from)) {
380            $params['from'] = $from;
381        }
382        if (!empty($set)) {
383            $params['set'] = $set;
384        }
385        if (!empty($until)) {
386            $params['until'] = $until;
387        }
388        return $this->getRecords($params);
389    }
390
391    /**
392     * Harvest records via OAI-PMH using resumption token.
393     *
394     * @param string $token Resumption token.
395     *
396     * @return mixed        Resumption token if provided, false if finished
397     */
398    protected function getRecordsByToken($token)
399    {
400        return $this->getRecords(['resumptionToken' => (string)$token]);
401    }
402
403    /**
404     * Get identify information from OAI-PMH host. Unless $reset = TRUE, this
405     * method will only invoke an OAI-PMH call upon its first usage and will
406     * return cached data after that.
407     *
408     * @param boolean $reset Whether-or-not to reset identity information
409     *                       already fetched during this request.
410     *
411     * @return stdClass       An object of response properties as defined by
412     *                        http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
413     *                        plus a 'responseDate' property representing the
414     *                        datestamp of the identify response in the form
415     *                        YYYY-MM-DDThh:mm:ssZ
416     *
417     * @see http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
418     */
419    protected function getIdentifyResponse($reset = false)
420    {
421        if (empty($this->identifyResponse) || $reset) {
422            $response = $this->sendRequest('Identify');
423            // Save callers the burden of casting XML elements by preparing a
424            // flat list of string properties.
425            $this->identifyResponse = (object)(array)$response->Identify;
426            $this->identifyResponse->responseDate = (string)$response->responseDate;
427        }
428        return $this->identifyResponse;
429    }
430
431    /**
432     * Set date range configuration (support method for constructor).
433     *
434     * @param array $settings Configuration
435     *
436     * @return void
437     */
438    protected function storeDateSettings($settings)
439    {
440        // Set up start/end dates:
441        $from = empty($settings['from'])
442            ? $this->stateManager->loadDate() : $settings['from'];
443        $until = empty($settings['until']) ? null : $settings['until'];
444        $this->setStartDate($from);
445        $this->setEndDate($until);
446    }
447
448    /**
449     * Set miscellaneous configuration (support method for constructor).
450     *
451     * @param array $settings Configuration
452     *
453     * @return void
454     */
455    protected function storeMiscSettings($settings)
456    {
457        if (isset($settings['set'])) {
458            $this->set = $settings['set'];
459        }
460        if (isset($settings['metadataPrefix'])) {
461            $this->metadataPrefix = $settings['metadataPrefix'];
462        }
463        if (isset($settings['dateGranularity'])) {
464            $this->granularity = $settings['dateGranularity'];
465        }
466        if (isset($settings['stopAfter'])) {
467            $this->stopAfter = $settings['stopAfter'];
468        }
469    }
470}