Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
89.38% |
101 / 113 |
|
91.67% |
11 / 12 |
CRAP | |
0.00% |
0 / 1 |
Harvester | |
89.38% |
101 / 113 |
|
91.67% |
11 / 12 |
45.21 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
setEndDate | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setStartDate | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
launch | |
77.36% |
41 / 53 |
|
0.00% |
0 / 1 |
17.61 | |||
sendRequest | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
checkResponseForErrors | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
getRecords | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
getRecordsByDate | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
getRecordsByToken | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getIdentifyResponse | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
storeDateSettings | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
storeMiscSettings | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 |
1 | <?php |
2 | |
3 | /** |
4 | * OAI-PMH Harvest Tool |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (c) Demian Katz 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Harvest_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
28 | */ |
29 | |
30 | namespace VuFindHarvest\OaiPmh; |
31 | |
32 | use VuFindHarvest\ConsoleOutput\WriterAwareTrait; |
33 | use VuFindHarvest\Exception\OaiException; |
34 | |
35 | use function count; |
36 | |
37 | /** |
38 | * OAI-PMH Harvest Tool |
39 | * |
40 | * @category VuFind |
41 | * @package Harvest_Tools |
42 | * @author Demian Katz <demian.katz@villanova.edu> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
45 | */ |
46 | class Harvester |
47 | { |
48 | use WriterAwareTrait; |
49 | |
50 | /** |
51 | * Record writer |
52 | * |
53 | * @var RecordWriter |
54 | */ |
55 | protected $writer; |
56 | |
57 | /** |
58 | * Low-level OAI-PMH communicator |
59 | * |
60 | * @var Communicator |
61 | */ |
62 | protected $communicator; |
63 | |
64 | /** |
65 | * State manager |
66 | * |
67 | * @var StateManager |
68 | */ |
69 | protected $stateManager; |
70 | |
71 | /** |
72 | * Target set(s) to harvest (null for all records) |
73 | * |
74 | * @var string|array |
75 | */ |
76 | protected $set = null; |
77 | |
78 | /** |
79 | * Metadata type to harvest |
80 | * |
81 | * @var string |
82 | */ |
83 | protected $metadataPrefix = 'oai_dc'; |
84 | |
85 | /** |
86 | * Harvest end date (null for no specific end) |
87 | * |
88 | * @var string |
89 | */ |
90 | protected $harvestEndDate; |
91 | |
92 | /** |
93 | * Harvest start date (null for no specific start) |
94 | * |
95 | * @var string |
96 | */ |
97 | protected $startDate = null; |
98 | |
99 | /** |
100 | * Date granularity ('auto' to autodetect) |
101 | * |
102 | * @var string |
103 | */ |
104 | protected $granularity = 'auto'; |
105 | |
106 | /** |
107 | * Identify information from OAI host |
108 | * |
109 | * @var stdClass |
110 | */ |
111 | protected $identifyResponse = null; |
112 | |
113 | /** |
114 | * Flag to limit number of harvested records (null = no limit). |
115 | * Used only for testing. |
116 | * |
117 | * @var ?int |
118 | */ |
119 | protected $stopAfter = null; |
120 | |
121 | /** |
122 | * Count harvested records. |
123 | */ |
124 | protected $recordsCount = 0; |
125 | |
126 | /** |
127 | * Constructor. |
128 | * |
129 | * @param Communicator $communicator Low-level API client |
130 | * @param RecordWriter $writer Record writer |
131 | * @param StateManager $stateManager State manager |
132 | * @param array $settings OAI-PMH settings |
133 | */ |
134 | public function __construct( |
135 | Communicator $communicator, |
136 | RecordWriter $writer, |
137 | StateManager $stateManager, |
138 | $settings = [] |
139 | ) { |
140 | // Don't time out during harvest!! |
141 | set_time_limit(0); |
142 | |
143 | // Store dependencies |
144 | $this->communicator = $communicator; |
145 | $this->writer = $writer; |
146 | $this->stateManager = $stateManager; |
147 | |
148 | // Store other settings |
149 | $this->storeDateSettings($settings); |
150 | $this->storeMiscSettings($settings); |
151 | } |
152 | |
153 | /** |
154 | * Set an end date for the harvest (only harvest records BEFORE this date). |
155 | * |
156 | * @param string $date End date (YYYY-MM-DD format). |
157 | * |
158 | * @return void |
159 | */ |
160 | public function setEndDate($date) |
161 | { |
162 | $this->harvestEndDate = $date; |
163 | } |
164 | |
165 | /** |
166 | * Set a start date for the harvest (only harvest records AFTER this date). |
167 | * |
168 | * @param string $date Start date (YYYY-MM-DD format). |
169 | * |
170 | * @return void |
171 | */ |
172 | public function setStartDate($date) |
173 | { |
174 | $this->startDate = $date; |
175 | } |
176 | |
177 | /** |
178 | * Harvest all available documents. |
179 | * |
180 | * @return void |
181 | * |
182 | * @throws \Exception |
183 | */ |
184 | public function launch() |
185 | { |
186 | // Normalize sets setting to an array: |
187 | $sets = (array)$this->set; |
188 | if (empty($sets)) { |
189 | $sets = [null]; |
190 | } |
191 | |
192 | // The harvestEndDate may be null. Some OAI-PMH hosts may depend on a |
193 | // null value for backwards compatibility and reliability for various |
194 | // edge cases, so we allow a null value to be used during the initial |
195 | // records request. However, we still need to track an explicit end |
196 | // date, based on the current OAI server time, as the basis for future |
197 | // harvest start ranges. Note that this value can also be declared via |
198 | // state data as it should always track the time the harvest was |
199 | // first started. |
200 | // @see https://github.com/vufind-org/vufindharvest/issues/7 |
201 | if (empty($this->harvestEndDate)) { |
202 | $explicitHarvestEndDate = $this->getIdentifyResponse()->responseDate; |
203 | // Add support for OAI-PMH hosts that require day granularity by |
204 | // converting the date format if necessary. |
205 | $granularity = $this->granularity == 'auto' ? |
206 | $this->getIdentifyResponse()->granularity : $this->granularity; |
207 | if ($granularity == 'YYYY-MM-DD') { |
208 | $explicitHarvestEndDate = substr($explicitHarvestEndDate, 0, 10); |
209 | } |
210 | } else { |
211 | $explicitHarvestEndDate = $this->harvestEndDate; |
212 | } |
213 | |
214 | // Load last state, if applicable (used to recover from server failure). |
215 | if ($state = $this->stateManager->loadState()) { |
216 | $this->write("Found saved state; attempting to resume.\n"); |
217 | // State data must contain 4 values for reliable resumption. |
218 | if (count($state) !== 4) { |
219 | $this->stateManager->clearState(); |
220 | throw new \Exception( |
221 | 'Corrupt or incomplete state data detected; ' |
222 | . 'removing last_state.txt. Please restart harvest.' |
223 | ); |
224 | } |
225 | [ |
226 | $resumeSet, |
227 | $resumeToken, |
228 | $this->startDate, |
229 | $explicitHarvestEndDate |
230 | ] = $state; |
231 | } |
232 | |
233 | // Loop through all of the selected sets: |
234 | foreach ($sets as $set) { |
235 | // If we're resuming and there are multiple sets, find the right one. |
236 | if (isset($resumeToken) && $resumeSet != $set) { |
237 | continue; |
238 | } |
239 | |
240 | // If we have a token to resume from, pick up there now... |
241 | if (isset($resumeToken)) { |
242 | $token = $resumeToken; |
243 | unset($resumeToken); |
244 | } else { |
245 | // ...otherwise, start harvesting at the requested date: |
246 | $token = $this->getRecordsByDate( |
247 | $this->startDate, |
248 | $set, |
249 | $this->harvestEndDate |
250 | ); |
251 | } |
252 | |
253 | // Keep harvesting as long as a resumption token is provided: |
254 | while ($token !== false) { |
255 | // If stopAfter is set, stop harvesting after given limit |
256 | if ( |
257 | !empty($this->stopAfter) |
258 | && $this->recordsCount >= $this->stopAfter |
259 | ) { |
260 | $this->writeLine( |
261 | 'reached limit of records to harvest: ' . $this->stopAfter |
262 | ); |
263 | $this->writeLine('stop harvesting.'); |
264 | $token = false; |
265 | break; |
266 | } |
267 | // Save current state in case we need to resume later: |
268 | $this->stateManager->saveState( |
269 | $set, |
270 | $token, |
271 | $this->startDate, |
272 | $explicitHarvestEndDate |
273 | ); |
274 | $token = $this->getRecordsByToken($token); |
275 | } |
276 | } |
277 | |
278 | // If we made it this far, all was successful. Save last harvest info and |
279 | // clean up the stored state (unless we have a limit imposed by stopAfter) |
280 | if (empty($this->stopAfter)) { |
281 | $this->stateManager->saveDate($explicitHarvestEndDate); |
282 | } |
283 | $this->stateManager->clearState(); |
284 | } |
285 | |
286 | /** |
287 | * Make an OAI-PMH request. Die if there is an error; return a SimpleXML object |
288 | * on success. |
289 | * |
290 | * @param string $verb OAI-PMH verb to execute. |
291 | * @param array $params GET parameters for ListRecords method. |
292 | * |
293 | * @return object SimpleXML-formatted response. |
294 | */ |
295 | protected function sendRequest($verb, $params = []) |
296 | { |
297 | $response = $this->communicator->request($verb, $params); |
298 | $this->checkResponseForErrors($response); |
299 | return $response; |
300 | } |
301 | |
302 | /** |
303 | * Check an OAI-PMH response for errors that need to be handled. |
304 | * |
305 | * @param object $result OAI-PMH response (SimpleXML object) |
306 | * |
307 | * @return void |
308 | * |
309 | * @throws \Exception |
310 | * @throws OaiException |
311 | */ |
312 | protected function checkResponseForErrors($result) |
313 | { |
314 | // Detect errors and die if one is found: |
315 | if ($result->error) { |
316 | $attribs = $result->error->attributes(); |
317 | |
318 | // If this is a bad resumption token error and we're trying to |
319 | // restore a prior state, we should clean up. |
320 | if ( |
321 | $attribs['code'] == 'badResumptionToken' |
322 | && $this->stateManager->loadState() |
323 | ) { |
324 | $this->stateManager->clearState(); |
325 | throw new \Exception( |
326 | 'Token expired; removing last_state.txt. Please restart harvest.' |
327 | ); |
328 | } |
329 | throw new OaiException($attribs['code'], $result->error); |
330 | } |
331 | } |
332 | |
333 | /** |
334 | * Harvest records using OAI-PMH. |
335 | * |
336 | * @param array $params GET parameters for ListRecords method. |
337 | * |
338 | * @return mixed Resumption token if provided, false if finished |
339 | */ |
340 | protected function getRecords($params) |
341 | { |
342 | // Make the OAI-PMH request: |
343 | $response = $this->sendRequest('ListRecords', $params); |
344 | |
345 | // Save the records from the response: |
346 | if ($response->ListRecords->record) { |
347 | $newRecords = count($response->ListRecords->record); |
348 | $this->writeLine( |
349 | '[' . $this->recordsCount . ' records harvested] Processing ' |
350 | . $newRecords . ' records...' |
351 | ); |
352 | // count numRecords |
353 | $this->recordsCount += $newRecords; |
354 | $this->writer->write($response->ListRecords->record); |
355 | } |
356 | |
357 | // If we have a resumption token, keep going; otherwise, we're done. |
358 | if ( |
359 | isset($response->ListRecords->resumptionToken) |
360 | && !empty($response->ListRecords->resumptionToken) |
361 | ) { |
362 | return $response->ListRecords->resumptionToken; |
363 | } |
364 | return false; |
365 | } |
366 | |
367 | /** |
368 | * Harvest records via OAI-PMH using date and set. |
369 | * |
370 | * @param string $from Harvest start date (null for no specific start). |
371 | * @param string $set Set to harvest (null for all records). |
372 | * @param string $until Harvest end date (null for no specific end). |
373 | * |
374 | * @return mixed Resumption token if provided, false if finished |
375 | */ |
376 | protected function getRecordsByDate($from = null, $set = null, $until = null) |
377 | { |
378 | $params = ['metadataPrefix' => $this->metadataPrefix]; |
379 | if (!empty($from)) { |
380 | $params['from'] = $from; |
381 | } |
382 | if (!empty($set)) { |
383 | $params['set'] = $set; |
384 | } |
385 | if (!empty($until)) { |
386 | $params['until'] = $until; |
387 | } |
388 | return $this->getRecords($params); |
389 | } |
390 | |
391 | /** |
392 | * Harvest records via OAI-PMH using resumption token. |
393 | * |
394 | * @param string $token Resumption token. |
395 | * |
396 | * @return mixed Resumption token if provided, false if finished |
397 | */ |
398 | protected function getRecordsByToken($token) |
399 | { |
400 | return $this->getRecords(['resumptionToken' => (string)$token]); |
401 | } |
402 | |
403 | /** |
404 | * Get identify information from OAI-PMH host. Unless $reset = TRUE, this |
405 | * method will only invoke an OAI-PMH call upon its first usage and will |
406 | * return cached data after that. |
407 | * |
408 | * @param boolean $reset Whether-or-not to reset identity information |
409 | * already fetched during this request. |
410 | * |
411 | * @return stdClass An object of response properties as defined by |
412 | * http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify |
413 | * plus a 'responseDate' property representing the |
414 | * datestamp of the identify response in the form |
415 | * YYYY-MM-DDThh:mm:ssZ |
416 | * |
417 | * @see http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify |
418 | */ |
419 | protected function getIdentifyResponse($reset = false) |
420 | { |
421 | if (empty($this->identifyResponse) || $reset) { |
422 | $response = $this->sendRequest('Identify'); |
423 | // Save callers the burden of casting XML elements by preparing a |
424 | // flat list of string properties. |
425 | $this->identifyResponse = (object)(array)$response->Identify; |
426 | $this->identifyResponse->responseDate = (string)$response->responseDate; |
427 | } |
428 | return $this->identifyResponse; |
429 | } |
430 | |
431 | /** |
432 | * Set date range configuration (support method for constructor). |
433 | * |
434 | * @param array $settings Configuration |
435 | * |
436 | * @return void |
437 | */ |
438 | protected function storeDateSettings($settings) |
439 | { |
440 | // Set up start/end dates: |
441 | $from = empty($settings['from']) |
442 | ? $this->stateManager->loadDate() : $settings['from']; |
443 | $until = empty($settings['until']) ? null : $settings['until']; |
444 | $this->setStartDate($from); |
445 | $this->setEndDate($until); |
446 | } |
447 | |
448 | /** |
449 | * Set miscellaneous configuration (support method for constructor). |
450 | * |
451 | * @param array $settings Configuration |
452 | * |
453 | * @return void |
454 | */ |
455 | protected function storeMiscSettings($settings) |
456 | { |
457 | if (isset($settings['set'])) { |
458 | $this->set = $settings['set']; |
459 | } |
460 | if (isset($settings['metadataPrefix'])) { |
461 | $this->metadataPrefix = $settings['metadataPrefix']; |
462 | } |
463 | if (isset($settings['dateGranularity'])) { |
464 | $this->granularity = $settings['dateGranularity']; |
465 | } |
466 | if (isset($settings['stopAfter'])) { |
467 | $this->stopAfter = $settings['stopAfter']; |
468 | } |
469 | } |
470 | } |