Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.89% |
139 / 142 |
|
75.00% |
9 / 12 |
CRAP | |
0.00% |
0 / 1 |
Importer | |
97.89% |
139 / 142 |
|
75.00% |
9 / 12 |
47 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
save | |
95.00% |
19 / 20 |
|
0.00% |
0 / 1 |
5 | |||
adjustEncoding | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
writeData | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
processHeader | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
getConfiguration | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
2.02 | |||
processConfiguration | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
6.01 | |||
injectCallbackDependencies | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
processCallback | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
7 | |||
applyCallbacks | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
2 | |||
processValues | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
collectValuesFromLine | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
6 |
1 | <?php |
2 | |
3 | /** |
4 | * VuFind CSV importer configuration |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (C) Villanova University 2021. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package CSV |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/ Wiki |
28 | */ |
29 | |
30 | namespace VuFind\CSV; |
31 | |
32 | use Laminas\ServiceManager\ServiceLocatorInterface; |
33 | use VuFindSearch\Backend\Solr\Document\RawJSONDocument; |
34 | |
35 | use function count; |
36 | |
37 | /** |
38 | * VuFind CSV importer configuration |
39 | * |
40 | * @category VuFind |
41 | * @package CSV |
42 | * @author Demian Katz <demian.katz@villanova.edu> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/ Wiki |
45 | */ |
46 | class Importer |
47 | { |
48 | /** |
49 | * Service locator |
50 | * |
51 | * @var ServiceLocatorInterface |
52 | */ |
53 | protected $serviceLocator; |
54 | |
55 | /** |
56 | * Base path for loading .ini files |
57 | * |
58 | * @var string |
59 | */ |
60 | protected $configBaseDir; |
61 | |
62 | /** |
63 | * Constructor |
64 | * |
65 | * @param ServiceLocatorInterface $sm Service manager |
66 | * @param array $options Configuration options |
67 | */ |
68 | public function __construct(ServiceLocatorInterface $sm, array $options = []) |
69 | { |
70 | $this->serviceLocator = $sm; |
71 | $this->configBaseDir = $options['configBaseDir'] ?? 'import'; |
72 | } |
73 | |
74 | /** |
75 | * Save a CSV file to the Solr index using the specified configuration. |
76 | * |
77 | * @param string $csvFile CSV file to load. |
78 | * @param string $iniFile INI file. |
79 | * @param string $index Solr index to use. |
80 | * @param bool $testMode Are we in test-only mode? |
81 | * |
82 | * @throws \Exception |
83 | * @return string Output for test mode |
84 | */ |
85 | public function save( |
86 | string $csvFile, |
87 | string $iniFile, |
88 | string $index = 'Solr', |
89 | bool $testMode = false |
90 | ): string { |
91 | $in = fopen($csvFile, 'r'); |
92 | if (!$in) { |
93 | throw new \Exception("Cannot open CSV file: {$csvFile}."); |
94 | } |
95 | $config = $this->getConfiguration($iniFile, $in); |
96 | $batchSize = $config->getBatchSize(); |
97 | $encoding = $config->getEncoding(); |
98 | $data = []; |
99 | $output = ''; |
100 | while ($line = fgetcsv($in)) { |
101 | $data[] = $this->collectValuesFromLine( |
102 | $this->adjustEncoding($line, $encoding), |
103 | $config |
104 | ); |
105 | // If we have finished a batch, write it now and start the next one: |
106 | if (count($data) === $batchSize) { |
107 | $output .= $this->writeData($data, $index, $testMode); |
108 | $data = []; |
109 | } |
110 | } |
111 | fclose($in); |
112 | // If there's an incomplete batch in progress, write the remaining data: |
113 | if (!empty($data)) { |
114 | $output .= $this->writeData($data, $index, $testMode); |
115 | } |
116 | |
117 | return $output; |
118 | } |
119 | |
120 | /** |
121 | * Fix the character encoding of a CSV line (if necessary). |
122 | * |
123 | * @param array $line Input from CSV |
124 | * @param string $encoding Encoding of $line |
125 | * |
126 | * @return array Input re-encoded as UTF-8 (if not already in UTF-8) |
127 | */ |
128 | protected function adjustEncoding(array $line, string $encoding): array |
129 | { |
130 | // We want UTF-8, so if that's already the setting, we don't need to do work: |
131 | if (strtolower($encoding) === 'utf-8') { |
132 | return $line; |
133 | } |
134 | return array_map( |
135 | function (string $str) use ($encoding): string { |
136 | return iconv($encoding, 'UTF-8', $str); |
137 | }, |
138 | $line |
139 | ); |
140 | } |
141 | |
142 | /** |
143 | * Write a batch of JSON data to Solr. |
144 | * |
145 | * @param array $data Data to write |
146 | * @param string $index Target Solr index |
147 | * @param bool $testMode Are we in test mode? |
148 | * |
149 | * @return string Test mode output (if applicable) or empty string |
150 | */ |
151 | protected function writeData(array $data, string $index, bool $testMode): string |
152 | { |
153 | // Format the data appropriately (human-readable for test-mode, concise |
154 | // for real Solr writing): |
155 | $flags = $testMode ? JSON_PRETTY_PRINT : 0; |
156 | $json = json_encode($data, $flags); |
157 | if ($json === false) { |
158 | throw new \Exception(json_last_error_msg(), json_last_error()); |
159 | } |
160 | |
161 | // Save the results (or just return them, if in test mode): |
162 | if ($testMode) { |
163 | return $json; |
164 | } |
165 | $solr = $this->serviceLocator->get(\VuFind\Solr\Writer::class); |
166 | $solr->save($index, new RawJSONDocument($json), 'update'); |
167 | return ''; // no output when not in test mode! |
168 | } |
169 | |
170 | /** |
171 | * Process the header row, and generate a configuration. |
172 | * |
173 | * @param ImporterConfig $config Configuration to be updated |
174 | * @param resource $in File handle to CSV |
175 | * @param string $mode Header processing mode (fields/none/skip) |
176 | * |
177 | * @return void |
178 | */ |
179 | protected function processHeader(ImporterConfig $config, $in, string $mode): void |
180 | { |
181 | switch (strtolower(trim($mode))) { |
182 | case 'fields': |
183 | // Load configuration from the header row: |
184 | $row = fgetcsv($in); |
185 | foreach ($row as $i => $field) { |
186 | $config->configureColumn($i, ['field' => $field]); |
187 | } |
188 | break; |
189 | case 'skip': |
190 | // Just skip a row: |
191 | fgetcsv($in); |
192 | break; |
193 | case 'none': |
194 | default: |
195 | // Do nothing. |
196 | break; |
197 | } |
198 | } |
199 | |
200 | /** |
201 | * Load and set up the configuration object. |
202 | * |
203 | * @param string $iniFile Name of .ini file to load |
204 | * @param resource $in File handle to input file |
205 | * |
206 | * @throws \Exception |
207 | * @return ImporterConfig |
208 | */ |
209 | protected function getConfiguration(string $iniFile, $in): ImporterConfig |
210 | { |
211 | // Load properties file: |
212 | $resolver = $this->serviceLocator->get(\VuFind\Config\PathResolver::class); |
213 | $ini = $resolver->getConfigPath($iniFile, $this->configBaseDir); |
214 | if (!file_exists($ini)) { |
215 | throw new \Exception("Cannot load .ini file: {$ini}."); |
216 | } |
217 | $options = parse_ini_file($ini, true); |
218 | return $this->processConfiguration($options, $in); |
219 | } |
220 | |
221 | /** |
222 | * Determine the list of fields that will be loaded. |
223 | * |
224 | * @param array $options Configuration |
225 | * @param resource $in File handle to input file |
226 | * |
227 | * @throws \Exception |
228 | * @return ImporterConfig |
229 | */ |
230 | protected function processConfiguration(array $options, $in): ImporterConfig |
231 | { |
232 | $config = new ImporterConfig($options['General'] ?? []); |
233 | $this->processHeader($config, $in, $options['General']['header'] ?? 'none'); |
234 | foreach ($options as $section => $settings) { |
235 | if (str_contains($section, ':')) { |
236 | [$type, $details] = explode(':', $section); |
237 | switch (strtolower(trim($type))) { |
238 | case 'column': |
239 | $config->configureColumn($details, $settings); |
240 | break; |
241 | case 'field': |
242 | $config->configureField($details, $settings); |
243 | break; |
244 | default: |
245 | throw new \Exception("Unexpected config section: $section"); |
246 | } |
247 | } |
248 | } |
249 | return $config; |
250 | } |
251 | |
252 | /** |
253 | * Inject dependencies into the callback, if necessary. |
254 | * |
255 | * @param string $callable Callback function |
256 | * |
257 | * @return void |
258 | */ |
259 | protected function injectCallbackDependencies(string $callable): void |
260 | { |
261 | // Use a static property to keep track of which static classes |
262 | // have already had dependencies injected. |
263 | static $alreadyInjected = []; |
264 | |
265 | // $callable is one of two formats: "function" or "class::method". |
266 | // We only want to proceed if we have a class name. |
267 | $parts = explode('::', $callable); |
268 | if (count($parts) < 2) { |
269 | return; |
270 | } |
271 | $class = $parts[0]; |
272 | |
273 | // If we haven't already injected dependencies, do it now! This makes |
274 | // it possible to use callbacks from the XSLT importer |
275 | // (e.g. \VuFind\XSLT\Import\VuFind::harvestWithParser) |
276 | if (!isset($alreadyInjected[$class])) { |
277 | if (method_exists($class, 'setServiceLocator')) { |
278 | $class::setServiceLocator($this->serviceLocator); |
279 | } |
280 | $alreadyInjected[$class] = true; |
281 | } |
282 | } |
283 | |
284 | /** |
285 | * Apply a single callback to a single value. |
286 | * |
287 | * @param string $callback Callback string from config |
288 | * @param string $value Value to process |
289 | * @param array $fieldValues Field values processed so far |
290 | * |
291 | * @return string[] |
292 | */ |
293 | protected function processCallback( |
294 | string $callback, |
295 | string $value, |
296 | array $fieldValues |
297 | ): array { |
298 | preg_match('/([^(]+)(\(.*\))?/', $callback, $matches); |
299 | $callable = $matches[1]; |
300 | $this->injectCallbackDependencies($callable); |
301 | $arglist = array_map( |
302 | 'trim', |
303 | explode( |
304 | ',', |
305 | ltrim(rtrim($matches[2] ?? '$$csv$$', ')'), '(') |
306 | ) |
307 | ); |
308 | $argCallback = function ($arg) use ($value, $fieldValues) { |
309 | if ( |
310 | str_starts_with($arg, '$$') |
311 | && str_ends_with($arg, '$$') |
312 | ) { |
313 | $parts = explode(':', trim($arg, '$'), 2); |
314 | switch ($parts[0]) { |
315 | case 'csv': |
316 | return $value; |
317 | case 'field': |
318 | return $fieldValues[$parts[1] ?? ''] ?? []; |
319 | case 'fieldFirst': |
320 | return $fieldValues[$parts[1] ?? ''][0] ?? ''; |
321 | default: |
322 | throw new \Exception('Unknown directive: ' . $parts[0]); |
323 | } |
324 | } |
325 | return $arg; |
326 | }; |
327 | $result = $callable(...array_map($argCallback, $arglist)); |
328 | return (array)$result; |
329 | } |
330 | |
331 | /** |
332 | * Recursively apply callback functions to a value. |
333 | * |
334 | * @param string $value Value to process |
335 | * @param string[] $callbacks List of callback functions |
336 | * @param array $fieldValues Field values processed so far |
337 | * |
338 | * @return string[] |
339 | */ |
340 | protected function applyCallbacks( |
341 | string $value, |
342 | array $callbacks, |
343 | array $fieldValues |
344 | ): array { |
345 | // No callbacks, no work: |
346 | if (empty($callbacks)) { |
347 | return [$value]; |
348 | } |
349 | |
350 | // Get the next callback, apply it, and then recurse over its |
351 | // return values. |
352 | $nextCallback = array_shift($callbacks); |
353 | $recurseFunction = function (string $val) use ( |
354 | $callbacks, |
355 | $fieldValues |
356 | ): array { |
357 | return $this->applyCallbacks($val, $callbacks, $fieldValues); |
358 | }; |
359 | $next = $this->processCallback($nextCallback, $value, $fieldValues); |
360 | $result = array_merge(...array_map($recurseFunction, $next)); |
361 | return $result; |
362 | } |
363 | |
364 | /** |
365 | * Process the values from a single column of a CSV. |
366 | * |
367 | * @param string[] $values Values to process |
368 | * @param array $fieldConfig Configuration to apply to values |
369 | * @param array $fieldValues Field values processed so far |
370 | * |
371 | * @return string[] |
372 | */ |
373 | protected function processValues( |
374 | array $values, |
375 | array $fieldConfig, |
376 | array $fieldValues |
377 | ): array { |
378 | $processed = []; |
379 | foreach ($values as $value) { |
380 | $newValues = $this->applyCallbacks( |
381 | $value, |
382 | (array)($fieldConfig['callback'] ?? []), |
383 | $fieldValues |
384 | ); |
385 | $processed = array_merge($processed, $newValues); |
386 | } |
387 | return $processed; |
388 | } |
389 | |
390 | /** |
391 | * Collect field-specific values from a CSV input line. Returns an array |
392 | * mapping field name to value array. |
393 | * |
394 | * @param array $line Line to process. |
395 | * @param ImporterConfig $config Configuration object. |
396 | * |
397 | * @return array |
398 | */ |
399 | protected function collectValuesFromLine( |
400 | array $line, |
401 | ImporterConfig $config |
402 | ): array { |
403 | // First get all hard-coded values... |
404 | $fieldValues = $config->getFixedFieldValues(); |
405 | |
406 | // Now add values mapped directly from the CSV columns... |
407 | $allMappedFields = []; |
408 | foreach ($line as $column => $value) { |
409 | $columnConfig = $config->getColumn($column); |
410 | $values = isset($columnConfig['delimiter']) |
411 | ? explode($columnConfig['delimiter'], $value) |
412 | : (array)$value; |
413 | if (isset($columnConfig['field'])) { |
414 | $fieldList = (array)$columnConfig['field']; |
415 | $allMappedFields = array_merge($allMappedFields, $fieldList); |
416 | foreach ($fieldList as $field) { |
417 | $fieldConfig = $config->getField($field); |
418 | $processed = $this->processValues( |
419 | $values, |
420 | $fieldConfig, |
421 | $fieldValues |
422 | ); |
423 | $fieldValues[$field] = array_merge( |
424 | $fieldValues[$field], |
425 | $processed |
426 | ); |
427 | } |
428 | } |
429 | } |
430 | |
431 | // Finally, add any values derived from other fields... |
432 | $remainingFields = $config->getOutstandingCallbacks($allMappedFields); |
433 | foreach ($remainingFields as $field) { |
434 | $fieldConfig = $config->getField($field); |
435 | $processed = $this->processValues( |
436 | (array)($fieldConfig['callbackSeed'] ?? []), |
437 | $fieldConfig, |
438 | $fieldValues |
439 | ); |
440 | $fieldValues[$field] = array_merge($fieldValues[$field], $processed); |
441 | } |
442 | |
443 | return $fieldValues; |
444 | } |
445 | } |