Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
99.29% |
139 / 140 |
|
90.00% |
9 / 10 |
CRAP | |
0.00% |
0 / 1 |
MarcXml | |
99.29% |
139 / 140 |
|
90.00% |
9 / 10 |
40 | |
0.00% |
0 / 1 |
canParse | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
canParseCollection | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
canParseCollectionFile | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
collectionFromString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
fromString | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
6 | |||
toString | |
100.00% |
48 / 48 |
|
100.00% |
1 / 1 |
7 | |||
loadXML | |
96.00% |
24 / 25 |
|
0.00% |
0 / 1 |
6 | |||
openCollectionFile | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
rewind | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getNextRecord | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
10 |
1 | <?php |
2 | |
3 | /** |
4 | * MARCXML format support class. |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (C) The National Library of Finland 2020-2022. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package MARC |
25 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/development:plugins:record_drivers Wiki |
28 | */ |
29 | |
30 | namespace VuFind\Marc\Serialization; |
31 | |
32 | use function array_slice; |
33 | use function count; |
34 | use function is_string; |
35 | use function strlen; |
36 | |
37 | /** |
38 | * MARCXML format support class. |
39 | * |
40 | * @category VuFind |
41 | * @package MARC |
42 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/development:plugins:record_drivers Wiki |
45 | */ |
46 | class MarcXml extends AbstractSerializationFile implements SerializationInterface |
47 | { |
48 | /** |
49 | * Current file |
50 | * |
51 | * @var string |
52 | */ |
53 | protected $fileName = ''; |
54 | |
55 | /** |
56 | * XML Reader for current file |
57 | * |
58 | * @var \XMLReader |
59 | */ |
60 | protected $xml = null; |
61 | |
62 | /** |
63 | * Current XML element path |
64 | * |
65 | * @var array |
66 | */ |
67 | protected $currentXmlPath = []; |
68 | |
69 | /** |
70 | * Check if this class can parse the given MARC string |
71 | * |
72 | * @param string $marc MARC |
73 | * |
74 | * @return bool |
75 | */ |
76 | public static function canParse(string $marc): bool |
77 | { |
78 | // A pretty naïve check, but it's enough to tell the different formats apart |
79 | return strncmp(trim($marc), '<', 1) === 0; |
80 | } |
81 | |
82 | /** |
83 | * Check if the serialization class can parse the given MARC collection string |
84 | * |
85 | * @param string $marc MARC |
86 | * |
87 | * @return bool |
88 | */ |
89 | public static function canParseCollection(string $marc): bool |
90 | { |
91 | // A pretty naïve check, but it's enough to tell the different formats apart |
92 | return strncmp(trim($marc), '<', 1) === 0; |
93 | } |
94 | |
95 | /** |
96 | * Check if the serialization class can parse the given MARC collection file |
97 | * |
98 | * @param string $file File name |
99 | * |
100 | * @return bool |
101 | */ |
102 | public static function canParseCollectionFile(string $file): bool |
103 | { |
104 | if (false === ($f = @fopen($file, 'r'))) { |
105 | throw new \Exception("Cannot open file '$file' for reading"); |
106 | } |
107 | do { |
108 | $s = ltrim(fgets($f, 10)); |
109 | } while (!$s && !feof($f)); |
110 | fclose($f); |
111 | |
112 | return self::canParseCollection($s); |
113 | } |
114 | |
115 | /** |
116 | * Parse MARC collection from a string into an array |
117 | * |
118 | * @param string $collection MARC record collection in the format supported by |
119 | * the serialization class |
120 | * |
121 | * @throws \Exception |
122 | * @return array |
123 | */ |
124 | public static function collectionFromString(string $collection): array |
125 | { |
126 | $xml = static::loadXML(trim($collection)); |
127 | $results = []; |
128 | foreach ($xml->record as $record) { |
129 | $results[] = $record->asXML(); |
130 | } |
131 | return $results; |
132 | } |
133 | |
134 | /** |
135 | * Parse MARCXML string |
136 | * |
137 | * @param string $marc MARCXML |
138 | * |
139 | * @throws \Exception |
140 | * @return array |
141 | */ |
142 | public static function fromString(string $marc): array |
143 | { |
144 | $xml = static::loadXML(trim($marc)); |
145 | |
146 | // Move to the record element if we were given a collection |
147 | if ($xml->record) { |
148 | $xml = $xml->record; |
149 | } |
150 | |
151 | $leader = isset($xml->leader) ? (string)$xml->leader[0] : ''; |
152 | $fields = []; |
153 | |
154 | foreach ($xml->controlfield as $field) { |
155 | $fields[] = [(string)$field['tag'] => (string)$field]; |
156 | } |
157 | |
158 | foreach ($xml->datafield as $field) { |
159 | $newField = [ |
160 | 'ind1' => str_pad((string)$field['ind1'], 1), |
161 | 'ind2' => str_pad((string)$field['ind2'], 1), |
162 | 'subfields' => [], |
163 | ]; |
164 | foreach ($field->subfield as $subfield) { |
165 | $newField['subfields'][] |
166 | = [(string)$subfield['code'] => (string)$subfield]; |
167 | } |
168 | $fields[] = [(string)$field['tag'] => $newField]; |
169 | } |
170 | |
171 | return compact('leader', 'fields'); |
172 | } |
173 | |
174 | /** |
175 | * Convert record to a MARCXML string |
176 | * |
177 | * @param array $record Record data |
178 | * |
179 | * @throws \Exception |
180 | * @return string |
181 | */ |
182 | public static function toString(array $record): string |
183 | { |
184 | $xml = new \XMLWriter(); |
185 | $xml->openMemory(); |
186 | $xml->setIndent(true); |
187 | $xml->startDocument('1.0', 'UTF-8'); |
188 | $xml->startElementNs(null, 'collection', 'http://www.loc.gov/MARC21/slim'); |
189 | $xml->writeAttribute( |
190 | 'xmlns:xsi', |
191 | 'http://www.w3.org/2001/XMLSchema-instance' |
192 | ); |
193 | $xml->writeAttribute( |
194 | 'xsi:schemaLocation', |
195 | 'http://www.loc.gov/MARC21/slim' |
196 | . ' http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd' |
197 | ); |
198 | $xml->startElement('record'); |
199 | if ($record['leader']) { |
200 | $xml->writeElement('leader', $record['leader']); |
201 | } |
202 | |
203 | foreach ($record['fields'] as $fieldData) { |
204 | $tag = (string)key($fieldData); |
205 | $field = current($fieldData); |
206 | if (is_string($field)) { |
207 | $xml->startElement('controlfield'); |
208 | $xml->writeAttribute('tag', $tag); |
209 | $xml->text($field); |
210 | $xml->endElement(); |
211 | } else { |
212 | $xml->startElement('datafield'); |
213 | $xml->writeAttribute('tag', $tag); |
214 | $xml->writeAttribute('ind1', $field['ind1']); |
215 | $xml->writeAttribute('ind2', $field['ind2']); |
216 | foreach ($field['subfields'] ?? [] as $subfield) { |
217 | $subfieldData = current($subfield); |
218 | $subfieldCode = (string)key($subfield); |
219 | if ($subfieldData == '') { |
220 | continue; |
221 | } |
222 | $xml->startElement('subfield'); |
223 | $xml->writeAttribute('code', $subfieldCode); |
224 | $xml->text($subfieldData); |
225 | $xml->endElement(); |
226 | } |
227 | $xml->endElement(); |
228 | } |
229 | } |
230 | $xml->endElement(); |
231 | $xml->endDocument(); |
232 | |
233 | // Strip illegal characters from XML: |
234 | $xmlString = preg_replace( |
235 | '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', |
236 | '', |
237 | $xml->outputMemory(true) |
238 | ); |
239 | if ($xmlString === null) { |
240 | throw new \Exception('Error processing XML'); |
241 | } |
242 | return $xmlString; |
243 | } |
244 | |
245 | /** |
246 | * Load XML into SimpleXMLElement |
247 | * |
248 | * @param string $xml XML |
249 | * |
250 | * @throws \Exception |
251 | * @return \SimpleXMLElement |
252 | */ |
253 | protected static function loadXML(string $xml): \SimpleXMLElement |
254 | { |
255 | // Make sure we have an XML prolog with proper encoding: |
256 | $xmlHead = '<?xml version'; |
257 | if (strcasecmp(substr($xml, 0, strlen($xmlHead)), $xmlHead) === 0) { |
258 | $decl = substr($xml, 0, strpos($xml, '?>')); |
259 | if (strstr($decl, 'encoding') === false) { |
260 | $xml = $decl . ' encoding="utf-8"' . substr($xml, strlen($decl)); |
261 | } |
262 | } else { |
263 | $xml = '<?xml version="1.0" encoding="utf-8"?>' . "\n\n$xml"; |
264 | } |
265 | $saveUseErrors = libxml_use_internal_errors(true); |
266 | try { |
267 | libxml_clear_errors(); |
268 | $doc = simplexml_load_string( |
269 | $xml, |
270 | \SimpleXMLElement::class, |
271 | LIBXML_COMPACT |
272 | ); |
273 | if (false === $doc) { |
274 | $errors = libxml_get_errors(); |
275 | $messageParts = []; |
276 | foreach ($errors as $error) { |
277 | $messageParts[] = '[' . $error->line . ':' . $error->column |
278 | . '] Error ' . $error->code . ': ' . $error->message; |
279 | } |
280 | throw new \Exception(implode("\n", $messageParts)); |
281 | } |
282 | libxml_use_internal_errors($saveUseErrors); |
283 | return $doc; |
284 | } catch (\Exception $e) { |
285 | libxml_use_internal_errors($saveUseErrors); |
286 | throw $e; |
287 | } |
288 | } |
289 | |
290 | /** |
291 | * Open a collection file |
292 | * |
293 | * @param string $file File name |
294 | * |
295 | * @return void |
296 | * |
297 | * @throws \Exception |
298 | */ |
299 | public function openCollectionFile(string $file): void |
300 | { |
301 | $this->fileName = $file; |
302 | $this->xml = new \XMLReader(); |
303 | $result = $this->xml->open($file); |
304 | if (false === $result) { |
305 | throw new \Exception("Cannot open file '$file' for reading"); |
306 | } |
307 | $this->currentXmlPath = []; |
308 | } |
309 | |
310 | /** |
311 | * Rewind the collection file |
312 | * |
313 | * @return void |
314 | * |
315 | * @throws \Exception |
316 | */ |
317 | public function rewind(): void |
318 | { |
319 | if ('' === $this->fileName) { |
320 | throw new \Exception('Collection file not open'); |
321 | } |
322 | $this->openCollectionFile($this->fileName); |
323 | } |
324 | |
325 | /** |
326 | * Get next record from the file or an empty string on EOF |
327 | * |
328 | * @return string |
329 | * |
330 | * @throws \Exception |
331 | */ |
332 | public function getNextRecord(): string |
333 | { |
334 | if (null === $this->xml) { |
335 | throw new \Exception('Collection file not open'); |
336 | } |
337 | while ($this->xml->read()) { |
338 | if ($this->xml->nodeType !== \XMLReader::ELEMENT) { |
339 | continue; |
340 | } |
341 | $this->currentXmlPath |
342 | = array_slice($this->currentXmlPath, 0, $this->xml->depth); |
343 | $this->currentXmlPath[] = $this->xml->name; |
344 | $ns = $this->xml->namespaceURI; |
345 | |
346 | $currentPathString = '/' . implode('/', $this->currentXmlPath); |
347 | $pathOk = '/collection/record' === $currentPathString; |
348 | $namespaceOk = !$ns || 'http://www.loc.gov/MARC21/slim' === $ns; |
349 | if (!$pathOk || !$namespaceOk) { |
350 | if (count($this->currentXmlPath) === 2) { |
351 | if (!$pathOk) { |
352 | $this->message( |
353 | "Unknown element \"$currentPathString\"", |
354 | E_NOTICE |
355 | ); |
356 | } elseif (!$namespaceOk) { |
357 | $this->message( |
358 | "Unknown namespace \"$ns\" for element \"" |
359 | . $currentPathString . '"', |
360 | E_NOTICE |
361 | ); |
362 | } |
363 | } |
364 | continue; |
365 | } |
366 | return $this->xml->readOuterXML(); |
367 | } |
368 | return ''; |
369 | } |
370 | } |