Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 139 |
|
0.00% |
0 / 10 |
CRAP | |
0.00% |
0 / 1 |
MarcXml | |
0.00% |
0 / 139 |
|
0.00% |
0 / 10 |
1560 | |
0.00% |
0 / 1 |
canParse | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
canParseCollection | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
canParseCollectionFile | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
collectionFromString | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
fromString | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
42 | |||
toString | |
0.00% |
0 / 46 |
|
0.00% |
0 / 1 |
42 | |||
loadXML | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
42 | |||
openCollectionFile | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
rewind | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getNextRecord | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
110 |
1 | <?php |
2 | |
3 | /** |
4 | * MARCXML format support class. |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (C) The National Library of Finland 2020-2022. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package MARC |
25 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/development:plugins:record_drivers Wiki |
28 | */ |
29 | |
30 | namespace VuFind\Marc\Serialization; |
31 | |
32 | use function array_slice; |
33 | use function count; |
34 | use function is_string; |
35 | use function strlen; |
36 | |
37 | /** |
38 | * MARCXML format support class. |
39 | * |
40 | * @category VuFind |
41 | * @package MARC |
42 | * @author Ere Maijala <ere.maijala@helsinki.fi> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/development:plugins:record_drivers Wiki |
45 | */ |
46 | class MarcXml extends AbstractSerializationFile implements SerializationInterface |
47 | { |
48 | /** |
49 | * Current file |
50 | * |
51 | * @var string |
52 | */ |
53 | protected $fileName = ''; |
54 | |
55 | /** |
56 | * XML Reader for current file |
57 | * |
58 | * @var \XMLReader |
59 | */ |
60 | protected $xml = null; |
61 | |
62 | /** |
63 | * Current XML element path |
64 | * |
65 | * @var array |
66 | */ |
67 | protected $currentXmlPath = []; |
68 | |
69 | /** |
70 | * Check if this class can parse the given MARC string |
71 | * |
72 | * @param string $marc MARC |
73 | * |
74 | * @return bool |
75 | */ |
76 | public static function canParse(string $marc): bool |
77 | { |
78 | // A pretty naïve check, but it's enough to tell the different formats apart |
79 | return strncmp(trim($marc), '<', 1) === 0; |
80 | } |
81 | |
82 | /** |
83 | * Check if the serialization class can parse the given MARC collection string |
84 | * |
85 | * @param string $marc MARC |
86 | * |
87 | * @return bool |
88 | */ |
89 | public static function canParseCollection(string $marc): bool |
90 | { |
91 | // A pretty naïve check, but it's enough to tell the different formats apart |
92 | return strncmp(trim($marc), '<', 1) === 0; |
93 | } |
94 | |
95 | /** |
96 | * Check if the serialization class can parse the given MARC collection file |
97 | * |
98 | * @param string $file File name |
99 | * |
100 | * @return bool |
101 | */ |
102 | public static function canParseCollectionFile(string $file): bool |
103 | { |
104 | if (false === ($f = @fopen($file, 'r'))) { |
105 | throw new \Exception("Cannot open file '$file' for reading"); |
106 | } |
107 | do { |
108 | $s = ltrim(fgets($f, 10)); |
109 | } while (!$s && !feof($f)); |
110 | fclose($f); |
111 | |
112 | return self::canParseCollection($s); |
113 | } |
114 | |
115 | /** |
116 | * Parse MARC collection from a string into an array |
117 | * |
118 | * @param string $collection MARC record collection in the format supported by |
119 | * the serialization class |
120 | * |
121 | * @throws \Exception |
122 | * @return array |
123 | */ |
124 | public static function collectionFromString(string $collection): array |
125 | { |
126 | $xml = static::loadXML(trim($collection)); |
127 | $results = []; |
128 | foreach ($xml->record as $record) { |
129 | $results[] = $record->asXML(); |
130 | } |
131 | return $results; |
132 | } |
133 | |
134 | /** |
135 | * Parse MARCXML string |
136 | * |
137 | * @param string $marc MARCXML |
138 | * |
139 | * @throws \Exception |
140 | * @return array |
141 | */ |
142 | public static function fromString(string $marc): array |
143 | { |
144 | $xml = static::loadXML(trim($marc)); |
145 | |
146 | // Move to the record element if we were given a collection |
147 | if ($xml->record) { |
148 | $xml = $xml->record; |
149 | } |
150 | |
151 | $leader = isset($xml->leader) ? (string)$xml->leader[0] : ''; |
152 | $fields = []; |
153 | |
154 | foreach ($xml->controlfield as $field) { |
155 | $fields[] = [(string)$field['tag'] => (string)$field]; |
156 | } |
157 | |
158 | foreach ($xml->datafield as $field) { |
159 | $newField = [ |
160 | 'ind1' => str_pad((string)$field['ind1'], 1), |
161 | 'ind2' => str_pad((string)$field['ind2'], 1), |
162 | 'subfields' => [], |
163 | ]; |
164 | foreach ($field->subfield as $subfield) { |
165 | $newField['subfields'][] |
166 | = [(string)$subfield['code'] => (string)$subfield]; |
167 | } |
168 | $fields[] = [(string)$field['tag'] => $newField]; |
169 | } |
170 | |
171 | return compact('leader', 'fields'); |
172 | } |
173 | |
174 | /** |
175 | * Convert record to a MARCXML string |
176 | * |
177 | * @param array $record Record data |
178 | * |
179 | * @return string |
180 | */ |
181 | public static function toString(array $record): string |
182 | { |
183 | $xml = new \XMLWriter(); |
184 | $xml->openMemory(); |
185 | $xml->setIndent(true); |
186 | $xml->startDocument('1.0', 'UTF-8'); |
187 | $xml->startElementNs(null, 'collection', 'http://www.loc.gov/MARC21/slim'); |
188 | $xml->writeAttribute( |
189 | 'xmlns:xsi', |
190 | 'http://www.w3.org/2001/XMLSchema-instance' |
191 | ); |
192 | $xml->writeAttribute( |
193 | 'xsi:schemaLocation', |
194 | 'http://www.loc.gov/MARC21/slim' |
195 | . ' http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd' |
196 | ); |
197 | $xml->startElement('record'); |
198 | if ($record['leader']) { |
199 | $xml->writeElement('leader', $record['leader']); |
200 | } |
201 | |
202 | foreach ($record['fields'] as $fieldData) { |
203 | $tag = (string)key($fieldData); |
204 | $field = current($fieldData); |
205 | if (is_string($field)) { |
206 | $xml->startElement('controlfield'); |
207 | $xml->writeAttribute('tag', $tag); |
208 | $xml->text($field); |
209 | $xml->endElement(); |
210 | } else { |
211 | $xml->startElement('datafield'); |
212 | $xml->writeAttribute('tag', $tag); |
213 | $xml->writeAttribute('ind1', $field['ind1']); |
214 | $xml->writeAttribute('ind2', $field['ind2']); |
215 | foreach ($field['subfields'] ?? [] as $subfield) { |
216 | $subfieldData = current($subfield); |
217 | $subfieldCode = (string)key($subfield); |
218 | if ($subfieldData == '') { |
219 | continue; |
220 | } |
221 | $xml->startElement('subfield'); |
222 | $xml->writeAttribute('code', $subfieldCode); |
223 | $xml->text($subfieldData); |
224 | $xml->endElement(); |
225 | } |
226 | $xml->endElement(); |
227 | } |
228 | } |
229 | $xml->endElement(); |
230 | $xml->endDocument(); |
231 | |
232 | // Strip illegal characters from XML: |
233 | return preg_replace( |
234 | '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', |
235 | '', |
236 | $xml->outputMemory(true) |
237 | ); |
238 | } |
239 | |
240 | /** |
241 | * Load XML into SimpleXMLElement |
242 | * |
243 | * @param string $xml XML |
244 | * |
245 | * @throws \Exception |
246 | * @return \SimpleXMLElement |
247 | */ |
248 | protected static function loadXML(string $xml): \SimpleXMLElement |
249 | { |
250 | // Make sure we have an XML prolog with proper encoding: |
251 | $xmlHead = '<?xml version'; |
252 | if (strcasecmp(substr($xml, 0, strlen($xmlHead)), $xmlHead) === 0) { |
253 | $decl = substr($xml, 0, strpos($xml, '?>')); |
254 | if (strstr($decl, 'encoding') === false) { |
255 | $xml = $decl . ' encoding="utf-8"' . substr($xml, strlen($decl)); |
256 | } |
257 | } else { |
258 | $xml = '<?xml version="1.0" encoding="utf-8"?>' . "\n\n$xml"; |
259 | } |
260 | $saveUseErrors = libxml_use_internal_errors(true); |
261 | try { |
262 | libxml_clear_errors(); |
263 | $doc = simplexml_load_string( |
264 | $xml, |
265 | \SimpleXMLElement::class, |
266 | LIBXML_COMPACT |
267 | ); |
268 | if (false === $doc) { |
269 | $errors = libxml_get_errors(); |
270 | $messageParts = []; |
271 | foreach ($errors as $error) { |
272 | $messageParts[] = '[' . $error->line . ':' . $error->column |
273 | . '] Error ' . $error->code . ': ' . $error->message; |
274 | } |
275 | throw new \Exception(implode("\n", $messageParts)); |
276 | } |
277 | libxml_use_internal_errors($saveUseErrors); |
278 | return $doc; |
279 | } catch (\Exception $e) { |
280 | libxml_use_internal_errors($saveUseErrors); |
281 | throw $e; |
282 | } |
283 | } |
284 | |
285 | /** |
286 | * Open a collection file |
287 | * |
288 | * @param string $file File name |
289 | * |
290 | * @return void |
291 | * |
292 | * @throws \Exception |
293 | */ |
294 | public function openCollectionFile(string $file): void |
295 | { |
296 | $this->fileName = $file; |
297 | $this->xml = new \XMLReader(); |
298 | $result = $this->xml->open($file); |
299 | if (false === $result) { |
300 | throw new \Exception("Cannot open file '$file' for reading"); |
301 | } |
302 | $this->currentXmlPath = []; |
303 | } |
304 | |
305 | /** |
306 | * Rewind the collection file |
307 | * |
308 | * @return void |
309 | * |
310 | * @throws \Exception |
311 | */ |
312 | public function rewind(): void |
313 | { |
314 | if ('' === $this->fileName) { |
315 | throw new \Exception('Collection file not open'); |
316 | } |
317 | $this->openCollectionFile($this->fileName); |
318 | } |
319 | |
320 | /** |
321 | * Get next record from the file or an empty string on EOF |
322 | * |
323 | * @return string |
324 | * |
325 | * @throws \Exception |
326 | */ |
327 | public function getNextRecord(): string |
328 | { |
329 | if (null === $this->xml) { |
330 | throw new \Exception('Collection file not open'); |
331 | } |
332 | while ($this->xml->read()) { |
333 | if ($this->xml->nodeType !== \XMLReader::ELEMENT) { |
334 | continue; |
335 | } |
336 | $this->currentXmlPath |
337 | = array_slice($this->currentXmlPath, 0, $this->xml->depth); |
338 | $this->currentXmlPath[] = $this->xml->name; |
339 | $ns = $this->xml->namespaceURI; |
340 | |
341 | $currentPathString = '/' . implode('/', $this->currentXmlPath); |
342 | $pathOk = '/collection/record' === $currentPathString; |
343 | $namespaceOk = !$ns || 'http://www.loc.gov/MARC21/slim' === $ns; |
344 | if (!$pathOk || !$namespaceOk) { |
345 | if (count($this->currentXmlPath) === 2) { |
346 | if (!$pathOk) { |
347 | $this->message( |
348 | "Unknown element \"$currentPathString\"", |
349 | E_NOTICE |
350 | ); |
351 | } elseif (!$namespaceOk) { |
352 | $this->message( |
353 | "Unknown namespace \"$ns\" for element \"" |
354 | . $currentPathString . '"', |
355 | E_NOTICE |
356 | ); |
357 | } |
358 | } |
359 | continue; |
360 | } |
361 | return $this->xml->readOuterXML(); |
362 | } |
363 | return ''; |
364 | } |
365 | } |