Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
28 / 28 |
|
100.00% |
5 / 5 |
CRAP | |
100.00% |
1 / 1 |
SimpleXmlResponseProcessor | |
100.00% |
28 / 28 |
|
100.00% |
5 / 5 |
12 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
logBadXML | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
sanitizeXml | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
collectXmlErrors | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
process | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | /** |
4 | * Class for processing API responses into SimpleXML objects. |
5 | * |
6 | * PHP version 7 |
7 | * |
8 | * Copyright (c) Demian Katz 2016. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Harvest_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
28 | */ |
29 | |
30 | namespace VuFindHarvest\ResponseProcessor; |
31 | |
32 | /** |
33 | * Class for processing API responses into SimpleXML objects. |
34 | * |
35 | * @category VuFind |
36 | * @package Harvest_Tools |
37 | * @author Demian Katz <demian.katz@villanova.edu> |
38 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
39 | * @link https://vufind.org/wiki/indexing:oai-pmh Wiki |
40 | */ |
41 | class SimpleXmlResponseProcessor implements ResponseProcessorInterface |
42 | { |
43 | /** |
44 | * Should we sanitize XML? |
45 | * |
46 | * @var bool |
47 | */ |
48 | protected $sanitize = false; |
49 | |
50 | /** |
51 | * Filename for logging bad XML responses (false for none) |
52 | * |
53 | * @var string|bool |
54 | */ |
55 | protected $badXmlLog = false; |
56 | |
57 | /** |
58 | * An array of regex strings used to sanitize XML |
59 | * |
60 | * @var array |
61 | */ |
62 | protected $sanitizeRegex = []; |
63 | |
64 | /** |
65 | * Constructor |
66 | * |
67 | * @param string $basePath Base path to harvest directory. |
68 | * @param array $settings OAI-PMH settings from oai.ini. |
69 | */ |
70 | public function __construct($basePath, $settings = []) |
71 | { |
72 | $this->sanitize = $settings['sanitize'] ?? false; |
73 | $this->badXmlLog = isset($settings['badXMLLog']) |
74 | ? $basePath . $settings['badXMLLog'] : false; |
75 | $this->sanitizeRegex = $settings['sanitizeRegex'] |
76 | ?? ['/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u']; |
77 | } |
78 | |
79 | /** |
80 | * Log a bad XML response. |
81 | * |
82 | * @param string $xml Bad XML |
83 | * |
84 | * @return void |
85 | */ |
86 | protected function logBadXML($xml) |
87 | { |
88 | $file = @fopen($this->badXmlLog, 'a'); |
89 | if (!$file) { |
90 | throw new \Exception("Problem opening {$this->badXmlLog}."); |
91 | } |
92 | fwrite($file, $xml . "\n\n"); |
93 | fclose($file); |
94 | } |
95 | |
96 | /** |
97 | * Sanitize XML. |
98 | * |
99 | * @param string $rawXml XML to sanitize |
100 | * |
101 | * @return string |
102 | */ |
103 | protected function sanitizeXml($rawXml) |
104 | { |
105 | // Make sure the encoding is correct before applying regular expressions: |
106 | $utf8xml = mb_convert_encoding($rawXml, 'UTF-8', 'UTF-8'); |
107 | |
108 | // Sanitize the XML if requested: |
109 | $newXml = trim(preg_replace($this->sanitizeRegex, ' ', $utf8xml)); |
110 | |
111 | if ($rawXml !== $newXml && $this->badXmlLog) { |
112 | $this->logBadXML($rawXml); |
113 | } |
114 | |
115 | return $newXml; |
116 | } |
117 | |
118 | /** |
119 | * Collect LibXML errors into a single string. |
120 | * |
121 | * @return string |
122 | */ |
123 | protected function collectXmlErrors() |
124 | { |
125 | $callback = function ($e) { |
126 | return trim($e->message); |
127 | }; |
128 | return implode('; ', array_map($callback, libxml_get_errors())); |
129 | } |
130 | |
131 | /** |
132 | * Process an OAI-PMH response into a SimpleXML object. Throw an exception if |
133 | * an error is detected. |
134 | * |
135 | * @param string $xml Raw XML to process |
136 | * |
137 | * @return mixed |
138 | * |
139 | * @throws \Exception |
140 | */ |
141 | public function process($xml) |
142 | { |
143 | // Sanitize if necessary: |
144 | if ($this->sanitize) { |
145 | $xml = $this->sanitizeXml($xml); |
146 | } |
147 | |
148 | // Parse the XML (newer versions of LibXML require a special flag for |
149 | // large documents, and responses may be quite large): |
150 | $flags = LIBXML_VERSION >= 20900 ? LIBXML_PARSEHUGE : 0; |
151 | $oldSetting = libxml_use_internal_errors(true); |
152 | $result = simplexml_load_string($xml, null, $flags); |
153 | $errors = $this->collectXmlErrors(); |
154 | libxml_use_internal_errors($oldSetting); |
155 | if (!$result) { |
156 | throw new \Exception('Problem loading XML: ' . $errors); |
157 | } |
158 | |
159 | // If we got this far, we have a valid response: |
160 | return $result; |
161 | } |
162 | } |