* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License * @link https://vufind.org/wiki/indexing:oai-pmh Wiki */ namespace VuFindHarvest\ResponseProcessor; /** * Class for processing API responses into SimpleXML objects. * * @category VuFind * @package Harvest_Tools * @author Demian Katz * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License * @link https://vufind.org/wiki/indexing:oai-pmh Wiki */ class SimpleXmlResponseProcessor implements ResponseProcessorInterface { /** * Should we sanitize XML? * * @var bool */ protected $sanitize = false; /** * Filename for logging bad XML responses (false for none) * * @var string|bool */ protected $badXmlLog = false; /** * An array of regex strings used to sanitize XML * * @var array */ protected $sanitizeRegex = []; /** * Constructor * * @param string $basePath Base path to harvest directory. * @param array $settings OAI-PMH settings from oai.ini. */ public function __construct($basePath, $settings = []) { $this->sanitize = $settings['sanitize'] ?? false; $this->badXmlLog = isset($settings['badXMLLog']) ? $basePath . $settings['badXMLLog'] : false; $this->sanitizeRegex = $settings['sanitizeRegex'] ?? ['/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u']; } /** * Log a bad XML response. * * @param string $xml Bad XML * * @return void */ protected function logBadXML($xml) { $file = @fopen($this->badXmlLog, 'a'); if (!$file) { throw new \Exception("Problem opening {$this->badXmlLog}."); } fwrite($file, $xml . "\n\n"); fclose($file); } /** * Sanitize XML. * * @param string $rawXml XML to sanitize * * @return string */ protected function sanitizeXml($rawXml) { // Make sure the encoding is correct before applying regular expressions: $utf8xml = mb_convert_encoding($rawXml, 'UTF-8', 'UTF-8'); // Sanitize the XML if requested: $newXml = trim(preg_replace($this->sanitizeRegex, ' ', $utf8xml)); if ($rawXml !== $newXml && $this->badXmlLog) { $this->logBadXML($rawXml); } return $newXml; } /** * Collect LibXML errors into a single string. * * @return string */ protected function collectXmlErrors() { $callback = function ($e) { return trim($e->message); }; return implode('; ', array_map($callback, libxml_get_errors())); } /** * Process an OAI-PMH response into a SimpleXML object. Throw an exception if * an error is detected. * * @param string $xml Raw XML to process * * @return mixed * * @throws \Exception */ public function process($xml) { // Sanitize if necessary: if ($this->sanitize) { $xml = $this->sanitizeXml($xml); } // Parse the XML (newer versions of LibXML require a special flag for // large documents, and responses may be quite large): $flags = LIBXML_VERSION >= 20900 ? LIBXML_PARSEHUGE : 0; $oldSetting = libxml_use_internal_errors(true); $result = simplexml_load_string($xml, null, $flags); $errors = $this->collectXmlErrors(); libxml_use_internal_errors($oldSetting); if (!$result) { throw new \Exception('Problem loading XML: ' . $errors); } // If we got this far, we have a valid response: return $result; } }