\\"\{\}\|\^\`]/'; private $input; private $line = 1; /** * @var array|null */ private $comments; private $n3Mode; private $prevTokenType; private $_oldTokenize; private $_tokenize; public function __construct($options = []) { $this->initTokenize(); $this->escapeReplacements = [ '\\' => '\\', "'" => "'", '"' => '"', 'n' => "\n", 'r' => "\r", 't' => "\t", 'f' => "\f", 'b' => \chr(8), '_' => '_', '~' => '~', '.' => '.', '-' => '-', '!' => '!', '$' => '$', '&' => '&', '(' => '(', ')' => ')', '*' => '*', '+' => '+', ',' => ',', ';' => ';', '=' => '=', '/' => '/', '?' => '?', '#' => '#', '@' => '@', '%' => '%', ]; // In line mode (N-Triples or N-Quads), only simple features may be parsed if ($options['lineMode']) { // Don't tokenize special literals $this->tripleQuotedString = '/$0^/'; $this->number = '/$0^/'; $this->boolean = '/$0^/'; // Swap the tokenize method for a restricted version $this->_oldTokenize = $this->_tokenize; $self = $this; $this->_tokenize = function ($input, $finalize = true) use ($self) { $tokens = \call_user_func($this->_oldTokenize, $input, $finalize); foreach ($tokens as $token) { if (!preg_match('/^(?:blank|IRI|prefixed|literal|langcode|type|typeIRI|\.|eof)$/', $token['type'])) { throw $self->syntaxError($token['type'], $token['line']); } } return $tokens; }; } // Enable N3 functionality by default $this->n3Mode = false !== $options['n3']; // Disable comment tokens by default $this->comments = isset($options['comments']) ? $options['comments'] : null; } // ## Regular expressions //_iri: /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/, // IRI with escape sequences; needs sanity check after unescaping private $iri = '/^<((?:[^ <>{}\\\\]|\\\\[uU])+)>[ \\t]*/'; // IRI with escape sequences; needs sanity check after unescaping // _unescapedIri: /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/, // IRI without escape sequences; no unescaping private $unescapedIri = '/^<([^\\x00-\\x20<>\\\\"\\{\\}\\|\\^\\`]*)>[ \\t]*/'; // IRI without escape sequences; no unescaping // _unescapedString: /^"[^"\\]+"(?=[^"\\])/, // non-empty string without escape sequences private $unescapedString = '/^"[^\\\\"]+"(?=[^\\\\"])/'; // non-empty string without escape sequences // _singleQuotedString: /^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^'[^'\\]*(?:\\.[^'\\]*)*'(?=[^'\\])/, private $singleQuotedString = '/^"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"(?=[^"\\\\])|^\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'(?=[^\'\\\\])/'; // _tripleQuotedString: /^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^''('[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*')''/, private $tripleQuotedString = '/^""("[^\\\\"]*(?:(?:\\\\.|"(?!""))[^\\\\"]*)*")""|^\'\'(\'[^\\\\\']*(?:(?:\\\\.|\'(?!\'\'))[^\\\\\']*)*\')\'\'/'; private $langcode = '/^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\\-])/i'; private $prefix = '/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)?:(?=[#\\s<])/'; private $prefixed = "/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6\\xf8-\\x{02ff}\\x{0370}-\\x{037d}\\x{037f}-\\x{1fff}\\x{200c}\\x{200d}\\x{2070}-\\x{218f}\\x{2c00}-\\x{2fef}\\x{3001}-\\x{d7ff}\\x{f900}-\\x{fdcf}\\x{fdf0}-\\x{fffd}])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6\\xf8-\\x{037d}\\x{037f}-\\x{1fff}\\x{200c}\\x{200d}\\x{203f}\\x{2040}\\x{2070}-\\x{218f}\\x{2c00}-\\x{2fef}\\x{3001}-\\x{d7ff}\\x{f900}-\\x{fdcf}\\x{fdf0}-\\x{fffd}])*)?:((?:(?:[0-:A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6\\xf8-\\x{02ff}\\x{0370}-\\x{037d}\\x{037f}-\\x{1fff}\\x{200c}\\x{200d}\\x{2070}-\\x{218f}\\x{2c00}-\\x{2fef}\\x{3001}-\\x{d7ff}\\x{f900}-\\x{fdcf}\\x{fdf0}-\\x{fffd}]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])(?:(?:[\\.\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6\\xf8-\\x{037d}\\x{037f}-\\x{1fff}\\x{200c}\\x{200d}\\x{203f}\\x{2040}\\x{2070}-\\x{218f}\\x{2c00}-\\x{2fef}\\x{3001}-\\x{d7ff}\\x{f900}-\\x{fdcf}\\x{fdf0}-\\x{fffd}]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])*(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6\\xf8-\\x{037d}\\x{037f}-\\x{1fff}\\x{200c}\\x{200d}\\x{203f}\\x{2040}\\x{2070}-\\x{218f}\\x{2c00}-\\x{2fef}\\x{3001}-\\x{d7ff}\\x{f900}-\\x{fdcf}\\x{fdf0}-\\x{fffd}]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~]))?)?)(?:[ \\t]+|(?=\\.?[,;!\\^\\s#()\\[\\]\\{\\}\"'<]))/u"; private $variable = '/^\\?(?:(?:[A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?=[.,;!\\^\\s#()\\[\\]\\{\\}"\'<])/'; private $blank = '/^_:((?:[0-9A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?:[ \\t]+|(?=\\.?[,;:\\s#()\\[\\]\\{\\}"\'<]))/'; private $number = "/^[\\-+]?(?:\\d+\\.?\\d*([eE](?:[\\-\\+])?\\d+)|\\d*\\.?\\d+)(?=[.,;:\\s#()\\[\\]\\{\\}\"'<])/"; private $boolean = '/^(?:true|false)(?=[.,;\\s#()\\[\\]\\{\\}"\'<])/'; private $keyword = '/^@[a-z]+(?=[\\s#<])/i'; private $sparqlKeyword = '/^(?:PREFIX|BASE|GRAPH)(?=[\\s#<])/i'; private $shortPredicates = '/^a(?=\\s+|<)/'; private $newline = '/^[ \\t]*(?:#[^\\n\\r]*)?(?:\\r\\n|\\n|\\r)[ \\t]*/'; private $comment = '/#([^\\n\\r]*)/'; private $whitespace = '/^[ \\t]+/'; private $endOfFile = '/^(?:#[^\\n\\r]*)?$/'; /** * tokenizes as for as possible, emitting tokens through the callback */ private function tokenizeToEnd($callback, $inputFinished) { // Continue parsing as far as possible; the loop will return eventually $input = $this->input; // Signals the syntax error through the callback $reportSyntaxError = function ($self) use ($callback, &$input) { preg_match("/^\S*/", $input, $match); $callback($self->syntaxError($match[0], $self->line), null); }; $outputComments = $this->comments; while (true) { // Count and skip whitespace lines $whiteSpaceMatch = null; $comment = null; while (preg_match($this->newline, $input, $whiteSpaceMatch)) { // Try to find a comment if ($outputComments && preg_match($this->comment, $whiteSpaceMatch[0], $comment)) { /* * originally the following line was here: * * callback(null, ['line' => $this->line, 'type' => 'comment', 'value' => $comment[1], 'prefix' => '']); * * but it makes no sense, because callback is a function from PHPUnit, which can't be relied on * in this context. therefore this line must be at least commented out. the question is, if the * whole "case" can be removed as well. * * FYI: #29 */ } // Advance the input $input = substr($input, \strlen($whiteSpaceMatch[0]), \strlen($input)); ++$this->line; } // Skip whitespace on current line if (preg_match($this->whitespace, $input, $whiteSpaceMatch)) { $input = substr($input, \strlen($whiteSpaceMatch[0]), \strlen($input)); } // Stop for now if we're at the end if (preg_match($this->endOfFile, $input)) { // If the $input is finished, emit EOF if ($inputFinished) { // Try to find a final comment if ($outputComments && preg_match($this->comment, $input, $comment)) { $callback(null, ['line' => $this->line, 'type' => 'comment', 'value' => $comment[1], 'prefix' => '']); } $callback($input = null, ['line' => $this->line, 'type' => 'eof', 'value' => '', 'prefix' => '']); } $this->input = $input; return $input; } // Look for specific token types based on the first character $line = $this->line; $type = ''; $value = ''; $prefix = ''; $firstChar = $input[0]; $match = null; $matchLength = 0; $unescaped = null; $inconclusive = false; switch ($firstChar) { case '^': // We need at least 3 tokens lookahead to distinguish ^^ and ^^pre:fixed if (\strlen($input) < 3) { break; } // Try to match a type elseif ('^' === $input[1]) { $this->prevTokenType = '^^'; // Move to type IRI or prefixed name $input = substr($input, 2); if ('<' !== $input[0]) { $inconclusive = true; break; } } // If no type, it must be a path expression else { if ($this->n3Mode) { $matchLength = 1; $type = '^'; } break; } // Fall through in case the type is an IRI // no break case '<': // Try to find a full IRI without escape sequences if (preg_match($this->unescapedIri, $input, $match)) { $type = 'IRI'; $value = $match[1]; } // Try to find a full IRI with escape sequences elseif (preg_match($this->iri, $input, $match)) { $unescaped = $this->unescape($match[1]); if (null === $unescaped || preg_match($this->illegalIriChars, $unescaped)) { return $reportSyntaxError($this); } $type = 'IRI'; $value = $unescaped; } // Try to find a backwards implication arrow elseif ($this->n3Mode && \strlen($input) > 1 && '=' === $input[1]) { $type = 'inverse'; $matchLength = 2; $value = 'http://www.w3.org/2000/10/swap/log#implies'; } break; case '_': // Try to find a blank node. Since it can contain (but not end with) a dot, // we always need a non-dot character before deciding it is a prefixed name. // Therefore, try inserting a space if we're at the end of the $input. if ((preg_match($this->blank, $input, $match)) || $inputFinished && (preg_match($this->blank, $input.' ', $match))) { $type = 'blank'; $prefix = '_'; $value = $match[1]; } break; case '"': case "'": // Try to find a non-empty double-quoted literal without escape sequences if (preg_match($this->unescapedString, $input, $match)) { $type = 'literal'; $value = $match[0]; } // Try to find any other literal wrapped in a pair of single or double quotes elseif (preg_match($this->singleQuotedString, $input, $match)) { $unescaped = $this->unescape($match[0]); if (null === $unescaped) { return $reportSyntaxError($this); } $type = 'literal'; $value = preg_replace('/^\'|\'$/', '"', $unescaped); } // Try to find a literal wrapped in three pairs of single or double quotes elseif (preg_match($this->tripleQuotedString, $input, $match)) { $unescaped = isset($match[1]) ? $match[1] : $match[2]; // Count the newlines and advance line counter $this->line += \count(preg_split('/\r\n|\r|\n/', $unescaped)) - 1; $unescaped = $this->unescape($unescaped); if (null === $unescaped) { return $reportSyntaxError($this); } $type = 'literal'; $value = preg_replace("/^'|'$/", '"', $unescaped); } break; case '?': // Try to find a variable if ($this->n3Mode && (preg_match($this->variable, $input, $match))) { $type = 'var'; $value = $match[0]; } break; case '@': // Try to find a language code if ('literal' === $this->prevTokenType && preg_match($this->langcode, $input, $match)) { $type = 'langcode'; $value = $match[1]; } // Try to find a keyword elseif (preg_match($this->keyword, $input, $match)) { $type = $match[0]; } break; case '.': // Try to find a dot as punctuation if (1 === \strlen($input) ? $inputFinished : ($input[1] < '0' || $input[1] > '9')) { $type = '.'; $matchLength = 1; break; } // Fall through to numerical case (could be a decimal dot) // no break case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '+': case '-': // Try to find a number if (preg_match($this->number, $input, $match)) { $type = 'literal'; $value = '"'.$match[0].'"^^http://www.w3.org/2001/XMLSchema#'.(isset($match[1]) ? 'double' : (preg_match("/^[+\-]?\d+$/", $match[0]) ? 'integer' : 'decimal')); } break; case 'B': case 'b': case 'p': case 'P': case 'G': case 'g': // Try to find a SPARQL-style keyword if (preg_match($this->sparqlKeyword, $input, $match)) { $type = strtoupper($match[0]); } else { $inconclusive = true; } break; case 'f': case 't': // Try to match a boolean if (preg_match($this->boolean, $input, $match)) { $type = 'literal'; $value = '"'.$match[0].'"^^http://www.w3.org/2001/XMLSchema#boolean'; } else { $inconclusive = true; } break; case 'a': // Try to find an abbreviated predicate if (preg_match($this->shortPredicates, $input, $match)) { $type = 'abbreviation'; $value = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; } else { $inconclusive = true; } break; case '=': // Try to find an implication arrow or equals sign if ($this->n3Mode && \strlen($input) > 1) { $type = 'abbreviation'; if ('>' !== $input[1]) { $matchLength = 1; $value = 'http://www.w3.org/2002/07/owl#sameAs'; } else { $matchLength = 2; $value = 'http://www.w3.org/2000/10/swap/log#implies'; } } break; case '!': if (!$this->n3Mode) { break; } // no break case ',': case ';': case '[': case ']': case '(': case ')': case '{': case '}': // The next token is punctuation $matchLength = 1; $type = $firstChar; break; default: $inconclusive = true; } // Some first characters do not allow an immediate decision, so inspect more if ($inconclusive) { // Try to find a prefix if (('@prefix' === $this->prevTokenType || 'PREFIX' === $this->prevTokenType) && preg_match($this->prefix, $input, $match)) { $type = 'prefix'; $value = isset($match[1]) ? $match[1] : ''; } // Try to find a prefixed name. Since it can contain (but not end with) a dot, // we always need a non-dot character before deciding it is a prefixed name. // Therefore, try inserting a space if we're at the end of the input. elseif (preg_match($this->prefixed, $input, $match) || $inputFinished && (preg_match($this->prefixed, $input.' ', $match))) { $type = 'prefixed'; $prefix = isset($match[1]) ? $match[1] : ''; $value = $this->unescape($match[2]); } } // A type token is special: it can only be emitted after an IRI or prefixed name is read if ('^^' === $this->prevTokenType) { switch ($type) { case 'prefixed': $type = 'type'; break; case 'IRI': $type = 'typeIRI'; break; default: $type = ''; } } // What if nothing of the above was found? if (!$type) { // We could be in streaming mode, and then we just wait for more input to arrive. // Otherwise, a syntax error has occurred in the input. // One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal). if ($inputFinished || (!preg_match('/^\'\'\'|^"""/', $input) && preg_match('/\\n|\\r/', $input))) { return $reportSyntaxError($this); } else { $this->input = $input; return $input; } } // Emit the parsed token $callback(null, ['line' => $line, 'type' => $type, 'value' => $value, 'prefix' => $prefix]); $this->prevTokenType = $type; // Advance to next part to tokenize $input = substr($input, $matchLength > 0 ? $matchLength : \strlen($match[0]), \strlen($input)); } } // ### `_unescape` replaces N3 escape codes by their corresponding characters private function unescape($item) { return preg_replace_callback($this->escapeSequence, function ($match) { // $match[0] contains sequence $unicode4 = isset($match[1]) ? $match[1] : null; $unicode8 = isset($match[2]) ? $match[2] : null; $escapedChar = isset($match[3]) ? $match[3] : null; $charCode = null; if ($unicode4) { $charCode = \intval($unicode4, 16); return mb_convert_encoding('&#'.(int) $charCode.';', 'UTF-8', 'HTML-ENTITIES'); } elseif ($unicode8) { $charCode = \intval($unicode8, 16); return mb_convert_encoding('&#'.(int) $charCode.';', 'UTF-8', 'HTML-ENTITIES'); } else { if (!isset($this->escapeReplacements[$escapedChar])) { throw new \Exception(); } return $this->escapeReplacements[$escapedChar]; } }, $item); } // ### `_syntaxError` creates a syntax error for the given issue private function syntaxError($issue, $line = 0) { $this->input = null; return new \Exception('Unexpected "'.$issue.'" on line '.$line.'.'); } // When handling tokenize as a variable, we can hotswap its functionality when dealing with various serializations private function initTokenize() { $this->_tokenize = function ($input, $finalize) { // If the input is a string, continuously emit tokens through the callback until the end if (!isset($this->input)) { $this->input = ''; } $this->input .= $input; $tokens = []; $error = ''; $this->input = $this->tokenizeToEnd(function ($e, $t) use (&$tokens, &$error) { if (isset($e)) { $error = $e; } $tokens[] = $t; }, $finalize); if ($error) { throw $error; } return $tokens; }; } // ## Public methods // ### `tokenize` starts the transformation of an N3 document into an array of tokens. // The input can be a string or a stream. public function tokenize($input, $finalize = true) { try { return \call_user_func($this->_tokenize, $input, $finalize); } catch (\Exception $e) { throw $e; } } // Adds the data chunk to the buffer and parses as far as possible public function tokenizeChunk($input) { return $this->tokenize($input, false); } public function end() { // Parses the rest return $this->tokenizeToEnd(true, null); } }