diff --git a/src/Parser.php b/src/Parser.php index ee22657..3a23be6 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -38,52 +38,35 @@ public function parse(Iterator $tokens) /** @var Token $token */ $token = $tokens->current(); while (!is_null($token)) { - switch (true) { - case $token->getType() == Token::T_ESCAPE: - $tokens->next(); - $token = $tokens->current(); - if (is_null($token)) { - throw new RuntimeException( - "Invalid CSV: The csv strings final character is an escape character" - ); - } - $value->addContent($token->getContent()); - break; - - case ($value->isInQuotes() && $token->getType() == Token::T_DOUBLE_QUOTE): - $value->addContent(substr($token->getContent(), 0, $token->getLength() / 2)); - break; - - case $token->getType() == Token::T_QUOTE: + switch ($token->getType()) { + case Token::T_QUOTE: $value->setInQuotes(!$value->isInQuotes()); break; - - case $value->isInQuotes(): + case Token::T_CONTENT: $value->addContent($token->getContent()); break; - - case ($value->isEmpty() - && !$value->isInQuotes() - && !$value->wasQuoted() - && $token->getType() == Token::T_NULL): - $value->addContent($token->getContent()); - $value->setIsNull(); + case Token::T_DOUBLE_QUOTE: + $value->addContent(substr($token->getContent(), 0, $token->getLength() / 2)); break; - - case (!$value->isInQuotes() && $token->getType() == Token::T_DELIMITER): + case Token::T_NULL: + if ($value->isEmpty() && !$value->isInQuotes() && !$value->wasQuoted()) { + $value->addContent($token->getContent()); + $value->setIsNull(); + } else { + $value->addContent($token->getContent()); + } + break; + case Token::T_DELIMITER: $row->append($value->getValue()); $value->reset(); break; - - case (!$value->isInQuotes() && $token->getType() == Token::T_NEW_LINE): + case Token::T_NEW_LINE: $row->append($value->getValue()); $value->reset(); yield $row; $row = new ArrayIterator(); break; - default: - $value->addContent($token->getContent()); break; } @@ -92,6 +75,9 @@ public function parse(Iterator $tokens) } if (!$value->isEmpty()) { + if ($value->isInQuotes()) { + throw new RuntimeException("Unmatched quote at the end of the csv data"); + } $row->append($value->getValue()); } diff --git a/src/Tokeniser/State.php b/src/Tokeniser/State.php new file mode 100644 index 0000000..821c17a --- /dev/null +++ b/src/Tokeniser/State.php @@ -0,0 +1,75 @@ +types = $types; + } + + /** + * @param int $token + * + * @return State|null + */ + public function getNextState($token) + { + foreach ($this->states as $mask => $state) { + if ($mask & $token) { + return $state; + } + } + + throw new RuntimeException("The supplied token: {$token} has no target state"); + } + + /** + * @param int $tokenMask + * @param State $target + */ + public function addStateTarget($tokenMask, State $target) + { + $this->states[$tokenMask] = $target; + } + + /** + * @param int $position + * @param string $buffer + * + * @return Token + */ + public function match($position, $buffer) + { + foreach ($this->types as $search => $tokenType) { + if (substr($buffer, 0, strlen($search)) == $search) { + return new Token($tokenType, $search, $position); + } + } + + return new Token(Token::T_CONTENT, $buffer[0], $position); + } +} diff --git a/src/Tokeniser/StateBuilder.php b/src/Tokeniser/StateBuilder.php new file mode 100644 index 0000000..d742697 --- /dev/null +++ b/src/Tokeniser/StateBuilder.php @@ -0,0 +1,40 @@ +addStateTarget(Token::T_ANY & ~Token::T_QUOTE & ~Token::T_ESCAPE, $any); + $any->addStateTarget(Token::T_QUOTE, $inQuote); + $any->addStateTarget(Token::T_ESCAPE, $inEscape); + + $inQuote->addStateTarget(Token::T_CONTENT | Token::T_DOUBLE_QUOTE, $inQuote); + $inQuote->addStateTarget(Token::T_QUOTE, $any); + $inQuote->addStateTarget(Token::T_ESCAPE, $inQuoteEscape); + + $inEscape->addStateTarget(Token::T_CONTENT, $any); + + $inQuoteEscape->addStateTarget(Token::T_CONTENT, $inQuote); + + return $any; + } +} diff --git a/src/Tokeniser/StreamTokeniser.php b/src/Tokeniser/StreamTokeniser.php index f22c6a6..12563e6 100644 --- a/src/Tokeniser/StreamTokeniser.php +++ b/src/Tokeniser/StreamTokeniser.php @@ -9,13 +9,14 @@ class StreamTokeniser implements TokeniserInterface { use TypeBuilder; + use StateBuilder; - /** @var int[] */ - private $types; /** @var int */ private $maxTypeLength; /** @var StreamInterface */ private $stream; + /** @var State */ + private $state; /** * Tokeniser constructor. @@ -25,13 +26,9 @@ class StreamTokeniser implements TokeniserInterface */ public function __construct(CsvConfigurationInterface $config, StreamInterface $stream) { - $this->types = $this->getTypes($config); - - // sort by reverse key length - uksort($this->types, function ($first, $second) { - return strlen($second) - strlen($first); - }); - $this->maxTypeLength = count($this->types) > 0 ? strlen(array_keys($this->types)[0]) : 1; + $types = $this->getTypes($config); + $this->state = $this->buildStates($types); + $this->maxTypeLength = count($types) > 0 ? strlen(array_keys($types)[0]) : 1; $this->stream = $stream; } @@ -51,7 +48,9 @@ public function getTokens() $last = null; while (strlen($buffer) > 0) { - $token = $this->match($position, $buffer); + $token = $this->state->match($position, $buffer); + $this->state = $this->state->getNextState($token->getType()); + $len = $token->getLength(); // merge tokens together to condense T_CONTENT tokens @@ -75,21 +74,4 @@ public function getTokens() $this->stream->close(); } - - /** - * @param int $position - * @param string $buffer - * - * @return Token - */ - private function match($position, $buffer) - { - foreach ($this->types as $search => $tokenType) { - if (substr($buffer, 0, strlen($search)) == $search) { - return new Token($tokenType, $search, $position); - } - } - - return new Token(Token::T_CONTENT, $buffer[0], $position); - } } diff --git a/src/Tokeniser/Token.php b/src/Tokeniser/Token.php index b4076fb..9722761 100644 --- a/src/Tokeniser/Token.php +++ b/src/Tokeniser/Token.php @@ -6,11 +6,12 @@ class Token { const T_CONTENT = 1; const T_DELIMITER = 2; - const T_NEW_LINE = 3; - const T_QUOTE = 4; - const T_NULL = 5; - const T_ESCAPE = 6; - const T_DOUBLE_QUOTE = 7; + const T_NEW_LINE = 4; + const T_QUOTE = 8; + const T_NULL = 16; + const T_ESCAPE = 32; + const T_DOUBLE_QUOTE = 128; + const T_ANY = 255; /** @var int */ private $type; diff --git a/src/Tokeniser/TypeBuilder.php b/src/Tokeniser/TypeBuilder.php index c971ee1..cf1a15f 100644 --- a/src/Tokeniser/TypeBuilder.php +++ b/src/Tokeniser/TypeBuilder.php @@ -9,7 +9,7 @@ trait TypeBuilder /** * @param CsvConfigurationInterface $config * - * @return int[] + * @return int[] Sorted in order of precedence */ protected function getTypes(CsvConfigurationInterface $config) { @@ -20,7 +20,7 @@ protected function getTypes(CsvConfigurationInterface $config) ]; if ($config->useDoubleQuotes()) { - $types[$config->getQuote() . $config->getQuote()] = Token::T_DOUBLE_QUOTE; + $types[str_repeat($config->getQuote(), 2)] = Token::T_DOUBLE_QUOTE; } $newLines = $config->getNewLine(); if (!is_array($newLines)) { @@ -33,6 +33,11 @@ protected function getTypes(CsvConfigurationInterface $config) $types[$config->getNullValue()] = Token::T_NULL; } + // sort by reverse key length + uksort($types, function ($first, $second) { + return strlen($second) - strlen($first); + }); + return $types; } } diff --git a/tests/integration/ParserTest.php b/tests/integration/ParserTest.php index e568423..644bf99 100644 --- a/tests/integration/ParserTest.php +++ b/tests/integration/ParserTest.php @@ -41,12 +41,20 @@ public function parseData() return [ [ new CsvConfiguration(), - '"some",\\N,"new' . "\n" . 'line",with\\' . "\n" . 'escaped,"in\\' . "\n" . 'quotes"', + '"some",\\N,"new' . "\n" . 'line",with\\' . "\n" . 'escaped,"in\\' . "\n" . 'quotes","\\\\"', [], [ - ['some', null, "new\nline", "with\nescaped", "in\nquotes"], + ['some', null, "new\nline", "with\nescaped", "in\nquotes", '\\'], ], ], + [ + new CsvConfiguration([ + CsvConfiguration::OPTION_DOUBLE_QUOTE => true, + ]), + '"end""","""start","""both""","",""""', + [], + [['end"', '"start', '"both"', '', '"']], + ], [ new CsvConfiguration([ CsvConfiguration::OPTION_DELIMITER => '|', @@ -87,7 +95,7 @@ public function parseData() 'text\\Nthing,\\Nstart,end\\N,\\N,"\\N"', [], [ - ['text\\Nthing', '\\Nstart', 'end\\N', null, '\\N'], + ['text\\Nthing', '\\Nstart', 'end\\N', null, 'N'], ], ], [ @@ -124,8 +132,8 @@ public function testParseExceptions($csv, $exception) public function parseExceptionsData() { return [ - ['"string",\\', RuntimeException::class], - ['"string"stuff,things', RuntimeException::class], + ['"string"stuff,things', RuntimeException::class], // extra text after a closing quote + ['"string', RuntimeException::class], // no closing quote ]; } } diff --git a/tests/unit/ParserTest.php b/tests/unit/ParserTest.php index 18f513e..63d101a 100644 --- a/tests/unit/ParserTest.php +++ b/tests/unit/ParserTest.php @@ -47,6 +47,7 @@ public function testParserIteratesSoNothingShouldHappenIfThereIsNoRequestForData { $parser = new Parser(); + /** @var Iterator $tokens */ $tokens = m::mock(Iterator::class); $output = $parser->parse($tokens); diff --git a/tests/unit/Tokeniser/StateTest.php b/tests/unit/Tokeniser/StateTest.php new file mode 100644 index 0000000..13388bb --- /dev/null +++ b/tests/unit/Tokeniser/StateTest.php @@ -0,0 +1,22 @@ +addStateTarget(Token::T_CONTENT, $state); + + static::assertSame($state, $state->getNextState(Token::T_CONTENT)); + + static::expectException(RuntimeException::class); + $state->getNextState(Token::T_ESCAPE); + } +} diff --git a/tests/unit/Tokeniser/StreamTokeniserTest.php b/tests/unit/Tokeniser/StreamTokeniserTest.php index b8ba81d..60248c5 100644 --- a/tests/unit/Tokeniser/StreamTokeniserTest.php +++ b/tests/unit/Tokeniser/StreamTokeniserTest.php @@ -87,9 +87,45 @@ public function tokeniserTestData() [Token::T_CONTENT, 'some'], ], ], + [ + new CsvConfiguration([ + CsvConfiguration::OPTION_DOUBLE_QUOTE => true, + ]), + '"end""","""start","""both""","","""",""""""""', + [ + [Token::T_QUOTE, '"'], + [Token::T_CONTENT, 'end'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_QUOTE, '"'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_CONTENT, 'start'], + [Token::T_QUOTE, '"'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_CONTENT, 'both'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_QUOTE, '"'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_QUOTE, '"'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_QUOTE, '"'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_DOUBLE_QUOTE, '""'], + [Token::T_QUOTE, '"'], + ], + ], [ new CsvConfiguration(), - '"some",test,"with \" escape"', + '"some",test,"with \" escape","\\\\"', [ [Token::T_QUOTE, '"'], [Token::T_CONTENT, 'some'], @@ -100,8 +136,12 @@ public function tokeniserTestData() [Token::T_QUOTE, '"'], [Token::T_CONTENT, 'with '], [Token::T_ESCAPE, '\\'], + [Token::T_CONTENT, '" escape'], [Token::T_QUOTE, '"'], - [Token::T_CONTENT, ' escape'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_ESCAPE, '\\'], + [Token::T_CONTENT, '\\'], [Token::T_QUOTE, '"'], ], ], @@ -124,8 +164,7 @@ public function tokeniserTestData() [Token::T_DELIMITER, '|'], [Token::T_QUOTE, "'"], [Token::T_ESCAPE, '\\'], - [Token::T_QUOTE, "'"], - [Token::T_CONTENT, "here"], + [Token::T_CONTENT, "'here"], [Token::T_QUOTE, "'"], [Token::T_DELIMITER, '|'], [Token::T_NULL, '\\N'], @@ -154,21 +193,17 @@ public function tokeniserTestData() [Token::T_QUOTE, '"'], [Token::T_DELIMITER, ','], [Token::T_QUOTE, '"'], - [Token::T_CONTENT, 'new'], - [Token::T_NEW_LINE, "\n"], - [Token::T_CONTENT, 'line'], + [Token::T_CONTENT, 'new' . "\n" . 'line'], [Token::T_QUOTE, '"'], [Token::T_DELIMITER, ','], [Token::T_CONTENT, 'with'], [Token::T_ESCAPE, '\\'], - [Token::T_NEW_LINE, "\n"], - [Token::T_CONTENT, 'escaped'], + [Token::T_CONTENT, "\n" . 'escaped'], [Token::T_DELIMITER, ','], [Token::T_QUOTE, '"'], [Token::T_CONTENT, 'in'], [Token::T_ESCAPE, '\\'], - [Token::T_NEW_LINE, "\n"], - [Token::T_CONTENT, 'quotes'], + [Token::T_CONTENT, "\n" . 'quotes'], [Token::T_QUOTE, '"'], ], ], @@ -188,6 +223,28 @@ public function tokeniserTestData() [Token::T_CONTENT, '한국말'], ], ], + [ + new CsvConfiguration([]), + 'text\\Nthing,\\Nstart,end\\N,\\N,"\\N"', + [ + [Token::T_CONTENT, 'text'], + [Token::T_NULL, '\\N'], + [Token::T_CONTENT, 'thing'], + [Token::T_DELIMITER, ','], + [Token::T_NULL, '\N'], + [Token::T_CONTENT, 'start'], + [Token::T_DELIMITER, ','], + [Token::T_CONTENT, 'end'], + [Token::T_NULL, '\\N'], + [Token::T_DELIMITER, ','], + [Token::T_NULL, '\\N'], + [Token::T_DELIMITER, ','], + [Token::T_QUOTE, '"'], + [Token::T_ESCAPE, '\\'], + [Token::T_CONTENT, 'N'], + [Token::T_QUOTE, '"'], + ], + ], ]; } }