Skip to content

Commit

Permalink
Merge pull request #2 from graze/tokeniser-states
Browse files Browse the repository at this point in the history
Tokeniser states
  • Loading branch information
Harry Bragg committed May 18, 2016
2 parents b6cc577 + b14f4d3 commit 0339346
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 82 deletions.
50 changes: 18 additions & 32 deletions src/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,52 +38,35 @@ public function parse(Iterator $tokens)
/** @var Token $token */
$token = $tokens->current();
while (!is_null($token)) {
switch (true) {
case $token->getType() == Token::T_ESCAPE:
$tokens->next();
$token = $tokens->current();
if (is_null($token)) {
throw new RuntimeException(
"Invalid CSV: The csv strings final character is an escape character"
);
}
$value->addContent($token->getContent());
break;

case ($value->isInQuotes() && $token->getType() == Token::T_DOUBLE_QUOTE):
$value->addContent(substr($token->getContent(), 0, $token->getLength() / 2));
break;

case $token->getType() == Token::T_QUOTE:
switch ($token->getType()) {
case Token::T_QUOTE:
$value->setInQuotes(!$value->isInQuotes());
break;

case $value->isInQuotes():
case Token::T_CONTENT:
$value->addContent($token->getContent());
break;

case ($value->isEmpty()
&& !$value->isInQuotes()
&& !$value->wasQuoted()
&& $token->getType() == Token::T_NULL):
$value->addContent($token->getContent());
$value->setIsNull();
case Token::T_DOUBLE_QUOTE:
$value->addContent(substr($token->getContent(), 0, $token->getLength() / 2));
break;

case (!$value->isInQuotes() && $token->getType() == Token::T_DELIMITER):
case Token::T_NULL:
if ($value->isEmpty() && !$value->isInQuotes() && !$value->wasQuoted()) {
$value->addContent($token->getContent());
$value->setIsNull();
} else {
$value->addContent($token->getContent());
}
break;
case Token::T_DELIMITER:
$row->append($value->getValue());
$value->reset();
break;

case (!$value->isInQuotes() && $token->getType() == Token::T_NEW_LINE):
case Token::T_NEW_LINE:
$row->append($value->getValue());
$value->reset();
yield $row;
$row = new ArrayIterator();
break;

default:
$value->addContent($token->getContent());
break;
}

Expand All @@ -92,6 +75,9 @@ public function parse(Iterator $tokens)
}

if (!$value->isEmpty()) {
if ($value->isInQuotes()) {
throw new RuntimeException("Unmatched quote at the end of the csv data");
}
$row->append($value->getValue());
}

Expand Down
75 changes: 75 additions & 0 deletions src/Tokeniser/State.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?php

namespace Graze\CsvToken\Tokeniser;

use RuntimeException;

class State
{
const S_ANY = 0;
const S_IN_QUOTE = 1;
const S_IN_ESCAPE = 2;
const S_IN_QUOTE_ESCAPE = 4;

const S_ANY_TOKENS = Token::T_ANY & ~Token::T_DOUBLE_QUOTE;
const S_IN_QUOTE_TOKENS = Token::T_CONTENT | Token::T_QUOTE | Token::T_DOUBLE_QUOTE | Token::T_ESCAPE;
const S_IN_ESCAPE_TOKENS = Token::T_CONTENT;
const S_IN_QUOTE_ESCAPE_TOKENS = Token::T_CONTENT;

/** @var array */
private $types;
/** @var State[] */
private $states;

/**
* State constructor.
*
* @param array $types
*/
public function __construct(array $types)
{
$this->types = $types;
}

/**
* @param int $token
*
* @return State|null
*/
public function getNextState($token)
{
foreach ($this->states as $mask => $state) {
if ($mask & $token) {
return $state;
}
}

throw new RuntimeException("The supplied token: {$token} has no target state");
}

/**
* @param int $tokenMask
* @param State $target
*/
public function addStateTarget($tokenMask, State $target)
{
$this->states[$tokenMask] = $target;
}

/**
* @param int $position
* @param string $buffer
*
* @return Token
*/
public function match($position, $buffer)
{
foreach ($this->types as $search => $tokenType) {
if (substr($buffer, 0, strlen($search)) == $search) {
return new Token($tokenType, $search, $position);
}
}

return new Token(Token::T_CONTENT, $buffer[0], $position);
}
}
40 changes: 40 additions & 0 deletions src/Tokeniser/StateBuilder.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?php

namespace Graze\CsvToken\Tokeniser;

trait StateBuilder
{
/**
* @param array $types
*
* @return State The default starting state
*/
public function buildStates(array $types)
{
$getTypes = function ($tokenMask) use ($types) {
return array_filter($types, function ($type) use ($tokenMask) {
return $type & $tokenMask;
});
};

$any = new State($getTypes(State::S_ANY_TOKENS));
$inQuote = new State($getTypes(State::S_IN_QUOTE_TOKENS));
$inEscape = new State($getTypes(State::S_IN_ESCAPE_TOKENS));
$inQuoteEscape = new State($getTypes(State::S_IN_QUOTE_ESCAPE_TOKENS));

// generate state mapping
$any->addStateTarget(Token::T_ANY & ~Token::T_QUOTE & ~Token::T_ESCAPE, $any);
$any->addStateTarget(Token::T_QUOTE, $inQuote);
$any->addStateTarget(Token::T_ESCAPE, $inEscape);

$inQuote->addStateTarget(Token::T_CONTENT | Token::T_DOUBLE_QUOTE, $inQuote);
$inQuote->addStateTarget(Token::T_QUOTE, $any);
$inQuote->addStateTarget(Token::T_ESCAPE, $inQuoteEscape);

$inEscape->addStateTarget(Token::T_CONTENT, $any);

$inQuoteEscape->addStateTarget(Token::T_CONTENT, $inQuote);

return $any;
}
}
36 changes: 9 additions & 27 deletions src/Tokeniser/StreamTokeniser.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
class StreamTokeniser implements TokeniserInterface
{
use TypeBuilder;
use StateBuilder;

/** @var int[] */
private $types;
/** @var int */
private $maxTypeLength;
/** @var StreamInterface */
private $stream;
/** @var State */
private $state;

/**
* Tokeniser constructor.
Expand All @@ -25,13 +26,9 @@ class StreamTokeniser implements TokeniserInterface
*/
public function __construct(CsvConfigurationInterface $config, StreamInterface $stream)
{
$this->types = $this->getTypes($config);

// sort by reverse key length
uksort($this->types, function ($first, $second) {
return strlen($second) - strlen($first);
});
$this->maxTypeLength = count($this->types) > 0 ? strlen(array_keys($this->types)[0]) : 1;
$types = $this->getTypes($config);
$this->state = $this->buildStates($types);
$this->maxTypeLength = count($types) > 0 ? strlen(array_keys($types)[0]) : 1;
$this->stream = $stream;
}

Expand All @@ -51,7 +48,9 @@ public function getTokens()
$last = null;

while (strlen($buffer) > 0) {
$token = $this->match($position, $buffer);
$token = $this->state->match($position, $buffer);
$this->state = $this->state->getNextState($token->getType());

$len = $token->getLength();

// merge tokens together to condense T_CONTENT tokens
Expand All @@ -75,21 +74,4 @@ public function getTokens()

$this->stream->close();
}

/**
* @param int $position
* @param string $buffer
*
* @return Token
*/
private function match($position, $buffer)
{
foreach ($this->types as $search => $tokenType) {
if (substr($buffer, 0, strlen($search)) == $search) {
return new Token($tokenType, $search, $position);
}
}

return new Token(Token::T_CONTENT, $buffer[0], $position);
}
}
11 changes: 6 additions & 5 deletions src/Tokeniser/Token.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ class Token
{
const T_CONTENT = 1;
const T_DELIMITER = 2;
const T_NEW_LINE = 3;
const T_QUOTE = 4;
const T_NULL = 5;
const T_ESCAPE = 6;
const T_DOUBLE_QUOTE = 7;
const T_NEW_LINE = 4;
const T_QUOTE = 8;
const T_NULL = 16;
const T_ESCAPE = 32;
const T_DOUBLE_QUOTE = 128;
const T_ANY = 255;

/** @var int */
private $type;
Expand Down
9 changes: 7 additions & 2 deletions src/Tokeniser/TypeBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ trait TypeBuilder
/**
* @param CsvConfigurationInterface $config
*
* @return int[]
* @return int[] Sorted in order of precedence
*/
protected function getTypes(CsvConfigurationInterface $config)
{
Expand All @@ -20,7 +20,7 @@ protected function getTypes(CsvConfigurationInterface $config)
];

if ($config->useDoubleQuotes()) {
$types[$config->getQuote() . $config->getQuote()] = Token::T_DOUBLE_QUOTE;
$types[str_repeat($config->getQuote(), 2)] = Token::T_DOUBLE_QUOTE;
}
$newLines = $config->getNewLine();
if (!is_array($newLines)) {
Expand All @@ -33,6 +33,11 @@ protected function getTypes(CsvConfigurationInterface $config)
$types[$config->getNullValue()] = Token::T_NULL;
}

// sort by reverse key length
uksort($types, function ($first, $second) {
return strlen($second) - strlen($first);
});

return $types;
}
}
18 changes: 13 additions & 5 deletions tests/integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,20 @@ public function parseData()
return [
[
new CsvConfiguration(),
'"some",\\N,"new' . "\n" . 'line",with\\' . "\n" . 'escaped,"in\\' . "\n" . 'quotes"',
'"some",\\N,"new' . "\n" . 'line",with\\' . "\n" . 'escaped,"in\\' . "\n" . 'quotes","\\\\"',
[],
[
['some', null, "new\nline", "with\nescaped", "in\nquotes"],
['some', null, "new\nline", "with\nescaped", "in\nquotes", '\\'],
],
],
[
new CsvConfiguration([
CsvConfiguration::OPTION_DOUBLE_QUOTE => true,
]),
'"end""","""start","""both""","",""""',
[],
[['end"', '"start', '"both"', '', '"']],
],
[
new CsvConfiguration([
CsvConfiguration::OPTION_DELIMITER => '|',
Expand Down Expand Up @@ -87,7 +95,7 @@ public function parseData()
'text\\Nthing,\\Nstart,end\\N,\\N,"\\N"',
[],
[
['text\\Nthing', '\\Nstart', 'end\\N', null, '\\N'],
['text\\Nthing', '\\Nstart', 'end\\N', null, 'N'],
],
],
[
Expand Down Expand Up @@ -124,8 +132,8 @@ public function testParseExceptions($csv, $exception)
public function parseExceptionsData()
{
return [
['"string",\\', RuntimeException::class],
['"string"stuff,things', RuntimeException::class],
['"string"stuff,things', RuntimeException::class], // extra text after a closing quote
['"string', RuntimeException::class], // no closing quote
];
}
}
1 change: 1 addition & 0 deletions tests/unit/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public function testParserIteratesSoNothingShouldHappenIfThereIsNoRequestForData
{
$parser = new Parser();

/** @var Iterator $tokens */
$tokens = m::mock(Iterator::class);

$output = $parser->parse($tokens);
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/Tokeniser/StateTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?php

namespace Graze\CsvToken\Test\Unit\Tokeniser;

use Graze\CsvToken\Test\TestCase;
use Graze\CsvToken\Tokeniser\State;
use Graze\CsvToken\Tokeniser\Token;
use RuntimeException;

class StateTest extends TestCase
{
public function testCallGetNextStateWithAnInvalidTokenWillThrowAnException()
{
$state = new State([]);
$state->addStateTarget(Token::T_CONTENT, $state);

static::assertSame($state, $state->getNextState(Token::T_CONTENT));

static::expectException(RuntimeException::class);
$state->getNextState(Token::T_ESCAPE);
}
}
Loading

0 comments on commit 0339346

Please sign in to comment.