Skip to content

Commit

Permalink
Use WP_XML_Reader for EPubs, support simple DOCTYPE declarations in XML
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Dec 17, 2024
1 parent e01dec8 commit d293c22
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,45 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter {

private $state = self::STATE_READY;
private $block_stack = array();
private $html;
private $markup_processor;
private $ignore_text = false;
private $in_ephemeral_paragraph = false;
private $block_markup = '';
private $metadata = array();
private $last_error = null;

public function __construct( $html ) {
$this->html = new \WP_HTML_Processor( $html );
public function __construct( $markup_processor ) {
$this->markup_processor = $markup_processor;
}

public function convert() {
if ( self::STATE_READY !== $this->state ) {
return false;
}

while ( $this->html->next_token() ) {
switch ( $this->html->get_token_type() ) {
while ( $this->markup_processor->next_token() ) {
var_dump( $this->markup_processor->get_token_type() );
switch ( $this->markup_processor->get_token_type() ) {
case '#text':
if ( $this->ignore_text ) {
break;
}
$this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) );
$this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) );
break;
case '#tag':
$this->handle_tag();
break;
}
}
var_dump( $this->markup_processor->get_last_error() );

if ( $this->markup_processor->get_last_error() ) {
$this->last_error = $this->markup_processor->get_last_error();
return false;
}

$this->close_ephemeral_paragraph();

return true;
}

Expand All @@ -77,8 +86,8 @@ public function get_block_markup() {
}

private function handle_tag() {
$html = $this->html;
$tag = $html->get_tag();
$html = $this->markup_processor;
$tag = strtoupper( $html->get_tag() );
$tag_lowercase = strtolower( $tag );

$is_tag_opener = ! $html->is_tag_closer();
Expand Down Expand Up @@ -304,7 +313,7 @@ private function should_preserve_tag_in_rich_text( $tag ) {
}

private function is_at_inline_code_element() {
$breadcrumbs = $this->html->get_breadcrumbs();
$breadcrumbs = $this->markup_processor->get_breadcrumbs();
foreach ( $breadcrumbs as $tag ) {
switch ( $tag ) {
case 'A':
Expand Down Expand Up @@ -392,4 +401,8 @@ private function close_ephemeral_paragraph() {
$this->in_ephemeral_paragraph = false;
}
}

public function get_last_error() {
return $this->last_error;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,17 @@ class WP_EPub_Entity_Reader extends WP_Entity_Reader {
protected $current_post_id;
protected $remaining_html_files;
protected $current_html_reader;

protected $last_error;
public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
$this->zip = $zip;
$this->current_post_id = $first_post_id;
}

public function next_entity() {
// If we're finished, we're finished.
if ( $this->last_error ) {
return false;
}

if ( $this->finished ) {
return false;
}
Expand Down Expand Up @@ -92,16 +95,14 @@ public function next_entity() {

$html_file = array_shift( $this->remaining_html_files );
$html = $this->zip->read_file( $html_file );
/**
* @TODO: Don't just assume that WP_HTML_Entity_Reader can
* handle an XHTML file. We might run into XML-specific
* subtleties that will derail the process.
* Let's consider using WP_XML_Processor instead.
*/
$this->current_html_reader = new \WP_HTML_Entity_Reader(
$html,
$this->current_html_reader = new WP_HTML_Entity_Reader(
WP_XML_Processor::create_from_string( $html ),
$this->current_post_id
);
if ( $this->current_html_reader->get_last_error() ) {
$this->last_error = $this->current_html_reader->get_last_error();
return false;
}
++$this->current_post_id;
}

Expand All @@ -117,6 +118,6 @@ public function is_finished(): bool {
}

public function get_last_error(): ?string {
return null;
return $this->last_error;
}
}
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
<?php

use WordPress\Data_Liberation\Block_Markup\WP_HTML_To_Blocks;

/**
* Converts a single HTML file into a stream of WordPress entities.
*
* @TODO: Support post meta.
*/
class WP_HTML_Entity_Reader extends WP_Entity_Reader {

protected $html;
protected $html_processor;
protected $entities;
protected $finished = false;
protected $post_id;
protected $last_error;

public function __construct( $html, $post_id ) {
$this->html = $html;
$this->post_id = $post_id;
public function __construct( $html_processor, $post_id ) {
$this->html_processor = $html_processor;
$this->post_id = $post_id;
}

public function next_entity() {
Expand All @@ -36,8 +35,9 @@ public function next_entity() {
}

// We did not read any entities yet. Let's convert the HTML document into entities.
$converter = new WP_HTML_To_Blocks( $this->html );
$converter = new WP_HTML_To_Blocks( $this->html_processor );
if ( false === $converter->convert() ) {
$this->last_error = $converter->get_last_error();
return false;
}

Expand Down Expand Up @@ -90,6 +90,6 @@ public function is_finished(): bool {
}

public function get_last_error(): ?string {
return null;
return $this->last_error;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1558,7 +1558,6 @@ private function parse_next_tag() {
* See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect
*/
if (
! $this->is_closing_tag &&
$doc_length > $this->token_starts_at + 8 &&
'[' === $xml[ $this->token_starts_at + 2 ] &&
'C' === $xml[ $this->token_starts_at + 3 ] &&
Expand All @@ -1583,6 +1582,59 @@ private function parse_next_tag() {
return true;
}

/*
* Identify DOCTYPE nodes.
*
* See https://www.w3.org/TR/xml11.html/#dtd
*/
if (
$doc_length > $this->token_starts_at + 8 &&
'D' === $xml[ $at + 2 ] &&
'O' === $xml[ $at + 3 ] &&
'C' === $xml[ $at + 4 ] &&
'T' === $xml[ $at + 5 ] &&
'Y' === $xml[ $at + 6 ] &&
'P' === $xml[ $at + 7 ] &&
'E' === $xml[ $at + 8 ]
) {
$at += 9;
// Skip whitespace.
$at += strspn( $this->xml, " \t\f\r\n", $at );

if ( $doc_length <= $at ) {
$this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' );

return false;
}

// @TODO: Expose the "name" value instead of skipping it like that
$at += $this->parse_name( $at );

// Skip whitespace.
$at += strspn( $this->xml, " \t\f\r\n", $at );

if ( $doc_length <= $at ) {
$this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' );
return false;
}

if ( $this->xml[ $at ] !== '>' ) {
$this->last_error = self::ERROR_SYNTAX;
_doing_it_wrong(
__METHOD__,
__( 'Unsupported DOCTYPE syntax. Only a simple <!DOCTYPE name> is supported.' ),
'WP_VERSION'
);
return false;
}

$closer_at = $at;
$this->parser_state = self::STATE_DOCTYPE_NODE;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}

/*
* Anything else here is either unsupported at this point or invalid
* syntax. See the class-level @TODO annotations for more information.
Expand All @@ -1592,6 +1644,7 @@ private function parse_next_tag() {
return false;
}


/*
* An `<?xml` token at the beginning of the document marks a start of an
* xml declaration.
Expand Down Expand Up @@ -2471,6 +2524,22 @@ public function get_tag() {
return null;
}

/**
* Indicates if the currently matched tag is expected to be closed.
* Returns true for tag openers (<div>) and false for empty elements (<img />) and tag closers (</div>).
*
* This method exists to provide a consistent interface with WP_HTML_Processor.
*
* @return bool Whether the tag is expected to be closed.
*/
public function expects_closer() {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}

return ! $this->is_empty_element() && ! $this->is_closing_tag;
}

/**
* Indicates if the currently matched tag is an empty element tag.
*
Expand Down Expand Up @@ -2604,6 +2673,9 @@ public function get_token_name() {
case self::STATE_CDATA_NODE:
return '#cdata-section';

case self::STATE_DOCTYPE_NODE:
return '#doctype';

case self::STATE_XML_DECLARATION:
return '#xml-declaration';

Expand Down Expand Up @@ -3030,10 +3102,11 @@ private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) {
$this->last_error = self::ERROR_SYNTAX;
_doing_it_wrong( __METHOD__, 'Unexpected token type in prolog stage.', 'WP_VERSION' );
}

return $this->step();
case '#xml-declaration':
// @TODO: Fail if there's more than one <!DOCTYPE> or if <!DOCTYPE> was found before the XML declaration token.
case '#doctype':
case '#comment':
case '#xml-declaration':
case '#processing-instructions':
return true;
case '#tag':
Expand Down Expand Up @@ -3393,6 +3466,18 @@ private function mark_incomplete_input(
*/
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';

/**
* Parser DOCTYPE Node State.
*
* Indicates that the parser has found a DOCTYPE declaration and it's possible
* to read and modify its modifiable text.
*
* @since WP_VERSION
*
* @access private
*/
const STATE_DOCTYPE_NODE = 'STATE_DOCTYPE_NODE';

/**
* Indicates that the parser has found an XML processing instruction.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public function test_entity_reader( $reader ) {
'data' => $data,
];
}
$this->assertNull( $reader->get_last_error() );
$this->assertEquals( 3, count($entities) );
$this->assertEquals( 117, strlen($entities[0]['data']['content']) );
$this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public function test_entity_reader() {
<h1>It is our pleasure to announce that WordPress 6.8 was released</h1>
<p>Last week, WordPress 6.8 was released.</p>
HTML;
$reader = new WP_HTML_Entity_Reader( $html, 1 );
$reader = new WP_HTML_Entity_Reader( new WP_HTML_Processor( $html ), 1 );
$entities = [];
while ( $reader->next_entity() ) {
$data = $reader->get_entity()->get_data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public function test_metadata_extraction() {
<h1>WordPress 6.8 was released</h1>
<p>Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.</p>
HTML;
$converter = new WP_HTML_To_Blocks( $html );
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) );
$converter->convert( $html );
$metadata = $converter->get_all_metadata();
$expected_metadata = [
Expand All @@ -35,7 +35,7 @@ public function test_metadata_extraction() {
* @dataProvider provider_test_conversion
*/
public function test_html_to_blocks_conversion( $html, $expected ) {
$converter = new WP_HTML_To_Blocks( $html );
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) );
$converter->convert( $html );
$blocks = $converter->get_block_markup();

Expand Down Expand Up @@ -136,7 +136,7 @@ public function provider_test_conversion() {

public function test_html_to_blocks_excerpt() {
$input = file_get_contents( __DIR__ . '/fixtures/html-to-blocks/excerpt.input.html' );
$converter = new WP_HTML_To_Blocks( $input );
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $input ) );
$converter->convert( $input );
$blocks = $converter->get_block_markup();

Expand Down
Loading

0 comments on commit d293c22

Please sign in to comment.