diff --git a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php index 5e82314ca9..d4f2118e85 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php +++ b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php @@ -28,14 +28,15 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { private $state = self::STATE_READY; private $block_stack = array(); - private $html; + private $markup_processor; private $ignore_text = false; private $in_ephemeral_paragraph = false; private $block_markup = ''; private $metadata = array(); + private $last_error = null; - public function __construct( $html ) { - $this->html = new \WP_HTML_Processor( $html ); + public function __construct( $markup_processor ) { + $this->markup_processor = $markup_processor; } public function convert() { @@ -43,21 +44,29 @@ public function convert() { return false; } - while ( $this->html->next_token() ) { - switch ( $this->html->get_token_type() ) { + while ( $this->markup_processor->next_token() ) { + var_dump( $this->markup_processor->get_token_type() ); + switch ( $this->markup_processor->get_token_type() ) { case '#text': if ( $this->ignore_text ) { break; } - $this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) ); + $this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) ); break; case '#tag': $this->handle_tag(); break; } } + var_dump( $this->markup_processor->get_last_error() ); + + if ( $this->markup_processor->get_last_error() ) { + $this->last_error = $this->markup_processor->get_last_error(); + return false; + } $this->close_ephemeral_paragraph(); + return true; } @@ -77,8 +86,8 @@ public function get_block_markup() { } private function handle_tag() { - $html = $this->html; - $tag = $html->get_tag(); + $html = $this->markup_processor; + $tag = strtoupper( $html->get_tag() ); $tag_lowercase = strtolower( $tag ); $is_tag_opener = ! $html->is_tag_closer(); @@ -304,7 +313,7 @@ private function should_preserve_tag_in_rich_text( $tag ) { } private function is_at_inline_code_element() { - $breadcrumbs = $this->html->get_breadcrumbs(); + $breadcrumbs = $this->markup_processor->get_breadcrumbs(); foreach ( $breadcrumbs as $tag ) { switch ( $tag ) { case 'A': @@ -392,4 +401,8 @@ private function close_ephemeral_paragraph() { $this->in_ephemeral_paragraph = false; } } + + public function get_last_error() { + return $this->last_error; + } } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php index d4bfb35320..e3a6c2a06b 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -26,14 +26,17 @@ class WP_EPub_Entity_Reader extends WP_Entity_Reader { protected $current_post_id; protected $remaining_html_files; protected $current_html_reader; - + protected $last_error; public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) { $this->zip = $zip; $this->current_post_id = $first_post_id; } public function next_entity() { - // If we're finished, we're finished. + if ( $this->last_error ) { + return false; + } + if ( $this->finished ) { return false; } @@ -92,16 +95,14 @@ public function next_entity() { $html_file = array_shift( $this->remaining_html_files ); $html = $this->zip->read_file( $html_file ); - /** - * @TODO: Don't just assume that WP_HTML_Entity_Reader can - * handle an XHTML file. We might run into XML-specific - * subtleties that will derail the process. - * Let's consider using WP_XML_Processor instead. - */ - $this->current_html_reader = new \WP_HTML_Entity_Reader( - $html, + $this->current_html_reader = new WP_HTML_Entity_Reader( + WP_XML_Processor::create_from_string( $html ), $this->current_post_id ); + if ( $this->current_html_reader->get_last_error() ) { + $this->last_error = $this->current_html_reader->get_last_error(); + return false; + } ++$this->current_post_id; } @@ -117,6 +118,6 @@ public function is_finished(): bool { } public function get_last_error(): ?string { - return null; + return $this->last_error; } } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php index b01bd0c875..67ceadad7d 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -1,7 +1,5 @@ html = $html; - $this->post_id = $post_id; + public function __construct( $html_processor, $post_id ) { + $this->html_processor = $html_processor; + $this->post_id = $post_id; } public function next_entity() { @@ -36,8 +35,9 @@ public function next_entity() { } // We did not read any entities yet. Let's convert the HTML document into entities. - $converter = new WP_HTML_To_Blocks( $this->html ); + $converter = new WP_HTML_To_Blocks( $this->html_processor ); if ( false === $converter->convert() ) { + $this->last_error = $converter->get_last_error(); return false; } @@ -90,6 +90,6 @@ public function is_finished(): bool { } public function get_last_error(): ?string { - return null; + return $this->last_error; } } diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index 881e689020..22e3039818 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -1558,7 +1558,6 @@ private function parse_next_tag() { * See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect */ if ( - ! $this->is_closing_tag && $doc_length > $this->token_starts_at + 8 && '[' === $xml[ $this->token_starts_at + 2 ] && 'C' === $xml[ $this->token_starts_at + 3 ] && @@ -1583,6 +1582,59 @@ private function parse_next_tag() { return true; } + /* + * Identify DOCTYPE nodes. + * + * See https://www.w3.org/TR/xml11.html/#dtd + */ + if ( + $doc_length > $this->token_starts_at + 8 && + 'D' === $xml[ $at + 2 ] && + 'O' === $xml[ $at + 3 ] && + 'C' === $xml[ $at + 4 ] && + 'T' === $xml[ $at + 5 ] && + 'Y' === $xml[ $at + 6 ] && + 'P' === $xml[ $at + 7 ] && + 'E' === $xml[ $at + 8 ] + ) { + $at += 9; + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + // @TODO: Expose the "name" value instead of skipping it like that + $at += $this->parse_name( $at ); + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + return false; + } + + if ( $this->xml[ $at ] !== '>' ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Unsupported DOCTYPE syntax. Only a simple is supported.' ), + 'WP_VERSION' + ); + return false; + } + + $closer_at = $at; + $this->parser_state = self::STATE_DOCTYPE_NODE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; + } + /* * Anything else here is either unsupported at this point or invalid * syntax. See the class-level @TODO annotations for more information. @@ -1592,6 +1644,7 @@ private function parse_next_tag() { return false; } + /* * An `) and false for empty elements () and tag closers (). + * + * This method exists to provide a consistent interface with WP_HTML_Processor. + * + * @return bool Whether the tag is expected to be closed. + */ + public function expects_closer() { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + + return ! $this->is_empty_element() && ! $this->is_closing_tag; + } + /** * Indicates if the currently matched tag is an empty element tag. * @@ -2604,6 +2673,9 @@ public function get_token_name() { case self::STATE_CDATA_NODE: return '#cdata-section'; + case self::STATE_DOCTYPE_NODE: + return '#doctype'; + case self::STATE_XML_DECLARATION: return '#xml-declaration'; @@ -3030,10 +3102,11 @@ private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->last_error = self::ERROR_SYNTAX; _doing_it_wrong( __METHOD__, 'Unexpected token type in prolog stage.', 'WP_VERSION' ); } - return $this->step(); - case '#xml-declaration': + // @TODO: Fail if there's more than one or if was found before the XML declaration token. + case '#doctype': case '#comment': + case '#xml-declaration': case '#processing-instructions': return true; case '#tag': @@ -3393,6 +3466,18 @@ private function mark_incomplete_input( */ const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + /** + * Parser DOCTYPE Node State. + * + * Indicates that the parser has found a DOCTYPE declaration and it's possible + * to read and modify its modifiable text. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_DOCTYPE_NODE = 'STATE_DOCTYPE_NODE'; + /** * Indicates that the parser has found an XML processing instruction. * diff --git a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php index 4a6c7a2324..f6a4c205f4 100644 --- a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php +++ b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php @@ -21,6 +21,7 @@ public function test_entity_reader( $reader ) { 'data' => $data, ]; } + $this->assertNull( $reader->get_last_error() ); $this->assertEquals( 3, count($entities) ); $this->assertEquals( 117, strlen($entities[0]['data']['content']) ); $this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) ); diff --git a/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php index be233599fa..f8d65c0357 100644 --- a/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php @@ -13,7 +13,7 @@ public function test_entity_reader() {

It is our pleasure to announce that WordPress 6.8 was released

Last week, WordPress 6.8 was released.

HTML; - $reader = new WP_HTML_Entity_Reader( $html, 1 ); + $reader = new WP_HTML_Entity_Reader( new WP_HTML_Processor( $html ), 1 ); $entities = []; while ( $reader->next_entity() ) { $data = $reader->get_entity()->get_data(); diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php index 66cae9670e..66a1a64306 100644 --- a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -16,7 +16,7 @@ public function test_metadata_extraction() {

WordPress 6.8 was released

Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.

HTML; - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $metadata = $converter->get_all_metadata(); $expected_metadata = [ @@ -35,7 +35,7 @@ public function test_metadata_extraction() { * @dataProvider provider_test_conversion */ public function test_html_to_blocks_conversion( $html, $expected ) { - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $blocks = $converter->get_block_markup(); @@ -136,7 +136,7 @@ public function provider_test_conversion() { public function test_html_to_blocks_excerpt() { $input = file_get_contents( __DIR__ . '/fixtures/html-to-blocks/excerpt.input.html' ); - $converter = new WP_HTML_To_Blocks( $input ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $input ) ); $converter->convert( $input ); $blocks = $converter->get_block_markup(); diff --git a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php index 2c3646dada..0e1dbf1ec4 100644 --- a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php +++ b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php @@ -1749,4 +1749,46 @@ public function test_pause_and_resume() { $this->assertEquals( 'Hello there', $resumed->get_modifiable_text(), 'Did not find the expected text.' ); } -} \ No newline at end of file + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_token + */ + public function test_doctype_parsing() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $this->assertTrue( $processor->next_token(), 'Did not find DOCTYPE node' ); + $this->assertEquals( '#doctype', $processor->get_token_type(), 'Did not find DOCTYPE node' ); + $this->assertTrue( $processor->next_token(), 'Did not find root tag' ); + $this->assertEquals( 'root', $processor->get_tag(), 'Did not find root tag' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_token + */ + public function test_unsupported_doctype_parsing() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $this->assertFalse( $processor->next_token(), 'Did not reject complex DOCTYPE' ); + $this->assertEquals( 'syntax', $processor->get_last_error(), 'Did not set syntax error' ); + } + + public function test_doctype_in_tag_content_is_syntax_error() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $processor->next_token(); + $processor->next_token(); + + $this->assertFalse( $processor->next_token(), 'Did not reject DOCTYPE in tag content' ); + $this->assertEquals( 'syntax', $processor->get_last_error(), 'Did not set syntax error' ); + } + +}