From 58def6c915eb50c7e45df21674e530156767e250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 17 Dec 2024 16:02:56 +0100 Subject: [PATCH] Parse EPubs as XHTML --- .../src/block-markup/WP_HTML_To_Blocks.php | 8 +++---- .../entity-readers/WP_EPub_Entity_Reader.php | 4 ++-- .../src/xml-api/WP_XML_Processor.php | 7 +++--- .../tests/WPEPubEntityReaderTests.php | 22 ++--------------- .../tests/WPHTMLToBlocksTests.php | 24 ++++++++++++++++++- 5 files changed, 33 insertions(+), 32 deletions(-) diff --git a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php index d4f2118e85..0d36c5629e 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php +++ b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php @@ -45,7 +45,6 @@ public function convert() { } while ( $this->markup_processor->next_token() ) { - var_dump( $this->markup_processor->get_token_type() ); switch ( $this->markup_processor->get_token_type() ) { case '#text': if ( $this->ignore_text ) { @@ -58,7 +57,6 @@ public function convert() { break; } } - var_dump( $this->markup_processor->get_last_error() ); if ( $this->markup_processor->get_last_error() ) { $this->last_error = $this->markup_processor->get_last_error(); @@ -90,8 +88,8 @@ private function handle_tag() { $tag = strtoupper( $html->get_tag() ); $tag_lowercase = strtolower( $tag ); - $is_tag_opener = ! $html->is_tag_closer(); - if ( ! $html->expects_closer() ) { + $is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer(); + if ( $is_void_tag ) { switch ( $tag ) { case 'META': $key = $html->get_attribute( 'name' ); @@ -119,7 +117,7 @@ private function handle_tag() { // Just insert an HTML block or what? break; } - } elseif ( $is_tag_opener ) { + } elseif ( ! $html->is_tag_closer() ) { switch ( $tag ) { // Block elements case 'SCRIPT': diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php index e3a6c2a06b..db7b8b9df3 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -93,8 +93,8 @@ public function next_entity() { return false; } - $html_file = array_shift( $this->remaining_html_files ); - $html = $this->zip->read_file( $html_file ); + $html_file = array_shift( $this->remaining_html_files ); + $html = $this->zip->read_file( $html_file ); $this->current_html_reader = new WP_HTML_Entity_Reader( WP_XML_Processor::create_from_string( $html ), $this->current_post_id diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index 22e3039818..b6b2a7669e 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -1628,13 +1628,13 @@ private function parse_next_tag() { return false; } - $closer_at = $at; + $closer_at = $at; $this->parser_state = self::STATE_DOCTYPE_NODE; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->bytes_already_parsed = $closer_at + 1; return true; } - + /* * Anything else here is either unsupported at this point or invalid * syntax. See the class-level @TODO annotations for more information. @@ -1644,7 +1644,6 @@ private function parse_next_tag() { return false; } - /* * An `is_empty_element() && ! $this->is_closing_tag; + return $this->is_tag_opener() && ! $this->is_empty_element(); } /** diff --git a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php index f6a4c205f4..c6bf17248c 100644 --- a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php +++ b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php @@ -13,9 +13,6 @@ public function test_entity_reader( $reader ) { $entities = []; while ( $reader->next_entity() ) { $data = $reader->get_entity()->get_data(); - if(isset($data['content'])) { - $data['content'] = $this->normalize_markup( $data['content'] ); - } $entities[] = [ 'type' => $reader->get_entity()->get_type(), 'data' => $data, @@ -23,9 +20,10 @@ public function test_entity_reader( $reader ) { } $this->assertNull( $reader->get_last_error() ); $this->assertEquals( 3, count($entities) ); - $this->assertEquals( 117, strlen($entities[0]['data']['content']) ); + $this->assertGreaterThan( 100, strlen($entities[0]['data']['content']) ); $this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) ); $this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) ); + echo $entities[2]['data']['content']; } public function epub_byte_reader_data_provider() { @@ -39,20 +37,4 @@ public function epub_byte_reader_data_provider() { ]; } - private function normalize_markup( $markup ) { - $processor = new WP_HTML_Processor( $markup ); - $serialized = $processor->serialize(); - // Naively remove parts of the HTML that serialize() - // adds that we don't want. - $serialized = str_replace( - [ - '', - '', - ], - '', - $serialized - ); - return $serialized; - } - } diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php index 66a1a64306..91359b9e47 100644 --- a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -146,7 +146,29 @@ public function test_html_to_blocks_excerpt() { } $this->assertEquals( file_get_contents( $output_file ), $blocks ); - + } + + public function test_xhtml_to_blocks_conversion() { + $input = << + + + +

Hello, world!

+

And some content

+ + +XML; + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $input ) ); + $converter->convert( $input ); + $blocks = $converter->get_block_markup(); + $expected = <<

Hello, world!

And some content

+HTML; + $this->assertEquals( + $this->normalize_markup( $expected ), + $this->normalize_markup( $blocks ) + ); } }