diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 091dcc4ec0..dcfab2623f 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -61,6 +61,7 @@ require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php'; diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php new file mode 100644 index 0000000000..d4bfb35320 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -0,0 +1,122 @@ +zip = $zip; + $this->current_post_id = $first_post_id; + } + + public function next_entity() { + // If we're finished, we're finished. + if ( $this->finished ) { + return false; + } + + if ( null === $this->remaining_html_files ) { + $path = false; + foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) { + if ( $this->zip->is_dir( $path_candidate ) ) { + $path = $path_candidate; + break; + } + } + if ( false === $path ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + + $files = $this->zip->ls( $path ); + if ( false === $files ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + $this->remaining_html_files = array(); + foreach ( $files as $file ) { + if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) { + $this->remaining_html_files[] = $path . '/' . $file; + } + } + } + + while ( true ) { + if ( null !== $this->current_html_reader ) { + if ( + ! $this->current_html_reader->is_finished() && + $this->current_html_reader->next_entity() + ) { + return true; + } + if ( $this->current_html_reader->get_last_error() ) { + _doing_it_wrong( + __METHOD__, + 'The EPUB file did not contain any HTML files.', + '1.0.0' + ); + $this->finished = true; + return false; + } + } + + if ( count( $this->remaining_html_files ) === 0 ) { + $this->finished = true; + return false; + } + + $html_file = array_shift( $this->remaining_html_files ); + $html = $this->zip->read_file( $html_file ); + /** + * @TODO: Don't just assume that WP_HTML_Entity_Reader can + * handle an XHTML file. We might run into XML-specific + * subtleties that will derail the process. + * Let's consider using WP_XML_Processor instead. + */ + $this->current_html_reader = new \WP_HTML_Entity_Reader( + $html, + $this->current_post_id + ); + ++$this->current_post_id; + } + + return false; + } + + public function get_entity() { + return $this->current_html_reader->get_entity(); + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return null; + } +} diff --git a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php new file mode 100644 index 0000000000..4a6c7a2324 --- /dev/null +++ b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php @@ -0,0 +1,57 @@ +next_entity() ) { + $data = $reader->get_entity()->get_data(); + if(isset($data['content'])) { + $data['content'] = $this->normalize_markup( $data['content'] ); + } + $entities[] = [ + 'type' => $reader->get_entity()->get_type(), + 'data' => $data, + ]; + } + $this->assertEquals( 3, count($entities) ); + $this->assertEquals( 117, strlen($entities[0]['data']['content']) ); + $this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) ); + $this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) ); + } + + public function epub_byte_reader_data_provider() { + return [ + 'Local file' => [ + \WordPress\ByteReader\WP_File_Reader::create( __DIR__ . '/fixtures/epub-entity-reader/childrens-literature.epub' ) + ], + 'Remote file' => [ + \WordPress\ByteReader\WP_Remote_File_Ranged_Reader::create( 'https://github.com/IDPF/epub3-samples/releases/download/20230704/childrens-literature.epub' ) + ], + ]; + } + + private function normalize_markup( $markup ) { + $processor = new WP_HTML_Processor( $markup ); + $serialized = $processor->serialize(); + // Naively remove parts of the HTML that serialize() + // adds that we don't want. + $serialized = str_replace( + [ + '', + '', + ], + '', + $serialized + ); + return $serialized; + } + +} diff --git a/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub new file mode 100644 index 0000000000..ba84a64399 Binary files /dev/null and b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub differ