-
Notifications
You must be signed in to change notification settings - Fork 274
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Description TBD
- Loading branch information
Showing
4 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
<?php | ||
|
||
use WordPress\Zip\WP_Zip_Filesystem; | ||
|
||
/** | ||
* https://www.w3.org/AudioVideo/ebook/ | ||
* | ||
* An EPUB Publication is transported as a single file (a "portable document") that contains: | ||
* * a Package Document (OPF file) which specifies all the Publication's constituent content documents and their required resources, defines a reading order and associates Publication-level metadata and navigation information. | ||
* * A metadata element including and/or referencing metadata applicable to the entire Publication and particular resources within it. | ||
* * A manifest element: identifies (via IRI) and describes (via MIME media type) the set of resources that constitute the EPUB Publication. | ||
* * A spine element : defines the default reading order of the Publication. (An ordered list of Publication Resources (EPUB Content Documents). | ||
* * A Bindings element defines a set of custom handlers for media types not supported by EPUB3. If the Reading System cannot support the specific media type, it could use scripting fallback if supported. | ||
* * all Content Documents | ||
* * all other required resources for processing the Publication. | ||
* | ||
* The OCF Container is packaged into a physical single ZIP file containing: | ||
* * Mime Type file: application/epub+zip. | ||
* * META-INF folder (container file which points to the location of the .opf file), signatures, encryption, rights, are xml files | ||
* * OEBPS folder stores the book content .(opf, ncx, html, svg, png, css, etc. files) | ||
*/ | ||
class WP_EPub_Entity_Reader extends WP_Entity_Reader { | ||
|
||
protected $zip; | ||
protected $finished = false; | ||
protected $current_post_id; | ||
protected $remaining_html_files; | ||
protected $current_html_reader; | ||
|
||
public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) { | ||
$this->zip = $zip; | ||
$this->current_post_id = $first_post_id; | ||
} | ||
|
||
public function next_entity() { | ||
// If we're finished, we're finished. | ||
if ( $this->finished ) { | ||
return false; | ||
} | ||
|
||
if ( null === $this->remaining_html_files ) { | ||
$path = false; | ||
foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) { | ||
if ( $this->zip->is_dir( $path_candidate ) ) { | ||
$path = $path_candidate; | ||
break; | ||
} | ||
} | ||
if ( false === $path ) { | ||
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); | ||
$this->finished = true; | ||
return false; | ||
} | ||
|
||
$files = $this->zip->ls( $path ); | ||
if ( false === $files ) { | ||
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); | ||
$this->finished = true; | ||
return false; | ||
} | ||
$this->remaining_html_files = array(); | ||
foreach ( $files as $file ) { | ||
if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) { | ||
$this->remaining_html_files[] = $path . '/' . $file; | ||
} | ||
} | ||
} | ||
|
||
while ( true ) { | ||
if ( null !== $this->current_html_reader ) { | ||
if ( | ||
! $this->current_html_reader->is_finished() && | ||
$this->current_html_reader->next_entity() | ||
) { | ||
return true; | ||
} | ||
if ( $this->current_html_reader->get_last_error() ) { | ||
_doing_it_wrong( | ||
__METHOD__, | ||
'The EPUB file did not contain any HTML files.', | ||
'1.0.0' | ||
); | ||
$this->finished = true; | ||
return false; | ||
} | ||
} | ||
|
||
if ( count( $this->remaining_html_files ) === 0 ) { | ||
$this->finished = true; | ||
return false; | ||
} | ||
|
||
$html_file = array_shift( $this->remaining_html_files ); | ||
$html = $this->zip->read_file( $html_file ); | ||
/** | ||
* @TODO: Don't just assume that WP_HTML_Entity_Reader can | ||
* handle an XHTML file. We might run into XML-specific | ||
* subtleties that will derail the process. | ||
* Let's consider using WP_XML_Processor instead. | ||
*/ | ||
$this->current_html_reader = new \WP_HTML_Entity_Reader( | ||
$html, | ||
$this->current_post_id | ||
); | ||
++$this->current_post_id; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
public function get_entity() { | ||
return $this->current_html_reader->get_entity(); | ||
} | ||
|
||
public function is_finished(): bool { | ||
return $this->finished; | ||
} | ||
|
||
public function get_last_error(): ?string { | ||
return null; | ||
} | ||
} |
57 changes: 57 additions & 0 deletions
57
packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
<?php | ||
|
||
use PHPUnit\Framework\TestCase; | ||
|
||
class WPEPubEntityReaderTests extends TestCase { | ||
|
||
/** | ||
* @dataProvider epub_byte_reader_data_provider | ||
*/ | ||
public function test_entity_reader( $reader ) { | ||
$zip = new \WordPress\Zip\WP_Zip_Filesystem( $reader ); | ||
$reader = new \WP_EPub_Entity_Reader( $zip ); | ||
$entities = []; | ||
while ( $reader->next_entity() ) { | ||
$data = $reader->get_entity()->get_data(); | ||
if(isset($data['content'])) { | ||
$data['content'] = $this->normalize_markup( $data['content'] ); | ||
} | ||
$entities[] = [ | ||
'type' => $reader->get_entity()->get_type(), | ||
'data' => $data, | ||
]; | ||
} | ||
$this->assertEquals( 3, count($entities) ); | ||
$this->assertEquals( 117, strlen($entities[0]['data']['content']) ); | ||
$this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) ); | ||
$this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) ); | ||
} | ||
|
||
public function epub_byte_reader_data_provider() { | ||
return [ | ||
'Local file' => [ | ||
\WordPress\ByteReader\WP_File_Reader::create( __DIR__ . '/fixtures/epub-entity-reader/childrens-literature.epub' ) | ||
], | ||
'Remote file' => [ | ||
\WordPress\ByteReader\WP_Remote_File_Ranged_Reader::create( 'https://github.com/IDPF/epub3-samples/releases/download/20230704/childrens-literature.epub' ) | ||
], | ||
]; | ||
} | ||
|
||
private function normalize_markup( $markup ) { | ||
$processor = new WP_HTML_Processor( $markup ); | ||
$serialized = $processor->serialize(); | ||
// Naively remove parts of the HTML that serialize() | ||
// adds that we don't want. | ||
$serialized = str_replace( | ||
[ | ||
'<html><head></head><body>', | ||
'</body></html>', | ||
], | ||
'', | ||
$serialized | ||
); | ||
return $serialized; | ||
} | ||
|
||
} |
Binary file added
BIN
+158 KB
...es/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub
Binary file not shown.