Skip to content

Commit

Permalink
[Data Liberation] Add Epub importer
Browse files Browse the repository at this point in the history
Description TBD
  • Loading branch information
adamziel committed Dec 17, 2024
1 parent 55c9e4e commit e01dec8
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 0 deletions.
1 change: 1 addition & 0 deletions packages/playground/data-liberation/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php';

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
<?php

use WordPress\Zip\WP_Zip_Filesystem;

/**
* https://www.w3.org/AudioVideo/ebook/
*
* An EPUB Publication is transported as a single file (a "portable document") that contains:
* * a Package Document (OPF file) which specifies all the Publication's constituent content documents and their required resources, defines a reading order and associates Publication-level metadata and navigation information.
* * A metadata element including and/or referencing metadata applicable to the entire Publication and particular resources within it.
* * A manifest element: identifies (via IRI) and describes (via MIME media type) the set of resources that constitute the EPUB Publication.
* * A spine element : defines the default reading order of the Publication. (An ordered list of Publication Resources (EPUB Content Documents).
* * A Bindings element defines a set of custom handlers for media types not supported by EPUB3. If the Reading System cannot support the specific media type, it could use scripting fallback if supported.
* * all Content Documents
* * all other required resources for processing the Publication.
*
* The OCF Container is packaged into a physical single ZIP file containing:
* * Mime Type file: application/epub+zip.
* * META-INF folder (container file which points to the location of the .opf file), signatures, encryption, rights, are xml files
* * OEBPS folder stores the book content .(opf, ncx, html, svg, png, css, etc. files)
*/
class WP_EPub_Entity_Reader extends WP_Entity_Reader {

protected $zip;
protected $finished = false;
protected $current_post_id;
protected $remaining_html_files;
protected $current_html_reader;

public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
$this->zip = $zip;
$this->current_post_id = $first_post_id;
}

public function next_entity() {
// If we're finished, we're finished.
if ( $this->finished ) {
return false;
}

if ( null === $this->remaining_html_files ) {
$path = false;
foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) {
if ( $this->zip->is_dir( $path_candidate ) ) {
$path = $path_candidate;
break;
}
}
if ( false === $path ) {
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
$this->finished = true;
return false;
}

$files = $this->zip->ls( $path );
if ( false === $files ) {
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
$this->finished = true;
return false;
}
$this->remaining_html_files = array();
foreach ( $files as $file ) {
if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) {
$this->remaining_html_files[] = $path . '/' . $file;
}
}
}

while ( true ) {
if ( null !== $this->current_html_reader ) {
if (
! $this->current_html_reader->is_finished() &&
$this->current_html_reader->next_entity()
) {
return true;
}
if ( $this->current_html_reader->get_last_error() ) {
_doing_it_wrong(
__METHOD__,
'The EPUB file did not contain any HTML files.',
'1.0.0'
);
$this->finished = true;
return false;
}
}

if ( count( $this->remaining_html_files ) === 0 ) {
$this->finished = true;
return false;
}

$html_file = array_shift( $this->remaining_html_files );
$html = $this->zip->read_file( $html_file );
/**
* @TODO: Don't just assume that WP_HTML_Entity_Reader can
* handle an XHTML file. We might run into XML-specific
* subtleties that will derail the process.
* Let's consider using WP_XML_Processor instead.
*/
$this->current_html_reader = new \WP_HTML_Entity_Reader(
$html,
$this->current_post_id
);
++$this->current_post_id;
}

return false;
}

public function get_entity() {
return $this->current_html_reader->get_entity();
}

public function is_finished(): bool {
return $this->finished;
}

public function get_last_error(): ?string {
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?php

use PHPUnit\Framework\TestCase;

class WPEPubEntityReaderTests extends TestCase {

/**
* @dataProvider epub_byte_reader_data_provider
*/
public function test_entity_reader( $reader ) {
$zip = new \WordPress\Zip\WP_Zip_Filesystem( $reader );
$reader = new \WP_EPub_Entity_Reader( $zip );
$entities = [];
while ( $reader->next_entity() ) {
$data = $reader->get_entity()->get_data();
if(isset($data['content'])) {
$data['content'] = $this->normalize_markup( $data['content'] );
}
$entities[] = [
'type' => $reader->get_entity()->get_type(),
'data' => $data,
];
}
$this->assertEquals( 3, count($entities) );
$this->assertEquals( 117, strlen($entities[0]['data']['content']) );
$this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) );
$this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) );
}

public function epub_byte_reader_data_provider() {
return [
'Local file' => [
\WordPress\ByteReader\WP_File_Reader::create( __DIR__ . '/fixtures/epub-entity-reader/childrens-literature.epub' )
],
'Remote file' => [
\WordPress\ByteReader\WP_Remote_File_Ranged_Reader::create( 'https://github.com/IDPF/epub3-samples/releases/download/20230704/childrens-literature.epub' )
],
];
}

private function normalize_markup( $markup ) {
$processor = new WP_HTML_Processor( $markup );
$serialized = $processor->serialize();
// Naively remove parts of the HTML that serialize()
// adds that we don't want.
$serialized = str_replace(
[
'<html><head></head><body>',
'</body></html>',
],
'',
$serialized
);
return $serialized;
}

}
Binary file not shown.

0 comments on commit e01dec8

Please sign in to comment.