[Data Liberation] Add Epub importer

Description TBD
WordPress · Dec 17, 2024 · e01dec8 · e01dec8
1 parent 55c9e4e
commit e01dec8
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 0 deletions.
diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php
@@ -61,6 +61,7 @@
 
 require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
 require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
+require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php';
 require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php';
 require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php';
 

diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php
@@ -0,0 +1,122 @@
+<?php
+
+use WordPress\Zip\WP_Zip_Filesystem;
+
+/**
+ * https://www.w3.org/AudioVideo/ebook/
+ *
+ * An EPUB Publication is transported as a single file (a "portable document") that contains:
+ * * a Package Document (OPF file) which specifies all the Publication's constituent content documents and their required resources, defines a reading order  and associates Publication-level metadata and navigation information.
+ *    * A metadata element including and/or referencing metadata applicable to the entire Publication and particular resources within it.
+ *    * A manifest element: identifies (via IRI) and describes (via MIME media type) the set of resources that constitute the EPUB Publication.
+ *    * A spine element : defines the default reading order of the Publication. (An ordered list of Publication Resources (EPUB Content Documents).
+ *    * A Bindings element defines a set of custom handlers for media types not supported by EPUB3. If the Reading System cannot support the specific media type, it could use scripting fallback if supported.
+ * * all Content Documents
+ * * all other required resources for processing the Publication.
+ *
+ * The OCF Container is packaged into a physical single ZIP file containing:
+ * * Mime Type file: application/epub+zip.
+ * * META-INF folder (container file which points to the location of the .opf file), signatures, encryption, rights, are xml files
+ * * OEBPS folder stores the book content .(opf, ncx, html, svg, png, css, etc. files)
+ */
+class WP_EPub_Entity_Reader extends WP_Entity_Reader {
+
+	protected $zip;
+	protected $finished = false;
+	protected $current_post_id;
+	protected $remaining_html_files;
+	protected $current_html_reader;
+
+	public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
+		$this->zip             = $zip;
+		$this->current_post_id = $first_post_id;
+	}
+
+	public function next_entity() {
+		// If we're finished, we're finished.
+		if ( $this->finished ) {
+			return false;
+		}
+
+		if ( null === $this->remaining_html_files ) {
+			$path = false;
+			foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) {
+				if ( $this->zip->is_dir( $path_candidate ) ) {
+					$path = $path_candidate;
+					break;
+				}
+			}
+			if ( false === $path ) {
+				_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
+				$this->finished = true;
+				return false;
+			}
+
+			$files = $this->zip->ls( $path );
+			if ( false === $files ) {
+				_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
+				$this->finished = true;
+				return false;
+			}
+			$this->remaining_html_files = array();
+			foreach ( $files as $file ) {
+				if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) {
+					$this->remaining_html_files[] = $path . '/' . $file;
+				}
+			}
+		}
+
+		while ( true ) {
+			if ( null !== $this->current_html_reader ) {
+				if (
+					! $this->current_html_reader->is_finished() &&
+					$this->current_html_reader->next_entity()
+				) {
+					return true;
+				}
+				if ( $this->current_html_reader->get_last_error() ) {
+					_doing_it_wrong(
+						__METHOD__,
+						'The EPUB file did not contain any HTML files.',
+						'1.0.0'
+					);
+					$this->finished = true;
+					return false;
+				}
+			}
+
+			if ( count( $this->remaining_html_files ) === 0 ) {
+				$this->finished = true;
+				return false;
+			}
+
+			$html_file = array_shift( $this->remaining_html_files );
+			$html      = $this->zip->read_file( $html_file );
+			/**
+			 * @TODO: Don't just assume that WP_HTML_Entity_Reader can
+			 *        handle an XHTML file. We might run into XML-specific
+			 *        subtleties that will derail the process.
+			 *        Let's consider using WP_XML_Processor instead.
+			 */
+			$this->current_html_reader = new \WP_HTML_Entity_Reader(
+				$html,
+				$this->current_post_id
+			);
+			++$this->current_post_id;
+		}
+
+		return false;
+	}
+
+	public function get_entity() {
+		return $this->current_html_reader->get_entity();
+	}
+
+	public function is_finished(): bool {
+		return $this->finished;
+	}
+
+	public function get_last_error(): ?string {
+		return null;
+	}
+}
diff --git a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php
@@ -0,0 +1,57 @@
+<?php
+
+use PHPUnit\Framework\TestCase;
+
+class WPEPubEntityReaderTests extends TestCase {
+
+    /**
+     * @dataProvider epub_byte_reader_data_provider
+     */
+    public function test_entity_reader( $reader ) {
+        $zip = new \WordPress\Zip\WP_Zip_Filesystem( $reader );
+        $reader = new \WP_EPub_Entity_Reader( $zip );
+        $entities = [];
+        while ( $reader->next_entity() ) {
+            $data = $reader->get_entity()->get_data();
+            if(isset($data['content'])) {
+                $data['content'] = $this->normalize_markup( $data['content'] );
+            }
+            $entities[] = [
+                'type' => $reader->get_entity()->get_type(),
+                'data' => $data,
+            ];
+        }
+        $this->assertEquals( 3, count($entities) );
+        $this->assertEquals( 117, strlen($entities[0]['data']['content']) );
+        $this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) );
+        $this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) );
+    }
+
+    public function epub_byte_reader_data_provider() {
+        return [
+            'Local file' => [
+                \WordPress\ByteReader\WP_File_Reader::create( __DIR__ . '/fixtures/epub-entity-reader/childrens-literature.epub' )
+            ],
+            'Remote file' => [
+                \WordPress\ByteReader\WP_Remote_File_Ranged_Reader::create( 'https://github.com/IDPF/epub3-samples/releases/download/20230704/childrens-literature.epub' )
+            ],
+        ];
+    }
+
+    private function normalize_markup( $markup ) {
+        $processor = new WP_HTML_Processor( $markup );
+        $serialized = $processor->serialize();
+        // Naively remove parts of the HTML that serialize()
+        // adds that we don't want.
+        $serialized = str_replace(
+            [
+                '<html><head></head><body>',
+                '</body></html>',
+            ],
+            '',
+            $serialized
+        );
+        return $serialized;
+    }
+
+}
diff --git a/...es/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub b/...es/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub