diff --git a/packages/playground/data-liberation/blueprints-library b/packages/playground/data-liberation/blueprints-library index b52a93ce17..b1362cbe3c 160000 --- a/packages/playground/data-liberation/blueprints-library +++ b/packages/playground/data-liberation/blueprints-library @@ -1 +1 @@ -Subproject commit b52a93ce17562a1964fb27df770792fe165b217b +Subproject commit b1362cbe3ca0956a36cc00dd769698bde0ba2ec7 diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 91038c1ae3..dcfab2623f 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -11,11 +11,19 @@ require_once __DIR__ . '/blueprints-library/src/WordPress/AsyncHttp/Connection.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/AsyncHttp/Client.php'; -require_once __DIR__ . '/src/byte-readers/WP_Byte_Reader.php'; -require_once __DIR__ . '/src/byte-readers/WP_File_Reader.php'; -require_once __DIR__ . '/src/byte-readers/WP_GZ_File_Reader.php'; -require_once __DIR__ . '/src/byte-readers/WP_Remote_File_Reader.php'; -require_once __DIR__ . '/src/byte-readers/WP_Remote_File_Ranged_Reader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Abstract_Filesystem.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_File_Visitor_Event.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem_Visitor.php'; + +require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_Byte_Reader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_File_Reader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_GZ_File_Reader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_Remote_File_Reader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_Remote_File_Ranged_Reader.php'; + +require_once __DIR__ . '/blueprints-library/src/WordPress/Zip/ZipStreamReader.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Zip/WP_Zip_Filesystem.php'; if ( ! class_exists( 'WP_HTML_Tag_Processor' ) && @@ -48,17 +56,19 @@ require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Url_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_URL_In_Text_Processor.php'; -require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php'; require_once __DIR__ . '/src/block-markup/WP_URL.php'; +require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php'; + +require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php'; -require_once __DIR__ . '/src/wxr/WP_WXR_Reader.php'; -require_once __DIR__ . '/src/import/WP_Import_Utils.php'; require_once __DIR__ . '/src/import/WP_Block_Object.php'; require_once __DIR__ . '/src/import/WP_Entity_Importer.php'; -require_once __DIR__ . '/src/import/WP_File_Visitor.php'; -require_once __DIR__ . '/src/import/WP_File_Visitor_Event.php'; require_once __DIR__ . '/src/import/WP_Imported_Entity.php'; require_once __DIR__ . '/src/import/WP_Attachment_Downloader.php'; require_once __DIR__ . '/src/import/WP_Attachment_Downloader_Event.php'; @@ -66,9 +76,8 @@ require_once __DIR__ . '/src/import/WP_Stream_Importer.php'; require_once __DIR__ . '/src/import/WP_Entity_Iterator_Chain.php'; require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php'; -require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; -require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; - +require_once __DIR__ . '/src/import/WP_Import_HTML_Processor.php'; +require_once __DIR__ . '/src/import/WP_Import_Utils.php'; require_once __DIR__ . '/src/utf8_decoder.php'; // When running in Playground, the composer autoloader script sees CLI SAPI and diff --git a/packages/playground/data-liberation/dist/data-liberation-core.phar.gz b/packages/playground/data-liberation/dist/data-liberation-core.phar.gz index ed86d01db1..5762f9c3fb 100755 Binary files a/packages/playground/data-liberation/dist/data-liberation-core.phar.gz and b/packages/playground/data-liberation/dist/data-liberation-core.phar.gz differ diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index 9646f33205..cd97c0ec90 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -2,10 +2,11 @@ - tests/WPHTMLEntityReaderTests.php - tests/WPHTMLToBlocksTests.php tests/WPWXRReaderTests.php tests/WPRewriteUrlsTests.php + tests/WPHTMLToBlocksTests.php + tests/WPHTMLEntityReaderTests.php + tests/WPEPubEntityReaderTests.php tests/WPURLInTextProcessorTests.php tests/WPBlockMarkupProcessorTests.php tests/WPBlockMarkupUrlProcessorTests.php diff --git a/packages/playground/data-liberation/project.json b/packages/playground/data-liberation/project.json index f2712fef71..c2e51df4ac 100644 --- a/packages/playground/data-liberation/project.json +++ b/packages/playground/data-liberation/project.json @@ -33,7 +33,7 @@ "options": { "cwd": "packages/playground/data-liberation", "commands": [ - "./vendor/bin/phpcs --standard=./phpcs.xml -s ./src ./*.php" + "./vendor/bin/phpcs --standard=./phpcs.xml -s ./src ./*.php " ], "parallel": false } @@ -43,7 +43,7 @@ "options": { "cwd": "packages/playground/data-liberation", "commands": [ - "./vendor/bin/phpcbf --standard=./phpcs.xml ./src ./*.php" + "./vendor/bin/phpcbf --standard=./phpcs.xml ./src ./*.php " ], "parallel": false } diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php deleted file mode 100644 index 4671eca2cb..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php +++ /dev/null @@ -1,10 +0,0 @@ -file_path = $file_path; - $this->chunk_size = $chunk_size; - } - - public function tell(): int { - // Save the previous offset, not the current one. - // This way, after resuming, the next read will yield the same $output_bytes - // as we have now. - return $this->offset_in_file - $this->last_chunk_size; - } - - public function seek( $offset_in_file ): bool { - if ( ! is_int( $offset_in_file ) ) { - _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor to a non-integer offset.', '1.0.0' ); - return false; - } - if ( $this->file_pointer ) { - _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor on a file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->offset_in_file = $offset_in_file; - $this->last_chunk_size = 0; - return true; - } - - public function is_finished(): bool { - return ! $this->output_bytes && $this->state === static::STATE_FINISHED; - } - - public function get_bytes(): string { - return $this->output_bytes; - } - - public function get_last_error(): ?string { - return $this->last_error; - } - - public function next_bytes(): bool { - $this->output_bytes = ''; - $this->last_chunk_size = 0; - if ( $this->last_error || $this->is_finished() ) { - return false; - } - if ( ! $this->file_pointer ) { - $this->file_pointer = fopen( $this->file_path, 'r' ); - if ( $this->offset_in_file ) { - fseek( $this->file_pointer, $this->offset_in_file ); - } - } - $bytes = fread( $this->file_pointer, $this->chunk_size ); - if ( ! $bytes && feof( $this->file_pointer ) ) { - fclose( $this->file_pointer ); - $this->state = static::STATE_FINISHED; - return false; - } - $this->last_chunk_size = strlen( $bytes ); - $this->offset_in_file += $this->last_chunk_size; - $this->output_bytes .= $bytes; - return true; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php deleted file mode 100644 index 1216e7f7cd..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php +++ /dev/null @@ -1,26 +0,0 @@ -output_bytes = ''; - if ( $this->last_error || $this->is_finished() ) { - return false; - } - if ( ! $this->file_pointer ) { - $this->file_pointer = gzopen( $this->file_path, 'r' ); - if ( $this->offset_in_file ) { - gzseek( $this->file_pointer, $this->offset_in_file ); - } - } - $bytes = gzread( $this->file_pointer, $this->chunk_size ); - if ( ! $bytes && gzeof( $this->file_pointer ) ) { - gzclose( $this->file_pointer ); - $this->state->finish(); - return false; - } - $this->offset_in_file += strlen( $bytes ); - $this->output_bytes .= $bytes; - return true; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php deleted file mode 100644 index 34ac703b18..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php +++ /dev/null @@ -1,187 +0,0 @@ -seek(0); - * $file->request_bytes(100); - * while($file->next_chunk()) { - * var_dump($file->get_bytes()); - * } - * $file->seek(600); - * $file->request_bytes(40); - * while($file->next_chunk()) { - * var_dump($file->get_bytes()); - * } - * - * @TODO: Verify that the remote server supports range requests. - * @TODO: Support requesting multiple ranges in a single request. - * @TODO: Abort in-progress requests when seeking to a new offset. - */ -class WP_Remote_File_Ranged_Reader { - - /** - * @var WordPress\AsyncHttp\Client - */ - private $client; - private $url; - private $remote_file_length; - - private $current_request; - private $offset_in_remote_file = 0; - private $offset_in_current_chunk = 0; - private $current_chunk; - private $expected_chunk_size; - - public function __construct( $url, $options = array() ) { - $this->client = new WordPress\AsyncHttp\Client(); - $this->url = $url; - } - - public function request_bytes( $bytes ) { - if ( null === $this->remote_file_length ) { - $content_length = $this->resolve_content_length(); - if ( false === $content_length ) { - // The remote server won't tell us what the content length is - // @TODO: What should we do in this case? Content-length is critical for - // stream-decompressing remote zip files, but we may not need it - // for other use-cases. - return false; - } - $this->remote_file_length = $content_length; - } - - if ( $this->offset_in_remote_file < 0 || $this->offset_in_remote_file + $bytes > $this->remote_file_length ) { - // TODO: Think through error handling - return false; - } - - $this->seek( $this->offset_in_remote_file ); - - $this->current_request = new WordPress\AsyncHttp\Request( - $this->url, - array( - 'headers' => array( - 'Range' => 'bytes=' . $this->offset_in_remote_file . '-' . ( $this->offset_in_remote_file + $bytes - 1 ), - ), - ) - ); - $this->expected_chunk_size = $bytes; - $this->offset_in_current_chunk = 0; - if ( false === $this->client->enqueue( $this->current_request ) ) { - // TODO: Think through error handling - return false; - } - return true; - } - - public function seek( $offset ) { - $this->offset_in_remote_file = $offset; - // @TODO cancel any pending requests - $this->current_request = null; - } - - public function tell() { - return $this->offset_in_remote_file; - } - - public function resolve_content_length() { - if ( null !== $this->remote_file_length ) { - return $this->remote_file_length; - } - - $request = new WordPress\AsyncHttp\Request( - $this->url, - array( 'method' => 'HEAD' ) - ); - if ( false === $this->client->enqueue( $request ) ) { - // TODO: Think through error handling - return false; - } - while ( $this->client->await_next_event() ) { - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_GOT_HEADERS: - $response = $request->response; - if ( false === $response ) { - return false; - } - $content_length = $response->get_header( 'Content-Length' ); - if ( false === $content_length ) { - return false; - } - return (int) $content_length; - } - } - return false; - } - - public function next_chunk() { - while ( $this->client->await_next_event() ) { - /** - * Only process events related to the most recent request. - * @TODO: Support redirects. - * @TODO: Cleanup resources for stale requests. - */ - if ( $this->current_request->id !== $this->client->get_request()->id ) { - continue; - } - - if ( $this->offset_in_current_chunk >= $this->expected_chunk_size ) { - // The remote server doesn't support range requests and sent us a chunk larger than expected. - // @TODO: Handle this case. Should we stream the entire file, or give up? - // Should we cache the download locally, or request the entire file again every - // time we need to seek()? - return false; - } - - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_GOT_HEADERS: - $request = $this->client->get_request(); - if ( ! $request ) { - return false; - } - $response = $request->response; - if ( false === $response ) { - return false; - } - if ( - $response->status_code !== 206 || - false === $response->get_header( 'Range' ) - ) { - // The remote server doesn't support range requests - // @TODO: Handle this case. Should we stream the entire file, or give up? - // Should we cache the download locally, or request the entire file again every - // time we need to seek()? - return false; - } - break; - case WordPress\AsyncHttp\Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( ! is_string( $chunk ) ) { - // TODO: Think through error handling - return false; - } - $this->current_chunk = $chunk; - $this->offset_in_remote_file += strlen( $chunk ); - $this->offset_in_current_chunk += strlen( $chunk ); - - return true; - case WordPress\AsyncHttp\Client::EVENT_FAILED: - // TODO: Think through error handling. Errors are expected when working with - // the network. Should we auto retry? Make it easy for the caller to retry? - // Something else? - return false; - case WordPress\AsyncHttp\Client::EVENT_FINISHED: - // TODO: Think through error handling - return false; - } - } - } - - public function get_bytes() { - return $this->current_chunk; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php deleted file mode 100644 index d55846f7b8..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php +++ /dev/null @@ -1,109 +0,0 @@ -client = new WordPress\AsyncHttp\Client(); - $this->url = $url; - } - - public function tell(): int { - return $this->bytes_already_read + $this->skip_bytes; - } - - public function seek( $offset_in_file ): bool { - if ( $this->request ) { - _doing_it_wrong( __METHOD__, 'Cannot set a remote file reader cursor on a remote file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->skip_bytes = $offset_in_file; - return true; - } - - public function next_bytes(): bool { - if ( null === $this->request ) { - $this->request = new WordPress\AsyncHttp\Request( - $this->url - ); - if ( false === $this->client->enqueue( $this->request ) ) { - // TODO: Think through error handling - return false; - } - } - - $this->after_chunk(); - - while ( $this->client->await_next_event() ) { - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( ! is_string( $chunk ) ) { - // TODO: Think through error handling - return false; - } - $this->current_chunk = $chunk; - - /** - * Naive seek() implementation – redownload the file from the start - * and ignore bytes until we reach the desired offset. - * - * @TODO: Use the range requests instead when the server supports them. - */ - if ( $this->skip_bytes > 0 ) { - if ( $this->skip_bytes < strlen( $chunk ) ) { - $this->current_chunk = substr( $chunk, $this->skip_bytes ); - $this->bytes_already_read += $this->skip_bytes; - $this->skip_bytes = 0; - } else { - $this->skip_bytes -= strlen( $chunk ); - continue 2; - } - } - return true; - case WordPress\AsyncHttp\Client::EVENT_FAILED: - // TODO: Think through error handling. Errors are expected when working with - // the network. Should we auto retry? Make it easy for the caller to retry? - // Something else? - $this->last_error = $this->client->get_request()->error; - return false; - case WordPress\AsyncHttp\Client::EVENT_FINISHED: - $this->is_finished = true; - return false; - } - } - } - - private function after_chunk() { - if ( $this->current_chunk ) { - $this->bytes_already_read += strlen( $this->current_chunk ); - } - $this->current_chunk = null; - } - - public function get_last_error(): ?string { - return $this->last_error; - } - - public function get_bytes(): ?string { - return $this->current_chunk; - } - - public function is_finished(): bool { - return $this->is_finished; - } -} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php new file mode 100644 index 0000000000..fcbcd70133 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php @@ -0,0 +1,342 @@ +file_visitor = new \WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem, $options['root_dir'] ); + $this->filesystem = $filesystem; + $this->next_post_id = $options['first_post_id']; + $this->allowed_extensions = $options['allowed_extensions']; + $this->index_file_patterns = $options['index_file_patterns']; + $this->markup_converter_factory = $options['markup_converter_factory']; + } + + public function next_entity() { + while ( true ) { + if ( null !== $this->pending_directory_index ) { + $dir = $this->file_visitor->get_event()->dir; + $depth = $this->file_visitor->get_current_depth(); + $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; + + if ( null === $parent_id && $depth > 1 ) { + // There's no parent ID even though we're a few levels deep. + // This is a scenario where `next_file()` skipped a few levels + // of directories with no relevant content in them: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, we need to backtrack and create the missing + // parent pages for /bar/ and /foo/. + + // Find the topmost missing parent ID + $missing_parent_id_depth = 1; + while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { + ++$missing_parent_id_depth; + } + + // Move up to the corresponding directory + $missing_parent_path = $dir; + for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { + $missing_parent_path = dirname( $missing_parent_path ); + } + + $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_post_entity( + array( + 'content' => '', + 'source_path' => $missing_parent_path, + 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ], + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $missing_parent_path ) ), + ) + ); + } elseif ( false === $this->pending_directory_index ) { + // No directory index candidate – let's create a fake page + // just to have something in the page tree. + $this->parent_ids[ $depth ] = $this->emit_post_entity( + array( + 'content' => '', + 'source_path' => $dir, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $dir ) ), + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } else { + $file_path = $this->pending_directory_index; + $this->parent_ids[ $depth ] = $this->emit_post_entity( + array( + 'content' => $this->filesystem->read_file( $file_path ), + 'source_path' => $file_path, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } + return true; + } + + while ( count( $this->pending_files ) ) { + $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; + $file_path = array_shift( $this->pending_files ); + $this->emit_post_entity( + array( + 'content' => $this->filesystem->read_file( $file_path ), + 'source_path' => $file_path, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), + ) + ); + return true; + } + + if ( false === $this->next_file() ) { + break; + } + } + $this->is_finished = true; + return false; + } + + public function get_entity(): ?\WP_Imported_Entity { + return $this->entity; + } + + protected function emit_post_entity( $options ) { + $factory = $this->markup_converter_factory; + $converter = $factory( $options['content'] ); + $converter->convert(); + $block_markup = $converter->get_block_markup(); + + $post_title = null; + if ( ! $post_title ) { + $removed_title = WP_Import_Utils::remove_first_h1_block_from_block_markup( $block_markup ); + if ( false !== $removed_title ) { + $post_title = $removed_title['title']; + $block_markup = $removed_title['remaining_html']; + } + } + if ( ! $post_title ) { + // In Markdown, the frontmatter title can be a worse title candidate than + // the first H1 block. In block markup exports, it will be the opposite. + // + // @TODO: Enable the API consumer to customize the title resolution. + $post_title = $converter->get_meta_value( 'post_title' ); + } + if ( ! $post_title ) { + $post_title = $options['title_fallback']; + } + + $entity_data = array( + 'post_id' => $this->next_post_id, + 'post_type' => 'page', + 'guid' => $options['source_path'], + 'post_title' => $post_title, + 'post_content' => $block_markup, + 'post_excerpt' => $converter->get_meta_value( 'post_excerpt' ) ?? '', + 'post_status' => 'publish', + ); + + /** + * Technically `source_path` isn't a part of the WordPress post object, + * but we need it to resolve relative URLs in the imported content. + * + * This path is relative to the root directory traversed by this class. + */ + if ( ! empty( $options['source_path'] ) ) { + $source_path = $options['source_path']; + $root_dir = $this->file_visitor->get_root_dir(); + if ( str_starts_with( $source_path, $root_dir ) ) { + $source_path = substr( $source_path, strlen( $root_dir ) ); + } + $source_path = ltrim( $source_path, '/' ); + $entity_data['source_path'] = $source_path; + } + + if ( $converter->get_meta_value( 'slug' ) ) { + $slug = $converter->get_meta_value( 'slug' ); + $last_segment = substr( $slug, strrpos( $slug, '/' ) + 1 ); + $entity_data['post_name'] = $last_segment; + } + + if ( $converter->get_meta_value( 'post_order' ) ) { + $entity_data['post_order'] = $converter->get_meta_value( 'post_order' ); + } + + if ( $options['parent_id'] ) { + $entity_data['post_parent'] = $options['parent_id']; + } + + $this->entity = new \WP_Imported_Entity( 'post', $entity_data ); + ++$this->next_post_id; + ++$this->entities_read_so_far; + return $entity_data['post_id']; + } + + private function next_file() { + $this->pending_files = array(); + $this->entity = null; + while ( $this->file_visitor->next() ) { + $event = $this->file_visitor->get_event(); + + if ( $event->is_exiting() ) { + // Clean up stale IDs to save some memory when processing + // large directory trees. + unset( $this->parent_ids[ $event->dir ] ); + continue; + } + + if ( $event->is_entering() ) { + $abs_paths = array(); + foreach ( $event->files as $filename ) { + $abs_paths[] = $event->dir . '/' . $filename; + } + $this->pending_files = $this->choose_relevant_files( $abs_paths ); + if ( ! count( $this->pending_files ) ) { + // Only consider directories with relevant files in them. + // Otherwise we'll create fake pages for media directories + // and other directories that don't contain any content. + // + // One corner case is when there's a few levels of directories + // with a single relevant file at the bottom: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, `next_entity()` will backtrack at baz.md and + // create the missing parent pages. + continue; + } + $directory_index_idx = $this->choose_directory_index( $this->pending_files ); + if ( -1 === $directory_index_idx ) { + $this->pending_directory_index = false; + } else { + $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; + unset( $this->pending_files[ $directory_index_idx ] ); + } + return true; + } + + return false; + } + return false; + } + + protected function choose_directory_index( $files ) { + foreach ( $files as $idx => $file ) { + if ( $this->looks_like_directory_index( $file ) ) { + return $idx; + } + } + return -1; + } + + protected function looks_like_directory_index( $path ) { + $filename = basename( $path ); + foreach ( $this->index_file_patterns as $pattern ) { + if ( preg_match( $pattern, $filename ) ) { + return true; + } + } + return false; + } + + protected function choose_relevant_files( $paths ) { + return array_filter( $paths, array( $this, 'is_valid_file' ) ); + } + + protected function is_valid_file( $path ) { + $extension = pathinfo( $path, PATHINFO_EXTENSION ); + return in_array( $extension, $this->allowed_extensions, true ); + } + + /** + * @TODO: Either implement this method, or introduce a concept of + * reentrant and non-reentrant entity readers. + */ + public function get_reentrancy_cursor() { + return ''; + } + + public function current(): mixed { + if ( null === $this->entity && ! $this->is_finished ) { + $this->next(); + } + return $this->get_entity(); + } + + public function next(): void { + $this->next_entity(); + } + + public function key(): int { + return $this->entities_read_so_far - 1; + } + + public function valid(): bool { + return ! $this->is_finished; + } + + public function rewind(): void { + // @TODO: Either implement this method, or formalize the fact that + // entity readers are not rewindable. + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php index 531534a73f..b01bd0c875 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -1,7 +1,11 @@ post_id = $post_id; } - /** - * Advances to the next entity. - * - * @return bool Whether the next entity was found. - */ public function next_entity() { // If we're finished, we're finished. if ( $this->finished ) { @@ -79,11 +78,6 @@ public function next_entity() { return true; } - /** - * Returns the current entity. - * - * @return WP_Imported_Entity|false The current entity, or false if there are no entities left. - */ public function get_entity() { if ( $this->is_finished() ) { return false; @@ -91,20 +85,10 @@ public function get_entity() { return $this->entities[0]; } - /** - * Checks if this reader has finished yet. - * - * @return bool Whether the reader has finished. - */ public function is_finished(): bool { return $this->finished; } - /** - * Returns the last error that occurred when processing the HTML. - * - * @return string|null The last error, or null if there was no error. - */ public function get_last_error(): ?string { return null; } diff --git a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php similarity index 93% rename from packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php rename to packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php index 25c21ff608..398983c370 100644 --- a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php @@ -1,6 +1,9 @@ channel > item` and comments are * stored in `rss > channel > item > `wp:comment`. @@ -33,7 +36,7 @@ * * Example: * - * $reader = WP_WXR_Reader::create_for_streaming(); + * $reader = WP_WXR_Entity_Reader::create_for_streaming(); * * // Add data as it becomes available * $reader->append_bytes( fread( $file_handle, 8192 ) ); @@ -64,24 +67,24 @@ * } * * The next_entity() -> fread -> break usage pattern may seem a bit tedious. This is expected. Even - * if the WXR parsing part of the WP_WXR_Reader offers a high-level API, working with byte streams + * if the WXR parsing part of the WP_WXR_Entity_Reader offers a high-level API, working with byte streams * requires reasoning on a much lower level. The StreamChain class shipped in this repository will * make the API consumption easier with its transformation–oriented API for chaining data processors. * - * Similarly to `WP_XML_Processor`, the `WP_WXR_Reader` enters a paused state when it doesn't + * Similarly to `WP_XML_Processor`, the `WP_WXR_Entity_Reader` enters a paused state when it doesn't * have enough XML bytes to parse the entire entity. * * ## Caveats * * ### Extensibility * - * `WP_WXR_Reader` ignores any XML elements it doesn't recognize. The WXR format is extensible + * `WP_WXR_Entity_Reader` ignores any XML elements it doesn't recognize. The WXR format is extensible * so in the future the reader may start supporting registration of custom handlers for unknown * tags in the future. * * ### Nested entities intertwined with data * - * `WP_WXR_Reader` flushes the current entity whenever another entity starts. The upside is + * `WP_WXR_Entity_Reader` flushes the current entity whenever another entity starts. The upside is * simplicity and a tiny memory footprint. The downside is that it's possible to craft a WXR * document where some information would be lost. For example: * @@ -101,7 +104,7 @@ * * ``` * - * `WP_WXR_Reader` would accumulate post data until the `wp:post_meta` tag. Then it would emit a + * `WP_WXR_Entity_Reader` would accumulate post data until the `wp:post_meta` tag. Then it would emit a * `post` entity and accumulate the meta information until the `` closer. Then it * would advance to `` and **ignore it**. * @@ -122,7 +125,7 @@ * * @since WP_VERSION */ -class WP_WXR_Reader implements Iterator { +class WP_WXR_Entity_Reader extends WP_Entity_Reader { /** * The XML processor used to parse the WXR file. @@ -355,7 +358,7 @@ public static function create( WP_Byte_Reader $upstream = null, $cursor = null ) if ( false === $cursor ) { _doing_it_wrong( __METHOD__, - 'Invalid cursor provided for WP_WXR_Reader::create().', + 'Invalid cursor provided for WP_WXR_Entity_Reader::create().', null ); return false; @@ -364,7 +367,7 @@ public static function create( WP_Byte_Reader $upstream = null, $cursor = null ) } $xml = WP_XML_Processor::create_for_streaming( '', $xml_cursor ); - $reader = new WP_WXR_Reader( $xml ); + $reader = new WP_WXR_Entity_Reader( $xml ); if ( null !== $cursor ) { $reader->last_post_id = $cursor['last_post_id']; $reader->last_comment_id = $cursor['last_comment_id']; @@ -627,7 +630,7 @@ private function read_next_entity() { /** * Custom adjustment: the Accessibility WXR file uses a non-standard * wp:wp_author tag. - * @TODO: Should WP_WXR_Reader care about such non-standard tags when + * @TODO: Should WP_WXR_Entity_Reader care about such non-standard tags when * the regular WXR importer would ignore them? Perhaps a warning * and an upstream PR would be a better solution. */ @@ -672,7 +675,7 @@ private function read_next_entity() { * Long time ago... * * - * The semantics of such a structure is not clear. The WP_WXR_Reader will + * The semantics of such a structure is not clear. The WP_WXR_Entity_Reader will * enter an error state when it encounters such a structure. * * Such nesting wasn't found in any WXR files analyzed when building @@ -901,38 +904,4 @@ private function after_entity() { $this->text_buffer = ''; $this->last_opener_attributes = array(); } - - public function current(): object { - if ( null === $this->entity_data && ! $this->is_finished() && ! $this->get_last_error() ) { - $this->next(); - } - return $this->get_entity(); - } - - private $last_next_result = null; - public function next(): void { - // @TODO: Don't keep track of this. Just make sure the next_entity() - // call will make the is_finished() true. - $this->last_next_result = $this->next_entity(); - } - - public function key(): string { - return $this->get_reentrancy_cursor(); - } - - public function valid(): bool { - return false !== $this->last_next_result && ! $this->is_finished() && ! $this->get_last_error(); - } - - public function rewind(): void { - // Haven't started yet. - if ( null === $this->last_next_result ) { - return; - } - _doing_it_wrong( - __METHOD__, - 'WP_WXR_Reader does not support rewinding.', - null - ); - } } diff --git a/packages/playground/data-liberation/src/import/WP_Import_HTML_Processor.php b/packages/playground/data-liberation/src/import/WP_Import_HTML_Processor.php new file mode 100644 index 0000000000..eced41d5f9 --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Import_HTML_Processor.php @@ -0,0 +1,20 @@ +set_bookmark( $name ); + $bookmark = $this->bookmarks[ '_' . $name ]; + $this->release_bookmark( $name ); + return $bookmark->start + $bookmark->length; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Import_Session.php b/packages/playground/data-liberation/src/import/WP_Import_Session.php index a731f3f9fc..931dbd1b70 100644 --- a/packages/playground/data-liberation/src/import/WP_Import_Session.php +++ b/packages/playground/data-liberation/src/import/WP_Import_Session.php @@ -510,6 +510,7 @@ public function bump_frontloading_progress( $frontloading_progress, $events = ar $attempts = get_post_meta( $placeholder->ID, 'attempts', true ); $new_attempts = $attempts; + $new_status = $placeholder->post_status; switch ( $event->type ) { case WP_Attachment_Downloader_Event::SUCCESS: $new_status = self::FRONTLOAD_STATUS_SUCCEEDED; diff --git a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php index 41f08a4e7c..77ebd09c00 100644 --- a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php +++ b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php @@ -2,6 +2,9 @@ use WordPress\AsyncHTTP\Client; use WordPress\AsyncHTTP\Request; +use WordPress\ByteReader\WP_File_Reader; +use WordPress\ByteReader\WP_Remote_File_Reader; +use WordPress\Filesystem\WP_Byte_Reader; /** * Idea: @@ -35,12 +38,12 @@ class WP_Stream_Importer { /** * Populated from the WXR file's tag. */ - private $source_site_url; + protected $source_site_url; /** * A list of [original_url, migrated_url] pairs for rewriting the URLs * in the imported content. */ - private $site_url_mapping = array(); + protected $site_url_mapping = array(); /** * A list of candidate base URLs that have been spotted in the WXR file. * @@ -66,8 +69,8 @@ class WP_Stream_Importer { * Once the API consumer decides on the mapping, it can call * add_site_url_mapping() to tell the importer what to map that domain to. */ - private $site_url_mapping_candidates = array(); - private $entity_iterator_factory; + protected $site_url_mapping_candidates = array(); + protected $entity_iterator_factory; /** * @param array|string|null $query { * @type string $uploads_path The directory to download the media attachments to. @@ -76,7 +79,7 @@ class WP_Stream_Importer { * after the import. E.g. http://127.0.0.1:9400/wp-content/uploads/ * } */ - private $options; + protected $options; const STAGE_INITIAL = '#initial'; const STAGE_INDEX_ENTITIES = '#index_entities'; @@ -98,7 +101,7 @@ class WP_Stream_Importer { * The current state of the import process. * @var string */ - private $stage = self::STAGE_INITIAL; + protected $stage = self::STAGE_INITIAL; /** * The next stage of the import process. An explicit call to * next_stage() is required to advance the importer. @@ -108,13 +111,13 @@ class WP_Stream_Importer { * failed to download. * @var string */ - private $next_stage; + protected $next_stage; /** * Iterator that streams entities to import. */ - private $entity_iterator; - private $resume_at_entity; + protected $entity_iterator; + protected $resume_at_entity; /** * A map of currently downloaded resources for each entity in * the following format: @@ -123,13 +126,13 @@ class WP_Stream_Importer { * * @var array> */ - private $active_downloads = array(); - private $downloader; + protected $active_downloads = array(); + protected $downloader; public static function create_for_wxr_file( $wxr_path, $options = array(), $cursor = null ) { return static::create( function ( $cursor = null ) use ( $wxr_path ) { - return WP_WXR_Reader::create( new WP_File_Reader( $wxr_path ), $cursor ); + return WP_WXR_Entity_Reader::create( WP_File_Reader::create( $wxr_path ), $cursor ); }, $options, $cursor @@ -139,7 +142,7 @@ function ( $cursor = null ) use ( $wxr_path ) { public static function create_for_wxr_url( $wxr_url, $options = array(), $cursor = null ) { return static::create( function ( $cursor = null ) use ( $wxr_url ) { - return WP_WXR_Reader::create( new WP_Remote_File_Reader( $wxr_url ), $cursor ); + return WP_WXR_Entity_Reader::create( new WP_Remote_File_Reader( $wxr_url ), $cursor ); }, $options, $cursor @@ -152,14 +155,14 @@ public static function create( $cursor = null ) { $options = static::parse_options( $options ); - $importer = new WP_Stream_Importer( $entity_iterator_factory, $options ); + $importer = new static( $entity_iterator_factory, $options ); if ( null !== $cursor && true !== $importer->initialize_from_cursor( $cursor ) ) { return false; } return $importer; } - private function initialize_from_cursor( $cursor ) { + protected function initialize_from_cursor( $cursor ) { $cursor = json_decode( $cursor, true ); if ( ! is_array( $cursor ) ) { _doing_it_wrong( __METHOD__, 'Cannot resume an importer with a non-array cursor.', '1.0.0' ); @@ -182,7 +185,7 @@ private function initialize_from_cursor( $cursor ) { return true; } - private function set_source_site_url( $source_site_url ) { + protected function set_source_site_url( $source_site_url ) { $this->source_site_url = $source_site_url; // -1 is a well-known index for the source site URL. // Every subsequent call to set_source_site_url() will @@ -235,7 +238,7 @@ public function get_reentrancy_cursor() { ); } - private static function parse_options( $options ) { + protected static function parse_options( $options ) { if ( ! isset( $options['new_site_url'] ) ) { $options['new_site_url'] = get_site_url(); } @@ -255,7 +258,7 @@ private static function parse_options( $options ) { return $options; } - private function __construct( + protected function __construct( $entity_iterator_factory, $options = array() ) { @@ -266,7 +269,7 @@ private function __construct( } } - private $frontloading_retries_iterator; + protected $frontloading_retries_iterator; public function set_frontloading_retries_iterator( $frontloading_retries_iterator ) { $this->frontloading_retries_iterator = $frontloading_retries_iterator; } @@ -277,7 +280,7 @@ public function set_frontloading_retries_iterator( $frontloading_retries_iterato * * @var WP_Entity_Importer */ - private $importer; + protected $importer; public function next_step() { switch ( $this->stage ) { @@ -328,10 +331,10 @@ public function advance_to_next_stage() { return true; } - private $indexed_entities_counts = array(); - private $indexed_assets_urls = array(); + protected $indexed_entities_counts = array(); + protected $indexed_assets_urls = array(); - private function index_next_entities( $count = 10000 ) { + protected function index_next_entities( $count = 10000 ) { if ( null !== $this->next_stage ) { return false; } @@ -454,7 +457,7 @@ public function get_indexed_assets_urls() { return $this->indexed_assets_urls; } - private $frontloading_events = array(); + protected $frontloading_events = array(); public function get_frontloading_events() { return $this->frontloading_events; } @@ -475,7 +478,7 @@ public function get_frontloading_progress() { * downloader will enqueue B for download and will skip C and D since * the relevant files already exist in the filesystem. */ - private function frontloading_advance_reentrancy_cursor() { + protected function frontloading_advance_reentrancy_cursor() { while ( $this->downloader->next_event() ) { $event = $this->downloader->get_event(); switch ( $event->type ) { @@ -510,7 +513,7 @@ private function frontloading_advance_reentrancy_cursor() { * before import_entities() so that every inserted post already has * all its attachments downloaded. */ - private function frontload_next_entity() { + protected function frontload_next_entity() { if ( null === $this->entity_iterator ) { $this->entity_iterator = new WP_Entity_Iterator_Chain(); if ( null !== $this->frontloading_retries_iterator ) { @@ -615,7 +618,7 @@ private function frontload_next_entity() { * large datasets, but maybe it could be a choice for * the API consumer? */ - private function import_next_entity() { + protected function import_next_entity() { if ( null !== $this->next_stage ) { return false; } @@ -717,8 +720,8 @@ private function import_next_entity() { return true; } - private $imported_entities_counts = array(); - private function count_imported_entity( $type ) { + protected $imported_entities_counts = array(); + protected function count_imported_entity( $type ) { if ( ! array_key_exists( $type, $this->imported_entities_counts ) ) { $this->imported_entities_counts[ $type ] = 0; } @@ -728,7 +731,7 @@ public function get_imported_entities_counts() { return $this->imported_entities_counts; } - private function enqueue_attachment_download( string $raw_url, $options = array() ) { + protected function enqueue_attachment_download( string $raw_url, $options = array() ) { $output_filename = $this->new_asset_filename( $options['original_url'] ?? $raw_url, $options['context_path'] ?? null @@ -776,7 +779,7 @@ private function enqueue_attachment_download( string $raw_url, $options = array( * different permissions. Just because Bob deletes his copy, doesn't * mean we should delete Alice's copy. */ - private function new_asset_filename( string $raw_asset_url, $context_path = null ) { + protected function new_asset_filename( string $raw_asset_url, $context_path = null ) { $raw_asset_url = $this->rewrite_attachment_url( $raw_asset_url, $context_path @@ -798,7 +801,7 @@ private function new_asset_filename( string $raw_asset_url, $context_path = null return $filename; } - private function rewrite_attachment_url( string $raw_url, $context_path = null ) { + protected function rewrite_attachment_url( string $raw_url, $context_path = null ) { if ( WP_URL::can_parse( $raw_url ) ) { // Absolute URL, nothing to do. return $raw_url; @@ -821,7 +824,7 @@ private function rewrite_attachment_url( string $raw_url, $context_path = null ) * @TODO: How can we process the videos? * @TODO: What other asset types are there? */ - private function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor $p ) { + protected function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor $p ) { return ( $p->get_tag() === 'IMG' && $p->get_inspected_attribute_name() === 'src' && @@ -829,11 +832,11 @@ private function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor ); } - private function is_child_of_a_mapped_url( $url ) { + protected function is_child_of_a_mapped_url( $url ) { return $this->get_url_mapping_target( $url ) !== false; } - private function get_url_mapping_target( $source_url ) { + protected function get_url_mapping_target( $source_url ) { $url = WP_URL::parse( $source_url ); foreach ( $this->site_url_mapping as $pair ) { $parsed_base_url = $pair[0]; @@ -844,8 +847,8 @@ private function get_url_mapping_target( $source_url ) { return false; } - private $first_iterator = true; - private function create_entity_iterator() { + protected $first_iterator = true; + protected function create_entity_iterator() { $factory = $this->entity_iterator_factory; if ( $this->first_iterator ) { $this->first_iterator = false; diff --git a/packages/playground/data-liberation/src/wxr/WXR_Import_Info.php b/packages/playground/data-liberation/src/wxr/WXR_Import_Info.php deleted file mode 100644 index 544305ba68..0000000000 --- a/packages/playground/data-liberation/src/wxr/WXR_Import_Info.php +++ /dev/null @@ -1,29 +0,0 @@ -append_bytes(file_get_contents($path)); $wxr->input_finished(); @@ -25,7 +27,7 @@ public function test_does_not_crash_when_parsing_preexisting_wxr_files_as_string */ public function test_does_not_crash_when_parsing_preexisting_wxr_files_as_stream($path, $expected_entitys) { $stream = fopen($path, 'r'); - $wxr = WP_WXR_Reader::create(); + $wxr = WP_WXR_Entity_Reader::create(); $found_entities = 0; while(true) { $chunk = fread($stream, 100); @@ -64,7 +66,7 @@ public function preexisting_wxr_files_provider() { public function test_simple_wxr() { - $importer = WP_WXR_Reader::create(); + $importer = WP_WXR_Entity_Reader::create(); $importer->append_bytes(file_get_contents(__DIR__ . '/fixtures/wxr-simple.xml')); $importer->input_finished(); $this->assertTrue( $importer->next_entity() ); @@ -182,7 +184,7 @@ public function test_simple_wxr() { } public function test_attachments() { - $importer = WP_WXR_Reader::create(); + $importer = WP_WXR_Entity_Reader::create(); $importer->append_bytes(<< @@ -265,7 +267,7 @@ public function test_attachments() { } public function test_terms() { - $importer = WP_WXR_Reader::create(); + $importer = WP_WXR_Entity_Reader::create(); $importer->append_bytes(<< @@ -300,7 +302,7 @@ public function test_terms() { } public function test_category() { - $importer = WP_WXR_Reader::create(); + $importer = WP_WXR_Entity_Reader::create(); $importer->append_bytes(<< @@ -331,7 +333,7 @@ public function test_category() { } public function test_tag_string() { - $importer = WP_WXR_Reader::create(); + $importer = WP_WXR_Entity_Reader::create(); $importer->append_bytes(<< @@ -379,7 +381,7 @@ public function test_tag_streaming() { XML; $chunks = str_split($wxr, 10); - $wxr = WP_WXR_Reader::create(); + $wxr = WP_WXR_Entity_Reader::create(); while(true) { if(true === $wxr->next_entity()) { break; @@ -411,7 +413,7 @@ public function test_tag_streaming() { } public function test_parse_comment() { - $wxr = WP_WXR_Reader::create(); + $wxr = WP_WXR_Entity_Reader::create(); $wxr->append_bytes(<< @@ -494,7 +496,7 @@ public function test_parse_comment() { } public function test_retains_last_ids() { - $wxr = WP_WXR_Reader::create(); + $wxr = WP_WXR_Entity_Reader::create(); $wxr->append_bytes(<< @@ -565,8 +567,8 @@ public function test_scan_entities_without_reentrancy() { "post_meta" ]; - $wxr = WP_WXR_Reader::create( - new WP_File_Reader( $xml_path ) + $wxr = WP_WXR_Entity_Reader::create( + WP_File_Reader::create( $xml_path ) ); for($i = 0; $i < 11; $i++) { @@ -595,8 +597,8 @@ public function test_scan_entities_with_reentrancy() { "post_meta" ]; - $wxr = WP_WXR_Reader::create( - new WP_File_Reader( $xml_path ) + $wxr = WP_WXR_Entity_Reader::create( + WP_File_Reader::create( $xml_path ) ); for($i = 0; $i < 11; $i++) { @@ -606,8 +608,8 @@ public function test_scan_entities_with_reentrancy() { $wxr->get_entity()->get_type() ); $cursor = $wxr->get_reentrancy_cursor(); - $wxr = WP_WXR_Reader::create( - new WP_File_Reader( $xml_path ), + $wxr = WP_WXR_Entity_Reader::create( + WP_File_Reader::create( $xml_path ), $cursor ); $this->assertTrue( $wxr->next_entity() );