Skip to content

Commit

Permalink
Parse EPubs as XHTML
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Dec 17, 2024
1 parent d293c22 commit 58def6c
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ public function convert() {
}

while ( $this->markup_processor->next_token() ) {
var_dump( $this->markup_processor->get_token_type() );
switch ( $this->markup_processor->get_token_type() ) {
case '#text':
if ( $this->ignore_text ) {
Expand All @@ -58,7 +57,6 @@ public function convert() {
break;
}
}
var_dump( $this->markup_processor->get_last_error() );

if ( $this->markup_processor->get_last_error() ) {
$this->last_error = $this->markup_processor->get_last_error();
Expand Down Expand Up @@ -90,8 +88,8 @@ private function handle_tag() {
$tag = strtoupper( $html->get_tag() );
$tag_lowercase = strtolower( $tag );

$is_tag_opener = ! $html->is_tag_closer();
if ( ! $html->expects_closer() ) {
$is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer();
if ( $is_void_tag ) {
switch ( $tag ) {
case 'META':
$key = $html->get_attribute( 'name' );
Expand Down Expand Up @@ -119,7 +117,7 @@ private function handle_tag() {
// Just insert an HTML block or what?
break;
}
} elseif ( $is_tag_opener ) {
} elseif ( ! $html->is_tag_closer() ) {
switch ( $tag ) {
// Block elements
case 'SCRIPT':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ public function next_entity() {
return false;
}

$html_file = array_shift( $this->remaining_html_files );
$html = $this->zip->read_file( $html_file );
$html_file = array_shift( $this->remaining_html_files );
$html = $this->zip->read_file( $html_file );
$this->current_html_reader = new WP_HTML_Entity_Reader(
WP_XML_Processor::create_from_string( $html ),
$this->current_post_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1628,13 +1628,13 @@ private function parse_next_tag() {
return false;
}

$closer_at = $at;
$closer_at = $at;
$this->parser_state = self::STATE_DOCTYPE_NODE;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}

/*
* Anything else here is either unsupported at this point or invalid
* syntax. See the class-level @TODO annotations for more information.
Expand All @@ -1644,7 +1644,6 @@ private function parse_next_tag() {
return false;
}


/*
* An `<?xml` token at the beginning of the document marks a start of an
* xml declaration.
Expand Down Expand Up @@ -2537,7 +2536,7 @@ public function expects_closer() {
return false;
}

return ! $this->is_empty_element() && ! $this->is_closing_tag;
return $this->is_tag_opener() && ! $this->is_empty_element();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,17 @@ public function test_entity_reader( $reader ) {
$entities = [];
while ( $reader->next_entity() ) {
$data = $reader->get_entity()->get_data();
if(isset($data['content'])) {
$data['content'] = $this->normalize_markup( $data['content'] );
}
$entities[] = [
'type' => $reader->get_entity()->get_type(),
'data' => $data,
];
}
$this->assertNull( $reader->get_last_error() );
$this->assertEquals( 3, count($entities) );
$this->assertEquals( 117, strlen($entities[0]['data']['content']) );
$this->assertGreaterThan( 100, strlen($entities[0]['data']['content']) );
$this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) );
$this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) );
echo $entities[2]['data']['content'];
}

public function epub_byte_reader_data_provider() {
Expand All @@ -39,20 +37,4 @@ public function epub_byte_reader_data_provider() {
];
}

private function normalize_markup( $markup ) {
$processor = new WP_HTML_Processor( $markup );
$serialized = $processor->serialize();
// Naively remove parts of the HTML that serialize()
// adds that we don't want.
$serialized = str_replace(
[
'<html><head></head><body>',
'</body></html>',
],
'',
$serialized
);
return $serialized;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,29 @@ public function test_html_to_blocks_excerpt() {
}

$this->assertEquals( file_get_contents( $output_file ), $blocks );

}

public function test_xhtml_to_blocks_conversion() {
$input = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html>
<body>
<h1>Hello, world!</h1>
<p>And some content</p>
</body>
</html>
XML;
$converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $input ) );
$converter->convert( $input );
$blocks = $converter->get_block_markup();
$expected = <<<HTML
<!-- wp:heading {"level":1} --><h1>Hello, world! </h1><!-- /wp:heading --><!-- wp:paragraph --><p>And some content </p><!-- /wp:paragraph -->
HTML;
$this->assertEquals(
$this->normalize_markup( $expected ),
$this->normalize_markup( $blocks )
);
}

}

0 comments on commit 58def6c

Please sign in to comment.