diff --git a/guide/src/misc/contributors.md b/guide/src/misc/contributors.md index 362a21fe4f..ff3549091f 100644 --- a/guide/src/misc/contributors.md +++ b/guide/src/misc/contributors.md @@ -20,5 +20,6 @@ shout-out to them! - Vivek Akupatni ([apatniv](https://github.com/apatniv)) - Eric Huss ([ehuss](https://github.com/ehuss)) - Josh Rotenberg ([joshrotenberg](https://github.com/joshrotenberg)) +- Songlin Jiang ([HollowMan6](https://github.com/HollowMan6)) If you feel you're missing from this list, feel free to add yourself in a PR. diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index d0149fb523..2d75c751e2 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -56,10 +56,11 @@ impl HtmlHandlebars { let content = utils::render_markdown(&ch.content, ctx.html_config.smart_punctuation()); - let fixed_content = utils::render_markdown_with_path( + let printed_item = utils::render_markdown_with_path_and_redirects( &ch.content, ctx.html_config.smart_punctuation(), Some(path), + &ctx.html_config.redirect, ); if !ctx.is_index && ctx.html_config.print.page_break { // Add page break between chapters @@ -68,7 +69,25 @@ impl HtmlHandlebars { print_content .push_str(r#"
"#); } - print_content.push_str(&fixed_content); + let print_page_id = { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + &base + .replace("/", "-") + .replace("\\", "-") + .to_ascii_lowercase() + }; + + // We have to build header links in advance so that we can know the ranges + // for the headers in one page. + // Insert a dummy div to make sure that we can locate the specific page. + print_content.push_str(&(format!(r#"
"#))); + print_content.push_str(&build_header_links( + &build_print_element_id(&printed_item, &print_page_id), + Some(print_page_id), + )); // Update the context with data for this file let ctx_path = path @@ -214,7 +233,23 @@ impl HtmlHandlebars { code_config: &Code, edition: Option, ) -> String { - let rendered = build_header_links(&rendered); + let rendered = build_header_links(&rendered, None); + let rendered = self.post_process_common(rendered, &playground_config, code_config, edition); + + rendered + } + + /// Applies some post-processing to the HTML to apply some adjustments. + /// + /// This common function is used for both normal chapters (via + /// `post_process`) and the combined print page. + fn post_process_common( + &self, + rendered: String, + playground_config: &Playground, + code_config: &Code, + edition: Option, + ) -> String { let rendered = fix_code_blocks(&rendered); let rendered = add_playground_pre(&rendered, playground_config, edition); let rendered = hide_lines(&rendered, code_config); @@ -577,7 +612,7 @@ impl Renderer for HtmlHandlebars { debug!("Render template"); let rendered = handlebars.render("index", &data)?; - let rendered = self.post_process( + let rendered = self.post_process_common( rendered, &html_config.playground, &html_config.code, @@ -800,9 +835,34 @@ fn make_data( Ok(data) } +/// Go through the rendered print page HTML, +/// add path id prefix to all the elements id as well as footnote links. +fn build_print_element_id(html: &str, print_page_id: &str) -> String { + static ALL_ID: Lazy = Lazy::new(|| Regex::new(r#"(<[^>]*?id=")([^"]+?)""#).unwrap()); + static FOOTNOTE_ID: Lazy = Lazy::new(|| { + Regex::new( + r##"(]*?class="footnote-reference"[^>]*?>[^<]*?]*?href="#)([^"]+?)""##, + ) + .unwrap() + }); + + let temp_html = ALL_ID.replace_all(html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }); + + FOOTNOTE_ID + .replace_all(&temp_html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }) + .into_owned() +} + /// Goes through the rendered HTML, making sure all header tags have /// an anchor respectively so people can link to sections directly. -fn build_header_links(html: &str) -> String { +/// +/// `print_page_id` should be set to the print page ID prefix when adjusting the +/// print page. +fn build_header_links(html: &str, print_page_id: Option<&str>) -> String { static BUILD_HEADER_LINKS: Lazy = Lazy::new(|| { Regex::new(r#"(.*?)"#).unwrap() }); @@ -831,6 +891,7 @@ fn build_header_links(html: &str) -> String { caps.get(2).map(|x| x.as_str().to_string()), caps.get(3).map(|x| x.as_str().to_string()), &mut id_counter, + print_page_id, ) }) .into_owned() @@ -838,14 +899,26 @@ fn build_header_links(html: &str) -> String { /// Insert a sinle link into a header, making sure each link gets its own /// unique ID by appending an auto-incremented number (if necessary). +/// +/// For `print.html`, we will add a path id prefix. fn insert_link_into_header( level: usize, content: &str, id: Option, classes: Option, id_counter: &mut HashMap, + print_page_id: Option<&str>, ) -> String { - let id = id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)); + let id = if let Some(print_page_id) = print_page_id { + let content_id = { + #[allow(deprecated)] + utils::id_from_content(content) + }; + let with_prefix = format!("{} {}", print_page_id, content_id); + id.unwrap_or_else(|| utils::unique_id_from_content(&with_prefix, id_counter)) + } else { + id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)) + }; let classes = classes .map(|s| format!(" class=\"{s}\"")) .unwrap_or_default(); @@ -1125,7 +1198,7 @@ mod tests { ]; for (src, should_be) in inputs { - let got = build_header_links(src); + let got = build_header_links(src, None); assert_eq!(got, should_be); } } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index a53f79c0e9..767cf6e4f9 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -6,13 +6,13 @@ pub(crate) mod toml_ext; use crate::errors::Error; use log::error; use once_cell::sync::Lazy; -use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd}; +use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag, TagEnd}; use regex::Regex; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Write; -use std::path::Path; +use std::path::{Component, Path, PathBuf}; pub use self::string::{ take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines, @@ -83,63 +83,232 @@ pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap>(path: P) -> String { + let ends_with_slash = path.as_ref().to_str().map_or(false, |s| s.ends_with('/')); + let mut normalized = PathBuf::new(); + for component in path.as_ref().components() { + match &component { + Component::ParentDir => { + if !normalized.pop() { + normalized.push(component); + } + } + Component::CurDir => {} + _ => { + normalized.push(component); + } + } + } + if ends_with_slash { + normalized.push(""); + } + normalized.to_str().unwrap().replace("\\", "/").to_string() +} + +/// Converts a relative URL path to a reference ID for the print page. +fn normalize_print_page_id(mut path: String) -> String { + path = path + .replace("/", "-") + .replace(".html#", "-") + .replace("#", "-") + .to_ascii_lowercase(); + if path.ends_with(".html") { + path.truncate(path.len() - 5); + } + path +} + /// Fix links to the correct location. /// /// This adjusts links, such as turning `.md` extensions to `.html`. /// -/// `path` is the path to the page being rendered relative to the root of the -/// book. This is used for the `print.html` page so that links on the print -/// page go to the original location. Normal page rendering sets `path` to -/// None. Ideally, print page links would link to anchors on the print page, -/// but that is very difficult. -fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { +/// See [`render_markdown_with_path_and_redirects`] for a description of +/// `path` and `redirects`. +fn adjust_links<'a>( + event: Event<'a>, + path: Option<&Path>, + redirects: &HashMap, +) -> Event<'a> { static SCHEME_LINK: Lazy = Lazy::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap()); - static MD_LINK: Lazy = - Lazy::new(|| Regex::new(r"(?P.*)\.md(?P#.*)?").unwrap()); - - fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { - if dest.starts_with('#') { - // Fragment-only link. - if let Some(path) = path { - let mut base = path.display().to_string(); - if base.ends_with(".md") { - base.replace_range(base.len() - 3.., ".html"); - } - return format!("{base}{dest}").into(); - } else { - return dest; + static HTML_MD_LINK: Lazy = + Lazy::new(|| Regex::new(r"(?P.*)\.(html|md)(?P#.*)?").unwrap()); + + fn add_base(path: Option<&Path>) -> String { + let mut fixed_link = String::new(); + if let Some(path) = path { + let base = path + .parent() + .expect("path can't be empty") + .to_str() + .expect("utf-8 paths only"); + if !base.is_empty() { + write!(fixed_link, "{base}/").unwrap(); } } - // Don't modify links with schemes like `https`. - if !SCHEME_LINK.is_match(&dest) { - // This is a relative link, adjust it as necessary. - let mut fixed_link = String::new(); - if let Some(path) = path { - let base = path + fixed_link.to_string() + } + + fn fix_print_page_link<'a>( + mut normalized_path: String, + redirects: &HashMap, + ) -> CowStr<'a> { + // Fix redirect links + let (path_no_fragment, fragment) = match normalized_path.split_once('#') { + Some((a, b)) => (a, Some(b)), + None => (normalized_path.as_str(), None), + }; + for (original, redirect) in redirects { + if !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&normalized_path) + && !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&path_no_fragment) + { + continue; + } + + let mut unnormalized_path = String::new(); + if SCHEME_LINK.is_match(&redirect) { + unnormalized_path = redirect.to_string(); + } else { + let base = PathBuf::from(path_no_fragment) .parent() .expect("path can't be empty") .to_str() - .expect("utf-8 paths only"); - if !base.is_empty() { - write!(fixed_link, "{base}/").unwrap(); + .expect("utf-8 paths only") + .to_owned(); + + let normalized_base = normalize_path(base).trim_matches('/').to_owned(); + if !normalized_base.is_empty() { + write!(unnormalized_path, "{normalized_base}/{redirect}").unwrap(); + } else { + unnormalized_path = redirect.to_string().trim_start_matches('/').to_string(); } } - if let Some(caps) = MD_LINK.captures(&dest) { - fixed_link.push_str(&caps["link"]); - fixed_link.push_str(".html"); - if let Some(anchor) = caps.name("anchor") { - fixed_link.push_str(anchor.as_str()); + // original without anchors, need to append link anchors + if !original.contains("#") { + if let Some(fragment) = fragment { + if !unnormalized_path.contains("#") { + unnormalized_path.push('#'); + } else { + unnormalized_path.push('-'); + } + unnormalized_path.push_str(fragment); } + } + + if SCHEME_LINK.is_match(&redirect) { + return CowStr::from(unnormalized_path); } else { - fixed_link.push_str(&dest); + normalized_path = normalize_path(unnormalized_path); + } + break; + } + + // Check again to make sure anchors are the html links inside the book. + if normalized_path.starts_with("../") || normalized_path.contains("/../") { + return CowStr::from(normalized_path); + } + + let mut fixed_anchor_for_print = String::new(); + fixed_anchor_for_print.push_str("#"); + fixed_anchor_for_print.push_str(&normalize_print_page_id(normalized_path)); + CowStr::from(fixed_anchor_for_print) + } + + /// Fix resource links like img to the correct location. + fn fix_resource_links<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + // Don't modify links with schemes like `https`. + if SCHEME_LINK.is_match(&dest) { + return dest; + } + + // This is a relative link, adjust it as necessary. + let mut fixed_link = add_base(path); + fixed_link.push_str(&dest); + CowStr::from(fixed_link) + } + + fn fix_a_links_with_type<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + link_type: LinkType, + ) -> CowStr<'a> { + if link_type == LinkType::Email { + return dest; + } + fix_a_links(dest, path, redirects) + } + + /// Adjust markdown file to correct point in the html file. + fn fix_a_links<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { + if dest.starts_with('#') { + // Fragment-only link. + return match path { + Some(path) => { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + format!( + "#{}{}", + normalize_print_page_id(normalize_path(base)), + dest.replace("#", "-") + ) + .into() + } + None => dest, }; - return CowStr::from(fixed_link); } - dest + + // Don't modify links with schemes like `https`. + if SCHEME_LINK.is_match(&dest) { + return dest; + } + + // This is a relative link, adjust it as necessary. + let mut fixed_link = add_base(path); + + if let Some(caps) = HTML_MD_LINK.captures(&dest) { + fixed_link.push_str(&caps["link"]); + fixed_link.push_str(".html"); + if let Some(anchor) = caps.name("anchor") { + fixed_link.push_str(anchor.as_str()); + } + } else { + fixed_link.push_str(&dest); + }; + + let normalized_path = normalize_path(&fixed_link); + + // Judge if the html link is inside the book. + if !normalized_path.starts_with("../") && !normalized_path.contains("/../") { + // In `print.html`, print page links would all link to anchors on the print page. + return match path { + Some(_) => fix_print_page_link(normalized_path, redirects), + None => CowStr::from(fixed_link), + }; + } + // In normal page rendering, links to anchors on another page. + CowStr::from(fixed_link) } - fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + fn fix_html<'a>( + html: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { // This is a terrible hack, but should be reasonably reliable. Nobody // should ever parse a tag with a regex. However, there isn't anything // in Rust that I know of that is suitable for handling partial html @@ -148,12 +317,45 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { // There are dozens of HTML tags/attributes that contain paths, so // feel free to add more tags if desired; these are the only ones I // care about right now. - static HTML_LINK: Lazy = - Lazy::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap()); + static A_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?href=")([^"]+?)""#).unwrap()); + static A_NAME: Lazy = + Lazy::new(|| Regex::new(r#"(]*?name=")([^"]+?)""#).unwrap()); + static IMG_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?src=")([^"]+?)""#).unwrap()); + + let img_link_fixed_html = IMG_LINK.replace_all(&html, |caps: ®ex::Captures<'_>| { + let fixed = fix_resource_links(caps[2].into(), path); + format!("{}{}\"", &caps[1], fixed) + }); - HTML_LINK - .replace_all(&html, |caps: ®ex::Captures<'_>| { - let fixed = fix(caps[2].into(), path); + let a_name_fixed_html = + A_NAME.replace_all(&img_link_fixed_html, |caps: ®ex::Captures<'_>| { + // This is a relative link, adjust it as necessary. + let origin_name = &caps[2].to_string(); + format!( + "{}{}\"", + &caps[1], + CowStr::from(match path { + Some(path) => { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + format!( + "{}-{}", + normalize_print_page_id(normalize_path(base)), + origin_name.to_string() + ) + } + None => origin_name.to_string(), + }) + ) + }); + + A_LINK + .replace_all(&a_name_fixed_html, |caps: ®ex::Captures<'_>| { + let fixed = fix_a_links(caps[2].into(), path, &redirects); format!("{}{}\"", &caps[1], fixed) }) .into_owned() @@ -168,7 +370,7 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Link { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_a_links_with_type(dest_url, path, redirects, link_type), title, id, }), @@ -179,12 +381,12 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Image { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_resource_links(dest_url, path), title, id, }), - Event::Html(html) => Event::Html(fix_html(html, path)), - Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)), + Event::Html(html) => Event::Html(fix_html(html, path, redirects)), + Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path, redirects)), _ => event, } } @@ -194,6 +396,15 @@ pub fn render_markdown(text: &str, smart_punctuation: bool) -> String { render_markdown_with_path(text, smart_punctuation, None) } +/// Wrapper around for API compatibility. +pub fn render_markdown_with_path( + text: &str, + smart_punctuation: bool, + path: Option<&Path>, +) -> String { + render_markdown_with_path_and_redirects(text, smart_punctuation, path, &HashMap::new()) +} + pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> { let mut opts = Options::empty(); opts.insert(Options::ENABLE_TABLES); @@ -207,16 +418,26 @@ pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> { Parser::new_ext(text, opts) } -pub fn render_markdown_with_path( +/// Renders markdown to HTML. +/// +/// `path` is the path to the page being rendered relative to the root of the +/// book. This is used for the `print.html` page so that links on the print +/// page go to the anchors that has a path id prefix. Normal page rendering +/// sets `path` to None. +/// +/// `redirects` is also only for the print page. It's for adjusting links to +/// a redirected location to go to the correct spot on the `print.html` page. +pub(crate) fn render_markdown_with_path_and_redirects( text: &str, smart_punctuation: bool, path: Option<&Path>, + redirects: &HashMap, ) -> String { let mut s = String::with_capacity(text.len() * 3 / 2); let p = new_cmark_parser(text, smart_punctuation); let events = p .map(clean_codeblock_headers) - .map(|event| adjust_links(event, path)) + .map(|event| adjust_links(event, path, &redirects)) .flat_map(|event| { let (a, b) = wrap_tables(event); a.into_iter().chain(b) diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs index 707b997db6..1c42926a9e 100644 --- a/tests/rendered_output.rs +++ b/tests/rendered_output.rs @@ -91,12 +91,14 @@ fn check_correct_relative_links_in_print_page() { assert_contains_strings( first.join("print.html"), &[ - r##"the first section,"##, + r##"the first section,"##, r##"outside"##, r##"Some image"##, - r##"fragment link"##, - r##"HTML Link"##, + r##"fragment link"##, + r##"HTML Link"##, r##"raw html"##, + r##"1"##, + r##"2"##, ], ); }