From c98b360d611c37315c9c5330089a1d91dbb2021c Mon Sep 17 00:00:00 2001 From: A Farzat Date: Sat, 7 Mar 2026 09:04:00 +0300 Subject: Fix chapter contents to proper xml Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters. --- src/xml.rs | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 src/xml.rs (limited to 'src/xml.rs') diff --git a/src/xml.rs b/src/xml.rs new file mode 100644 index 0000000..9b11e9a --- /dev/null +++ b/src/xml.rs @@ -0,0 +1,137 @@ +use anyhow::Result; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::{Reader, Writer}; +use relative_path::{RelativePath, RelativePathBuf}; +use std::collections::HashMap; +use std::io::Cursor; + +use crate::models::{Chapter, EpubResponse}; + +/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag. +fn is_html_void_tag(name: &[u8]) -> bool { + matches!( + name, + b"area" + | b"base" + | b"br" + | b"col" + | b"embed" + | b"hr" + | b"img" + | b"input" + | b"link" + | b"meta" + | b"param" + | b"source" + | b"track" + | b"wbr" + ) +} + +/// Processes the fragment and outputs a complete, EPUB-ready XHTML document. +pub fn build_epub_chapter( + epub_data: &EpubResponse, + chapter: &Chapter, + chapter_dir: &RelativePath, + fragment: &str, + stylesheet_path: &str, + url_path_to_local: &HashMap<&str, &RelativePathBuf>, +) -> Result { + // Setup the XML Reader and Writer. + let mut reader = Reader::from_str(fragment); + // Preserve spacing for EPUB text formatting. + reader.config_mut().trim_text(false); + // Fragments could have unmatched tags - tell the parser not to panic if so. + reader.config_mut().check_end_names = false; + let mut writer = Writer::new(Cursor::new(Vec::new())); + + // Loop through the XML events and rewrite tags. + loop { + match reader.read_event() { + Ok(Event::Start(tag_data)) => { + // If it is a void tag, convert it to a self-closing XML tag. + let tag_type = if is_html_void_tag(tag_data.name().as_ref()) { + Event::Empty + } else { + Event::Start + }; + writer.write_event(tag_type(rewrite_attributes( + &tag_data, + url_path_to_local, + chapter_dir, + )))?; + } + Ok(Event::Empty(tag_data)) => { + // If tags are already empty, leave them as-is. + writer.write_event(Event::Empty(rewrite_attributes( + &tag_data, + url_path_to_local, + chapter_dir, + )))?; + } + Ok(Event::End(tag_data)) => { + // Silently drop closing tags for void elements if they exist (e.g. ). + if !is_html_void_tag(tag_data.name().as_ref()) { + writer.write_event(Event::End(tag_data))?; + } + } + Ok(Event::Eof) => break, + Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified. + Err(e) => anyhow::bail!(e), + } + } + + // Extract the modified fragment + let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?; + + // Wrap in EPUB XHTML Boilerplate. + // EPUBs strictly require the w3 and idpf namespaces to validate properly. + let full_xhtml = format!( + r#" + + + {title} + {css} + + +{content} + +"#, + lang = epub_data.language, + title = chapter.title, + css = stylesheet_path, + content = processed_fragment, + ); + + Ok(full_xhtml) +} + +/// Helper function to inspect tags and rewrite the elements' attributes. +fn rewrite_attributes<'a>( + tag_data: &BytesStart<'a>, + url_path_to_local: &HashMap<&str, &RelativePathBuf>, + chapter_dir: &RelativePath, +) -> BytesStart<'static> { + let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned(); + let mut new_elem = BytesStart::new(name); + + for attr in tag_data.attributes().filter_map(Result::ok) { + let key = attr.key.as_ref(); + + // Intercept tags with a "src" attribute. + if tag_data.name().as_ref() == b"img" && key == b"src" { + let url = String::from_utf8_lossy(&attr.value); + + // If we have a local path, inject it instead of the absolute URL. + if let Some(local_path) = url_path_to_local.get(url.as_ref()) { + new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str())); + continue; + } + } + + // Keep all other attributes intact. + new_elem.push_attribute(attr); + } + + new_elem +} -- cgit v1.2.3-70-g09d2