From c98b360d611c37315c9c5330089a1d91dbb2021c Mon Sep 17 00:00:00 2001 From: A Farzat Date: Sat, 7 Mar 2026 09:04:00 +0300 Subject: Fix chapter contents to proper xml Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters. --- Cargo.lock | 10 +++++ Cargo.toml | 1 + src/epub.rs | 33 ++++++++++---- src/main.rs | 3 +- src/models.rs | 2 + src/xml.rs | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 src/xml.rs diff --git a/Cargo.lock b/Cargo.lock index 9417be2..6e403d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -855,6 +855,7 @@ version = "0.1.0" dependencies = [ "anyhow", "clap", + "quick-xml", "relative-path", "reqwest", "serde", @@ -954,6 +955,15 @@ dependencies = [ "psl-types", ] +[[package]] +name = "quick-xml" +version = "0.39.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.9" diff --git a/Cargo.toml b/Cargo.toml index dbe5793..50927f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2024" [dependencies] anyhow = "1.0.102" clap = { version = "4.5.60", features = ["derive"] } +quick-xml = "0.39.2" relative-path = { version = "2.0.1", features = ["serde"] } reqwest = { version = "0.13.2", features = ["cookies", "json"] } serde = { version = "1.0.228", features = ["derive"] } diff --git a/src/epub.rs b/src/epub.rs index 946c9ee..5383251 100644 --- a/src/epub.rs +++ b/src/epub.rs @@ -1,4 +1,7 @@ -use crate::models::{Chapter, FileEntry}; +use crate::{ + models::{Chapter, EpubResponse, FileEntry}, + xml::build_epub_chapter, +}; use anyhow::{Context, Result}; use relative_path::{RelativePath, RelativePathBuf}; use reqwest::Client; @@ -65,6 +68,7 @@ pub async fn download_all_files( /// Creates the EPUB archive (creates zip and includes all files in it). pub fn create_epub_archive( + epub_data: &EpubResponse, epub_root: &Path, output_epub: &Path, file_entries: &[FileEntry], @@ -106,18 +110,29 @@ pub fn create_epub_archive( let mut buffer = Vec::new(); src_file.read_to_end(&mut buffer)?; if let Some(chapter) = chapters.get(&entry.ourn) { - let stylesheet_entries = chapter + let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new("")); + let stylesheet_links = chapter .related_assets .stylesheets .iter() .filter_map(|u| url_to_file.get(u)) - .collect::>(); - let mut html = String::from_utf8(buffer)?; - let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new("")); - for (url_path, local_path) in &url_path_to_local { - let rel_path = chapter_dir.relative(local_path); - html = html.replace(url_path, rel_path.as_str()); - } + .map(|e| { + format!( + "\n", + e.media_type, + chapter_dir.relative(&e.full_path) + ) + }) + .collect::(); + let html = String::from_utf8(buffer)?; + let html = build_epub_chapter( + epub_data, + chapter, + chapter_dir, + &html, + &stylesheet_links, + &url_path_to_local, + )?; zip.write_all(html.as_bytes())?; } else { zip.write_all(&buffer)?; diff --git a/src/main.rs b/src/main.rs index 80f81e4..00e38a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ mod epub; mod http_client; mod models; +mod xml; use std::collections::HashMap; use std::path::Path; @@ -119,7 +120,7 @@ async fn main() -> Result<()> { download_all_files(&client, &file_entries, dest_root).await?; let epub_path = format!("Books/{0}/{0}.epub", args.bookid); let epub_path = Path::new(&epub_path); - create_epub_archive(dest_root, epub_path, &file_entries, &chapters)?; + create_epub_archive(&epub_data, dest_root, epub_path, &file_entries, &chapters)?; Ok(()) } diff --git a/src/models.rs b/src/models.rs index 704ad84..991224c 100644 --- a/src/models.rs +++ b/src/models.rs @@ -18,12 +18,14 @@ pub struct EpubResponse { pub files: String, // This is a URL to the resource files pub spine: String, // This is a URL to the spine list pub table_of_contents: String, // This is a URL to the table of contents + pub language: String, } /// Model for chapters API. #[derive(Debug, Deserialize)] pub struct Chapter { pub ourn: String, + pub title: String, pub is_skippable: bool, pub related_assets: ChapRelAssets, } diff --git a/src/xml.rs b/src/xml.rs new file mode 100644 index 0000000..9b11e9a --- /dev/null +++ b/src/xml.rs @@ -0,0 +1,137 @@ +use anyhow::Result; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::{Reader, Writer}; +use relative_path::{RelativePath, RelativePathBuf}; +use std::collections::HashMap; +use std::io::Cursor; + +use crate::models::{Chapter, EpubResponse}; + +/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag. +fn is_html_void_tag(name: &[u8]) -> bool { + matches!( + name, + b"area" + | b"base" + | b"br" + | b"col" + | b"embed" + | b"hr" + | b"img" + | b"input" + | b"link" + | b"meta" + | b"param" + | b"source" + | b"track" + | b"wbr" + ) +} + +/// Processes the fragment and outputs a complete, EPUB-ready XHTML document. +pub fn build_epub_chapter( + epub_data: &EpubResponse, + chapter: &Chapter, + chapter_dir: &RelativePath, + fragment: &str, + stylesheet_path: &str, + url_path_to_local: &HashMap<&str, &RelativePathBuf>, +) -> Result { + // Setup the XML Reader and Writer. + let mut reader = Reader::from_str(fragment); + // Preserve spacing for EPUB text formatting. + reader.config_mut().trim_text(false); + // Fragments could have unmatched tags - tell the parser not to panic if so. + reader.config_mut().check_end_names = false; + let mut writer = Writer::new(Cursor::new(Vec::new())); + + // Loop through the XML events and rewrite tags. + loop { + match reader.read_event() { + Ok(Event::Start(tag_data)) => { + // If it is a void tag, convert it to a self-closing XML tag. + let tag_type = if is_html_void_tag(tag_data.name().as_ref()) { + Event::Empty + } else { + Event::Start + }; + writer.write_event(tag_type(rewrite_attributes( + &tag_data, + url_path_to_local, + chapter_dir, + )))?; + } + Ok(Event::Empty(tag_data)) => { + // If tags are already empty, leave them as-is. + writer.write_event(Event::Empty(rewrite_attributes( + &tag_data, + url_path_to_local, + chapter_dir, + )))?; + } + Ok(Event::End(tag_data)) => { + // Silently drop closing tags for void elements if they exist (e.g. ). + if !is_html_void_tag(tag_data.name().as_ref()) { + writer.write_event(Event::End(tag_data))?; + } + } + Ok(Event::Eof) => break, + Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified. + Err(e) => anyhow::bail!(e), + } + } + + // Extract the modified fragment + let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?; + + // Wrap in EPUB XHTML Boilerplate. + // EPUBs strictly require the w3 and idpf namespaces to validate properly. + let full_xhtml = format!( + r#" + + + {title} + {css} + + +{content} + +"#, + lang = epub_data.language, + title = chapter.title, + css = stylesheet_path, + content = processed_fragment, + ); + + Ok(full_xhtml) +} + +/// Helper function to inspect tags and rewrite the elements' attributes. +fn rewrite_attributes<'a>( + tag_data: &BytesStart<'a>, + url_path_to_local: &HashMap<&str, &RelativePathBuf>, + chapter_dir: &RelativePath, +) -> BytesStart<'static> { + let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned(); + let mut new_elem = BytesStart::new(name); + + for attr in tag_data.attributes().filter_map(Result::ok) { + let key = attr.key.as_ref(); + + // Intercept tags with a "src" attribute. + if tag_data.name().as_ref() == b"img" && key == b"src" { + let url = String::from_utf8_lossy(&attr.value); + + // If we have a local path, inject it instead of the absolute URL. + if let Some(local_path) = url_path_to_local.get(url.as_ref()) { + new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str())); + continue; + } + } + + // Keep all other attributes intact. + new_elem.push_attribute(attr); + } + + new_elem +} -- cgit v1.2.3-70-g09d2