Fix chapter contents to proper xml

Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters.
author: A Farzat <a@farzat.xyz> 2026-03-07 09:04:00 +0300
committer: A Farzat <a@farzat.xyz> 2026-03-07 09:07:38 +0300
commit: c98b360d611c37315c9c5330089a1d91dbb2021c (patch)
tree: d190740bb24df7aaafe09e914aa2f1314dd70988 /src
parent: 7e24d0cd5b671d06383466baf89c340023421d86 (diff)
download: oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.tar.gz
oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.zip
4 files changed, 165 insertions, 10 deletions
diff --git a/src/epub.rs b/src/epub.rs
index 946c9ee..5383251 100644
--- a/src/epub.rs
+++ b/src/epub.rs
@@ -1,4 +1,7 @@
-use crate::models::{Chapter, FileEntry};
+use crate::{
+    models::{Chapter, EpubResponse, FileEntry},
+    xml::build_epub_chapter,
+};
 use anyhow::{Context, Result};
 use relative_path::{RelativePath, RelativePathBuf};
 use reqwest::Client;
@@ -65,6 +68,7 @@ pub async fn download_all_files(
 
 /// Creates the EPUB archive (creates zip and includes all files in it).
 pub fn create_epub_archive(
+    epub_data: &EpubResponse,
     epub_root: &Path,
     output_epub: &Path,
     file_entries: &[FileEntry],
@@ -106,18 +110,29 @@ pub fn create_epub_archive(
         let mut buffer = Vec::new();
         src_file.read_to_end(&mut buffer)?;
         if let Some(chapter) = chapters.get(&entry.ourn) {
-            let stylesheet_entries = chapter
+            let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
+            let stylesheet_links = chapter
                 .related_assets
                 .stylesheets
                 .iter()
                 .filter_map(|u| url_to_file.get(u))
-                .collect::<Vec<_>>();
-            let mut html = String::from_utf8(buffer)?;
-            let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
-            for (url_path, local_path) in &url_path_to_local {
-                let rel_path = chapter_dir.relative(local_path);
-                html = html.replace(url_path, rel_path.as_str());
-            }
+                .map(|e| {
+                    format!(
+                        "<link rel=\"stylesheet\" type=\"{}\" href=\"{}\" />\n",
+                        e.media_type,
+                        chapter_dir.relative(&e.full_path)
+                    )
+                })
+                .collect::<String>();
+            let html = String::from_utf8(buffer)?;
+            let html = build_epub_chapter(
+                epub_data,
+                chapter,
+                chapter_dir,
+                &html,
+                &stylesheet_links,
+                &url_path_to_local,
+            )?;
             zip.write_all(html.as_bytes())?;
         } else {
             zip.write_all(&buffer)?;
diff --git a/src/main.rs b/src/main.rs
index 80f81e4..00e38a5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
 mod epub;
 mod http_client;
 mod models;
+mod xml;
 
 use std::collections::HashMap;
 use std::path::Path;
@@ -119,7 +120,7 @@ async fn main() -> Result<()> {
     download_all_files(&client, &file_entries, dest_root).await?;
     let epub_path = format!("Books/{0}/{0}.epub", args.bookid);
     let epub_path = Path::new(&epub_path);
-    create_epub_archive(dest_root, epub_path, &file_entries, &chapters)?;
+    create_epub_archive(&epub_data, dest_root, epub_path, &file_entries, &chapters)?;
 
     Ok(())
 }
diff --git a/src/models.rs b/src/models.rs
index 704ad84..991224c 100644
--- a/src/models.rs
+++ b/src/models.rs
@@ -18,12 +18,14 @@ pub struct EpubResponse {
     pub files: String,             // This is a URL to the resource files
     pub spine: String,             // This is a URL to the spine list
     pub table_of_contents: String, // This is a URL to the table of contents
+    pub language: String,
 }
 
 /// Model for chapters API.
 #[derive(Debug, Deserialize)]
 pub struct Chapter {
     pub ourn: String,
+    pub title: String,
     pub is_skippable: bool,
     pub related_assets: ChapRelAssets,
 }
diff --git a/src/xml.rs b/src/xml.rs
new file mode 100644
index 0000000..9b11e9a
--- /dev/null
+++ b/src/xml.rs
@@ -0,0 +1,137 @@
+use anyhow::Result;
+use quick_xml::events::{BytesStart, Event};
+use quick_xml::{Reader, Writer};
+use relative_path::{RelativePath, RelativePathBuf};
+use std::collections::HashMap;
+use std::io::Cursor;
+
+use crate::models::{Chapter, EpubResponse};
+
+/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag.
+fn is_html_void_tag(name: &[u8]) -> bool {
+    matches!(
+        name,
+        b"area"
+            | b"base"
+            | b"br"
+            | b"col"
+            | b"embed"
+            | b"hr"
+            | b"img"
+            | b"input"
+            | b"link"
+            | b"meta"
+            | b"param"
+            | b"source"
+            | b"track"
+            | b"wbr"
+    )
+}
+
+/// Processes the fragment and outputs a complete, EPUB-ready XHTML document.
+pub fn build_epub_chapter(
+    epub_data: &EpubResponse,
+    chapter: &Chapter,
+    chapter_dir: &RelativePath,
+    fragment: &str,
+    stylesheet_path: &str,
+    url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+) -> Result<String> {
+    // Setup the XML Reader and Writer.
+    let mut reader = Reader::from_str(fragment);
+    // Preserve spacing for EPUB text formatting.
+    reader.config_mut().trim_text(false);
+    // Fragments could have unmatched tags - tell the parser not to panic if so.
+    reader.config_mut().check_end_names = false;
+    let mut writer = Writer::new(Cursor::new(Vec::new()));
+
+    // Loop through the XML events and rewrite tags.
+    loop {
+        match reader.read_event() {
+            Ok(Event::Start(tag_data)) => {
+                // If it is a void tag, convert it to a self-closing XML tag.
+                let tag_type = if is_html_void_tag(tag_data.name().as_ref()) {
+                    Event::Empty
+                } else {
+                    Event::Start
+                };
+                writer.write_event(tag_type(rewrite_attributes(
+                    &tag_data,
+                    url_path_to_local,
+                    chapter_dir,
+                )))?;
+            }
+            Ok(Event::Empty(tag_data)) => {
+                // If tags are already empty, leave them as-is.
+                writer.write_event(Event::Empty(rewrite_attributes(
+                    &tag_data,
+                    url_path_to_local,
+                    chapter_dir,
+                )))?;
+            }
+            Ok(Event::End(tag_data)) => {
+                // Silently drop closing tags for void elements if they exist (e.g. <img></img>).
+                if !is_html_void_tag(tag_data.name().as_ref()) {
+                    writer.write_event(Event::End(tag_data))?;
+                }
+            }
+            Ok(Event::Eof) => break,
+            Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified.
+            Err(e) => anyhow::bail!(e),
+        }
+    }
+
+    // Extract the modified fragment
+    let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?;
+
+    // Wrap in EPUB XHTML Boilerplate.
+    // EPUBs strictly require the w3 and idpf namespaces to validate properly.
+    let full_xhtml = format!(
+        r#"<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{lang}" xml:lang="{lang}">
+<head>
+    <title>{title}</title>
+    {css}
+</head>
+<body>
+{content}
+</body>
+</html>"#,
+        lang = epub_data.language,
+        title = chapter.title,
+        css = stylesheet_path,
+        content = processed_fragment,
+    );
+
+    Ok(full_xhtml)
+}
+
+/// Helper function to inspect tags and rewrite the elements' attributes.
+fn rewrite_attributes<'a>(
+    tag_data: &BytesStart<'a>,
+    url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+    chapter_dir: &RelativePath,
+) -> BytesStart<'static> {
+    let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned();
+    let mut new_elem = BytesStart::new(name);
+
+    for attr in tag_data.attributes().filter_map(Result::ok) {
+        let key = attr.key.as_ref();
+
+        // Intercept <img> tags with a "src" attribute.
+        if tag_data.name().as_ref() == b"img" && key == b"src" {
+            let url = String::from_utf8_lossy(&attr.value);
+
+            // If we have a local path, inject it instead of the absolute URL.
+            if let Some(local_path) = url_path_to_local.get(url.as_ref()) {
+                new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str()));
+                continue;
+            }
+        }
+
+        // Keep all other attributes intact.
+        new_elem.push_attribute(attr);
+    }
+
+    new_elem
+}
author	A Farzat <a@farzat.xyz>	2026-03-07 09:04:00 +0300
committer	A Farzat <a@farzat.xyz>	2026-03-07 09:07:38 +0300
commit	c98b360d611c37315c9c5330089a1d91dbb2021c (patch)
tree	d190740bb24df7aaafe09e914aa2f1314dd70988 /src
parent	7e24d0cd5b671d06383466baf89c340023421d86 (diff)
download	oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.tar.gz oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.zip