aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA Farzat <a@farzat.xyz>2026-03-07 09:04:00 +0300
committerA Farzat <a@farzat.xyz>2026-03-07 09:07:38 +0300
commitc98b360d611c37315c9c5330089a1d91dbb2021c (patch)
treed190740bb24df7aaafe09e914aa2f1314dd70988
parent7e24d0cd5b671d06383466baf89c340023421d86 (diff)
downloadoreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.tar.gz
oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.zip
Fix chapter contents to proper xml
Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters.
-rw-r--r--Cargo.lock10
-rw-r--r--Cargo.toml1
-rw-r--r--src/epub.rs33
-rw-r--r--src/main.rs3
-rw-r--r--src/models.rs2
-rw-r--r--src/xml.rs137
6 files changed, 176 insertions, 10 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 9417be2..6e403d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -855,6 +855,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"clap",
+ "quick-xml",
"relative-path",
"reqwest",
"serde",
@@ -955,6 +956,15 @@ dependencies = [
]
[[package]]
+name = "quick-xml"
+version = "0.39.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
name = "quinn"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index dbe5793..50927f1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ edition = "2024"
[dependencies]
anyhow = "1.0.102"
clap = { version = "4.5.60", features = ["derive"] }
+quick-xml = "0.39.2"
relative-path = { version = "2.0.1", features = ["serde"] }
reqwest = { version = "0.13.2", features = ["cookies", "json"] }
serde = { version = "1.0.228", features = ["derive"] }
diff --git a/src/epub.rs b/src/epub.rs
index 946c9ee..5383251 100644
--- a/src/epub.rs
+++ b/src/epub.rs
@@ -1,4 +1,7 @@
-use crate::models::{Chapter, FileEntry};
+use crate::{
+ models::{Chapter, EpubResponse, FileEntry},
+ xml::build_epub_chapter,
+};
use anyhow::{Context, Result};
use relative_path::{RelativePath, RelativePathBuf};
use reqwest::Client;
@@ -65,6 +68,7 @@ pub async fn download_all_files(
/// Creates the EPUB archive (creates zip and includes all files in it).
pub fn create_epub_archive(
+ epub_data: &EpubResponse,
epub_root: &Path,
output_epub: &Path,
file_entries: &[FileEntry],
@@ -106,18 +110,29 @@ pub fn create_epub_archive(
let mut buffer = Vec::new();
src_file.read_to_end(&mut buffer)?;
if let Some(chapter) = chapters.get(&entry.ourn) {
- let stylesheet_entries = chapter
+ let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
+ let stylesheet_links = chapter
.related_assets
.stylesheets
.iter()
.filter_map(|u| url_to_file.get(u))
- .collect::<Vec<_>>();
- let mut html = String::from_utf8(buffer)?;
- let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
- for (url_path, local_path) in &url_path_to_local {
- let rel_path = chapter_dir.relative(local_path);
- html = html.replace(url_path, rel_path.as_str());
- }
+ .map(|e| {
+ format!(
+ "<link rel=\"stylesheet\" type=\"{}\" href=\"{}\" />\n",
+ e.media_type,
+ chapter_dir.relative(&e.full_path)
+ )
+ })
+ .collect::<String>();
+ let html = String::from_utf8(buffer)?;
+ let html = build_epub_chapter(
+ epub_data,
+ chapter,
+ chapter_dir,
+ &html,
+ &stylesheet_links,
+ &url_path_to_local,
+ )?;
zip.write_all(html.as_bytes())?;
} else {
zip.write_all(&buffer)?;
diff --git a/src/main.rs b/src/main.rs
index 80f81e4..00e38a5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
mod epub;
mod http_client;
mod models;
+mod xml;
use std::collections::HashMap;
use std::path::Path;
@@ -119,7 +120,7 @@ async fn main() -> Result<()> {
download_all_files(&client, &file_entries, dest_root).await?;
let epub_path = format!("Books/{0}/{0}.epub", args.bookid);
let epub_path = Path::new(&epub_path);
- create_epub_archive(dest_root, epub_path, &file_entries, &chapters)?;
+ create_epub_archive(&epub_data, dest_root, epub_path, &file_entries, &chapters)?;
Ok(())
}
diff --git a/src/models.rs b/src/models.rs
index 704ad84..991224c 100644
--- a/src/models.rs
+++ b/src/models.rs
@@ -18,12 +18,14 @@ pub struct EpubResponse {
pub files: String, // This is a URL to the resource files
pub spine: String, // This is a URL to the spine list
pub table_of_contents: String, // This is a URL to the table of contents
+ pub language: String,
}
/// Model for chapters API.
#[derive(Debug, Deserialize)]
pub struct Chapter {
pub ourn: String,
+ pub title: String,
pub is_skippable: bool,
pub related_assets: ChapRelAssets,
}
diff --git a/src/xml.rs b/src/xml.rs
new file mode 100644
index 0000000..9b11e9a
--- /dev/null
+++ b/src/xml.rs
@@ -0,0 +1,137 @@
+use anyhow::Result;
+use quick_xml::events::{BytesStart, Event};
+use quick_xml::{Reader, Writer};
+use relative_path::{RelativePath, RelativePathBuf};
+use std::collections::HashMap;
+use std::io::Cursor;
+
+use crate::models::{Chapter, EpubResponse};
+
+/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag.
+fn is_html_void_tag(name: &[u8]) -> bool {
+ matches!(
+ name,
+ b"area"
+ | b"base"
+ | b"br"
+ | b"col"
+ | b"embed"
+ | b"hr"
+ | b"img"
+ | b"input"
+ | b"link"
+ | b"meta"
+ | b"param"
+ | b"source"
+ | b"track"
+ | b"wbr"
+ )
+}
+
+/// Processes the fragment and outputs a complete, EPUB-ready XHTML document.
+pub fn build_epub_chapter(
+ epub_data: &EpubResponse,
+ chapter: &Chapter,
+ chapter_dir: &RelativePath,
+ fragment: &str,
+ stylesheet_path: &str,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+) -> Result<String> {
+ // Setup the XML Reader and Writer.
+ let mut reader = Reader::from_str(fragment);
+ // Preserve spacing for EPUB text formatting.
+ reader.config_mut().trim_text(false);
+ // Fragments could have unmatched tags - tell the parser not to panic if so.
+ reader.config_mut().check_end_names = false;
+ let mut writer = Writer::new(Cursor::new(Vec::new()));
+
+ // Loop through the XML events and rewrite tags.
+ loop {
+ match reader.read_event() {
+ Ok(Event::Start(tag_data)) => {
+ // If it is a void tag, convert it to a self-closing XML tag.
+ let tag_type = if is_html_void_tag(tag_data.name().as_ref()) {
+ Event::Empty
+ } else {
+ Event::Start
+ };
+ writer.write_event(tag_type(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::Empty(tag_data)) => {
+ // If tags are already empty, leave them as-is.
+ writer.write_event(Event::Empty(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::End(tag_data)) => {
+ // Silently drop closing tags for void elements if they exist (e.g. <img></img>).
+ if !is_html_void_tag(tag_data.name().as_ref()) {
+ writer.write_event(Event::End(tag_data))?;
+ }
+ }
+ Ok(Event::Eof) => break,
+ Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified.
+ Err(e) => anyhow::bail!(e),
+ }
+ }
+
+ // Extract the modified fragment
+ let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?;
+
+ // Wrap in EPUB XHTML Boilerplate.
+ // EPUBs strictly require the w3 and idpf namespaces to validate properly.
+ let full_xhtml = format!(
+ r#"<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{lang}" xml:lang="{lang}">
+<head>
+ <title>{title}</title>
+ {css}
+</head>
+<body>
+{content}
+</body>
+</html>"#,
+ lang = epub_data.language,
+ title = chapter.title,
+ css = stylesheet_path,
+ content = processed_fragment,
+ );
+
+ Ok(full_xhtml)
+}
+
+/// Helper function to inspect tags and rewrite the elements' attributes.
+fn rewrite_attributes<'a>(
+ tag_data: &BytesStart<'a>,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+ chapter_dir: &RelativePath,
+) -> BytesStart<'static> {
+ let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned();
+ let mut new_elem = BytesStart::new(name);
+
+ for attr in tag_data.attributes().filter_map(Result::ok) {
+ let key = attr.key.as_ref();
+
+ // Intercept <img> tags with a "src" attribute.
+ if tag_data.name().as_ref() == b"img" && key == b"src" {
+ let url = String::from_utf8_lossy(&attr.value);
+
+ // If we have a local path, inject it instead of the absolute URL.
+ if let Some(local_path) = url_path_to_local.get(url.as_ref()) {
+ new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str()));
+ continue;
+ }
+ }
+
+ // Keep all other attributes intact.
+ new_elem.push_attribute(attr);
+ }
+
+ new_elem
+}