aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorA Farzat <a@farzat.xyz>2026-03-07 09:04:00 +0300
committerA Farzat <a@farzat.xyz>2026-03-07 09:07:38 +0300
commitc98b360d611c37315c9c5330089a1d91dbb2021c (patch)
treed190740bb24df7aaafe09e914aa2f1314dd70988 /src
parent7e24d0cd5b671d06383466baf89c340023421d86 (diff)
downloadoreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.tar.gz
oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.zip
Fix chapter contents to proper xml
Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters.
Diffstat (limited to 'src')
-rw-r--r--src/epub.rs33
-rw-r--r--src/main.rs3
-rw-r--r--src/models.rs2
-rw-r--r--src/xml.rs137
4 files changed, 165 insertions, 10 deletions
diff --git a/src/epub.rs b/src/epub.rs
index 946c9ee..5383251 100644
--- a/src/epub.rs
+++ b/src/epub.rs
@@ -1,4 +1,7 @@
-use crate::models::{Chapter, FileEntry};
+use crate::{
+ models::{Chapter, EpubResponse, FileEntry},
+ xml::build_epub_chapter,
+};
use anyhow::{Context, Result};
use relative_path::{RelativePath, RelativePathBuf};
use reqwest::Client;
@@ -65,6 +68,7 @@ pub async fn download_all_files(
/// Creates the EPUB archive (creates zip and includes all files in it).
pub fn create_epub_archive(
+ epub_data: &EpubResponse,
epub_root: &Path,
output_epub: &Path,
file_entries: &[FileEntry],
@@ -106,18 +110,29 @@ pub fn create_epub_archive(
let mut buffer = Vec::new();
src_file.read_to_end(&mut buffer)?;
if let Some(chapter) = chapters.get(&entry.ourn) {
- let stylesheet_entries = chapter
+ let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
+ let stylesheet_links = chapter
.related_assets
.stylesheets
.iter()
.filter_map(|u| url_to_file.get(u))
- .collect::<Vec<_>>();
- let mut html = String::from_utf8(buffer)?;
- let chapter_dir = entry.full_path.parent().unwrap_or(RelativePath::new(""));
- for (url_path, local_path) in &url_path_to_local {
- let rel_path = chapter_dir.relative(local_path);
- html = html.replace(url_path, rel_path.as_str());
- }
+ .map(|e| {
+ format!(
+ "<link rel=\"stylesheet\" type=\"{}\" href=\"{}\" />\n",
+ e.media_type,
+ chapter_dir.relative(&e.full_path)
+ )
+ })
+ .collect::<String>();
+ let html = String::from_utf8(buffer)?;
+ let html = build_epub_chapter(
+ epub_data,
+ chapter,
+ chapter_dir,
+ &html,
+ &stylesheet_links,
+ &url_path_to_local,
+ )?;
zip.write_all(html.as_bytes())?;
} else {
zip.write_all(&buffer)?;
diff --git a/src/main.rs b/src/main.rs
index 80f81e4..00e38a5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
mod epub;
mod http_client;
mod models;
+mod xml;
use std::collections::HashMap;
use std::path::Path;
@@ -119,7 +120,7 @@ async fn main() -> Result<()> {
download_all_files(&client, &file_entries, dest_root).await?;
let epub_path = format!("Books/{0}/{0}.epub", args.bookid);
let epub_path = Path::new(&epub_path);
- create_epub_archive(dest_root, epub_path, &file_entries, &chapters)?;
+ create_epub_archive(&epub_data, dest_root, epub_path, &file_entries, &chapters)?;
Ok(())
}
diff --git a/src/models.rs b/src/models.rs
index 704ad84..991224c 100644
--- a/src/models.rs
+++ b/src/models.rs
@@ -18,12 +18,14 @@ pub struct EpubResponse {
pub files: String, // This is a URL to the resource files
pub spine: String, // This is a URL to the spine list
pub table_of_contents: String, // This is a URL to the table of contents
+ pub language: String,
}
/// Model for chapters API.
#[derive(Debug, Deserialize)]
pub struct Chapter {
pub ourn: String,
+ pub title: String,
pub is_skippable: bool,
pub related_assets: ChapRelAssets,
}
diff --git a/src/xml.rs b/src/xml.rs
new file mode 100644
index 0000000..9b11e9a
--- /dev/null
+++ b/src/xml.rs
@@ -0,0 +1,137 @@
+use anyhow::Result;
+use quick_xml::events::{BytesStart, Event};
+use quick_xml::{Reader, Writer};
+use relative_path::{RelativePath, RelativePathBuf};
+use std::collections::HashMap;
+use std::io::Cursor;
+
+use crate::models::{Chapter, EpubResponse};
+
+/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag.
+fn is_html_void_tag(name: &[u8]) -> bool {
+ matches!(
+ name,
+ b"area"
+ | b"base"
+ | b"br"
+ | b"col"
+ | b"embed"
+ | b"hr"
+ | b"img"
+ | b"input"
+ | b"link"
+ | b"meta"
+ | b"param"
+ | b"source"
+ | b"track"
+ | b"wbr"
+ )
+}
+
+/// Processes the fragment and outputs a complete, EPUB-ready XHTML document.
+pub fn build_epub_chapter(
+ epub_data: &EpubResponse,
+ chapter: &Chapter,
+ chapter_dir: &RelativePath,
+ fragment: &str,
+ stylesheet_path: &str,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+) -> Result<String> {
+ // Setup the XML Reader and Writer.
+ let mut reader = Reader::from_str(fragment);
+ // Preserve spacing for EPUB text formatting.
+ reader.config_mut().trim_text(false);
+ // Fragments could have unmatched tags - tell the parser not to panic if so.
+ reader.config_mut().check_end_names = false;
+ let mut writer = Writer::new(Cursor::new(Vec::new()));
+
+ // Loop through the XML events and rewrite tags.
+ loop {
+ match reader.read_event() {
+ Ok(Event::Start(tag_data)) => {
+ // If it is a void tag, convert it to a self-closing XML tag.
+ let tag_type = if is_html_void_tag(tag_data.name().as_ref()) {
+ Event::Empty
+ } else {
+ Event::Start
+ };
+ writer.write_event(tag_type(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::Empty(tag_data)) => {
+ // If tags are already empty, leave them as-is.
+ writer.write_event(Event::Empty(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::End(tag_data)) => {
+ // Silently drop closing tags for void elements if they exist (e.g. <img></img>).
+ if !is_html_void_tag(tag_data.name().as_ref()) {
+ writer.write_event(Event::End(tag_data))?;
+ }
+ }
+ Ok(Event::Eof) => break,
+ Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified.
+ Err(e) => anyhow::bail!(e),
+ }
+ }
+
+ // Extract the modified fragment
+ let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?;
+
+ // Wrap in EPUB XHTML Boilerplate.
+ // EPUBs strictly require the w3 and idpf namespaces to validate properly.
+ let full_xhtml = format!(
+ r#"<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{lang}" xml:lang="{lang}">
+<head>
+ <title>{title}</title>
+ {css}
+</head>
+<body>
+{content}
+</body>
+</html>"#,
+ lang = epub_data.language,
+ title = chapter.title,
+ css = stylesheet_path,
+ content = processed_fragment,
+ );
+
+ Ok(full_xhtml)
+}
+
+/// Helper function to inspect tags and rewrite the elements' attributes.
+fn rewrite_attributes<'a>(
+ tag_data: &BytesStart<'a>,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+ chapter_dir: &RelativePath,
+) -> BytesStart<'static> {
+ let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned();
+ let mut new_elem = BytesStart::new(name);
+
+ for attr in tag_data.attributes().filter_map(Result::ok) {
+ let key = attr.key.as_ref();
+
+ // Intercept <img> tags with a "src" attribute.
+ if tag_data.name().as_ref() == b"img" && key == b"src" {
+ let url = String::from_utf8_lossy(&attr.value);
+
+ // If we have a local path, inject it instead of the absolute URL.
+ if let Some(local_path) = url_path_to_local.get(url.as_ref()) {
+ new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str()));
+ continue;
+ }
+ }
+
+ // Keep all other attributes intact.
+ new_elem.push_attribute(attr);
+ }
+
+ new_elem
+}