aboutsummaryrefslogtreecommitdiff
path: root/src/xml.rs
diff options
context:
space:
mode:
authorA Farzat <a@farzat.xyz>2026-03-07 09:04:00 +0300
committerA Farzat <a@farzat.xyz>2026-03-07 09:07:38 +0300
commitc98b360d611c37315c9c5330089a1d91dbb2021c (patch)
treed190740bb24df7aaafe09e914aa2f1314dd70988 /src/xml.rs
parent7e24d0cd5b671d06383466baf89c340023421d86 (diff)
downloadoreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.tar.gz
oreilly-epub-c98b360d611c37315c9c5330089a1d91dbb2021c.zip
Fix chapter contents to proper xml
Change resource URLs using attribute matching (only img src for now, should add more later). Add respective stylesheets to chapters.
Diffstat (limited to 'src/xml.rs')
-rw-r--r--src/xml.rs137
1 files changed, 137 insertions, 0 deletions
diff --git a/src/xml.rs b/src/xml.rs
new file mode 100644
index 0000000..9b11e9a
--- /dev/null
+++ b/src/xml.rs
@@ -0,0 +1,137 @@
+use anyhow::Result;
+use quick_xml::events::{BytesStart, Event};
+use quick_xml::{Reader, Writer};
+use relative_path::{RelativePath, RelativePathBuf};
+use std::collections::HashMap;
+use std::io::Cursor;
+
+use crate::models::{Chapter, EpubResponse};
+
+/// Checks if a tag is a standard HTML void element that shouldn't have a closing tag.
+fn is_html_void_tag(name: &[u8]) -> bool {
+ matches!(
+ name,
+ b"area"
+ | b"base"
+ | b"br"
+ | b"col"
+ | b"embed"
+ | b"hr"
+ | b"img"
+ | b"input"
+ | b"link"
+ | b"meta"
+ | b"param"
+ | b"source"
+ | b"track"
+ | b"wbr"
+ )
+}
+
+/// Processes the fragment and outputs a complete, EPUB-ready XHTML document.
+pub fn build_epub_chapter(
+ epub_data: &EpubResponse,
+ chapter: &Chapter,
+ chapter_dir: &RelativePath,
+ fragment: &str,
+ stylesheet_path: &str,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+) -> Result<String> {
+ // Setup the XML Reader and Writer.
+ let mut reader = Reader::from_str(fragment);
+ // Preserve spacing for EPUB text formatting.
+ reader.config_mut().trim_text(false);
+ // Fragments could have unmatched tags - tell the parser not to panic if so.
+ reader.config_mut().check_end_names = false;
+ let mut writer = Writer::new(Cursor::new(Vec::new()));
+
+ // Loop through the XML events and rewrite tags.
+ loop {
+ match reader.read_event() {
+ Ok(Event::Start(tag_data)) => {
+ // If it is a void tag, convert it to a self-closing XML tag.
+ let tag_type = if is_html_void_tag(tag_data.name().as_ref()) {
+ Event::Empty
+ } else {
+ Event::Start
+ };
+ writer.write_event(tag_type(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::Empty(tag_data)) => {
+ // If tags are already empty, leave them as-is.
+ writer.write_event(Event::Empty(rewrite_attributes(
+ &tag_data,
+ url_path_to_local,
+ chapter_dir,
+ )))?;
+ }
+ Ok(Event::End(tag_data)) => {
+ // Silently drop closing tags for void elements if they exist (e.g. <img></img>).
+ if !is_html_void_tag(tag_data.name().as_ref()) {
+ writer.write_event(Event::End(tag_data))?;
+ }
+ }
+ Ok(Event::Eof) => break,
+ Ok(tag_data) => writer.write_event(tag_data)?, // Pass through text, comments, etc. unmodified.
+ Err(e) => anyhow::bail!(e),
+ }
+ }
+
+ // Extract the modified fragment
+ let processed_fragment = String::from_utf8(writer.into_inner().into_inner())?;
+
+ // Wrap in EPUB XHTML Boilerplate.
+ // EPUBs strictly require the w3 and idpf namespaces to validate properly.
+ let full_xhtml = format!(
+ r#"<?xml version="1.0" encoding="utf-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{lang}" xml:lang="{lang}">
+<head>
+ <title>{title}</title>
+ {css}
+</head>
+<body>
+{content}
+</body>
+</html>"#,
+ lang = epub_data.language,
+ title = chapter.title,
+ css = stylesheet_path,
+ content = processed_fragment,
+ );
+
+ Ok(full_xhtml)
+}
+
+/// Helper function to inspect tags and rewrite the elements' attributes.
+fn rewrite_attributes<'a>(
+ tag_data: &BytesStart<'a>,
+ url_path_to_local: &HashMap<&str, &RelativePathBuf>,
+ chapter_dir: &RelativePath,
+) -> BytesStart<'static> {
+ let name = String::from_utf8_lossy(tag_data.name().as_ref()).into_owned();
+ let mut new_elem = BytesStart::new(name);
+
+ for attr in tag_data.attributes().filter_map(Result::ok) {
+ let key = attr.key.as_ref();
+
+ // Intercept <img> tags with a "src" attribute.
+ if tag_data.name().as_ref() == b"img" && key == b"src" {
+ let url = String::from_utf8_lossy(&attr.value);
+
+ // If we have a local path, inject it instead of the absolute URL.
+ if let Some(local_path) = url_path_to_local.get(url.as_ref()) {
+ new_elem.push_attribute(("src", chapter_dir.relative(local_path).as_str()));
+ continue;
+ }
+ }
+
+ // Keep all other attributes intact.
+ new_elem.push_attribute(attr);
+ }
+
+ new_elem
+}