Coverage for odmpy/processing/ebook.py: 87.1%
428 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-14 08:51 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-14 08:51 +0000
1# Copyright (C) 2023 github.com/ping
2#
3# This file is part of odmpy.
4#
5# odmpy is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 3 of the License, or
8# (at your option) any later version.
9#
10# odmpy is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with odmpy. If not, see <http://www.gnu.org/licenses/>.
17#
19import argparse
20import base64
21import datetime
22import json
23import logging
24import os
25import re
26import shutil
27import xml.etree.ElementTree as ET
28import zipfile
29from functools import cmp_to_key
30from pathlib import Path
31from typing import Dict, List, Optional
32from urllib.parse import urlparse, urljoin
34import bs4.element
35import requests
36from bs4 import BeautifulSoup, Doctype, Tag
37from termcolor import colored
38from tqdm import tqdm
40from .shared import (
41 generate_names,
42 build_opf_package,
43 extract_isbn,
44 extract_authors_from_openbook,
45)
46from ..errors import OdmpyRuntimeError
47from ..libby import USER_AGENT, LibbyClient, LibbyFormats, LibbyMediaTypes
48from ..overdrive import OverDriveClient
49from ..utils import slugify, is_windows, guess_mimetype
51#
52# Main processing logic for libby direct ebook and magazine loans
53#
55NAV_XHTMLTEMPLATE = """
56<!DOCTYPE html>
57<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
58<head><title></title>
59<style>
60 #toc { list-style-type: none; padding-left: 0; }
61 #toc > li { margin-top: 0.5rem; }
62</style>
63</head>
64<body>
65<nav epub:type="toc">
66<h1>Contents</h1>
67<ol id="toc"></ol>
68</nav>
69</body>
70</html>
71"""
74def _sort_toc(toc: Dict) -> List:
75 """
76 Sorts the ToC dict from openbook into a hierarchical structure
78 :param toc:
79 :return:
80 """
81 hierarchical_toc = []
82 current_section = {} # type: Dict
83 for i, item in enumerate(toc, start=1):
84 if not item.get("sectionName"):
85 hierarchical_toc.append(item)
86 continue
87 if item["sectionName"] not in current_section or i == len(toc):
88 # new section or last item
89 if i == len(toc):
90 current_section.setdefault(item["sectionName"], []).append(item)
91 section_names = list(current_section.keys())
92 for section_name in section_names:
93 hierarchical_toc.append(
94 {
95 "sectionName": section_name,
96 "items": current_section[section_name],
97 }
98 )
99 del current_section[section_name]
100 if i < len(toc):
101 current_section.setdefault(item["sectionName"], []).append(item)
103 return hierarchical_toc
106def _build_ncx(media_info: Dict, openbook: Dict, nav_page: str) -> ET.Element:
107 """
108 Build the ncx from openbook
110 :param media_info:
111 :param openbook:
112 :param nav_page:
113 :return:
114 """
116 # References:
117 # Version 2: https://idpf.org/epub/20/spec/OPF_2.0_final_spec.html#Section2.0
118 # Version 3: https://www.w3.org/TR/epub-33/#sec-package-doc
120 publication_identifier = (
121 extract_isbn(
122 media_info["formats"],
123 [LibbyFormats.EBookOverdrive, LibbyFormats.MagazineOverDrive],
124 )
125 or media_info["id"]
126 )
128 ET.register_namespace("opf", "http://www.idpf.org/2007/opf")
129 ET.register_namespace("dc", "http://purl.org/dc/elements/1.1/")
130 ncx = ET.Element(
131 "ncx",
132 attrib={
133 "version": "2005-1",
134 "xmlns": "http://www.daisy.org/z3986/2005/ncx/",
135 "xml:lang": "en",
136 },
137 )
139 head = ET.SubElement(ncx, "head")
140 ET.SubElement(
141 head, "meta", attrib={"content": publication_identifier, "name": "dtb:uid"}
142 )
143 doc_title = ET.SubElement(ncx, "docTitle")
144 doc_title_text = ET.SubElement(doc_title, "text")
145 doc_title_text.text = openbook["title"]["main"]
147 doc_author = ET.SubElement(ncx, "docAuthor")
148 doc_author_text = ET.SubElement(doc_author, "text")
149 doc_author_text.text = openbook["creator"][0]["name"]
151 nav_map = ET.SubElement(ncx, "navMap")
152 hierarchical_toc = _sort_toc(openbook["nav"]["toc"])
153 nav_point_counter = 0
154 for item in hierarchical_toc:
155 nav_point_counter += 1
156 if not item.get("sectionName"):
157 nav_point = ET.SubElement(
158 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}
159 )
160 nav_label = ET.SubElement(nav_point, "navLabel")
161 nav_label_text = ET.SubElement(nav_label, "text")
162 nav_label_text.text = item["title"]
163 ET.SubElement(nav_point, "content", attrib={"src": item["path"]})
165 if nav_point_counter == 1 and nav_page:
166 nav_point_counter += 1
167 nav_point = ET.SubElement(
168 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}
169 )
170 nav_label = ET.SubElement(nav_point, "navLabel")
171 nav_label_text = ET.SubElement(nav_label, "text")
172 nav_label_text.text = "Contents"
173 ET.SubElement(nav_point, "content", attrib={"src": nav_page})
174 continue
176 nav_point = ET.SubElement(
177 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}
178 )
179 nav_label = ET.SubElement(nav_point, "navLabel")
180 nav_label_text = ET.SubElement(nav_label, "text")
181 nav_label_text.text = item["sectionName"]
182 # since we don't have a section content page, link section to first article path
183 ET.SubElement(nav_point, "content", attrib={"src": item["items"][0]["path"]})
184 for section_item in item["items"]:
185 nav_point_counter += 1
186 section_item_nav_point = ET.SubElement(
187 nav_point, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}
188 )
189 section_item_nav_label = ET.SubElement(section_item_nav_point, "navLabel")
190 section_item_nav_label_text = ET.SubElement(section_item_nav_label, "text")
191 section_item_nav_label_text.text = section_item["title"]
192 ET.SubElement(
193 section_item_nav_point, "content", attrib={"src": section_item["path"]}
194 )
195 return ncx
198def _sanitise_opf_id(string_id: str) -> str:
199 """
200 OPF IDs cannot start with a number
201 :param string_id:
202 :return:
203 """
204 string_id = slugify(string_id)
205 if string_id[0].isdigit():
206 return f"id_{string_id}"
207 return string_id
210def _cleanup_soup(soup: BeautifulSoup, version: str = "2.0") -> None:
211 """
212 Tries to fix up book content pages to be epub-version compliant.
214 :param soup:
215 :param version:
216 :return:
217 """
218 if version == "2.0":
219 # v2 is a lot pickier about the acceptable elements and attributes
220 modified_doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"'
221 for item in soup.contents:
222 if isinstance(item, Doctype):
223 item.replace_with(Doctype(modified_doctype))
224 break
225 remove_attributes = [
226 # this list will not be complete, but we try
227 "aria-label",
228 "data-loc",
229 "data-epub-type",
230 "data-document-status",
231 "data-xml-lang",
232 "lang",
233 "role",
234 "epub:type",
235 "epub:prefix",
236 ]
237 for attribute in remove_attributes:
238 for tag in soup.find_all(attrs={attribute: True}):
239 del tag[attribute]
240 convert_tags = ["nav", "section"] # this list will not be complete, but we try
241 for tag in convert_tags:
242 for invalid_tag in soup.find_all(tag):
243 invalid_tag.name = "div"
245 # known issues, this will not be complete
246 for svg in soup.find_all("svg"):
247 if not svg.get("xmlns"):
248 svg["xmlns"] = "http://www.w3.org/2000/svg"
249 if not svg.get("xmlns:xlink"):
250 svg["xmlns:xlink"] = "http://www.w3.org/1999/xlink"
251 convert_tags = ["figcaption"]
252 for tag in convert_tags:
253 for invalid_tag in soup.find_all(tag):
254 invalid_tag.name = "div"
255 remove_tags = ["base"]
256 for tag in remove_tags:
257 for remove_tag in soup.find_all(tag):
258 remove_tag.decompose()
260 html_tag = soup.find("html")
261 if html_tag and isinstance(html_tag, bs4.element.Tag) and not html_tag.get("xmlns"):
262 html_tag["xmlns"] = "http://www.w3.org/1999/xhtml"
265def _sort_spine_entries(a: Dict, b: Dict, toc_pages: List[str]):
266 """
267 Sort spine according to TOC. For magazines, this is sometimes a
268 problem where the sequence laid out in the spine does not align
269 with the TOC, e.g. Mother Jones. If unsorted, the page through
270 sequence does not match the actual TOC.
272 :param a:
273 :param b:
274 :param toc_pages:
275 :return:
276 """
277 try:
278 a_index = toc_pages.index(a["-odread-original-path"])
279 except ValueError:
280 a_index = 999
281 try:
282 b_index = toc_pages.index(b["-odread-original-path"])
283 except ValueError:
284 b_index = 999
286 if a_index != b_index:
287 # sort order found via toc
288 return -1 if a_index < b_index else 1
290 return -1 if a["-odread-spine-position"] < b["-odread-spine-position"] else 1
293def _sort_title_contents(a: Dict, b: Dict):
294 """
295 Sort the title contents roster so that pages get processed first.
296 This is a precautionary measure for getting high-res cover images
297 since we must parse the html for the image src.
299 :param a:
300 :param b:
301 :return:
302 """
303 extensions_rank = [
304 ".xhtml",
305 ".html",
306 ".htm",
307 ".jpg",
308 ".jpeg",
309 ".png",
310 ".gif",
311 ".ttf", # download fonts before css so that we can check if font is available
312 ".otf",
313 ".css",
314 ]
315 a_parsed_url = urlparse(a["url"])
316 b_parsed_url = urlparse(b["url"])
317 a_ext = Path(a_parsed_url.path).suffix
318 b_ext = Path(b_parsed_url.path).suffix
319 try:
320 a_index = extensions_rank.index(a_ext)
321 except ValueError:
322 a_index = 999
323 try:
324 b_index = extensions_rank.index(b_ext)
325 except ValueError:
326 b_index = 999
328 if a_index != b_index:
329 # sort order found via toc
330 return -1 if a_index < b_index else 1
332 if a_ext != b_ext:
333 return -1 if a_ext < b_ext else 1
335 return -1 if a_parsed_url.path < b_parsed_url.path else 1
338def _filter_content(entry: Dict, media_info: Dict, toc_pages: List[str]):
339 """
340 Filter title contents that are not needed.
342 :param entry:
343 :param media_info:
344 :param toc_pages:
345 :return:
346 """
347 parsed_entry_url = urlparse(entry["url"])
348 media_type = guess_mimetype(parsed_entry_url.path[1:])
350 if media_info["type"]["id"] == LibbyMediaTypes.Magazine and media_type:
351 if media_type.startswith("image/") and (
352 parsed_entry_url.path.startswith("/pages/")
353 or parsed_entry_url.path.startswith("/thumbnails/")
354 ):
355 return False
356 if (
357 media_type in ("application/xhtml+xml", "text/html")
358 and parsed_entry_url.path[1:] not in toc_pages
359 ):
360 return False
362 if parsed_entry_url.path.startswith("/_d/"): # ebooks
363 return False
365 return True
368def process_ebook_loan(
369 loan: Dict,
370 cover_path: Optional[Path],
371 openbook: Dict,
372 rosters: List[Dict],
373 libby_client: LibbyClient,
374 args: argparse.Namespace,
375 logger: logging.Logger,
376) -> None:
377 """
378 Generates and return an ebook loan directly from Libby.
380 :param loan:
381 :param cover_path:
382 :param openbook:
383 :param rosters:
384 :param libby_client:
385 :param args:
386 :param logger:
387 :return:
388 """
389 book_folder, book_file_name = generate_names(
390 title=loan["title"],
391 series=loan.get("series") or "",
392 series_reading_order=loan.get("detailedSeries", {}).get("readingOrder", ""),
393 authors=extract_authors_from_openbook(openbook),
394 edition=loan.get("edition") or "",
395 title_id=loan["id"],
396 args=args,
397 logger=logger,
398 )
399 epub_file_path = book_file_name.with_suffix(".epub")
400 epub_version = "3.0"
402 book_meta_name = "META-INF"
403 book_content_name = "OEBPS"
404 book_meta_folder = book_folder.joinpath(book_meta_name)
405 book_content_folder = book_folder.joinpath(book_content_name)
406 for d in (book_meta_folder, book_content_folder):
407 if not d.exists():
408 d.mkdir(parents=True, exist_ok=True)
410 od_client = OverDriveClient(
411 user_agent=USER_AGENT, timeout=args.timeout, retry=args.retries
412 )
413 media_info = od_client.media(loan["id"])
415 if args.is_debug_mode:
416 with book_folder.joinpath("media.json").open("w", encoding="utf-8") as f:
417 json.dump(media_info, f, indent=2)
419 with book_folder.joinpath("loan.json").open("w", encoding="utf-8") as f:
420 json.dump(loan, f, indent=2)
422 with book_folder.joinpath("rosters.json").open("w", encoding="utf-8") as f:
423 json.dump(rosters, f, indent=2)
425 with book_folder.joinpath("openbook.json").open("w", encoding="utf-8") as f:
426 json.dump(openbook, f, indent=2)
428 title_contents: Dict = next(
429 iter([r for r in rosters if r["group"] == "title-content"]), {}
430 )
431 headers = libby_client.default_headers()
432 headers["Accept"] = "*/*"
433 contents_re = re.compile(r"parent\.__bif_cfc0\(self,'(?P<base64_text>.+)'\)")
435 openbook_toc = openbook["nav"]["toc"]
436 if len(openbook_toc) <= 1 and loan["type"]["id"] == LibbyMediaTypes.Magazine:
437 raise OdmpyRuntimeError("Unsupported fixed-layout (pre-paginated) format.")
439 # for finding cover image for magazines
440 cover_toc_item = next(
441 iter(
442 [
443 item
444 for item in openbook_toc
445 if item.get("pageRange", "") == "Cover" and item.get("featureImage")
446 ]
447 ),
448 None,
449 )
450 # for finding cover image for ebooks
451 cover_page_landmark = next(
452 iter(
453 [
454 item
455 for item in openbook.get("nav", {}).get("landmarks", [])
456 if item["type"] == "cover"
457 ]
458 ),
459 None,
460 )
461 toc_pages = [item["path"].split("#")[0] for item in openbook_toc]
462 manifest_entries: List[Dict] = []
464 title_content_entries = list(
465 filter(
466 lambda e: _filter_content(e, media_info, toc_pages),
467 title_contents["entries"],
468 )
469 )
470 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372
471 title_content_entries = sorted(
472 title_content_entries, key=cmp_to_key(_sort_title_contents) # type: ignore[misc]
473 )
474 progress_bar = tqdm(title_content_entries, disable=args.hide_progress)
475 has_ncx = False
476 has_nav = False
478 # Used to patch magazine css that causes paged mode in calibre viewer to not work.
479 # This expression is used to strip `overflow-x: hidden` from the css definition
480 # for `#article-body`.
481 patch_magazine_css_overflow_re = re.compile(
482 r"(#article-body\s*\{[^{}]+?)overflow-x:\s*hidden;([^{}]+?})"
483 )
484 # This expression is used to strip `padding: Xem Xem;` from the css definition
485 # for `#article-body` to remove the extraneous padding
486 patch_magazine_css_padding_re = re.compile(
487 r"(#article-body\s*\{[^{}]+?)padding:\s*[^;]+;([^{}]+?})"
488 )
489 # This expression is used to patch the missing fonts-specified in magazine css
490 patch_magazine_css_font_re = re.compile(r"(font-family: '[^']+(Sans|Serif)[^']+';)")
491 # This expression is used to strip the missing font src in magazine css
492 patch_magazine_css_font_src_re = re.compile(
493 r"@font-face\s*\{[^{}]+?(src:\s*url\('(fonts/.+\.ttf)'\).+?;)[^{}]+?}"
494 )
496 # holds the manifest item ID for the image identified as the cover
497 cover_img_manifest_id = None
499 for entry in progress_bar:
500 entry_url = entry["url"]
501 parsed_entry_url = urlparse(entry_url)
502 title_content_path = Path(parsed_entry_url.path[1:])
503 media_type = guess_mimetype(title_content_path.name)
504 if not media_type:
505 logger.warning("Skipped roster entry: %s", title_content_path.name)
506 continue
507 asset_folder = book_content_folder.joinpath(title_content_path.parent)
508 if media_type == "application/x-dtbncx+xml":
509 has_ncx = True
510 manifest_entry = {
511 "href": parsed_entry_url.path[1:],
512 "id": "ncx"
513 if media_type == "application/x-dtbncx+xml"
514 else _sanitise_opf_id(parsed_entry_url.path[1:]),
515 "media-type": media_type,
516 }
518 # try to find cover image for magazines
519 if cover_toc_item and manifest_entry["id"] == _sanitise_opf_id(
520 cover_toc_item["featureImage"]
521 ):
522 # we assign it here to ensure that the image referenced in the
523 # toc actually exists
524 cover_img_manifest_id = manifest_entry["id"]
526 if not asset_folder.exists():
527 asset_folder.mkdir(parents=True, exist_ok=True)
528 asset_file_path = asset_folder.joinpath(Path(parsed_entry_url.path).name)
530 soup = None
531 if asset_file_path.exists():
532 progress_bar.set_description(f"Already saved {asset_file_path.name}")
533 if media_type in ("application/xhtml+xml", "text/html"):
534 with asset_file_path.open("r", encoding="utf-8") as f_asset:
535 soup = BeautifulSoup(f_asset, features="html.parser")
536 else:
537 progress_bar.set_description(f"Downloading {asset_file_path.name}")
538 # use the libby client session because the required
539 # auth cookies are set there
540 res: requests.Response = libby_client.make_request(
541 entry_url, headers=headers, authenticated=False, return_res=True
542 )
544 # patch magazine css to fix various rendering problems
545 if (
546 media_info["type"]["id"] == LibbyMediaTypes.Magazine
547 and media_type == "text/css"
548 ):
549 css_content = patch_magazine_css_overflow_re.sub(r"\1\2", res.text)
550 css_content = patch_magazine_css_padding_re.sub(r"\1\2", css_content)
551 if "#article-body" in css_content:
552 # patch font-family declarations
553 # libby declares these font-faces but does not supply them in the roster
554 # nor are they actually available when viewed online (http 403)
555 font_families = list(
556 set(patch_magazine_css_font_re.findall(css_content))
557 )
558 for font_family, _ in font_families:
559 new_font_css = font_family[:-1]
560 if "Serif" in font_family:
561 new_font_css += ',Charter,"Bitstream Charter","Sitka Text",Cambria,serif'
562 elif "Sans" in font_family:
563 new_font_css += ",system-ui,sans-serif"
564 new_font_css += ";"
565 if "-Bold" in font_family:
566 new_font_css += " font-weight: 700;"
567 elif "-SemiBold" in font_family:
568 new_font_css += " font-weight: 600;"
569 elif "-Light" in font_family:
570 new_font_css += " font-weight: 300;"
571 css_content = css_content.replace(font_family, new_font_css)
572 else:
573 # patch font url declarations
574 # since ttf/otf files are downloaded ahead of css, we can verify
575 # if the font files are actually available
576 try:
577 font_sources = patch_magazine_css_font_src_re.findall(
578 css_content
579 )
580 for src_match, font_src in font_sources:
581 asset_font_path = Path(
582 urljoin(str(asset_file_path), font_src)
583 )
584 if not asset_font_path.exists():
585 css_content = css_content.replace(src_match, "")
586 except (
587 Exception # noqa, pylint: disable=broad-exception-caught
588 ) as patch_err:
589 logger.warning(
590 "Error while patching font sources: %s", patch_err
591 )
592 with open(asset_file_path, "w", encoding="utf-8") as f_out:
593 f_out.write(css_content)
594 elif media_type in ("application/xhtml+xml", "text/html"):
595 soup = BeautifulSoup(res.text, features="html.parser")
596 script_ele = soup.find("script", attrs={"type": "text/javascript"})
597 if script_ele and hasattr(script_ele, "string"):
598 mobj = contents_re.search(script_ele.string or "")
599 if not mobj:
600 logger.warning(
601 "Unable to extract content string for %s",
602 parsed_entry_url.path,
603 )
604 else:
605 new_soup = BeautifulSoup(
606 base64.b64decode(mobj.group("base64_text")),
607 features="html.parser",
608 )
609 soup.body.replace_with(new_soup.body) # type: ignore[arg-type,union-attr]
610 _cleanup_soup(soup, version=epub_version)
611 if (
612 cover_toc_item
613 and cover_toc_item.get("featureImage")
614 and manifest_entry["id"] == _sanitise_opf_id(cover_toc_item["path"])
615 ):
616 img_src = os.path.relpath(
617 book_content_folder.joinpath(cover_toc_item["featureImage"]),
618 start=asset_folder,
619 )
620 if is_windows():
621 img_src = Path(img_src).as_posix()
622 # patch the svg based cover for magazines
623 cover_svg = soup.find("svg")
624 if cover_svg:
625 # replace the svg ele with a simple image tag
626 cover_svg.decompose() # type: ignore[union-attr]
627 for c in soup.body.find_all(recursive=False): # type: ignore[union-attr]
628 c.decompose()
629 soup.body.append( # type: ignore[union-attr]
630 soup.new_tag("img", attrs={"src": img_src, "alt": "Cover"})
631 )
632 style_ele = soup.new_tag("style")
633 style_ele.append(
634 "img { max-width: 100%; margin-left: auto; margin-right: auto; }"
635 )
636 soup.head.append(style_ele) # type: ignore[union-attr]
638 with open(asset_file_path, "w", encoding="utf-8") as f_out:
639 f_out.write(str(soup))
640 else:
641 with open(asset_file_path, "wb") as f_out:
642 f_out.write(res.content)
644 if soup:
645 # try to min. soup searches where possible
646 if (
647 (not cover_img_manifest_id)
648 and cover_page_landmark
649 and cover_page_landmark["path"] == parsed_entry_url.path[1:]
650 ):
651 # try to find cover image for the book from the cover html content
652 cover_image = soup.find("img", attrs={"src": True})
653 if cover_image:
654 cover_img_manifest_id = _sanitise_opf_id(
655 urljoin(cover_page_landmark["path"], cover_image["src"]) # type: ignore[index]
656 )
657 elif (not has_nav) and soup.find(attrs={"epub:type": "toc"}):
658 # identify nav page
659 manifest_entry["properties"] = "nav"
660 has_nav = True
661 elif soup.find("svg"):
662 # page has svg
663 manifest_entry["properties"] = "svg"
665 if cover_img_manifest_id == manifest_entry["id"]:
666 manifest_entry["properties"] = "cover-image"
667 manifest_entries.append(manifest_entry)
668 if manifest_entry.get("properties") == "cover-image" and cover_path:
669 # replace the cover image already downloaded via the OD api, in case it is to be kept
670 shutil.copyfile(asset_file_path, cover_path)
672 if not has_nav:
673 # Generate nav - needed for magazines
675 # we give the nav an id-stamped file name to avoid accidentally overwriting
676 # an existing file name
677 nav_file_name = f'nav_{loan["id"]}.xhtml'
679 nav_soup = BeautifulSoup(NAV_XHTMLTEMPLATE, features="html.parser")
680 nav_soup.find("title").append(loan["title"]) # type: ignore[union-attr]
681 toc_ele = nav_soup.find(id="toc")
683 # sort toc into hierarchical sections
684 hierarchical_toc = _sort_toc(openbook_toc)
685 for item in hierarchical_toc:
686 li_ele = nav_soup.new_tag("li")
687 if not item.get("sectionName"):
688 a_ele = nav_soup.new_tag("a", attrs={"href": item["path"]})
689 a_ele.append(item["title"])
690 li_ele.append(a_ele)
691 toc_ele.append(li_ele) # type: ignore[union-attr]
692 continue
693 # since we don't have a section content page, and this can cause problems,
694 # link section to first article path
695 a_ele = nav_soup.new_tag("a", attrs={"href": item["items"][0]["path"]})
696 a_ele.append(item["sectionName"])
697 li_ele.append(a_ele)
698 ol_ele = nav_soup.new_tag("ol", attrs={"type": "1"})
699 for section_item in item.get("items", []):
700 section_li_ele = nav_soup.new_tag("li")
701 section_item_a_ele = nav_soup.new_tag(
702 "a", attrs={"href": section_item["path"]}
703 )
704 section_item_a_ele.append(section_item["title"])
705 section_li_ele.append(section_item_a_ele)
706 ol_ele.append(section_li_ele)
707 continue
708 li_ele.append(ol_ele)
709 toc_ele.append(li_ele) # type: ignore[union-attr]
711 with book_content_folder.joinpath(nav_file_name).open(
712 "w", encoding="utf-8"
713 ) as f_nav:
714 f_nav.write(str(nav_soup).strip())
715 manifest_entries.append(
716 {
717 "href": nav_file_name,
718 "id": _sanitise_opf_id(nav_file_name),
719 "media-type": "application/xhtml+xml",
720 "properties": "nav",
721 }
722 )
724 if not has_ncx:
725 # generate ncx for backward compat
726 ncx = _build_ncx(media_info, openbook, nav_file_name if not has_nav else "")
727 # we give the ncx an id-stamped file name to avoid accidentally overwriting
728 # an existing file name
729 toc_ncx_name = f'toc_{loan["id"]}.ncx'
730 tree = ET.ElementTree(ncx)
731 tree.write(
732 book_content_folder.joinpath(toc_ncx_name),
733 xml_declaration=True,
734 encoding="utf-8",
735 )
736 manifest_entries.append(
737 {
738 "href": toc_ncx_name,
739 "id": "ncx",
740 "media-type": "application/x-dtbncx+xml",
741 }
742 )
743 has_ncx = True
744 else:
745 # EPUB3 compliance: Ensure that the identifier in ncx matches the one in the OPF
746 # Mismatch due to the toc.ncx being supplied by publisher
747 ncx_manifest_entry = next(
748 iter([m for m in manifest_entries if m["id"] == "ncx"]), None
749 )
750 if ncx_manifest_entry:
751 expected_book_identifier = (
752 extract_isbn(
753 media_info["formats"],
754 format_types=[
755 LibbyFormats.MagazineOverDrive
756 if loan["type"]["id"] == LibbyMediaTypes.Magazine
757 else LibbyFormats.EBookOverdrive
758 ],
759 )
760 or media_info["id"]
761 ) # this is the summarised logic from build_opf_package
762 ncx_path = book_content_folder.joinpath(ncx_manifest_entry["href"])
763 new_ncx_contents = None
764 with ncx_path.open("r", encoding="utf-8") as ncx_f:
765 ncx_soup = BeautifulSoup(ncx_f, features="xml")
766 meta_id = ncx_soup.find("meta", attrs={"name": "dtb:uid"})
767 if (
768 meta_id
769 and isinstance(meta_id, Tag)
770 and meta_id.get("content")
771 and meta_id["content"] != expected_book_identifier
772 ):
773 logger.debug(
774 'Replacing identifier in %s: "%s" -> "%s"',
775 ncx_path.name,
776 meta_id["content"],
777 expected_book_identifier,
778 )
779 meta_id["content"] = expected_book_identifier
780 new_ncx_contents = str(ncx_soup)
781 if new_ncx_contents:
782 with ncx_path.open("w", encoding="utf-8") as ncx_f:
783 ncx_f.write(new_ncx_contents)
785 # create epub OPF
786 opf_file_name = "package.opf"
787 opf_file_path = book_content_folder.joinpath(opf_file_name)
788 package = build_opf_package(
789 media_info,
790 version=epub_version,
791 loan_format=LibbyFormats.MagazineOverDrive
792 if loan["type"]["id"] == LibbyMediaTypes.Magazine
793 else LibbyFormats.EBookOverdrive,
794 )
795 if args.generate_opf:
796 # save opf before the manifest and spine elements get added
797 # because those elements are meaningless outside an epub
798 export_opf_file = epub_file_path.with_suffix(".opf")
799 ET.ElementTree(package).write(
800 export_opf_file, xml_declaration=True, encoding="utf-8"
801 )
802 logger.info('Saved "%s"', colored(str(export_opf_file), "magenta"))
804 # add manifest
805 manifest = ET.SubElement(package, "manifest")
806 for entry in manifest_entries:
807 ET.SubElement(manifest, "item", attrib=entry)
809 cover_manifest_entry = next(
810 iter(
811 [
812 entry
813 for entry in manifest_entries
814 if entry.get("properties", "") == "cover-image"
815 ]
816 ),
817 None,
818 )
819 if not cover_manifest_entry:
820 cover_img_manifest_id = None
821 if cover_path and not cover_manifest_entry:
822 # add cover image separately since we can't identify which item is the cover
823 # we give the cover a timestamped file name to avoid accidentally overwriting
824 # an existing file name
825 cover_image_name = f"cover_{int(datetime.datetime.now().timestamp())}.jpg"
826 shutil.copyfile(cover_path, book_content_folder.joinpath(cover_image_name))
827 cover_img_manifest_id = "coverimage"
828 ET.SubElement(
829 manifest,
830 "item",
831 attrib={
832 "id": cover_img_manifest_id,
833 "href": cover_image_name,
834 "media-type": "image/jpeg",
835 "properties": "cover-image",
836 },
837 )
839 if cover_img_manifest_id:
840 metadata = package.find("metadata")
841 if metadata:
842 _ = ET.SubElement(
843 metadata,
844 "meta",
845 attrib={"name": "cover", "content": cover_img_manifest_id},
846 )
848 # add spine
849 spine = ET.SubElement(package, "spine")
850 if has_ncx:
851 spine.set("toc", "ncx")
852 spine_entries = list(
853 filter(
854 lambda s: not (
855 media_info["type"]["id"] == LibbyMediaTypes.Magazine
856 and s["-odread-original-path"] not in toc_pages
857 ),
858 openbook["spine"],
859 )
860 )
862 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372
863 spine_entries = sorted(
864 spine_entries, key=cmp_to_key(lambda a, b: _sort_spine_entries(a, b, toc_pages)) # type: ignore[misc]
865 )
866 for spine_idx, entry in enumerate(spine_entries):
867 if (
868 media_info["type"]["id"] == LibbyMediaTypes.Magazine
869 and entry["-odread-original-path"] not in toc_pages
870 ):
871 continue
872 item_ref = ET.SubElement(spine, "itemref")
873 item_ref.set("idref", _sanitise_opf_id(entry["-odread-original-path"]))
874 if spine_idx == 0 and not has_nav:
875 item_ref = ET.SubElement(spine, "itemref")
876 item_ref.set("idref", _sanitise_opf_id(nav_file_name))
878 # add guide
879 if openbook.get("nav", {}).get("landmarks"):
880 guide = ET.SubElement(package, "guide")
881 for landmark in openbook["nav"]["landmarks"]:
882 _ = ET.SubElement(
883 guide,
884 "reference",
885 attrib={
886 "href": landmark["path"],
887 "title": landmark["title"],
888 "type": landmark["type"],
889 },
890 )
892 if args.is_debug_mode:
893 from xml.dom import minidom
895 with opf_file_path.open("w", encoding="utf-8") as f:
896 f.write(
897 minidom.parseString(ET.tostring(package, "utf-8")).toprettyxml(
898 indent="\t"
899 )
900 )
901 else:
902 tree = ET.ElementTree(package)
903 tree.write(opf_file_path, xml_declaration=True, encoding="utf-8")
904 logger.debug('Saved "%s"', opf_file_path)
906 # create container.xml
907 container_file_path = book_meta_folder.joinpath("container.xml")
908 container = ET.Element(
909 "container",
910 attrib={
911 "version": "1.0",
912 "xmlns": "urn:oasis:names:tc:opendocument:xmlns:container",
913 },
914 )
915 root_files = ET.SubElement(container, "rootfiles")
916 _ = ET.SubElement(
917 root_files,
918 "rootfile",
919 attrib={
920 # use posix path because zipFile requires "/"
921 "full-path": Path(book_content_name, opf_file_name).as_posix(),
922 "media-type": "application/oebps-package+xml",
923 },
924 )
925 tree = ET.ElementTree(container)
926 tree.write(container_file_path, xml_declaration=True, encoding="utf-8")
927 logger.debug('Saved "%s"', container_file_path)
929 # create epub zip
930 with zipfile.ZipFile(
931 epub_file_path, mode="w", compression=zipfile.ZIP_DEFLATED
932 ) as epub_zip:
933 epub_zip.writestr(
934 "mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED
935 )
936 for root_start in (book_meta_folder, book_content_folder):
937 for p in root_start.glob("**/*"):
938 if p.is_dir():
939 continue
940 zip_archive_file = p.relative_to(book_folder)
941 # using posix path because zipfile requires "/" separators
942 # and may break on Windows otherwise
943 zip_archive_name = zip_archive_file.as_posix()
944 zip_target_file = book_folder.joinpath(zip_archive_file)
945 epub_zip.write(zip_target_file, zip_archive_name)
946 logger.debug(
947 'epub: Added "%s" as "%s"', zip_target_file, zip_archive_name
948 )
949 logger.info('Saved "%s"', colored(str(epub_file_path), "magenta", attrs=["bold"]))
951 # clean up
952 if not args.is_debug_mode:
953 for file_name in (
954 "mimetype",
955 "media.json",
956 "openbook.json",
957 "loan.json",
958 "rosters.json",
959 ):
960 target = book_folder.joinpath(file_name)
961 if target.exists():
962 target.unlink()
963 for folder in (book_content_folder, book_meta_folder):
964 shutil.rmtree(folder, ignore_errors=True)