Coverage for odmpy/processing/ebook.py: 87.1%

3# This file is part of odmpy.

5# odmpy is free software: you can redistribute it and/or modify

6# it under the terms of the GNU General Public License as published by

7# the Free Software Foundation, either version 3 of the License, or

8# (at your option) any later version.

10# odmpy is distributed in the hope that it will be useful,

11# but WITHOUT ANY WARRANTY; without even the implied warranty of

12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13# GNU General Public License for more details.

14#

15# You should have received a copy of the GNU General Public License

16# along with odmpy. If not, see <http://www.gnu.org/licenses/>.

17#

19import argparse

20import base64

21import datetime

22import json

23import logging

24import os

25import re

26import shutil

27import xml.etree.ElementTree as ET

28import zipfile

29from functools import cmp_to_key

30from pathlib import Path

31from typing import Dict, List, Optional

32from urllib.parse import urlparse, urljoin

34import bs4.element

35import requests

36from bs4 import BeautifulSoup, Doctype, Tag

37from termcolor import colored

38from tqdm import tqdm

40from .shared import (

41 generate_names,

42 build_opf_package,

43 extract_isbn,

44 extract_authors_from_openbook,

45)

46from ..errors import OdmpyRuntimeError

47from ..libby import USER_AGENT, LibbyClient, LibbyFormats, LibbyMediaTypes

48from ..overdrive import OverDriveClient

49from ..utils import slugify, is_windows, guess_mimetype

51#

52# Main processing logic for libby direct ebook and magazine loans

53#

55NAV_XHTMLTEMPLATE = """

56<!DOCTYPE html>

57<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">

58<head><title></title>

59<style>

60 #toc { list-style-type: none; padding-left: 0; }

61 #toc > li { margin-top: 0.5rem; }

62</style>

63</head>

64<body>

65<nav epub:type="toc">

66<h1>Contents</h1>

67<ol id="toc"></ol>

68</nav>

69</body>

70</html>

71"""

74def _sort_toc(toc: Dict) -> List:

75 """

76 Sorts the ToC dict from openbook into a hierarchical structure

78 :param toc:

79 :return:

80 """

81 hierarchical_toc = []

82 current_section = {} # type: Dict

83 for i, item in enumerate(toc, start=1):

84 if not item.get("sectionName"):

85 hierarchical_toc.append(item)

86 continue

87 if item["sectionName"] not in current_section or i == len(toc):

88 # new section or last item

89 if i == len(toc):

90 current_section.setdefault(item["sectionName"], []).append(item)

91 section_names = list(current_section.keys())

92 for section_name in section_names:

93 hierarchical_toc.append(

94 {

95 "sectionName": section_name,

96 "items": current_section[section_name],

97 }

98 )

99 del current_section[section_name]

100 if i < len(toc):

101 current_section.setdefault(item["sectionName"], []).append(item)

102

103 return hierarchical_toc

104

105

106def _build_ncx(media_info: Dict, openbook: Dict, nav_page: str) -> ET.Element:

107 """

108 Build the ncx from openbook

109

110 :param media_info:

111 :param openbook:

112 :param nav_page:

113 :return:

114 """

115

116 # References:

117 # Version 2: https://idpf.org/epub/20/spec/OPF_2.0_final_spec.html#Section2.0

118 # Version 3: https://www.w3.org/TR/epub-33/#sec-package-doc

119

120 publication_identifier = (

121 extract_isbn(

122 media_info["formats"],

123 [LibbyFormats.EBookOverdrive, LibbyFormats.MagazineOverDrive],

124 )

125 or media_info["id"]

126 )

127

128 ET.register_namespace("opf", "http://www.idpf.org/2007/opf")

129 ET.register_namespace("dc", "http://purl.org/dc/elements/1.1/")

130 ncx = ET.Element(

131 "ncx",

132 attrib={

133 "version": "2005-1",

134 "xmlns": "http://www.daisy.org/z3986/2005/ncx/",

135 "xml:lang": "en",

136 },

137 )

138

139 head = ET.SubElement(ncx, "head")

140 ET.SubElement(

141 head, "meta", attrib={"content": publication_identifier, "name": "dtb:uid"}

142 )

143 doc_title = ET.SubElement(ncx, "docTitle")

144 doc_title_text = ET.SubElement(doc_title, "text")

145 doc_title_text.text = openbook["title"]["main"]

146

147 doc_author = ET.SubElement(ncx, "docAuthor")

148 doc_author_text = ET.SubElement(doc_author, "text")

149 doc_author_text.text = openbook["creator"][0]["name"]

150

151 nav_map = ET.SubElement(ncx, "navMap")

152 hierarchical_toc = _sort_toc(openbook["nav"]["toc"])

153 nav_point_counter = 0

154 for item in hierarchical_toc:

155 nav_point_counter += 1

156 if not item.get("sectionName"):

157 nav_point = ET.SubElement(

158 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}

159 )

160 nav_label = ET.SubElement(nav_point, "navLabel")

161 nav_label_text = ET.SubElement(nav_label, "text")

162 nav_label_text.text = item["title"]

163 ET.SubElement(nav_point, "content", attrib={"src": item["path"]})

164

165 if nav_point_counter == 1 and nav_page:

166 nav_point_counter += 1

167 nav_point = ET.SubElement(

168 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}

169 )

170 nav_label = ET.SubElement(nav_point, "navLabel")

171 nav_label_text = ET.SubElement(nav_label, "text")

172 nav_label_text.text = "Contents"

173 ET.SubElement(nav_point, "content", attrib={"src": nav_page})

174 continue

175

176 nav_point = ET.SubElement(

177 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}

178 )

179 nav_label = ET.SubElement(nav_point, "navLabel")

180 nav_label_text = ET.SubElement(nav_label, "text")

181 nav_label_text.text = item["sectionName"]

182 # since we don't have a section content page, link section to first article path

183 ET.SubElement(nav_point, "content", attrib={"src": item["items"][0]["path"]})

184 for section_item in item["items"]:

185 nav_point_counter += 1

186 section_item_nav_point = ET.SubElement(

187 nav_point, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"}

188 )

189 section_item_nav_label = ET.SubElement(section_item_nav_point, "navLabel")

190 section_item_nav_label_text = ET.SubElement(section_item_nav_label, "text")

191 section_item_nav_label_text.text = section_item["title"]

192 ET.SubElement(

193 section_item_nav_point, "content", attrib={"src": section_item["path"]}

194 )

195 return ncx

196

197

198def _sanitise_opf_id(string_id: str) -> str:

199 """

200 OPF IDs cannot start with a number

201 :param string_id:

202 :return:

203 """

204 string_id = slugify(string_id)

205 if string_id[0].isdigit():

206 return f"id_{string_id}"

207 return string_id

208

209

210def _cleanup_soup(soup: BeautifulSoup, version: str = "2.0") -> None:

211 """

212 Tries to fix up book content pages to be epub-version compliant.

213

214 :param soup:

215 :param version:

216 :return:

217 """

218 if version == "2.0":

219 # v2 is a lot pickier about the acceptable elements and attributes

220 modified_doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"'

221 for item in soup.contents:

222 if isinstance(item, Doctype):

223 item.replace_with(Doctype(modified_doctype))

224 break

225 remove_attributes = [

226 # this list will not be complete, but we try

227 "aria-label",

228 "data-loc",

229 "data-epub-type",

230 "data-document-status",

231 "data-xml-lang",

232 "lang",

233 "role",

234 "epub:type",

235 "epub:prefix",

236 ]

237 for attribute in remove_attributes:

238 for tag in soup.find_all(attrs={attribute: True}):

239 del tag[attribute]

240 convert_tags = ["nav", "section"] # this list will not be complete, but we try

241 for tag in convert_tags:

242 for invalid_tag in soup.find_all(tag):

243 invalid_tag.name = "div"

244

245 # known issues, this will not be complete

246 for svg in soup.find_all("svg"):

247 if not svg.get("xmlns"):

248 svg["xmlns"] = "http://www.w3.org/2000/svg"

249 if not svg.get("xmlns:xlink"):

250 svg["xmlns:xlink"] = "http://www.w3.org/1999/xlink"

251 convert_tags = ["figcaption"]

252 for tag in convert_tags:

253 for invalid_tag in soup.find_all(tag):

254 invalid_tag.name = "div"

255 remove_tags = ["base"]

256 for tag in remove_tags:

257 for remove_tag in soup.find_all(tag):

258 remove_tag.decompose()

259

260 html_tag = soup.find("html")

261 if html_tag and isinstance(html_tag, bs4.element.Tag) and not html_tag.get("xmlns"):

262 html_tag["xmlns"] = "http://www.w3.org/1999/xhtml"

263

264

265def _sort_spine_entries(a: Dict, b: Dict, toc_pages: List[str]):

266 """

267 Sort spine according to TOC. For magazines, this is sometimes a

268 problem where the sequence laid out in the spine does not align

269 with the TOC, e.g. Mother Jones. If unsorted, the page through

270 sequence does not match the actual TOC.

271

272 :param a:

273 :param b:

274 :param toc_pages:

275 :return:

276 """

277 try:

278 a_index = toc_pages.index(a["-odread-original-path"])

279 except ValueError:

280 a_index = 999

281 try:

282 b_index = toc_pages.index(b["-odread-original-path"])

283 except ValueError:

284 b_index = 999

285

286 if a_index != b_index:

287 # sort order found via toc

288 return -1 if a_index < b_index else 1

289

290 return -1 if a["-odread-spine-position"] < b["-odread-spine-position"] else 1

291

292

293def _sort_title_contents(a: Dict, b: Dict):

294 """

295 Sort the title contents roster so that pages get processed first.

296 This is a precautionary measure for getting high-res cover images

297 since we must parse the html for the image src.

298

299 :param a:

300 :param b:

301 :return:

302 """

303 extensions_rank = [

304 ".xhtml",

305 ".html",

306 ".htm",

307 ".jpg",

308 ".jpeg",

309 ".png",

310 ".gif",

311 ".ttf", # download fonts before css so that we can check if font is available

312 ".otf",

313 ".css",

314 ]

315 a_parsed_url = urlparse(a["url"])

316 b_parsed_url = urlparse(b["url"])

317 a_ext = Path(a_parsed_url.path).suffix

318 b_ext = Path(b_parsed_url.path).suffix

319 try:

320 a_index = extensions_rank.index(a_ext)

321 except ValueError:

322 a_index = 999

323 try:

324 b_index = extensions_rank.index(b_ext)

325 except ValueError:

326 b_index = 999

327

328 if a_index != b_index:

329 # sort order found via toc

330 return -1 if a_index < b_index else 1

331

332 if a_ext != b_ext:

333 return -1 if a_ext < b_ext else 1

334

335 return -1 if a_parsed_url.path < b_parsed_url.path else 1

336

337

338def _filter_content(entry: Dict, media_info: Dict, toc_pages: List[str]):

339 """

340 Filter title contents that are not needed.

341

342 :param entry:

343 :param media_info:

344 :param toc_pages:

345 :return:

346 """

347 parsed_entry_url = urlparse(entry["url"])

348 media_type = guess_mimetype(parsed_entry_url.path[1:])

349

350 if media_info["type"]["id"] == LibbyMediaTypes.Magazine and media_type:

351 if media_type.startswith("image/") and (

352 parsed_entry_url.path.startswith("/pages/")

353 or parsed_entry_url.path.startswith("/thumbnails/")

354 ):

355 return False

356 if (

357 media_type in ("application/xhtml+xml", "text/html")

358 and parsed_entry_url.path[1:] not in toc_pages

359 ):

360 return False

361

362 if parsed_entry_url.path.startswith("/_d/"): # ebooks

363 return False

364

365 return True

366

367

368def process_ebook_loan(

369 loan: Dict,

370 cover_path: Optional[Path],

371 openbook: Dict,

372 rosters: List[Dict],

373 libby_client: LibbyClient,

374 args: argparse.Namespace,

375 logger: logging.Logger,

376) -> None:

377 """

378 Generates and return an ebook loan directly from Libby.

379

380 :param loan:

381 :param cover_path:

382 :param openbook:

383 :param rosters:

384 :param libby_client:

385 :param args:

386 :param logger:

387 :return:

388 """

389 book_folder, book_file_name = generate_names(

390 title=loan["title"],

391 series=loan.get("series") or "",

392 series_reading_order=loan.get("detailedSeries", {}).get("readingOrder", ""),

393 authors=extract_authors_from_openbook(openbook),

394 edition=loan.get("edition") or "",

395 title_id=loan["id"],

396 args=args,

397 logger=logger,

398 )

399 epub_file_path = book_file_name.with_suffix(".epub")

400 epub_version = "3.0"

401

402 book_meta_name = "META-INF"

403 book_content_name = "OEBPS"

404 book_meta_folder = book_folder.joinpath(book_meta_name)

405 book_content_folder = book_folder.joinpath(book_content_name)

406 for d in (book_meta_folder, book_content_folder):

407 if not d.exists():

408 d.mkdir(parents=True, exist_ok=True)

409

410 od_client = OverDriveClient(

411 user_agent=USER_AGENT, timeout=args.timeout, retry=args.retries

412 )

413 media_info = od_client.media(loan["id"])

414

415 if args.is_debug_mode:

416 with book_folder.joinpath("media.json").open("w", encoding="utf-8") as f:

417 json.dump(media_info, f, indent=2)

418

419 with book_folder.joinpath("loan.json").open("w", encoding="utf-8") as f:

420 json.dump(loan, f, indent=2)

421

422 with book_folder.joinpath("rosters.json").open("w", encoding="utf-8") as f:

423 json.dump(rosters, f, indent=2)

424

425 with book_folder.joinpath("openbook.json").open("w", encoding="utf-8") as f:

426 json.dump(openbook, f, indent=2)

427

428 title_contents: Dict = next(

429 iter([r for r in rosters if r["group"] == "title-content"]), {}

430 )

431 headers = libby_client.default_headers()

432 headers["Accept"] = "*/*"

433 contents_re = re.compile(r"parent\.__bif_cfc0\(self,'(?P<base64_text>.+)'\)")

434

435 openbook_toc = openbook["nav"]["toc"]

436 if len(openbook_toc) <= 1 and loan["type"]["id"] == LibbyMediaTypes.Magazine:

437 raise OdmpyRuntimeError("Unsupported fixed-layout (pre-paginated) format.")

438

439 # for finding cover image for magazines

440 cover_toc_item = next(

441 iter(

442 [

443 item

444 for item in openbook_toc

445 if item.get("pageRange", "") == "Cover" and item.get("featureImage")

446 ]

447 ),

448 None,

449 )

450 # for finding cover image for ebooks

451 cover_page_landmark = next(

452 iter(

453 [

454 item

455 for item in openbook.get("nav", {}).get("landmarks", [])

456 if item["type"] == "cover"

457 ]

458 ),

459 None,

460 )

461 toc_pages = [item["path"].split("#")[0] for item in openbook_toc]

462 manifest_entries: List[Dict] = []

463

464 title_content_entries = list(

465 filter(

466 lambda e: _filter_content(e, media_info, toc_pages),

467 title_contents["entries"],

468 )

469 )

470 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372

471 title_content_entries = sorted(

472 title_content_entries, key=cmp_to_key(_sort_title_contents) # type: ignore[misc]

473 )

474 progress_bar = tqdm(title_content_entries, disable=args.hide_progress)

475 has_ncx = False

476 has_nav = False

477

478 # Used to patch magazine css that causes paged mode in calibre viewer to not work.

479 # This expression is used to strip `overflow-x: hidden` from the css definition

480 # for `#article-body`.

481 patch_magazine_css_overflow_re = re.compile(

482 r"(#article-body\s*\{[^{}]+?)overflow-x:\s*hidden;([^{}]+?})"

483 )

484 # This expression is used to strip `padding: Xem Xem;` from the css definition

485 # for `#article-body` to remove the extraneous padding

486 patch_magazine_css_padding_re = re.compile(

487 r"(#article-body\s*\{[^{}]+?)padding:\s*[^;]+;([^{}]+?})"

488 )

489 # This expression is used to patch the missing fonts-specified in magazine css

490 patch_magazine_css_font_re = re.compile(r"(font-family: '[^']+(Sans|Serif)[^']+';)")

491 # This expression is used to strip the missing font src in magazine css

492 patch_magazine_css_font_src_re = re.compile(

493 r"@font-face\s*\{[^{}]+?(src:\s*url\('(fonts/.+\.ttf)'\).+?;)[^{}]+?}"

494 )

495

496 # holds the manifest item ID for the image identified as the cover

497 cover_img_manifest_id = None

498

499 for entry in progress_bar:

500 entry_url = entry["url"]

501 parsed_entry_url = urlparse(entry_url)

502 title_content_path = Path(parsed_entry_url.path[1:])

503 media_type = guess_mimetype(title_content_path.name)

504 if not media_type:

505 logger.warning("Skipped roster entry: %s", title_content_path.name)

506 continue

507 asset_folder = book_content_folder.joinpath(title_content_path.parent)

508 if media_type == "application/x-dtbncx+xml":

509 has_ncx = True

510 manifest_entry = {

511 "href": parsed_entry_url.path[1:],

512 "id": "ncx"

513 if media_type == "application/x-dtbncx+xml"

514 else _sanitise_opf_id(parsed_entry_url.path[1:]),

515 "media-type": media_type,

516 }

517

518 # try to find cover image for magazines

519 if cover_toc_item and manifest_entry["id"] == _sanitise_opf_id(

520 cover_toc_item["featureImage"]

521 ):

522 # we assign it here to ensure that the image referenced in the

523 # toc actually exists

524 cover_img_manifest_id = manifest_entry["id"]

525

526 if not asset_folder.exists():

527 asset_folder.mkdir(parents=True, exist_ok=True)

528 asset_file_path = asset_folder.joinpath(Path(parsed_entry_url.path).name)

529

530 soup = None

531 if asset_file_path.exists():

532 progress_bar.set_description(f"Already saved {asset_file_path.name}")

533 if media_type in ("application/xhtml+xml", "text/html"):

534 with asset_file_path.open("r", encoding="utf-8") as f_asset:

535 soup = BeautifulSoup(f_asset, features="html.parser")

536 else:

537 progress_bar.set_description(f"Downloading {asset_file_path.name}")

538 # use the libby client session because the required

539 # auth cookies are set there

540 res: requests.Response = libby_client.make_request(

541 entry_url, headers=headers, authenticated=False, return_res=True

542 )

543

544 # patch magazine css to fix various rendering problems

545 if (

546 media_info["type"]["id"] == LibbyMediaTypes.Magazine

547 and media_type == "text/css"

548 ):

549 css_content = patch_magazine_css_overflow_re.sub(r"\1\2", res.text)

550 css_content = patch_magazine_css_padding_re.sub(r"\1\2", css_content)

551 if "#article-body" in css_content:

552 # patch font-family declarations

553 # libby declares these font-faces but does not supply them in the roster

554 # nor are they actually available when viewed online (http 403)

555 font_families = list(

556 set(patch_magazine_css_font_re.findall(css_content))

557 )

558 for font_family, _ in font_families:

559 new_font_css = font_family[:-1]

560 if "Serif" in font_family:

561 new_font_css += ',Charter,"Bitstream Charter","Sitka Text",Cambria,serif'

562 elif "Sans" in font_family:

563 new_font_css += ",system-ui,sans-serif"

564 new_font_css += ";"

565 if "-Bold" in font_family:

566 new_font_css += " font-weight: 700;"

567 elif "-SemiBold" in font_family:

568 new_font_css += " font-weight: 600;"

569 elif "-Light" in font_family:

570 new_font_css += " font-weight: 300;"

571 css_content = css_content.replace(font_family, new_font_css)

572 else:

573 # patch font url declarations

574 # since ttf/otf files are downloaded ahead of css, we can verify

575 # if the font files are actually available

576 try:

577 font_sources = patch_magazine_css_font_src_re.findall(

578 css_content

579 )

580 for src_match, font_src in font_sources:

581 asset_font_path = Path(

582 urljoin(str(asset_file_path), font_src)

583 )

584 if not asset_font_path.exists():

585 css_content = css_content.replace(src_match, "")

586 except (

587 Exception # noqa, pylint: disable=broad-exception-caught

588 ) as patch_err:

589 logger.warning(

590 "Error while patching font sources: %s", patch_err

591 )

592 with open(asset_file_path, "w", encoding="utf-8") as f_out:

593 f_out.write(css_content)

594 elif media_type in ("application/xhtml+xml", "text/html"):

595 soup = BeautifulSoup(res.text, features="html.parser")

596 script_ele = soup.find("script", attrs={"type": "text/javascript"})

597 if script_ele and hasattr(script_ele, "string"):

598 mobj = contents_re.search(script_ele.string or "")

599 if not mobj:

600 logger.warning(

601 "Unable to extract content string for %s",

602 parsed_entry_url.path,

603 )

604 else:

605 new_soup = BeautifulSoup(

606 base64.b64decode(mobj.group("base64_text")),

607 features="html.parser",

608 )

609 soup.body.replace_with(new_soup.body) # type: ignore[arg-type,union-attr]

610 _cleanup_soup(soup, version=epub_version)

611 if (

612 cover_toc_item

613 and cover_toc_item.get("featureImage")

614 and manifest_entry["id"] == _sanitise_opf_id(cover_toc_item["path"])

615 ):

616 img_src = os.path.relpath(

617 book_content_folder.joinpath(cover_toc_item["featureImage"]),

618 start=asset_folder,

619 )

620 if is_windows():

621 img_src = Path(img_src).as_posix()

622 # patch the svg based cover for magazines

623 cover_svg = soup.find("svg")

624 if cover_svg:

625 # replace the svg ele with a simple image tag

626 cover_svg.decompose() # type: ignore[union-attr]

627 for c in soup.body.find_all(recursive=False): # type: ignore[union-attr]

628 c.decompose()

629 soup.body.append( # type: ignore[union-attr]

630 soup.new_tag("img", attrs={"src": img_src, "alt": "Cover"})

631 )

632 style_ele = soup.new_tag("style")

633 style_ele.append(

634 "img { max-width: 100%; margin-left: auto; margin-right: auto; }"

635 )

636 soup.head.append(style_ele) # type: ignore[union-attr]

637

638 with open(asset_file_path, "w", encoding="utf-8") as f_out:

639 f_out.write(str(soup))

640 else:

641 with open(asset_file_path, "wb") as f_out:

642 f_out.write(res.content)

643

644 if soup:

645 # try to min. soup searches where possible

646 if (

647 (not cover_img_manifest_id)

648 and cover_page_landmark

649 and cover_page_landmark["path"] == parsed_entry_url.path[1:]

650 ):

651 # try to find cover image for the book from the cover html content

652 cover_image = soup.find("img", attrs={"src": True})

653 if cover_image:

654 cover_img_manifest_id = _sanitise_opf_id(

655 urljoin(cover_page_landmark["path"], cover_image["src"]) # type: ignore[index]

656 )

657 elif (not has_nav) and soup.find(attrs={"epub:type": "toc"}):

658 # identify nav page

659 manifest_entry["properties"] = "nav"

660 has_nav = True

661 elif soup.find("svg"):

662 # page has svg

663 manifest_entry["properties"] = "svg"

664

665 if cover_img_manifest_id == manifest_entry["id"]:

666 manifest_entry["properties"] = "cover-image"

667 manifest_entries.append(manifest_entry)

668 if manifest_entry.get("properties") == "cover-image" and cover_path:

669 # replace the cover image already downloaded via the OD api, in case it is to be kept

670 shutil.copyfile(asset_file_path, cover_path)

671

672 if not has_nav:

673 # Generate nav - needed for magazines

674

675 # we give the nav an id-stamped file name to avoid accidentally overwriting

676 # an existing file name

677 nav_file_name = f'nav_{loan["id"]}.xhtml'

678

679 nav_soup = BeautifulSoup(NAV_XHTMLTEMPLATE, features="html.parser")

680 nav_soup.find("title").append(loan["title"]) # type: ignore[union-attr]

681 toc_ele = nav_soup.find(id="toc")

682

683 # sort toc into hierarchical sections

684 hierarchical_toc = _sort_toc(openbook_toc)

685 for item in hierarchical_toc:

686 li_ele = nav_soup.new_tag("li")

687 if not item.get("sectionName"):

688 a_ele = nav_soup.new_tag("a", attrs={"href": item["path"]})

689 a_ele.append(item["title"])

690 li_ele.append(a_ele)

691 toc_ele.append(li_ele) # type: ignore[union-attr]

692 continue

693 # since we don't have a section content page, and this can cause problems,

694 # link section to first article path

695 a_ele = nav_soup.new_tag("a", attrs={"href": item["items"][0]["path"]})

696 a_ele.append(item["sectionName"])

697 li_ele.append(a_ele)

698 ol_ele = nav_soup.new_tag("ol", attrs={"type": "1"})

699 for section_item in item.get("items", []):

700 section_li_ele = nav_soup.new_tag("li")

701 section_item_a_ele = nav_soup.new_tag(

702 "a", attrs={"href": section_item["path"]}

703 )

704 section_item_a_ele.append(section_item["title"])

705 section_li_ele.append(section_item_a_ele)

706 ol_ele.append(section_li_ele)

707 continue

708 li_ele.append(ol_ele)

709 toc_ele.append(li_ele) # type: ignore[union-attr]

710

711 with book_content_folder.joinpath(nav_file_name).open(

712 "w", encoding="utf-8"

713 ) as f_nav:

714 f_nav.write(str(nav_soup).strip())

715 manifest_entries.append(

716 {

717 "href": nav_file_name,

718 "id": _sanitise_opf_id(nav_file_name),

719 "media-type": "application/xhtml+xml",

720 "properties": "nav",

721 }

722 )

723

724 if not has_ncx:

725 # generate ncx for backward compat

726 ncx = _build_ncx(media_info, openbook, nav_file_name if not has_nav else "")

727 # we give the ncx an id-stamped file name to avoid accidentally overwriting

728 # an existing file name

729 toc_ncx_name = f'toc_{loan["id"]}.ncx'

730 tree = ET.ElementTree(ncx)

731 tree.write(

732 book_content_folder.joinpath(toc_ncx_name),

733 xml_declaration=True,

734 encoding="utf-8",

735 )

736 manifest_entries.append(

737 {

738 "href": toc_ncx_name,

739 "id": "ncx",

740 "media-type": "application/x-dtbncx+xml",

741 }

742 )

743 has_ncx = True

744 else:

745 # EPUB3 compliance: Ensure that the identifier in ncx matches the one in the OPF

746 # Mismatch due to the toc.ncx being supplied by publisher

747 ncx_manifest_entry = next(

748 iter([m for m in manifest_entries if m["id"] == "ncx"]), None

749 )

750 if ncx_manifest_entry:

751 expected_book_identifier = (

752 extract_isbn(

753 media_info["formats"],

754 format_types=[

755 LibbyFormats.MagazineOverDrive

756 if loan["type"]["id"] == LibbyMediaTypes.Magazine

757 else LibbyFormats.EBookOverdrive

758 ],

759 )

760 or media_info["id"]

761 ) # this is the summarised logic from build_opf_package

762 ncx_path = book_content_folder.joinpath(ncx_manifest_entry["href"])

763 new_ncx_contents = None

764 with ncx_path.open("r", encoding="utf-8") as ncx_f:

765 ncx_soup = BeautifulSoup(ncx_f, features="xml")

766 meta_id = ncx_soup.find("meta", attrs={"name": "dtb:uid"})

767 if (

768 meta_id

769 and isinstance(meta_id, Tag)

770 and meta_id.get("content")

771 and meta_id["content"] != expected_book_identifier

772 ):

773 logger.debug(

774 'Replacing identifier in %s: "%s" -> "%s"',

775 ncx_path.name,

776 meta_id["content"],

777 expected_book_identifier,

778 )

779 meta_id["content"] = expected_book_identifier

780 new_ncx_contents = str(ncx_soup)

781 if new_ncx_contents:

782 with ncx_path.open("w", encoding="utf-8") as ncx_f:

783 ncx_f.write(new_ncx_contents)

784

785 # create epub OPF

786 opf_file_name = "package.opf"

787 opf_file_path = book_content_folder.joinpath(opf_file_name)

788 package = build_opf_package(

789 media_info,

790 version=epub_version,

791 loan_format=LibbyFormats.MagazineOverDrive

792 if loan["type"]["id"] == LibbyMediaTypes.Magazine

793 else LibbyFormats.EBookOverdrive,

794 )

795 if args.generate_opf:

796 # save opf before the manifest and spine elements get added

797 # because those elements are meaningless outside an epub

798 export_opf_file = epub_file_path.with_suffix(".opf")

799 ET.ElementTree(package).write(

800 export_opf_file, xml_declaration=True, encoding="utf-8"

801 )

802 logger.info('Saved "%s"', colored(str(export_opf_file), "magenta"))

803

804 # add manifest

805 manifest = ET.SubElement(package, "manifest")

806 for entry in manifest_entries:

807 ET.SubElement(manifest, "item", attrib=entry)

808

809 cover_manifest_entry = next(

810 iter(

811 [

812 entry

813 for entry in manifest_entries

814 if entry.get("properties", "") == "cover-image"

815 ]

816 ),

817 None,

818 )

819 if not cover_manifest_entry:

820 cover_img_manifest_id = None

821 if cover_path and not cover_manifest_entry:

822 # add cover image separately since we can't identify which item is the cover

823 # we give the cover a timestamped file name to avoid accidentally overwriting

824 # an existing file name

825 cover_image_name = f"cover_{int(datetime.datetime.now().timestamp())}.jpg"

826 shutil.copyfile(cover_path, book_content_folder.joinpath(cover_image_name))

827 cover_img_manifest_id = "coverimage"

828 ET.SubElement(

829 manifest,

830 "item",

831 attrib={

832 "id": cover_img_manifest_id,

833 "href": cover_image_name,

834 "media-type": "image/jpeg",

835 "properties": "cover-image",

836 },

837 )

838

839 if cover_img_manifest_id:

840 metadata = package.find("metadata")

841 if metadata:

842 _ = ET.SubElement(

843 metadata,

844 "meta",

845 attrib={"name": "cover", "content": cover_img_manifest_id},

846 )

847

848 # add spine

849 spine = ET.SubElement(package, "spine")

850 if has_ncx:

851 spine.set("toc", "ncx")

852 spine_entries = list(

853 filter(

854 lambda s: not (

855 media_info["type"]["id"] == LibbyMediaTypes.Magazine

856 and s["-odread-original-path"] not in toc_pages

857 ),

858 openbook["spine"],

859 )

860 )

861

862 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372

863 spine_entries = sorted(

864 spine_entries, key=cmp_to_key(lambda a, b: _sort_spine_entries(a, b, toc_pages)) # type: ignore[misc]

865 )

866 for spine_idx, entry in enumerate(spine_entries):

867 if (

868 media_info["type"]["id"] == LibbyMediaTypes.Magazine

869 and entry["-odread-original-path"] not in toc_pages

870 ):

871 continue

872 item_ref = ET.SubElement(spine, "itemref")

873 item_ref.set("idref", _sanitise_opf_id(entry["-odread-original-path"]))

874 if spine_idx == 0 and not has_nav:

875 item_ref = ET.SubElement(spine, "itemref")

876 item_ref.set("idref", _sanitise_opf_id(nav_file_name))

877

878 # add guide

879 if openbook.get("nav", {}).get("landmarks"):

880 guide = ET.SubElement(package, "guide")

881 for landmark in openbook["nav"]["landmarks"]:

882 _ = ET.SubElement(

883 guide,

884 "reference",

885 attrib={

886 "href": landmark["path"],

887 "title": landmark["title"],

888 "type": landmark["type"],

889 },

890 )

891

892 if args.is_debug_mode:

893 from xml.dom import minidom

894

895 with opf_file_path.open("w", encoding="utf-8") as f:

896 f.write(

897 minidom.parseString(ET.tostring(package, "utf-8")).toprettyxml(

898 indent="\t"

899 )

900 )

901 else:

902 tree = ET.ElementTree(package)

903 tree.write(opf_file_path, xml_declaration=True, encoding="utf-8")

904 logger.debug('Saved "%s"', opf_file_path)

905

906 # create container.xml

907 container_file_path = book_meta_folder.joinpath("container.xml")

908 container = ET.Element(

909 "container",

910 attrib={

911 "version": "1.0",

912 "xmlns": "urn:oasis:names:tc:opendocument:xmlns:container",

913 },

914 )

915 root_files = ET.SubElement(container, "rootfiles")

916 _ = ET.SubElement(

917 root_files,

918 "rootfile",

919 attrib={

920 # use posix path because zipFile requires "/"

921 "full-path": Path(book_content_name, opf_file_name).as_posix(),

922 "media-type": "application/oebps-package+xml",

923 },

924 )

925 tree = ET.ElementTree(container)

926 tree.write(container_file_path, xml_declaration=True, encoding="utf-8")

927 logger.debug('Saved "%s"', container_file_path)

928

929 # create epub zip

930 with zipfile.ZipFile(

931 epub_file_path, mode="w", compression=zipfile.ZIP_DEFLATED

932 ) as epub_zip:

933 epub_zip.writestr(

934 "mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED

935 )

936 for root_start in (book_meta_folder, book_content_folder):

937 for p in root_start.glob("**/*"):

938 if p.is_dir():

939 continue

940 zip_archive_file = p.relative_to(book_folder)

941 # using posix path because zipfile requires "/" separators

942 # and may break on Windows otherwise

943 zip_archive_name = zip_archive_file.as_posix()

944 zip_target_file = book_folder.joinpath(zip_archive_file)

945 epub_zip.write(zip_target_file, zip_archive_name)

946 logger.debug(

947 'epub: Added "%s" as "%s"', zip_target_file, zip_archive_name

948 )

949 logger.info('Saved "%s"', colored(str(epub_file_path), "magenta", attrs=["bold"]))

950

951 # clean up

952 if not args.is_debug_mode:

953 for file_name in (

954 "mimetype",

955 "media.json",

956 "openbook.json",

957 "loan.json",

958 "rosters.json",

959 ):

960 target = book_folder.joinpath(file_name)

961 if target.exists():

962 target.unlink()

963 for folder in (book_content_folder, book_meta_folder):

964 shutil.rmtree(folder, ignore_errors=True)