Coverage for odmpy/processing/ebook.py: 87.1%

428 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-14 08:51 +0000

1# Copyright (C) 2023 github.com/ping 

2# 

3# This file is part of odmpy. 

4# 

5# odmpy is free software: you can redistribute it and/or modify 

6# it under the terms of the GNU General Public License as published by 

7# the Free Software Foundation, either version 3 of the License, or 

8# (at your option) any later version. 

9# 

10# odmpy is distributed in the hope that it will be useful, 

11# but WITHOUT ANY WARRANTY; without even the implied warranty of 

12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

13# GNU General Public License for more details. 

14# 

15# You should have received a copy of the GNU General Public License 

16# along with odmpy. If not, see <http://www.gnu.org/licenses/>. 

17# 

18 

19import argparse 

20import base64 

21import datetime 

22import json 

23import logging 

24import os 

25import re 

26import shutil 

27import xml.etree.ElementTree as ET 

28import zipfile 

29from functools import cmp_to_key 

30from pathlib import Path 

31from typing import Dict, List, Optional 

32from urllib.parse import urlparse, urljoin 

33 

34import bs4.element 

35import requests 

36from bs4 import BeautifulSoup, Doctype, Tag 

37from termcolor import colored 

38from tqdm import tqdm 

39 

40from .shared import ( 

41 generate_names, 

42 build_opf_package, 

43 extract_isbn, 

44 extract_authors_from_openbook, 

45) 

46from ..errors import OdmpyRuntimeError 

47from ..libby import USER_AGENT, LibbyClient, LibbyFormats, LibbyMediaTypes 

48from ..overdrive import OverDriveClient 

49from ..utils import slugify, is_windows, guess_mimetype 

50 

51# 

52# Main processing logic for libby direct ebook and magazine loans 

53# 

54 

55NAV_XHTMLTEMPLATE = """ 

56<!DOCTYPE html> 

57<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> 

58<head><title></title> 

59<style> 

60 #toc { list-style-type: none; padding-left: 0; } 

61 #toc > li { margin-top: 0.5rem; } 

62</style> 

63</head> 

64<body> 

65<nav epub:type="toc"> 

66<h1>Contents</h1> 

67<ol id="toc"></ol> 

68</nav> 

69</body> 

70</html> 

71""" 

72 

73 

74def _sort_toc(toc: Dict) -> List: 

75 """ 

76 Sorts the ToC dict from openbook into a hierarchical structure 

77 

78 :param toc: 

79 :return: 

80 """ 

81 hierarchical_toc = [] 

82 current_section = {} # type: Dict 

83 for i, item in enumerate(toc, start=1): 

84 if not item.get("sectionName"): 

85 hierarchical_toc.append(item) 

86 continue 

87 if item["sectionName"] not in current_section or i == len(toc): 

88 # new section or last item 

89 if i == len(toc): 

90 current_section.setdefault(item["sectionName"], []).append(item) 

91 section_names = list(current_section.keys()) 

92 for section_name in section_names: 

93 hierarchical_toc.append( 

94 { 

95 "sectionName": section_name, 

96 "items": current_section[section_name], 

97 } 

98 ) 

99 del current_section[section_name] 

100 if i < len(toc): 

101 current_section.setdefault(item["sectionName"], []).append(item) 

102 

103 return hierarchical_toc 

104 

105 

106def _build_ncx(media_info: Dict, openbook: Dict, nav_page: str) -> ET.Element: 

107 """ 

108 Build the ncx from openbook 

109 

110 :param media_info: 

111 :param openbook: 

112 :param nav_page: 

113 :return: 

114 """ 

115 

116 # References: 

117 # Version 2: https://idpf.org/epub/20/spec/OPF_2.0_final_spec.html#Section2.0 

118 # Version 3: https://www.w3.org/TR/epub-33/#sec-package-doc 

119 

120 publication_identifier = ( 

121 extract_isbn( 

122 media_info["formats"], 

123 [LibbyFormats.EBookOverdrive, LibbyFormats.MagazineOverDrive], 

124 ) 

125 or media_info["id"] 

126 ) 

127 

128 ET.register_namespace("opf", "http://www.idpf.org/2007/opf") 

129 ET.register_namespace("dc", "http://purl.org/dc/elements/1.1/") 

130 ncx = ET.Element( 

131 "ncx", 

132 attrib={ 

133 "version": "2005-1", 

134 "xmlns": "http://www.daisy.org/z3986/2005/ncx/", 

135 "xml:lang": "en", 

136 }, 

137 ) 

138 

139 head = ET.SubElement(ncx, "head") 

140 ET.SubElement( 

141 head, "meta", attrib={"content": publication_identifier, "name": "dtb:uid"} 

142 ) 

143 doc_title = ET.SubElement(ncx, "docTitle") 

144 doc_title_text = ET.SubElement(doc_title, "text") 

145 doc_title_text.text = openbook["title"]["main"] 

146 

147 doc_author = ET.SubElement(ncx, "docAuthor") 

148 doc_author_text = ET.SubElement(doc_author, "text") 

149 doc_author_text.text = openbook["creator"][0]["name"] 

150 

151 nav_map = ET.SubElement(ncx, "navMap") 

152 hierarchical_toc = _sort_toc(openbook["nav"]["toc"]) 

153 nav_point_counter = 0 

154 for item in hierarchical_toc: 

155 nav_point_counter += 1 

156 if not item.get("sectionName"): 

157 nav_point = ET.SubElement( 

158 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"} 

159 ) 

160 nav_label = ET.SubElement(nav_point, "navLabel") 

161 nav_label_text = ET.SubElement(nav_label, "text") 

162 nav_label_text.text = item["title"] 

163 ET.SubElement(nav_point, "content", attrib={"src": item["path"]}) 

164 

165 if nav_point_counter == 1 and nav_page: 

166 nav_point_counter += 1 

167 nav_point = ET.SubElement( 

168 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"} 

169 ) 

170 nav_label = ET.SubElement(nav_point, "navLabel") 

171 nav_label_text = ET.SubElement(nav_label, "text") 

172 nav_label_text.text = "Contents" 

173 ET.SubElement(nav_point, "content", attrib={"src": nav_page}) 

174 continue 

175 

176 nav_point = ET.SubElement( 

177 nav_map, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"} 

178 ) 

179 nav_label = ET.SubElement(nav_point, "navLabel") 

180 nav_label_text = ET.SubElement(nav_label, "text") 

181 nav_label_text.text = item["sectionName"] 

182 # since we don't have a section content page, link section to first article path 

183 ET.SubElement(nav_point, "content", attrib={"src": item["items"][0]["path"]}) 

184 for section_item in item["items"]: 

185 nav_point_counter += 1 

186 section_item_nav_point = ET.SubElement( 

187 nav_point, "navPoint", attrib={"id": f"navPoint{nav_point_counter}"} 

188 ) 

189 section_item_nav_label = ET.SubElement(section_item_nav_point, "navLabel") 

190 section_item_nav_label_text = ET.SubElement(section_item_nav_label, "text") 

191 section_item_nav_label_text.text = section_item["title"] 

192 ET.SubElement( 

193 section_item_nav_point, "content", attrib={"src": section_item["path"]} 

194 ) 

195 return ncx 

196 

197 

198def _sanitise_opf_id(string_id: str) -> str: 

199 """ 

200 OPF IDs cannot start with a number 

201 :param string_id: 

202 :return: 

203 """ 

204 string_id = slugify(string_id) 

205 if string_id[0].isdigit(): 

206 return f"id_{string_id}" 

207 return string_id 

208 

209 

210def _cleanup_soup(soup: BeautifulSoup, version: str = "2.0") -> None: 

211 """ 

212 Tries to fix up book content pages to be epub-version compliant. 

213 

214 :param soup: 

215 :param version: 

216 :return: 

217 """ 

218 if version == "2.0": 

219 # v2 is a lot pickier about the acceptable elements and attributes 

220 modified_doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"' 

221 for item in soup.contents: 

222 if isinstance(item, Doctype): 

223 item.replace_with(Doctype(modified_doctype)) 

224 break 

225 remove_attributes = [ 

226 # this list will not be complete, but we try 

227 "aria-label", 

228 "data-loc", 

229 "data-epub-type", 

230 "data-document-status", 

231 "data-xml-lang", 

232 "lang", 

233 "role", 

234 "epub:type", 

235 "epub:prefix", 

236 ] 

237 for attribute in remove_attributes: 

238 for tag in soup.find_all(attrs={attribute: True}): 

239 del tag[attribute] 

240 convert_tags = ["nav", "section"] # this list will not be complete, but we try 

241 for tag in convert_tags: 

242 for invalid_tag in soup.find_all(tag): 

243 invalid_tag.name = "div" 

244 

245 # known issues, this will not be complete 

246 for svg in soup.find_all("svg"): 

247 if not svg.get("xmlns"): 

248 svg["xmlns"] = "http://www.w3.org/2000/svg" 

249 if not svg.get("xmlns:xlink"): 

250 svg["xmlns:xlink"] = "http://www.w3.org/1999/xlink" 

251 convert_tags = ["figcaption"] 

252 for tag in convert_tags: 

253 for invalid_tag in soup.find_all(tag): 

254 invalid_tag.name = "div" 

255 remove_tags = ["base"] 

256 for tag in remove_tags: 

257 for remove_tag in soup.find_all(tag): 

258 remove_tag.decompose() 

259 

260 html_tag = soup.find("html") 

261 if html_tag and isinstance(html_tag, bs4.element.Tag) and not html_tag.get("xmlns"): 

262 html_tag["xmlns"] = "http://www.w3.org/1999/xhtml" 

263 

264 

265def _sort_spine_entries(a: Dict, b: Dict, toc_pages: List[str]): 

266 """ 

267 Sort spine according to TOC. For magazines, this is sometimes a 

268 problem where the sequence laid out in the spine does not align 

269 with the TOC, e.g. Mother Jones. If unsorted, the page through 

270 sequence does not match the actual TOC. 

271 

272 :param a: 

273 :param b: 

274 :param toc_pages: 

275 :return: 

276 """ 

277 try: 

278 a_index = toc_pages.index(a["-odread-original-path"]) 

279 except ValueError: 

280 a_index = 999 

281 try: 

282 b_index = toc_pages.index(b["-odread-original-path"]) 

283 except ValueError: 

284 b_index = 999 

285 

286 if a_index != b_index: 

287 # sort order found via toc 

288 return -1 if a_index < b_index else 1 

289 

290 return -1 if a["-odread-spine-position"] < b["-odread-spine-position"] else 1 

291 

292 

293def _sort_title_contents(a: Dict, b: Dict): 

294 """ 

295 Sort the title contents roster so that pages get processed first. 

296 This is a precautionary measure for getting high-res cover images 

297 since we must parse the html for the image src. 

298 

299 :param a: 

300 :param b: 

301 :return: 

302 """ 

303 extensions_rank = [ 

304 ".xhtml", 

305 ".html", 

306 ".htm", 

307 ".jpg", 

308 ".jpeg", 

309 ".png", 

310 ".gif", 

311 ".ttf", # download fonts before css so that we can check if font is available 

312 ".otf", 

313 ".css", 

314 ] 

315 a_parsed_url = urlparse(a["url"]) 

316 b_parsed_url = urlparse(b["url"]) 

317 a_ext = Path(a_parsed_url.path).suffix 

318 b_ext = Path(b_parsed_url.path).suffix 

319 try: 

320 a_index = extensions_rank.index(a_ext) 

321 except ValueError: 

322 a_index = 999 

323 try: 

324 b_index = extensions_rank.index(b_ext) 

325 except ValueError: 

326 b_index = 999 

327 

328 if a_index != b_index: 

329 # sort order found via toc 

330 return -1 if a_index < b_index else 1 

331 

332 if a_ext != b_ext: 

333 return -1 if a_ext < b_ext else 1 

334 

335 return -1 if a_parsed_url.path < b_parsed_url.path else 1 

336 

337 

338def _filter_content(entry: Dict, media_info: Dict, toc_pages: List[str]): 

339 """ 

340 Filter title contents that are not needed. 

341 

342 :param entry: 

343 :param media_info: 

344 :param toc_pages: 

345 :return: 

346 """ 

347 parsed_entry_url = urlparse(entry["url"]) 

348 media_type = guess_mimetype(parsed_entry_url.path[1:]) 

349 

350 if media_info["type"]["id"] == LibbyMediaTypes.Magazine and media_type: 

351 if media_type.startswith("image/") and ( 

352 parsed_entry_url.path.startswith("/pages/") 

353 or parsed_entry_url.path.startswith("/thumbnails/") 

354 ): 

355 return False 

356 if ( 

357 media_type in ("application/xhtml+xml", "text/html") 

358 and parsed_entry_url.path[1:] not in toc_pages 

359 ): 

360 return False 

361 

362 if parsed_entry_url.path.startswith("/_d/"): # ebooks 

363 return False 

364 

365 return True 

366 

367 

368def process_ebook_loan( 

369 loan: Dict, 

370 cover_path: Optional[Path], 

371 openbook: Dict, 

372 rosters: List[Dict], 

373 libby_client: LibbyClient, 

374 args: argparse.Namespace, 

375 logger: logging.Logger, 

376) -> None: 

377 """ 

378 Generates and return an ebook loan directly from Libby. 

379 

380 :param loan: 

381 :param cover_path: 

382 :param openbook: 

383 :param rosters: 

384 :param libby_client: 

385 :param args: 

386 :param logger: 

387 :return: 

388 """ 

389 book_folder, book_file_name = generate_names( 

390 title=loan["title"], 

391 series=loan.get("series") or "", 

392 series_reading_order=loan.get("detailedSeries", {}).get("readingOrder", ""), 

393 authors=extract_authors_from_openbook(openbook), 

394 edition=loan.get("edition") or "", 

395 title_id=loan["id"], 

396 args=args, 

397 logger=logger, 

398 ) 

399 epub_file_path = book_file_name.with_suffix(".epub") 

400 epub_version = "3.0" 

401 

402 book_meta_name = "META-INF" 

403 book_content_name = "OEBPS" 

404 book_meta_folder = book_folder.joinpath(book_meta_name) 

405 book_content_folder = book_folder.joinpath(book_content_name) 

406 for d in (book_meta_folder, book_content_folder): 

407 if not d.exists(): 

408 d.mkdir(parents=True, exist_ok=True) 

409 

410 od_client = OverDriveClient( 

411 user_agent=USER_AGENT, timeout=args.timeout, retry=args.retries 

412 ) 

413 media_info = od_client.media(loan["id"]) 

414 

415 if args.is_debug_mode: 

416 with book_folder.joinpath("media.json").open("w", encoding="utf-8") as f: 

417 json.dump(media_info, f, indent=2) 

418 

419 with book_folder.joinpath("loan.json").open("w", encoding="utf-8") as f: 

420 json.dump(loan, f, indent=2) 

421 

422 with book_folder.joinpath("rosters.json").open("w", encoding="utf-8") as f: 

423 json.dump(rosters, f, indent=2) 

424 

425 with book_folder.joinpath("openbook.json").open("w", encoding="utf-8") as f: 

426 json.dump(openbook, f, indent=2) 

427 

428 title_contents: Dict = next( 

429 iter([r for r in rosters if r["group"] == "title-content"]), {} 

430 ) 

431 headers = libby_client.default_headers() 

432 headers["Accept"] = "*/*" 

433 contents_re = re.compile(r"parent\.__bif_cfc0\(self,'(?P<base64_text>.+)'\)") 

434 

435 openbook_toc = openbook["nav"]["toc"] 

436 if len(openbook_toc) <= 1 and loan["type"]["id"] == LibbyMediaTypes.Magazine: 

437 raise OdmpyRuntimeError("Unsupported fixed-layout (pre-paginated) format.") 

438 

439 # for finding cover image for magazines 

440 cover_toc_item = next( 

441 iter( 

442 [ 

443 item 

444 for item in openbook_toc 

445 if item.get("pageRange", "") == "Cover" and item.get("featureImage") 

446 ] 

447 ), 

448 None, 

449 ) 

450 # for finding cover image for ebooks 

451 cover_page_landmark = next( 

452 iter( 

453 [ 

454 item 

455 for item in openbook.get("nav", {}).get("landmarks", []) 

456 if item["type"] == "cover" 

457 ] 

458 ), 

459 None, 

460 ) 

461 toc_pages = [item["path"].split("#")[0] for item in openbook_toc] 

462 manifest_entries: List[Dict] = [] 

463 

464 title_content_entries = list( 

465 filter( 

466 lambda e: _filter_content(e, media_info, toc_pages), 

467 title_contents["entries"], 

468 ) 

469 ) 

470 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372 

471 title_content_entries = sorted( 

472 title_content_entries, key=cmp_to_key(_sort_title_contents) # type: ignore[misc] 

473 ) 

474 progress_bar = tqdm(title_content_entries, disable=args.hide_progress) 

475 has_ncx = False 

476 has_nav = False 

477 

478 # Used to patch magazine css that causes paged mode in calibre viewer to not work. 

479 # This expression is used to strip `overflow-x: hidden` from the css definition 

480 # for `#article-body`. 

481 patch_magazine_css_overflow_re = re.compile( 

482 r"(#article-body\s*\{[^{}]+?)overflow-x:\s*hidden;([^{}]+?})" 

483 ) 

484 # This expression is used to strip `padding: Xem Xem;` from the css definition 

485 # for `#article-body` to remove the extraneous padding 

486 patch_magazine_css_padding_re = re.compile( 

487 r"(#article-body\s*\{[^{}]+?)padding:\s*[^;]+;([^{}]+?})" 

488 ) 

489 # This expression is used to patch the missing fonts-specified in magazine css 

490 patch_magazine_css_font_re = re.compile(r"(font-family: '[^']+(Sans|Serif)[^']+';)") 

491 # This expression is used to strip the missing font src in magazine css 

492 patch_magazine_css_font_src_re = re.compile( 

493 r"@font-face\s*\{[^{}]+?(src:\s*url\('(fonts/.+\.ttf)'\).+?;)[^{}]+?}" 

494 ) 

495 

496 # holds the manifest item ID for the image identified as the cover 

497 cover_img_manifest_id = None 

498 

499 for entry in progress_bar: 

500 entry_url = entry["url"] 

501 parsed_entry_url = urlparse(entry_url) 

502 title_content_path = Path(parsed_entry_url.path[1:]) 

503 media_type = guess_mimetype(title_content_path.name) 

504 if not media_type: 

505 logger.warning("Skipped roster entry: %s", title_content_path.name) 

506 continue 

507 asset_folder = book_content_folder.joinpath(title_content_path.parent) 

508 if media_type == "application/x-dtbncx+xml": 

509 has_ncx = True 

510 manifest_entry = { 

511 "href": parsed_entry_url.path[1:], 

512 "id": "ncx" 

513 if media_type == "application/x-dtbncx+xml" 

514 else _sanitise_opf_id(parsed_entry_url.path[1:]), 

515 "media-type": media_type, 

516 } 

517 

518 # try to find cover image for magazines 

519 if cover_toc_item and manifest_entry["id"] == _sanitise_opf_id( 

520 cover_toc_item["featureImage"] 

521 ): 

522 # we assign it here to ensure that the image referenced in the 

523 # toc actually exists 

524 cover_img_manifest_id = manifest_entry["id"] 

525 

526 if not asset_folder.exists(): 

527 asset_folder.mkdir(parents=True, exist_ok=True) 

528 asset_file_path = asset_folder.joinpath(Path(parsed_entry_url.path).name) 

529 

530 soup = None 

531 if asset_file_path.exists(): 

532 progress_bar.set_description(f"Already saved {asset_file_path.name}") 

533 if media_type in ("application/xhtml+xml", "text/html"): 

534 with asset_file_path.open("r", encoding="utf-8") as f_asset: 

535 soup = BeautifulSoup(f_asset, features="html.parser") 

536 else: 

537 progress_bar.set_description(f"Downloading {asset_file_path.name}") 

538 # use the libby client session because the required 

539 # auth cookies are set there 

540 res: requests.Response = libby_client.make_request( 

541 entry_url, headers=headers, authenticated=False, return_res=True 

542 ) 

543 

544 # patch magazine css to fix various rendering problems 

545 if ( 

546 media_info["type"]["id"] == LibbyMediaTypes.Magazine 

547 and media_type == "text/css" 

548 ): 

549 css_content = patch_magazine_css_overflow_re.sub(r"\1\2", res.text) 

550 css_content = patch_magazine_css_padding_re.sub(r"\1\2", css_content) 

551 if "#article-body" in css_content: 

552 # patch font-family declarations 

553 # libby declares these font-faces but does not supply them in the roster 

554 # nor are they actually available when viewed online (http 403) 

555 font_families = list( 

556 set(patch_magazine_css_font_re.findall(css_content)) 

557 ) 

558 for font_family, _ in font_families: 

559 new_font_css = font_family[:-1] 

560 if "Serif" in font_family: 

561 new_font_css += ',Charter,"Bitstream Charter","Sitka Text",Cambria,serif' 

562 elif "Sans" in font_family: 

563 new_font_css += ",system-ui,sans-serif" 

564 new_font_css += ";" 

565 if "-Bold" in font_family: 

566 new_font_css += " font-weight: 700;" 

567 elif "-SemiBold" in font_family: 

568 new_font_css += " font-weight: 600;" 

569 elif "-Light" in font_family: 

570 new_font_css += " font-weight: 300;" 

571 css_content = css_content.replace(font_family, new_font_css) 

572 else: 

573 # patch font url declarations 

574 # since ttf/otf files are downloaded ahead of css, we can verify 

575 # if the font files are actually available 

576 try: 

577 font_sources = patch_magazine_css_font_src_re.findall( 

578 css_content 

579 ) 

580 for src_match, font_src in font_sources: 

581 asset_font_path = Path( 

582 urljoin(str(asset_file_path), font_src) 

583 ) 

584 if not asset_font_path.exists(): 

585 css_content = css_content.replace(src_match, "") 

586 except ( 

587 Exception # noqa, pylint: disable=broad-exception-caught 

588 ) as patch_err: 

589 logger.warning( 

590 "Error while patching font sources: %s", patch_err 

591 ) 

592 with open(asset_file_path, "w", encoding="utf-8") as f_out: 

593 f_out.write(css_content) 

594 elif media_type in ("application/xhtml+xml", "text/html"): 

595 soup = BeautifulSoup(res.text, features="html.parser") 

596 script_ele = soup.find("script", attrs={"type": "text/javascript"}) 

597 if script_ele and hasattr(script_ele, "string"): 

598 mobj = contents_re.search(script_ele.string or "") 

599 if not mobj: 

600 logger.warning( 

601 "Unable to extract content string for %s", 

602 parsed_entry_url.path, 

603 ) 

604 else: 

605 new_soup = BeautifulSoup( 

606 base64.b64decode(mobj.group("base64_text")), 

607 features="html.parser", 

608 ) 

609 soup.body.replace_with(new_soup.body) # type: ignore[arg-type,union-attr] 

610 _cleanup_soup(soup, version=epub_version) 

611 if ( 

612 cover_toc_item 

613 and cover_toc_item.get("featureImage") 

614 and manifest_entry["id"] == _sanitise_opf_id(cover_toc_item["path"]) 

615 ): 

616 img_src = os.path.relpath( 

617 book_content_folder.joinpath(cover_toc_item["featureImage"]), 

618 start=asset_folder, 

619 ) 

620 if is_windows(): 

621 img_src = Path(img_src).as_posix() 

622 # patch the svg based cover for magazines 

623 cover_svg = soup.find("svg") 

624 if cover_svg: 

625 # replace the svg ele with a simple image tag 

626 cover_svg.decompose() # type: ignore[union-attr] 

627 for c in soup.body.find_all(recursive=False): # type: ignore[union-attr] 

628 c.decompose() 

629 soup.body.append( # type: ignore[union-attr] 

630 soup.new_tag("img", attrs={"src": img_src, "alt": "Cover"}) 

631 ) 

632 style_ele = soup.new_tag("style") 

633 style_ele.append( 

634 "img { max-width: 100%; margin-left: auto; margin-right: auto; }" 

635 ) 

636 soup.head.append(style_ele) # type: ignore[union-attr] 

637 

638 with open(asset_file_path, "w", encoding="utf-8") as f_out: 

639 f_out.write(str(soup)) 

640 else: 

641 with open(asset_file_path, "wb") as f_out: 

642 f_out.write(res.content) 

643 

644 if soup: 

645 # try to min. soup searches where possible 

646 if ( 

647 (not cover_img_manifest_id) 

648 and cover_page_landmark 

649 and cover_page_landmark["path"] == parsed_entry_url.path[1:] 

650 ): 

651 # try to find cover image for the book from the cover html content 

652 cover_image = soup.find("img", attrs={"src": True}) 

653 if cover_image: 

654 cover_img_manifest_id = _sanitise_opf_id( 

655 urljoin(cover_page_landmark["path"], cover_image["src"]) # type: ignore[index] 

656 ) 

657 elif (not has_nav) and soup.find(attrs={"epub:type": "toc"}): 

658 # identify nav page 

659 manifest_entry["properties"] = "nav" 

660 has_nav = True 

661 elif soup.find("svg"): 

662 # page has svg 

663 manifest_entry["properties"] = "svg" 

664 

665 if cover_img_manifest_id == manifest_entry["id"]: 

666 manifest_entry["properties"] = "cover-image" 

667 manifest_entries.append(manifest_entry) 

668 if manifest_entry.get("properties") == "cover-image" and cover_path: 

669 # replace the cover image already downloaded via the OD api, in case it is to be kept 

670 shutil.copyfile(asset_file_path, cover_path) 

671 

672 if not has_nav: 

673 # Generate nav - needed for magazines 

674 

675 # we give the nav an id-stamped file name to avoid accidentally overwriting 

676 # an existing file name 

677 nav_file_name = f'nav_{loan["id"]}.xhtml' 

678 

679 nav_soup = BeautifulSoup(NAV_XHTMLTEMPLATE, features="html.parser") 

680 nav_soup.find("title").append(loan["title"]) # type: ignore[union-attr] 

681 toc_ele = nav_soup.find(id="toc") 

682 

683 # sort toc into hierarchical sections 

684 hierarchical_toc = _sort_toc(openbook_toc) 

685 for item in hierarchical_toc: 

686 li_ele = nav_soup.new_tag("li") 

687 if not item.get("sectionName"): 

688 a_ele = nav_soup.new_tag("a", attrs={"href": item["path"]}) 

689 a_ele.append(item["title"]) 

690 li_ele.append(a_ele) 

691 toc_ele.append(li_ele) # type: ignore[union-attr] 

692 continue 

693 # since we don't have a section content page, and this can cause problems, 

694 # link section to first article path 

695 a_ele = nav_soup.new_tag("a", attrs={"href": item["items"][0]["path"]}) 

696 a_ele.append(item["sectionName"]) 

697 li_ele.append(a_ele) 

698 ol_ele = nav_soup.new_tag("ol", attrs={"type": "1"}) 

699 for section_item in item.get("items", []): 

700 section_li_ele = nav_soup.new_tag("li") 

701 section_item_a_ele = nav_soup.new_tag( 

702 "a", attrs={"href": section_item["path"]} 

703 ) 

704 section_item_a_ele.append(section_item["title"]) 

705 section_li_ele.append(section_item_a_ele) 

706 ol_ele.append(section_li_ele) 

707 continue 

708 li_ele.append(ol_ele) 

709 toc_ele.append(li_ele) # type: ignore[union-attr] 

710 

711 with book_content_folder.joinpath(nav_file_name).open( 

712 "w", encoding="utf-8" 

713 ) as f_nav: 

714 f_nav.write(str(nav_soup).strip()) 

715 manifest_entries.append( 

716 { 

717 "href": nav_file_name, 

718 "id": _sanitise_opf_id(nav_file_name), 

719 "media-type": "application/xhtml+xml", 

720 "properties": "nav", 

721 } 

722 ) 

723 

724 if not has_ncx: 

725 # generate ncx for backward compat 

726 ncx = _build_ncx(media_info, openbook, nav_file_name if not has_nav else "") 

727 # we give the ncx an id-stamped file name to avoid accidentally overwriting 

728 # an existing file name 

729 toc_ncx_name = f'toc_{loan["id"]}.ncx' 

730 tree = ET.ElementTree(ncx) 

731 tree.write( 

732 book_content_folder.joinpath(toc_ncx_name), 

733 xml_declaration=True, 

734 encoding="utf-8", 

735 ) 

736 manifest_entries.append( 

737 { 

738 "href": toc_ncx_name, 

739 "id": "ncx", 

740 "media-type": "application/x-dtbncx+xml", 

741 } 

742 ) 

743 has_ncx = True 

744 else: 

745 # EPUB3 compliance: Ensure that the identifier in ncx matches the one in the OPF 

746 # Mismatch due to the toc.ncx being supplied by publisher 

747 ncx_manifest_entry = next( 

748 iter([m for m in manifest_entries if m["id"] == "ncx"]), None 

749 ) 

750 if ncx_manifest_entry: 

751 expected_book_identifier = ( 

752 extract_isbn( 

753 media_info["formats"], 

754 format_types=[ 

755 LibbyFormats.MagazineOverDrive 

756 if loan["type"]["id"] == LibbyMediaTypes.Magazine 

757 else LibbyFormats.EBookOverdrive 

758 ], 

759 ) 

760 or media_info["id"] 

761 ) # this is the summarised logic from build_opf_package 

762 ncx_path = book_content_folder.joinpath(ncx_manifest_entry["href"]) 

763 new_ncx_contents = None 

764 with ncx_path.open("r", encoding="utf-8") as ncx_f: 

765 ncx_soup = BeautifulSoup(ncx_f, features="xml") 

766 meta_id = ncx_soup.find("meta", attrs={"name": "dtb:uid"}) 

767 if ( 

768 meta_id 

769 and isinstance(meta_id, Tag) 

770 and meta_id.get("content") 

771 and meta_id["content"] != expected_book_identifier 

772 ): 

773 logger.debug( 

774 'Replacing identifier in %s: "%s" -> "%s"', 

775 ncx_path.name, 

776 meta_id["content"], 

777 expected_book_identifier, 

778 ) 

779 meta_id["content"] = expected_book_identifier 

780 new_ncx_contents = str(ncx_soup) 

781 if new_ncx_contents: 

782 with ncx_path.open("w", encoding="utf-8") as ncx_f: 

783 ncx_f.write(new_ncx_contents) 

784 

785 # create epub OPF 

786 opf_file_name = "package.opf" 

787 opf_file_path = book_content_folder.joinpath(opf_file_name) 

788 package = build_opf_package( 

789 media_info, 

790 version=epub_version, 

791 loan_format=LibbyFormats.MagazineOverDrive 

792 if loan["type"]["id"] == LibbyMediaTypes.Magazine 

793 else LibbyFormats.EBookOverdrive, 

794 ) 

795 if args.generate_opf: 

796 # save opf before the manifest and spine elements get added 

797 # because those elements are meaningless outside an epub 

798 export_opf_file = epub_file_path.with_suffix(".opf") 

799 ET.ElementTree(package).write( 

800 export_opf_file, xml_declaration=True, encoding="utf-8" 

801 ) 

802 logger.info('Saved "%s"', colored(str(export_opf_file), "magenta")) 

803 

804 # add manifest 

805 manifest = ET.SubElement(package, "manifest") 

806 for entry in manifest_entries: 

807 ET.SubElement(manifest, "item", attrib=entry) 

808 

809 cover_manifest_entry = next( 

810 iter( 

811 [ 

812 entry 

813 for entry in manifest_entries 

814 if entry.get("properties", "") == "cover-image" 

815 ] 

816 ), 

817 None, 

818 ) 

819 if not cover_manifest_entry: 

820 cover_img_manifest_id = None 

821 if cover_path and not cover_manifest_entry: 

822 # add cover image separately since we can't identify which item is the cover 

823 # we give the cover a timestamped file name to avoid accidentally overwriting 

824 # an existing file name 

825 cover_image_name = f"cover_{int(datetime.datetime.now().timestamp())}.jpg" 

826 shutil.copyfile(cover_path, book_content_folder.joinpath(cover_image_name)) 

827 cover_img_manifest_id = "coverimage" 

828 ET.SubElement( 

829 manifest, 

830 "item", 

831 attrib={ 

832 "id": cover_img_manifest_id, 

833 "href": cover_image_name, 

834 "media-type": "image/jpeg", 

835 "properties": "cover-image", 

836 }, 

837 ) 

838 

839 if cover_img_manifest_id: 

840 metadata = package.find("metadata") 

841 if metadata: 

842 _ = ET.SubElement( 

843 metadata, 

844 "meta", 

845 attrib={"name": "cover", "content": cover_img_manifest_id}, 

846 ) 

847 

848 # add spine 

849 spine = ET.SubElement(package, "spine") 

850 if has_ncx: 

851 spine.set("toc", "ncx") 

852 spine_entries = list( 

853 filter( 

854 lambda s: not ( 

855 media_info["type"]["id"] == LibbyMediaTypes.Magazine 

856 and s["-odread-original-path"] not in toc_pages 

857 ), 

858 openbook["spine"], 

859 ) 

860 ) 

861 

862 # Ignoring mypy error below because of https://github.com/python/mypy/issues/9372 

863 spine_entries = sorted( 

864 spine_entries, key=cmp_to_key(lambda a, b: _sort_spine_entries(a, b, toc_pages)) # type: ignore[misc] 

865 ) 

866 for spine_idx, entry in enumerate(spine_entries): 

867 if ( 

868 media_info["type"]["id"] == LibbyMediaTypes.Magazine 

869 and entry["-odread-original-path"] not in toc_pages 

870 ): 

871 continue 

872 item_ref = ET.SubElement(spine, "itemref") 

873 item_ref.set("idref", _sanitise_opf_id(entry["-odread-original-path"])) 

874 if spine_idx == 0 and not has_nav: 

875 item_ref = ET.SubElement(spine, "itemref") 

876 item_ref.set("idref", _sanitise_opf_id(nav_file_name)) 

877 

878 # add guide 

879 if openbook.get("nav", {}).get("landmarks"): 

880 guide = ET.SubElement(package, "guide") 

881 for landmark in openbook["nav"]["landmarks"]: 

882 _ = ET.SubElement( 

883 guide, 

884 "reference", 

885 attrib={ 

886 "href": landmark["path"], 

887 "title": landmark["title"], 

888 "type": landmark["type"], 

889 }, 

890 ) 

891 

892 if args.is_debug_mode: 

893 from xml.dom import minidom 

894 

895 with opf_file_path.open("w", encoding="utf-8") as f: 

896 f.write( 

897 minidom.parseString(ET.tostring(package, "utf-8")).toprettyxml( 

898 indent="\t" 

899 ) 

900 ) 

901 else: 

902 tree = ET.ElementTree(package) 

903 tree.write(opf_file_path, xml_declaration=True, encoding="utf-8") 

904 logger.debug('Saved "%s"', opf_file_path) 

905 

906 # create container.xml 

907 container_file_path = book_meta_folder.joinpath("container.xml") 

908 container = ET.Element( 

909 "container", 

910 attrib={ 

911 "version": "1.0", 

912 "xmlns": "urn:oasis:names:tc:opendocument:xmlns:container", 

913 }, 

914 ) 

915 root_files = ET.SubElement(container, "rootfiles") 

916 _ = ET.SubElement( 

917 root_files, 

918 "rootfile", 

919 attrib={ 

920 # use posix path because zipFile requires "/" 

921 "full-path": Path(book_content_name, opf_file_name).as_posix(), 

922 "media-type": "application/oebps-package+xml", 

923 }, 

924 ) 

925 tree = ET.ElementTree(container) 

926 tree.write(container_file_path, xml_declaration=True, encoding="utf-8") 

927 logger.debug('Saved "%s"', container_file_path) 

928 

929 # create epub zip 

930 with zipfile.ZipFile( 

931 epub_file_path, mode="w", compression=zipfile.ZIP_DEFLATED 

932 ) as epub_zip: 

933 epub_zip.writestr( 

934 "mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED 

935 ) 

936 for root_start in (book_meta_folder, book_content_folder): 

937 for p in root_start.glob("**/*"): 

938 if p.is_dir(): 

939 continue 

940 zip_archive_file = p.relative_to(book_folder) 

941 # using posix path because zipfile requires "/" separators 

942 # and may break on Windows otherwise 

943 zip_archive_name = zip_archive_file.as_posix() 

944 zip_target_file = book_folder.joinpath(zip_archive_file) 

945 epub_zip.write(zip_target_file, zip_archive_name) 

946 logger.debug( 

947 'epub: Added "%s" as "%s"', zip_target_file, zip_archive_name 

948 ) 

949 logger.info('Saved "%s"', colored(str(epub_file_path), "magenta", attrs=["bold"])) 

950 

951 # clean up 

952 if not args.is_debug_mode: 

953 for file_name in ( 

954 "mimetype", 

955 "media.json", 

956 "openbook.json", 

957 "loan.json", 

958 "rosters.json", 

959 ): 

960 target = book_folder.joinpath(file_name) 

961 if target.exists(): 

962 target.unlink() 

963 for folder in (book_content_folder, book_meta_folder): 

964 shutil.rmtree(folder, ignore_errors=True)