pdf_generator_enhanced_v2.py 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 增强版PDF生成器 V2
  5. 基于HTML文件解析和Markdown内容提取的PDF生成系统
  6. 支持中英文双语版本,自动生成目录和正文
  7. """
  8. import os
  9. import sys
  10. import json
  11. import re
  12. import tempfile
  13. import shutil
  14. import subprocess
  15. from pathlib import Path
  16. from typing import List, Dict, Optional, Tuple
  17. import argparse
  18. from datetime import datetime
  19. from bs4 import BeautifulSoup
  20. import markdown
  21. from markdown.extensions import codehilite, tables, toc
  22. class HTMLParser:
  23. """HTML文件解析器"""
  24. def __init__(self, html_file: Path):
  25. self.html_file = html_file
  26. self.soup = None
  27. self._parse_html()
  28. def _parse_html(self):
  29. """解析HTML文件"""
  30. try:
  31. with open(self.html_file, 'r', encoding='utf-8') as f:
  32. content = f.read()
  33. self.soup = BeautifulSoup(content, 'html.parser')
  34. except Exception as e:
  35. print(f"警告: 无法解析HTML文件 {self.html_file}: {e}")
  36. self.soup = None
  37. def extract_title(self) -> str:
  38. """提取页面标题"""
  39. if not self.soup:
  40. return "未知标题"
  41. # 尝试多种方式获取标题
  42. title_selectors = [
  43. 'h1',
  44. '.rst-content h1',
  45. 'title',
  46. '.document h1'
  47. ]
  48. for selector in title_selectors:
  49. element = self.soup.select_one(selector)
  50. if element:
  51. title = element.get_text().strip()
  52. if title and title != "Titan-Board SDK 1.0.0 文档":
  53. return title
  54. return "未知标题"
  55. def extract_content(self) -> str:
  56. """提取主要内容(保留HTML结构,并将资源链接改为绝对路径)"""
  57. if not self.soup:
  58. return ""
  59. # 查找主要内容区域
  60. content_selectors = [
  61. '.rst-content',
  62. '.document',
  63. '.body',
  64. 'main',
  65. 'article'
  66. ]
  67. content_element = None
  68. for selector in content_selectors:
  69. content_element = self.soup.select_one(selector)
  70. if content_element:
  71. break
  72. if not content_element:
  73. # 如果没有找到特定容器,使用body
  74. content_element = self.soup.find('body')
  75. if not content_element:
  76. return ""
  77. # 移除不需要的元素(脚本、样式、侧栏、版本菜单、编辑/PDF按钮、搜索框等)
  78. remove_selectors = [
  79. 'script', 'style', 'nav',
  80. '.wy-nav-side', '.rst-versions', '.version-menu',
  81. '.edit-button', '.pdf-button', '[role="search"]',
  82. 'a.headerlink'
  83. ]
  84. for sel in remove_selectors:
  85. for node in content_element.select(sel):
  86. node.decompose()
  87. # 将相对资源路径转换为绝对 file:// 路径
  88. base_dir = self.html_file.parent
  89. def _abs_url(url: str) -> str:
  90. if not url:
  91. return url
  92. u = url.strip()
  93. if u.startswith(('http://', 'https://', 'data:', 'mailto:', '#')):
  94. return u
  95. try:
  96. return (base_dir / u).resolve().as_uri()
  97. except Exception:
  98. return u
  99. for tag in content_element.select('[src]'):
  100. tag['src'] = _abs_url(tag.get('src'))
  101. for tag in content_element.select('a[href]'):
  102. href = tag.get('href')
  103. # 锚点保持不变
  104. if href and not href.startswith('#'):
  105. tag['href'] = _abs_url(href)
  106. # 返回主要内容的内部HTML,保留原有结构与样式类名
  107. return content_element.decode_contents()
  108. def extract_toc(self) -> List[Dict]:
  109. """提取目录结构"""
  110. if not self.soup:
  111. return []
  112. toc_items = []
  113. # 查找目录元素
  114. toc_selectors = [
  115. '.wy-menu .toctree-l1',
  116. '.toctree-wrapper .toctree-l1',
  117. '.rst-content .toctree-l1'
  118. ]
  119. for selector in toc_selectors:
  120. elements = self.soup.select(selector)
  121. if elements:
  122. for element in elements:
  123. link = element.find('a')
  124. if link:
  125. title = link.get_text().strip()
  126. href = link.get('href', '')
  127. if title and href:
  128. toc_items.append({
  129. 'title': title,
  130. 'href': href,
  131. 'level': 1
  132. })
  133. break
  134. return toc_items
  135. class MarkdownProcessor:
  136. """Markdown处理器"""
  137. def __init__(self):
  138. self.md = markdown.Markdown(
  139. extensions=[
  140. 'codehilite',
  141. 'tables',
  142. 'toc',
  143. 'fenced_code',
  144. 'attr_list',
  145. 'nl2br',
  146. 'sane_lists',
  147. 'footnotes'
  148. ],
  149. extension_configs={
  150. 'codehilite': {
  151. 'css_class': 'highlight',
  152. 'use_pygments': False
  153. },
  154. 'toc': {
  155. 'permalink': False,
  156. 'baselevel': 1
  157. }
  158. }
  159. )
  160. def process_markdown(self, md_content: str) -> str:
  161. """处理Markdown内容"""
  162. try:
  163. # Markdown 实例是有状态的,转换前需重置以避免交叉污染
  164. self.md.reset()
  165. html = self.md.convert(md_content)
  166. return html
  167. except Exception as e:
  168. print(f"警告: Markdown处理失败: {e}")
  169. return md_content
  170. class DocumentScanner:
  171. """文档扫描器(从原始Markdown项目目录读取)"""
  172. def __init__(self, html_dir: Path, projects_root: Path):
  173. self.html_dir = html_dir
  174. self.projects_root = projects_root
  175. # 分类与顺序:从 config.yaml 的 generation.output_structure 读取,如果缺省则按默认顺序
  176. self.categories = {}
  177. self.category_name_map = {}
  178. self.category_order: List[str] = []
  179. try:
  180. cfg_path = Path(__file__).parent / 'config.yaml'
  181. if cfg_path.exists():
  182. import yaml
  183. with open(cfg_path, 'r', encoding='utf-8') as f:
  184. cfg = yaml.safe_load(f) or {}
  185. cfg_cats = (cfg.get('categories') or {})
  186. out_struct = ((cfg.get('generation', {}) or {}).get('output_structure', []) or [])
  187. if out_struct:
  188. self.category_order = list(out_struct)
  189. else:
  190. self.category_order = ['start', 'basic', 'driver', 'component', 'multimedia', 'multcore']
  191. # 名称映射 + 填充 categories 字典用于扫描
  192. for key in self.category_order:
  193. node = cfg_cats.get(key) or {}
  194. name_cn = node.get('name') or key
  195. name_en = node.get('name_en') or name_cn
  196. self.category_name_map[key] = {'name': name_cn, 'name_en': name_en}
  197. # categories 值用中文默认名作占位
  198. self.categories[key] = name_cn
  199. except Exception:
  200. # 回退默认
  201. self.category_order = ['start', 'basic', 'driver', 'component', 'multimedia', 'multcore']
  202. for key, default_cn in {
  203. 'start': '快速上手',
  204. 'basic': '基础篇',
  205. 'driver': '驱动篇',
  206. 'component': '组件篇',
  207. 'multimedia': '多媒体显示篇',
  208. 'multcore': '多核通信篇'
  209. }.items():
  210. self.category_name_map[key] = {'name': default_cn, 'name_en': default_cn}
  211. self.categories[key] = default_cn
  212. def scan_documents(self, language: str = 'zh') -> Dict[str, List[Dict]]:
  213. """扫描文档结构(只使用 Markdown 文件)"""
  214. documents = {}
  215. for category, category_name in self.categories.items():
  216. # 用 html_dir 列出该分类下有哪些项目目录(递归),以确定项目名
  217. category_dir = self.html_dir / category
  218. if not category_dir.exists():
  219. continue
  220. category_docs = []
  221. # 递归查找该分类下的所有项目目录:判定标准为目录中存在任意 HTML(README*.html 等)
  222. try:
  223. candidates = []
  224. for html_file in category_dir.rglob('*.html'):
  225. rel = html_file.parent.relative_to(category_dir) # 支持多级路径
  226. if str(rel) == '.':
  227. continue
  228. candidates.append(rel)
  229. except Exception:
  230. candidates = []
  231. # 去重并按路径稳定排序
  232. seen = set()
  233. uniq_candidates = []
  234. for rel in sorted(candidates, key=lambda p: str(p).lower()):
  235. key = str(rel).replace('\\','/')
  236. if key in seen:
  237. continue
  238. seen.add(key)
  239. uniq_candidates.append(rel)
  240. for rel_path in uniq_candidates:
  241. project_dir = self.projects_root / rel_path
  242. if not project_dir.exists():
  243. # 回退到最后一段匹配
  244. project_dir = self.projects_root / rel_path.name
  245. if not project_dir.exists():
  246. continue
  247. readme_file = project_dir / ('README_zh.md' if language == 'zh' else 'README.md')
  248. if not readme_file.exists():
  249. # 若指定语言缺失,尝试另一种
  250. alt_readme = project_dir / ('README.md' if language == 'zh' else 'README_zh.md')
  251. if alt_readme.exists():
  252. readme_file = alt_readme
  253. else:
  254. continue
  255. title = self._extract_markdown_title(readme_file)
  256. category_docs.append({
  257. 'title': title,
  258. 'file': readme_file,
  259. 'project_name': str(rel_path).replace('\\','/'),
  260. 'project_dir': project_dir,
  261. 'category': category,
  262. 'category_name': self.category_name_map.get(category, {}).get('name', category),
  263. 'category_name_en': self.category_name_map.get(category, {}).get('name_en', self.category_name_map.get(category, {}).get('name', category))
  264. })
  265. if category_docs:
  266. documents[category] = category_docs
  267. return documents
  268. def _extract_markdown_title(self, md_file: Path) -> str:
  269. """从Markdown文件提取标题"""
  270. try:
  271. with open(md_file, 'r', encoding='utf-8') as f:
  272. content = f.read()
  273. # 查找第一个一级标题
  274. lines = content.split('\n')
  275. for line in lines:
  276. line = line.strip()
  277. if line.startswith('# '):
  278. title = line[2:].strip()
  279. # 清理标题中的Markdown格式
  280. title = re.sub(r'\*\*(.*?)\*\*', r'\1', title) # 移除粗体
  281. title = re.sub(r'\*(.*?)\*', r'\1', title) # 移除斜体
  282. title = re.sub(r'`(.*?)`', r'\1', title) # 移除代码格式
  283. return title
  284. # 如果没有找到一级标题,尝试二级标题
  285. for line in lines:
  286. line = line.strip()
  287. if line.startswith('## '):
  288. title = line[3:].strip()
  289. title = re.sub(r'\*\*(.*?)\*\*', r'\1', title)
  290. title = re.sub(r'\*(.*?)\*', r'\1', title)
  291. title = re.sub(r'`(.*?)`', r'\1', title)
  292. return title
  293. return md_file.stem
  294. except Exception as e:
  295. print(f"警告: 无法读取Markdown文件 {md_file}: {e}")
  296. return md_file.stem
  297. class PDFGeneratorV2:
  298. """增强版PDF生成器V2"""
  299. def __init__(self, html_dir: Path, output_dir: Path, keep_temp: bool = False, browser_path: Optional[str] = None):
  300. self.html_dir = html_dir
  301. self.output_dir = output_dir
  302. self.temp_dir = Path(tempfile.mkdtemp())
  303. self.keep_temp = keep_temp
  304. self.browser_path = browser_path
  305. self.scanner = DocumentScanner(html_dir, self._derive_projects_root())
  306. self.md_processor = MarkdownProcessor()
  307. self.toc_entries = [] # [{'level':1,'title':'1. Title','anchor':'id'}]
  308. self.assets_dir: Optional[Path] = None
  309. def __del__(self):
  310. """清理临时文件"""
  311. try:
  312. if not getattr(self, 'keep_temp', False) and self.temp_dir.exists():
  313. shutil.rmtree(self.temp_dir, ignore_errors=True)
  314. except Exception:
  315. pass
  316. def generate_pdf(self, title: str = "SDK文档", language: str = "zh") -> bool:
  317. """生成PDF的主流程"""
  318. print("=" * 60)
  319. print(f"开始生成PDF文档 V2 - 语言: {language}")
  320. print("=" * 60)
  321. # 为每次生成创建独立的临时目录,避免跨语言复用导致清理困难
  322. try:
  323. import tempfile as _tempfile
  324. self.temp_dir = Path(_tempfile.mkdtemp())
  325. # 重置目录收集,避免多语言生成时相互污染
  326. self.toc_entries = []
  327. # 0. 加载项目信息(版本、版权等)
  328. self.project_meta = self._load_project_meta()
  329. # 1. 扫描文档结构
  330. print("1. 扫描文档结构...")
  331. documents = self.scanner.scan_documents(language)
  332. if not documents:
  333. print("✗ 未找到文档文件")
  334. return False
  335. total_docs = sum(len(docs) for docs in documents.values())
  336. print(f"✓ 找到 {total_docs} 个文档文件")
  337. # 准备资源输出目录(用于相对路径拷贝)
  338. self.assets_dir = self.temp_dir / 'assets'
  339. self.assets_dir.mkdir(exist_ok=True)
  340. # 记录章节结构来源(动态/硬编码)
  341. order = getattr(self.scanner, 'category_order', None)
  342. if order:
  343. print("✓ 章节结构: 动态 (来自 config.yaml:generation.output_structure)")
  344. print(" 顺序: " + ", ".join(order))
  345. else:
  346. print("✓ 章节结构: 硬编码回退 (未在 config.yaml 中找到 output_structure)")
  347. # 2. 生成正文内容(先生成正文以便收集目录项)
  348. print("2. 生成正文内容...")
  349. content_html = self._generate_content(documents, language)
  350. # 3. 生成目录(依赖已收集的 toc_entries)
  351. print("3. 生成目录结构...")
  352. toc_html = self._generate_toc(documents, language)
  353. # 4. 创建完整的HTML文件
  354. print("4. 创建完整HTML文件...")
  355. full_html = self._create_full_html(title, toc_html, content_html, language)
  356. # 5. 生成PDF
  357. print("5. 生成PDF文件...")
  358. success = self._generate_pdf_from_html(full_html, title, language)
  359. if success:
  360. print("=" * 60)
  361. print("PDF生成完成!")
  362. print(f"📁 输出位置: {self.output_dir}")
  363. print("=" * 60)
  364. if getattr(self.scanner, 'category_order', None):
  365. print("总结: 本次 PDF 章节名称与顺序根据 docs/source/config.yaml 动态生成")
  366. else:
  367. print("总结: 本次 PDF 章节名称与顺序使用硬编码回退顺序")
  368. if getattr(self, 'keep_temp', False):
  369. try:
  370. print(f"临时目录保留: {self.temp_dir}")
  371. except Exception:
  372. pass
  373. return success
  374. except Exception as e:
  375. print(f"✗ PDF生成过程出错: {e}")
  376. return False
  377. finally:
  378. # 结束后清理本次生成的临时目录(除非要求保留)
  379. try:
  380. if not getattr(self, 'keep_temp', False) and getattr(self, 'temp_dir', None) and self.temp_dir.exists():
  381. shutil.rmtree(self.temp_dir, ignore_errors=True)
  382. except Exception:
  383. pass
  384. def _generate_toc(self, documents: Dict[str, List[Dict]], language: str) -> str:
  385. """根据已收集的多级标题生成目录HTML(带点线引导)"""
  386. import re
  387. list_items = []
  388. for entry in self.toc_entries:
  389. indent = (entry['level'] - 1) * 18
  390. title_text = entry["title"]
  391. # 清洗标题中的前导列表符号或编号后的星号:"* ", "- ", "• "、以及诸如 "1.2. * Title"
  392. title_text = re.sub(r'^[\s\*\-\u2022]+', '', title_text)
  393. title_text = re.sub(r'^(\d+(?:\.\d+)*)\.\s*[\*\-\u2022]+\s*', r'\1. ', title_text)
  394. # 再次去除可能残留的单独星号包围
  395. title_text = re.sub(r'\s+[\*\u2022]+\s+', ' ', title_text)
  396. list_items.append(
  397. f'<li class="toc-item level-{entry["level"]}" style="margin-left:{indent}px">'
  398. f'<a href="#{entry["anchor"]}"><span class="toc-text">{title_text}</span><span class="toc-dots"></span><span class="toc-page"></span></a>'
  399. f'</li>'
  400. )
  401. toc_title = '目录' if language == 'zh' else 'Contents'
  402. return (
  403. '<div class="toc">'
  404. f'<h2>{toc_title}</h2>'
  405. '<ul>' + ''.join(list_items) + '</ul>'
  406. '</div>'
  407. )
  408. def _generate_content(self, documents: Dict[str, List[Dict]], language: str) -> str:
  409. """生成正文内容HTML(分类、文档二级编号 + 文档内多级编号)"""
  410. content_sections = []
  411. category_index = 0
  412. # 按配置顺序渲染分类
  413. for category in getattr(self, 'category_order', []) or documents.keys():
  414. docs = documents.get(category, [])
  415. if not docs:
  416. continue
  417. # 本地化分类名
  418. # 根据语言选择分类名
  419. category_name = docs[0]['category_name'] if language == 'zh' else docs[0].get('category_name_en', docs[0]['category_name'])
  420. if language == 'en':
  421. cn_to_en = {
  422. '快速上手': 'Getting Started',
  423. '基础篇': 'Basics',
  424. '驱动篇': 'Drivers',
  425. '组件篇': 'Components',
  426. '多媒体显示篇': 'Multimedia Display',
  427. '多核通信篇': 'Multicore Communication',
  428. }
  429. category_name = cn_to_en.get(category_name, category_name)
  430. category_anchor = f'cat-{category}'
  431. category_index += 1
  432. category_number_text = f'{category_index}. '
  433. content_sections.append(f'<h1 id="{category_anchor}" class="category-title">{category_number_text}{category_name}</h1>')
  434. self.toc_entries.append({'level': 1, 'title': f'{category_number_text}{category_name}', 'anchor': category_anchor})
  435. doc_counter = 0
  436. for doc in docs:
  437. doc_id = f"doc-{self._slugify(doc['project_name'])}"
  438. content_sections.append(f'<section id="{doc_id}" class="document-section">')
  439. # 文档标题(二级)
  440. doc_counter += 1
  441. doc_anchor = f'{doc_id}-title'
  442. doc_number_text = f'{category_index}.{doc_counter}. '
  443. content_sections.append(f'<h2 id="{doc_anchor}" class="document-title">{doc_number_text}{doc["title"]}</h2>')
  444. self.toc_entries.append({'level': 2, 'title': f'{doc_number_text}{doc["title"]}', 'anchor': doc_anchor})
  445. # 文档内容与内部编号(三级及以下)
  446. doc_content = self._extract_document_content(doc, language)
  447. if doc_content:
  448. numbered_html = self._auto_number_and_collect_toc(doc_content, base_numbers=[category_index, doc_counter], doc_title=doc["title"])
  449. content_sections.append(numbered_html)
  450. else:
  451. content_sections.append('<p>文档内容加载失败</p>')
  452. content_sections.append('</section>')
  453. return '\n'.join(content_sections)
  454. def _extract_document_content(self, doc_meta: Dict, language: str) -> str:
  455. """提取文档内容,并修复资源路径"""
  456. try:
  457. doc_file: Path = doc_meta['file']
  458. # 仅从Markdown文件提取内容
  459. with open(doc_file, 'r', encoding='utf-8') as f:
  460. md_content = f.read()
  461. # 预处理Markdown内容
  462. md_content = self._preprocess_markdown(md_content)
  463. # 处理Markdown
  464. html_content = self.md_processor.process_markdown(md_content)
  465. # 修复图片/链接等资源路径
  466. html_content = self._rewrite_resource_paths(html_content, doc_meta)
  467. return html_content
  468. except Exception as e:
  469. print(f"警告: 无法提取文档内容 {doc_file}: {e}")
  470. return ""
  471. def _text_to_html(self, text: str) -> str:
  472. """将纯文本转换为HTML格式"""
  473. if not text:
  474. return ""
  475. # 简单的文本到HTML转换
  476. lines = text.split('\n')
  477. html_lines = []
  478. for line in lines:
  479. line = line.strip()
  480. if not line:
  481. html_lines.append('<br>')
  482. elif line.startswith('#'):
  483. # 标题
  484. level = len(line) - len(line.lstrip('#'))
  485. title_text = line.lstrip('# ').strip()
  486. html_lines.append(f'<h{level}>{title_text}</h{level}>')
  487. elif line.startswith('- ') or line.startswith('* '):
  488. # 列表项
  489. html_lines.append(f'<li>{line[2:].strip()}</li>')
  490. elif line.startswith('```'):
  491. # 代码块
  492. html_lines.append('<pre><code>')
  493. elif line.endswith('```'):
  494. html_lines.append('</code></pre>')
  495. else:
  496. # 普通段落
  497. html_lines.append(f'<p>{line}</p>')
  498. return '\n'.join(html_lines)
  499. def _derive_projects_root(self) -> Path:
  500. """从 docs/source/config.yaml 读取项目根目录,找不到则回退到 ../../project"""
  501. try:
  502. cfg_path = Path(__file__).parent / 'config.yaml'
  503. if cfg_path.exists():
  504. import yaml
  505. with open(cfg_path, 'r', encoding='utf-8') as f:
  506. cfg = yaml.safe_load(f) or {}
  507. pdir = ((cfg.get('repository', {}) or {}).get('projects_dir', '../../project') or '../../project')
  508. return (Path(__file__).parent / pdir).resolve()
  509. except Exception:
  510. pass
  511. return (Path(__file__).parent / '../../project').resolve()
  512. def _slugify(self, text: str) -> str:
  513. slug = re.sub(r'[^\w\-\.]+', '-', text, flags=re.UNICODE).strip('-').lower()
  514. return slug or 'section'
  515. def _rewrite_resource_paths(self, html: str, doc_meta: Dict) -> str:
  516. """将文档内相对资源路径改写为相对于合并HTML的相对路径。
  517. 策略:把源图片复制到临时目录 temp/assets/<category>/<project>/,并将链接改为相对路径 assets/...。
  518. """
  519. try:
  520. soup = BeautifulSoup(html, 'html.parser')
  521. project_dir: Path = doc_meta.get('project_dir')
  522. category: str = doc_meta.get('category', '')
  523. project_name: str = doc_meta.get('project_name', '')
  524. docs_source_root = Path(__file__).parent
  525. # 目标相对目录(相对于合并HTML)
  526. rel_target_dir = Path('assets') / category / project_name
  527. abs_target_dir = (self.assets_dir or (self.temp_dir / 'assets')) / category / project_name
  528. abs_target_dir.mkdir(parents=True, exist_ok=True)
  529. def resolve_url(u: str) -> str:
  530. if not u:
  531. return u
  532. s = u.strip()
  533. if s.startswith(('http://', 'https://', 'data:', 'mailto:', '#')):
  534. return s
  535. # 尝试项目源码目录
  536. if project_dir is not None:
  537. p1 = (project_dir / s)
  538. try:
  539. if p1.exists():
  540. # 复制到目标 assets 目录并返回相对路径
  541. dest = abs_target_dir / p1.name
  542. try:
  543. if p1.is_file():
  544. import shutil
  545. shutil.copy2(p1, dest)
  546. return str((rel_target_dir / p1.name).as_posix())
  547. except Exception:
  548. pass
  549. except Exception:
  550. pass
  551. # 尝试 docs/source/<category>/<project_name>/
  552. p2 = (docs_source_root / category / project_name / s)
  553. try:
  554. if p2.exists():
  555. dest = abs_target_dir / p2.name
  556. try:
  557. if p2.is_file():
  558. import shutil
  559. shutil.copy2(p2, dest)
  560. return str((rel_target_dir / p2.name).as_posix())
  561. except Exception:
  562. pass
  563. except Exception:
  564. pass
  565. return s
  566. for tag in soup.select('img[src]'):
  567. tag['src'] = resolve_url(tag.get('src'))
  568. for tag in soup.select('a[href]'):
  569. href = tag.get('href')
  570. if href and not href.startswith('#'):
  571. tag['href'] = resolve_url(href)
  572. return str(soup)
  573. except Exception:
  574. return html
  575. def _auto_number_and_collect_toc(self, html: str, base_numbers: List[int], doc_title: Optional[str] = None) -> str:
  576. """为文档内部标题自动编号并收集目录(最大深度到第3级)。
  577. 规则:目录深度仅到 3 级:分类(1) -> 文档(2) -> 文档内首级标题(3)。
  578. base_numbers: 分类号与文档号作为前缀,例如 [2,1] -> 2.1.x
  579. """
  580. try:
  581. soup = BeautifulSoup(html, 'html.parser')
  582. # 避免 Sphinx headerlink 图标导致的方框,移除
  583. for node in soup.select('a.headerlink'):
  584. node.decompose()
  585. # 规范化:如果文档首个标题不是 h1,则将其作为 h1 起始
  586. headers = soup.find_all([f'h{i}' for i in range(1, 4)])
  587. first_level = None
  588. for h in headers:
  589. first_level = int(h.name[1])
  590. break
  591. if first_level and first_level > 1:
  592. for h in headers:
  593. lvl = int(h.name[1])
  594. h.name = f'h{max(1, lvl - first_level + 1)}'
  595. # 如果文档首个 h1 与外层文档标题相同,则删除该 h1,避免目录重复(例如 1.1 与 1.1.1 重复)
  596. if doc_title:
  597. # 取第一个 h1
  598. first_h1 = soup.find('h1')
  599. if first_h1:
  600. def _normalize(text: str) -> str:
  601. import re as _re
  602. t = (text or '').strip()
  603. t = _re.sub(r'[\s\*\u2022]+', ' ', t)
  604. t = _re.sub(r'\s+', ' ', t)
  605. return t
  606. if _normalize(first_h1.get_text()) == _normalize(doc_title):
  607. # 删除该 h1,不计入目录
  608. first_h1.decompose()
  609. # 仅对文档内首级标题(h1)编号,保证总深度不超过 3
  610. local_counter = 0
  611. for header in soup.find_all('h1'):
  612. local_counter += 1
  613. parts = [str(n) for n in base_numbers] + [str(local_counter)]
  614. number_text = '.'.join(parts) + '. '
  615. # 原标题文本
  616. original_text = header.get_text(strip=True)
  617. # 写回:前置编号
  618. # 清空子节点,仅保留纯文本(保留强调等会复杂,这里简化)
  619. header.clear()
  620. header.string = number_text + original_text
  621. # 设置锚点 id
  622. anchor = header.get('id') or self._slugify(original_text)
  623. header['id'] = anchor
  624. # 收集目录项:层级等于数字段数
  625. toc_level = len(parts)
  626. self.toc_entries.append({'level': toc_level, 'title': number_text + original_text, 'anchor': anchor})
  627. return str(soup)
  628. except Exception:
  629. return html
  630. def _preprocess_markdown(self, md_content: str) -> str:
  631. """预处理Markdown内容"""
  632. if not md_content:
  633. return ""
  634. lines = md_content.split('\n')
  635. processed_lines = []
  636. in_code_block = False
  637. code_block_language = ""
  638. for line in lines:
  639. # 处理代码块
  640. if line.strip().startswith('```'):
  641. if not in_code_block:
  642. # 开始代码块
  643. in_code_block = True
  644. code_block_language = line.strip()[3:].strip()
  645. processed_lines.append(f'```{code_block_language}')
  646. else:
  647. # 结束代码块
  648. in_code_block = False
  649. code_block_language = ""
  650. processed_lines.append('```')
  651. continue
  652. # 在代码块内,保持原样
  653. if in_code_block:
  654. processed_lines.append(line)
  655. continue
  656. # 处理标题
  657. if line.strip().startswith('#'):
  658. # 确保标题前后有空行
  659. if processed_lines and processed_lines[-1].strip():
  660. processed_lines.append('')
  661. processed_lines.append(line)
  662. processed_lines.append('')
  663. continue
  664. # 处理列表项
  665. if line.strip().startswith(('- ', '* ', '+ ')):
  666. # 确保列表前有空行
  667. if processed_lines and processed_lines[-1].strip() and not processed_lines[-1].strip().startswith(('- ', '* ', '+ ')):
  668. processed_lines.append('')
  669. processed_lines.append(line)
  670. continue
  671. # 处理数字列表
  672. if re.match(r'^\s*\d+\.\s+', line):
  673. # 确保列表前有空行
  674. if processed_lines and processed_lines[-1].strip() and not re.match(r'^\s*\d+\.\s+', processed_lines[-1]):
  675. processed_lines.append('')
  676. processed_lines.append(line)
  677. continue
  678. # 处理表格
  679. if '|' in line and line.strip():
  680. # 确保表格前有空行
  681. if processed_lines and processed_lines[-1].strip() and '|' not in processed_lines[-1]:
  682. processed_lines.append('')
  683. processed_lines.append(line)
  684. continue
  685. # 处理空行
  686. if not line.strip():
  687. # 避免连续的空行
  688. if processed_lines and processed_lines[-1].strip():
  689. processed_lines.append('')
  690. continue
  691. # 处理普通段落
  692. processed_lines.append(line)
  693. # 清理末尾的空行
  694. while processed_lines and not processed_lines[-1].strip():
  695. processed_lines.pop()
  696. return '\n'.join(processed_lines)
  697. def _create_full_html(self, title: str, toc_html: str, content_html: str, language: str) -> Path:
  698. """创建完整的HTML文件"""
  699. html_file = self.temp_dir / f"merged_{language}.html"
  700. # 根据语言设置字体
  701. if language == 'zh':
  702. # 扩充中文字体回退链,兼容 Linux CI(Noto/WenQuanYi/Droid)
  703. font_family = (
  704. '"Microsoft YaHei", "SimSun", '
  705. '"Noto Sans CJK SC", "Noto Sans CJK", "Noto Sans SC", '
  706. '"WenQuanYi Zen Hei", "Droid Sans Fallback", sans-serif'
  707. )
  708. lang_attr = "zh-CN"
  709. else:
  710. font_family = '"Arial", "Helvetica", sans-serif'
  711. lang_attr = "en"
  712. cover_subtitle = ('开发文档' if language == 'zh' else 'Documentation')
  713. label_version = ('版本' if language == 'zh' else 'Version')
  714. label_language = ('语言' if language == 'zh' else 'Language')
  715. label_generated = ('生成时间' if language == 'zh' else 'Generated on')
  716. # 获取项目描述
  717. project_description = ''
  718. if language == 'zh':
  719. project_description = self.project_meta.get('description', '')
  720. else:
  721. project_description = self.project_meta.get('description_en', '') or self.project_meta.get('description', '')
  722. # 封面信息(徽章 + 详情行)
  723. badge_lang = ('中文' if language == 'zh' else 'English')
  724. badge_version = (self.project_meta.get('version') or '1.0.0')
  725. badge_date = datetime.now().strftime('%Y年%m月%d日' if language == 'zh' else '%B %d, %Y')
  726. meta_badges_html = (
  727. '<div class="meta-badges">'
  728. f'<span class="pill">Version {badge_version}</span>'
  729. '<span class="sep">|</span>'
  730. f'<span class="pill">{badge_lang}</span>'
  731. '<span class="sep">|</span>'
  732. f'<span class="pill">{badge_date}</span>'
  733. '</div>'
  734. )
  735. detail_items = []
  736. author = (self.project_meta.get('author') or '').strip()
  737. website = (self.project_meta.get('website') or '').strip()
  738. copyright_txt = (self.project_meta.get('copyright') or '').strip()
  739. if author:
  740. detail_items.append(( '作者' if language == 'zh' else 'Author', author ))
  741. if website:
  742. website_display = website.replace('官网: ', '').replace('Website: ', '')
  743. detail_items.append(('', website_display)) # 空键名,不显示标签
  744. if copyright_txt:
  745. copyright_display = copyright_txt.replace('版权: ', '').replace('Copyright: ', '')
  746. detail_items.append(('', copyright_display)) # 空键名,不显示标签
  747. if detail_items:
  748. lines = []
  749. for key, val in detail_items:
  750. if not key: # 如果键名为空,只显示值
  751. if val.startswith(('http://', 'https://')):
  752. lines.append(f'<div class="meta-line"><a href="{val}">{val}</a></div>')
  753. else:
  754. lines.append(f'<div class="meta-line">{val}</div>')
  755. else:
  756. if key.lower().startswith(('官网','website')) and (val.startswith('http://') or val.startswith('https://')):
  757. lines.append(f'<div class="meta-line"><span class="k">{key}:</span> <a href="{val}">{val}</a></div>')
  758. else:
  759. lines.append(f'<div class="meta-line"><span class="k">{key}:</span> {val}</div>')
  760. meta_details_html = '<div class="meta-details">' + ''.join(lines) + '</div>'
  761. else:
  762. meta_details_html = ''
  763. # 生成项目描述HTML
  764. project_description_html = ''
  765. if project_description:
  766. project_description_html = f'<div class="cover-description">{project_description}</div>'
  767. html_template = f'''<!DOCTYPE html>
  768. <html lang="{lang_attr}">
  769. <head>
  770. <meta charset="UTF-8">
  771. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  772. <title>{title}</title>
  773. <style>
  774. @page {{
  775. size: A4;
  776. margin-top: 1.50cm;
  777. margin-bottom: 1.75cm;
  778. margin-left: 1.10cm;
  779. margin-right: 1.10cm;
  780. }}
  781. body {{
  782. font-family: {font_family};
  783. line-height: 1.5;
  784. margin: 0;
  785. padding: 0;
  786. color: #222;
  787. font-size: 12pt;
  788. }}
  789. /* 封面样式 */
  790. .cover-page {{
  791. /* 避免封面后出现空白页 */
  792. page-break-after: avoid;
  793. text-align: center;
  794. padding: 2.2cm 2cm 1.2cm 2cm;
  795. height: auto;
  796. min-height: calc(100vh - 3.0cm);
  797. display: flex;
  798. flex-direction: column;
  799. justify-content: flex-start;
  800. align-items: center;
  801. margin-bottom: 0.8cm;
  802. }}
  803. .cover-title {{
  804. font-size: 2.5em;
  805. font-weight: bold;
  806. color: #2c3e50;
  807. margin-bottom: 1em;
  808. border-bottom: 3px solid #3498db;
  809. padding-bottom: 0.5em;
  810. }}
  811. .cover-subtitle {{
  812. font-size: 1.2em;
  813. color: #7f8c8d;
  814. margin-top: 0.6em;
  815. margin-bottom: 1.6em;
  816. }}
  817. .cover-description {{
  818. font-size: 1em;
  819. color: #5a6c7d;
  820. margin: 0 auto 1.5em auto;
  821. max-width: 80%;
  822. text-align: center;
  823. line-height: 1.6;
  824. font-style: italic;
  825. display: block;
  826. width: 100%;
  827. }}
  828. .cover-footer {{
  829. position: absolute;
  830. left: 0;
  831. right: 0;
  832. bottom: 1.8cm;
  833. text-align: center;
  834. }}
  835. .meta-badges {{
  836. display: inline-flex;
  837. align-items: center;
  838. gap: 10px;
  839. margin-bottom: 0.6em;
  840. }}
  841. .meta-badges .pill {{
  842. display: inline-block;
  843. padding: 4px 10px;
  844. border-radius: 9999px;
  845. font-size: 0.9em;
  846. color: #2c3e50;
  847. background: #f3f6fa;
  848. border: 1px solid #dde5ee;
  849. }}
  850. .meta-badges .sep {{
  851. color: #95a5a6;
  852. font-size: 0.95em;
  853. }}
  854. .meta-details {{
  855. font-size: 0.95em;
  856. color: #6b7280;
  857. }}
  858. .meta-details .meta-line {{ margin: 4px 0; }}
  859. .meta-details .k {{
  860. color: #374151;
  861. font-weight: 600;
  862. }}
  863. .meta-details a {{
  864. color: #2563eb;
  865. text-decoration: none;
  866. }}
  867. .meta-details a:hover {{
  868. text-decoration: underline;
  869. }}
  870. .cover-info {{
  871. font-size: 1em;
  872. color: #95a5a6;
  873. margin-top: auto;
  874. }}
  875. /* 目录样式 */
  876. .toc {{
  877. page-break-after: always;
  878. padding: 1.3cm 2cm 1.6cm 2cm;
  879. }}
  880. .toc h2 {{
  881. font-size: 1.8em;
  882. color: #2c3e50;
  883. text-align: left;
  884. margin: 0 0 0.8em 0;
  885. border-left: 4px solid #3498db;
  886. padding-left: 0.6em;
  887. }}
  888. .toc ul {{
  889. list-style: none;
  890. padding: 0;
  891. margin: 0;
  892. max-width: 16cm;
  893. }}
  894. .toc-category {{
  895. margin: 0.6em 0 0.35em 0;
  896. font-size: 1.04em;
  897. color: #334155;
  898. border-bottom: 1px solid #e2e8f0;
  899. padding-bottom: 0.25em;
  900. letter-spacing: 0.2px;
  901. }}
  902. .toc-item {{
  903. margin: 0.12em 0;
  904. padding: 0.12em 0;
  905. line-height: 1.38;
  906. }}
  907. .toc a {{
  908. text-decoration: none;
  909. color: #2c3e50;
  910. font-size: 0.95em;
  911. display: block;
  912. padding: 0.1em 0;
  913. }}
  914. .toc a {{ display: grid; grid-template-columns: auto 1fr auto; align-items: baseline; column-gap: 8px; }}
  915. .toc a .toc-text {{ white-space: nowrap; overflow: hidden; text-overflow: ellipsis; }}
  916. .toc a .toc-dots {{ border-bottom: 1px dotted #cbd5e1; height: 0; margin-top: 0.55em; }}
  917. .toc a .toc-page {{ color: #475569; font-variant-numeric: tabular-nums; padding-left: 4px; }}
  918. .toc a:hover {{
  919. color: #2563eb;
  920. background: #f8fafc;
  921. padding-left: 0.4em;
  922. transition: all 0.2s ease;
  923. }}
  924. /* 正文样式 */
  925. .content {{
  926. padding: 2cm;
  927. }}
  928. /* 脚注样式 */
  929. .footnote-ref {{
  930. vertical-align: super;
  931. font-size: 0.8em;
  932. }}
  933. .footnotes {{
  934. font-size: 0.9em;
  935. color: #555;
  936. }}
  937. .footnotes hr {{
  938. border: none;
  939. border-top: 1px solid #ddd;
  940. margin: 1em 0;
  941. }}
  942. .footnotes ol {{
  943. padding-left: 1.2em;
  944. }}
  945. .footnotes li {{
  946. margin: 0.4em 0;
  947. }}
  948. .category-title {{
  949. color: #2c3e50;
  950. border-bottom: 2px solid #3498db;
  951. padding-bottom: 10px;
  952. margin-top: 2em;
  953. margin-bottom: 1em;
  954. font-size: 1.8em;
  955. page-break-before: always;
  956. }}
  957. .category-title:first-child {{
  958. page-break-before: auto;
  959. }}
  960. .document-section {{
  961. margin-bottom: 1.2em;
  962. page-break-inside: avoid;
  963. }}
  964. .document-title {{
  965. color: #2c3e50;
  966. border-bottom: 1px solid #bdc3c7;
  967. padding-bottom: 8px;
  968. margin-top: 1.5em;
  969. margin-bottom: 1em;
  970. font-size: 1.4em;
  971. }}
  972. h1, h2, h3, h4, h5, h6 {{
  973. color: #2c3e50;
  974. margin-top: 1.5em;
  975. margin-bottom: 0.8em;
  976. }}
  977. h3 {{
  978. font-size: 1.2em;
  979. color: #7f8c8d;
  980. }}
  981. p {{
  982. margin: 0.6em 0;
  983. text-align: justify;
  984. }}
  985. code {{
  986. background: #f8f9fa;
  987. padding: 2px 6px;
  988. border-radius: 3px;
  989. font-family: "Consolas", "Monaco", monospace;
  990. border: 1px solid #e9ecef;
  991. font-size: 0.9em;
  992. color: #e83e8c;
  993. }}
  994. pre {{
  995. background: #f8f9fa;
  996. padding: 0.8em;
  997. border-radius: 5px;
  998. overflow-x: auto;
  999. margin: 0.6em 0;
  1000. border: 1px solid #e9ecef;
  1001. font-family: "Consolas", "Monaco", monospace;
  1002. font-size: 0.9em;
  1003. line-height: 1.4;
  1004. }}
  1005. pre code {{
  1006. background: none;
  1007. padding: 0;
  1008. border: none;
  1009. color: #333;
  1010. font-size: inherit;
  1011. }}
  1012. ul, ol {{
  1013. margin: 0.6em 0;
  1014. padding-left: 2em;
  1015. }}
  1016. li {{
  1017. margin: 0.5em 0;
  1018. line-height: 1.6;
  1019. }}
  1020. ul ul, ol ol, ul ol, ol ul {{
  1021. margin: 0.5em 0;
  1022. }}
  1023. /* 改进列表样式 */
  1024. ul li {{
  1025. list-style-type: disc;
  1026. }}
  1027. ul ul li {{
  1028. list-style-type: circle;
  1029. }}
  1030. ul ul ul li {{
  1031. list-style-type: square;
  1032. }}
  1033. ol li {{
  1034. list-style-type: decimal;
  1035. }}
  1036. ol ol li {{
  1037. list-style-type: lower-alpha;
  1038. }}
  1039. ol ol ol li {{
  1040. list-style-type: lower-roman;
  1041. }}
  1042. img {{
  1043. max-width: 100%;
  1044. height: auto;
  1045. margin: 1em 0;
  1046. border: 1px solid #ddd;
  1047. border-radius: 3px;
  1048. }}
  1049. table {{
  1050. border-collapse: collapse;
  1051. width: 100%;
  1052. margin: 1.5em 0;
  1053. border: 1px solid #ddd;
  1054. }}
  1055. th, td {{
  1056. border: 1px solid #ddd;
  1057. padding: 0.8em 1em;
  1058. text-align: left;
  1059. }}
  1060. th {{
  1061. background: #f8f9fa;
  1062. font-weight: bold;
  1063. color: #2c3e50;
  1064. }}
  1065. blockquote {{
  1066. border-left: 4px solid #e74c3c;
  1067. margin: 1.5em 0;
  1068. padding: 1em 1.5em;
  1069. background: #fdf2f2;
  1070. font-style: italic;
  1071. }}
  1072. /* 改进表格样式 */
  1073. table {{
  1074. border-collapse: collapse;
  1075. width: 100%;
  1076. margin: 1.5em 0;
  1077. border: 1px solid #ddd;
  1078. font-size: 0.9em;
  1079. }}
  1080. th, td {{
  1081. border: 1px solid #ddd;
  1082. padding: 0.8em 1em;
  1083. text-align: left;
  1084. vertical-align: top;
  1085. }}
  1086. th {{
  1087. background: #f8f9fa;
  1088. font-weight: bold;
  1089. color: #2c3e50;
  1090. }}
  1091. tr:nth-child(even) {{
  1092. background: #f9f9f9;
  1093. }}
  1094. /* 改进链接样式 */
  1095. a {{
  1096. color: #3498db;
  1097. text-decoration: none;
  1098. }}
  1099. a:hover {{
  1100. text-decoration: underline;
  1101. }}
  1102. /* 改进强调样式 */
  1103. strong, b {{
  1104. font-weight: bold;
  1105. color: #2c3e50;
  1106. }}
  1107. em, i {{
  1108. font-style: italic;
  1109. color: #7f8c8d;
  1110. }}
  1111. /* 改进水平线样式 */
  1112. hr {{
  1113. border: none;
  1114. border-top: 2px solid #bdc3c7;
  1115. margin: 2em 0;
  1116. }}
  1117. /* 改进引用块样式 */
  1118. .highlight {{
  1119. background: #f8f9fa;
  1120. border: 1px solid #e9ecef;
  1121. border-radius: 5px;
  1122. padding: 1em;
  1123. margin: 1em 0;
  1124. }}
  1125. /* 改进警告框样式 */
  1126. .admonition {{
  1127. border: 1px solid #ddd;
  1128. border-radius: 5px;
  1129. padding: 1em;
  1130. margin: 1em 0;
  1131. background: #f9f9f9;
  1132. }}
  1133. .admonition-title {{
  1134. font-weight: bold;
  1135. margin-bottom: 0.5em;
  1136. color: #2c3e50;
  1137. }}
  1138. /* 打印样式 */
  1139. @media print {{
  1140. .cover-page {{
  1141. page-break-after: avoid;
  1142. }}
  1143. .toc {{
  1144. page-break-after: always;
  1145. }}
  1146. .category-title {{
  1147. page-break-before: always;
  1148. }}
  1149. .document-section {{
  1150. page-break-inside: avoid;
  1151. }}
  1152. h1, h2, h3 {{
  1153. page-break-after: avoid;
  1154. }}
  1155. body {{
  1156. margin: 0;
  1157. padding: 0;
  1158. }}
  1159. }}
  1160. </style>
  1161. </head>
  1162. <body>
  1163. <!-- 封面页 -->
  1164. <div class="cover-page">
  1165. <div class="cover-title">{title}</div>
  1166. <div class="cover-subtitle">{cover_subtitle}</div>
  1167. <div class="cover-footer">
  1168. {project_description_html}
  1169. {meta_badges_html}
  1170. {meta_details_html}
  1171. </div>
  1172. </div>
  1173. <!-- 目录页 -->
  1174. {toc_html}
  1175. <!-- 正文内容 -->
  1176. <div class="content">
  1177. {content_html}
  1178. </div>
  1179. </body>
  1180. </html>'''
  1181. with open(html_file, 'w', encoding='utf-8') as f:
  1182. f.write(html_template)
  1183. print(f"✓ 已创建完整HTML文件: {html_file}")
  1184. return html_file
  1185. def _load_project_meta(self) -> Dict[str, str]:
  1186. """从 docs/source/config.yaml 读取项目信息(名称、版本、版权等)"""
  1187. meta = {"name": "", "version": "", "copyright": "", "website": "", "description": "", "description_en": ""}
  1188. try:
  1189. cfg_path = Path(__file__).parent / 'config.yaml'
  1190. if cfg_path.exists():
  1191. import yaml
  1192. with open(cfg_path, 'r', encoding='utf-8') as f:
  1193. cfg = yaml.safe_load(f) or {}
  1194. proj = (cfg.get('project') or {})
  1195. meta["name"] = proj.get('name', '')
  1196. meta["version"] = proj.get('version', '')
  1197. meta["website"] = proj.get('website', '')
  1198. meta["description"] = proj.get('description', '')
  1199. meta["description_en"] = proj.get('description_en', '')
  1200. meta["copyright"] = (proj.get('copyright')
  1201. or (cfg.get('project', {}).get('copyright'))
  1202. or '')
  1203. except Exception:
  1204. pass
  1205. return meta
  1206. def _generate_pdf_from_html(self, html_file: Path, title: str, language: str) -> bool:
  1207. """从HTML文件生成PDF"""
  1208. try:
  1209. # 确保输出目录存在
  1210. self.output_dir.mkdir(parents=True, exist_ok=True)
  1211. # 生成PDF文件名
  1212. if language == 'zh':
  1213. pdf_filename = f"{title}.pdf"
  1214. else:
  1215. # 英文版:将空格替换为下划线,再追加 _EN
  1216. safe_title = (title or '').replace(' ', '_')
  1217. pdf_filename = f"{safe_title}_EN.pdf"
  1218. output_pdf = self.output_dir / pdf_filename
  1219. # 尝试使用Chrome/Edge的headless模式生成PDF
  1220. if self._try_chrome_pdf(html_file, output_pdf):
  1221. print(f"✓ PDF生成成功: {output_pdf}")
  1222. return True
  1223. # 回退到手动方式
  1224. print("⚠️ 自动PDF生成失败,回退到手动方式...")
  1225. return self._generate_pdf_manual(html_file, output_pdf)
  1226. except Exception as e:
  1227. print(f"✗ PDF生成失败: {e}")
  1228. return False
  1229. def _try_chrome_pdf(self, html_file: Path, output_pdf: Path) -> bool:
  1230. """尝试使用Chrome生成PDF"""
  1231. try:
  1232. # 优先使用用户提供的浏览器路径
  1233. if getattr(self, 'browser_path', None):
  1234. if os.path.exists(self.browser_path):
  1235. browser_cmd = self.browser_path
  1236. else:
  1237. browser_cmd = None
  1238. else:
  1239. browser_cmd = None
  1240. # 优先使用环境变量指定的浏览器路径
  1241. env_browser = os.environ.get('CHROME_PATH') or os.environ.get('BROWSER')
  1242. browser_paths = [
  1243. r"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
  1244. r"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
  1245. r"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
  1246. r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
  1247. ]
  1248. if env_browser:
  1249. browser_paths.insert(0, env_browser)
  1250. # 也尝试通过 PATH 查找
  1251. try:
  1252. import shutil as _shutil
  1253. for candidate in [
  1254. "chrome.exe", "msedge.exe", "chrome", "msedge",
  1255. "google-chrome", "google-chrome-stable", "chromium", "chromium-browser"
  1256. ]:
  1257. p = _shutil.which(candidate)
  1258. if p and os.path.exists(p):
  1259. browser_paths.insert(0, p)
  1260. break
  1261. except Exception:
  1262. pass
  1263. if not browser_cmd:
  1264. for path in browser_paths:
  1265. if os.path.exists(path):
  1266. browser_cmd = path
  1267. break
  1268. if not browser_cmd:
  1269. return False
  1270. # 构建命令
  1271. cmd = [
  1272. browser_cmd,
  1273. "--headless=new",
  1274. "--disable-gpu",
  1275. "--no-sandbox",
  1276. "--disable-dev-shm-usage",
  1277. "--no-first-run",
  1278. "--disable-extensions",
  1279. "--disable-background-networking",
  1280. "--disable-default-apps",
  1281. "--disable-sync",
  1282. "--disable-translate",
  1283. "--disable-features=PrintingPdfHeaderFooter,TranslateUI",
  1284. f"--print-to-pdf={str(output_pdf.resolve().as_posix())}",
  1285. "--print-to-pdf-no-header",
  1286. "--no-pdf-header-footer",
  1287. "--disable-print-preview",
  1288. "--run-all-compositor-stages-before-draw",
  1289. "--virtual-time-budget=20000",
  1290. f"file:///{html_file.resolve().as_posix()}"
  1291. ]
  1292. # 执行命令(以二进制模式捕获输出,避免控制台编码问题)
  1293. result = subprocess.run(cmd, capture_output=True, text=False, timeout=60)
  1294. ok = result.returncode == 0 and output_pdf.exists()
  1295. if not ok:
  1296. try:
  1297. stderr_text = (result.stderr or b"").decode(errors="ignore")
  1298. stdout_text = (result.stdout or b"").decode(errors="ignore")
  1299. print("Chrome/Edge 无头打印失败: ")
  1300. if stdout_text:
  1301. print(stdout_text[:800])
  1302. if stderr_text:
  1303. print(stderr_text[:800])
  1304. except Exception:
  1305. pass
  1306. return ok
  1307. except Exception as e:
  1308. print(f"Chrome PDF生成失败: {e}")
  1309. return False
  1310. def _generate_pdf_manual(self, html_file: Path, output_pdf: Path) -> bool:
  1311. """手动生成PDF(回退方案)"""
  1312. try:
  1313. # 打开浏览器
  1314. import webbrowser
  1315. url = f"file:///{html_file.absolute()}"
  1316. webbrowser.open(url)
  1317. print("\n" + "="*60)
  1318. print("📄 PDF生成说明:")
  1319. print("1. 浏览器已自动打开HTML页面")
  1320. print("2. 使用浏览器打印(Ctrl+P),目标选择“保存为 PDF”")
  1321. print("3. 重要:取消勾选“页眉和页脚”(Headers and footers)以移除浏览器默认的时间、URL、页码")
  1322. print("4. 如需保持版式,建议边距使用默认或无")
  1323. print(f"5. 保存到: {output_pdf}")
  1324. print("6. 完成后按回车键继续...")
  1325. print("="*60)
  1326. try:
  1327. input("按回车键继续...")
  1328. except EOFError:
  1329. # 非交互环境下没有标准输入,直接继续
  1330. pass
  1331. return True
  1332. except Exception as e:
  1333. print(f"手动PDF生成失败: {e}")
  1334. return False
  1335. def main():
  1336. """主函数"""
  1337. parser = argparse.ArgumentParser(description="增强版PDF生成器V2")
  1338. parser.add_argument('--html-dir', type=str, default='source_build/html/latest',
  1339. help='HTML文件目录 (默认: source_build/html/latest)')
  1340. parser.add_argument('--output-dir', type=str, default='source_build/html/latest/_static',
  1341. help='PDF输出目录 (默认: source_build/html/latest/_static)')
  1342. parser.add_argument('--title', type=str, default='Titan-Board SDK文档',
  1343. help='PDF标题 (默认: Titan-Board SDK文档)')
  1344. parser.add_argument('--language', type=str, default='zh',
  1345. choices=['zh', 'en'],
  1346. help='文档语言 (默认: zh)')
  1347. parser.add_argument('--both', action='store_true',
  1348. help='同时生成中英文版本')
  1349. parser.add_argument('--keep-temp', action='store_true',
  1350. help='保留临时目录以便调试(输出merged_*.html路径)')
  1351. parser.add_argument('--browser', type=str, default='',
  1352. help='指定 Chrome/Edge 浏览器可执行文件路径')
  1353. args = parser.parse_args()
  1354. # 转换为Path对象
  1355. html_dir = Path(args.html_dir)
  1356. output_dir = Path(args.output_dir)
  1357. if not html_dir.exists():
  1358. print(f"✗ HTML目录不存在: {html_dir}")
  1359. sys.exit(1)
  1360. # 创建PDF生成器
  1361. generator = PDFGeneratorV2(html_dir, output_dir, keep_temp=args.keep_temp, browser_path=(args.browser or None))
  1362. if args.both:
  1363. # 生成中英文两个版本
  1364. success_zh = generator.generate_pdf(args.title, "zh")
  1365. success_en = generator.generate_pdf(args.title, "en")
  1366. success = success_zh and success_en
  1367. else:
  1368. # 生成指定语言版本
  1369. success = generator.generate_pdf(args.title, args.language)
  1370. sys.exit(0 if success else 1)
  1371. if __name__ == "__main__":
  1372. main()