""" 论文生成编排器。 组装 配置 + 解析 + 模板渲染 + 正文注入 的完整流水线。 """ from collections import defaultdict from pathlib import Path from docxtpl import DocxTemplate from docx import Document from .config import load_config, ThesisConfig from .parser import parse_markdown from .body import body_to_paragraphs, replace_placeholder, link_body_citations from .references import references_to_paragraphs # 解析器可能产生的字段(用于填充报告) _PARSER_FIELDS = [ "title", "abstact_cn_context", "abstract_cn_keywords", "abstract_en_context", "abstract_en_keywords", "acknowledgement", "reference", "appendix", "body_md", ] def generate_thesis( template_path: str | Path, data_path: str | Path, config_path: str | Path | None = None, output_path: str | Path = "output.docx", ) -> dict: """执行从数据到 Word 的完整论文生成流程。 Parameters ---------- template_path : str | Path docxtpl 模板文件路径(.docx)。 data_path : str | Path Markdown 论文正文文件路径(.md)。 config_path : str | Path | None TOML 配置文件路径。为 ``None`` 时尝试自动查找。 output_path : str | Path 输出 Word 文件路径。 """ data_path = Path(data_path) # 1. 加载配置 if config_path is None: candidates = [ Path("thesis_config.toml"), data_path.with_suffix(".toml"), ] config_path = next((p for p in candidates if p.exists()), None) config: ThesisConfig | None = None if config_path and Path(config_path).exists(): config = load_config(config_path) print(f"[配置] 配置文件: {config_path}") else: config = ThesisConfig() print("[配置] 未找到配置文件,使用默认值。") # 2. 解析 Markdown with open(data_path, "r", encoding="utf-8") as f: md_text = f.read() context = parse_markdown( md_text, body_start_kw=config.body_start_keywords, body_end_kw=config.body_end_keywords, ) # 3. 合并配置 → 上下文(配置填充解析器未产生的空白) for k, v in config.to_dict().items(): if k == "title" and config.title_from_md and context.get("title"): continue # 以 markdown 标题为准 context.setdefault(k, v) # 4. 用 defaultdict 兜底缺失键 ctx = defaultdict(lambda: "", context) # 5. 解析正文为段落列表 body_md = ctx.get("body_md", "") body_paragraphs = ( body_to_paragraphs( body_md, level_offset=config.level_offset, body_style=config.body_style, base_dir=data_path.parent, ) if body_md else [] ) # 6. 解析参考文献为段落列表 ref_text = ctx.get("reference", "") ref_paragraphs = references_to_paragraphs(ref_text, ref_style=config.reference_style) # 7. 占位符(替代模板变量,后处理时替换) ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__" ctx["reference"] = "__REFERENCE_PLACEHOLDER__" # 7. 渲染模板 doc = DocxTemplate(str(template_path)) doc.render(ctx) # 8. 保存临时文件,再做后处理 temp_path = Path(output_path).with_suffix(".tmp") doc.save(str(temp_path)) # 9. 正文注入+参考文献注入 final_doc = Document(str(temp_path)) replace_placeholder( final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs, default_body_style=config.body_style, ) # 将正文中的 [N] 引用替换为超链接 link_body_citations(final_doc) replace_placeholder( final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs, default_body_style=config.reference_style, ) final_doc.save(str(output_path)) temp_path.unlink(missing_ok=True) print(f"[完成] 论文生成完成: {output_path}") # 10. 字段填充报告(动态收集所有模板与解析字段) report_fields = list(dict.fromkeys([*config.metadata.keys(), *_PARSER_FIELDS])) print("\n--- 字段填充情况 ---") for key in report_fields: val = ctx.get(key, "") if val == "": print(f" [缺失] {key}") else: preview = str(val)[:60].replace("\n", " ") print(f" [OK] {key}: {preview}...") missing = [k for k in report_fields if ctx.get(k, "") == ""] if missing: print("\n[警告] 以下字段缺失,已填充 '':") for f in missing: print(f" - {f}") return dict(ctx)