md2word/transit/renderer.py

"""
论文生成编排器。

组装 配置 + 解析 + 模板渲染 + 正文注入 的完整流水线。
"""

from collections import defaultdict
from pathlib import Path
from docxtpl import DocxTemplate
from docx import Document

from .config import load_config, ThesisConfig
from .parser import parse_markdown
from .body import body_to_paragraphs, replace_placeholder, link_body_citations
from .references import references_to_paragraphs


# 解析器可能产生的字段（用于填充报告）
_PARSER_FIELDS = [
    "title",
    "abstact_cn_context",
    "abstract_cn_keywords",
    "abstract_en_context",
    "abstract_en_keywords",
    "acknowledgement",
    "reference",
    "appendix",
    "body_md",
]


def generate_thesis(
    template_path: str | Path,
    data_path: str | Path,
    config_path: str | Path | None = None,
    output_path: str | Path = "output.docx",
) -> dict:
    """执行从数据到 Word 的完整论文生成流程。

    Parameters
    ----------
    template_path : str | Path
        docxtpl 模板文件路径（.docx）。
    data_path : str | Path
        Markdown 论文正文文件路径（.md）。
    config_path : str | Path | None
        TOML 配置文件路径。为 ``None`` 时尝试自动查找。
    output_path : str | Path
        输出 Word 文件路径。
    """
    data_path = Path(data_path)

    # 1. 加载配置
    if config_path is None:
        candidates = [
            Path("thesis_config.toml"),
            data_path.with_suffix(".toml"),
        ]
        config_path = next((p for p in candidates if p.exists()), None)

    config: ThesisConfig | None = None
    if config_path and Path(config_path).exists():
        config = load_config(config_path)
        print(f"[配置] 配置文件: {config_path}")
    else:
        config = ThesisConfig()
        print("[配置] 未找到配置文件，使用默认值。")

    # 2. 解析 Markdown
    with open(data_path, "r", encoding="utf-8") as f:
        md_text = f.read()

    context = parse_markdown(
        md_text,
        body_start_kw=config.body_start_keywords,
        body_end_kw=config.body_end_keywords,
    )

    # 3. 合并配置 → 上下文（配置填充解析器未产生的空白）
    for k, v in config.to_dict().items():
        if k == "title" and config.title_from_md and context.get("title"):
            continue  # 以 markdown 标题为准
        context.setdefault(k, v)

    # 4. 用 defaultdict 兜底缺失键
    ctx = defaultdict(lambda: "<None>", context)

    # 5. 解析正文为段落列表
    body_md = ctx.get("body_md", "")
    body_paragraphs = (
        body_to_paragraphs(
            body_md,
            level_offset=config.level_offset,
            body_style=config.body_style,
            base_dir=data_path.parent,
        )
        if body_md else []
    )

    # 6. 解析参考文献为段落列表
    ref_text = ctx.get("reference", "")
    ref_paragraphs = references_to_paragraphs(ref_text, ref_style=config.reference_style)

    # 7. 占位符（替代模板变量，后处理时替换）
    ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__"
    ctx["reference"] = "__REFERENCE_PLACEHOLDER__"

    # 7. 渲染模板
    doc = DocxTemplate(str(template_path))
    doc.render(ctx)

    # 8. 保存临时文件，再做后处理
    temp_path = Path(output_path).with_suffix(".tmp")
    doc.save(str(temp_path))

    # 9. 正文注入＋参考文献注入
    final_doc = Document(str(temp_path))
    replace_placeholder(
        final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs,
        default_body_style=config.body_style,
    )

    # 将正文中的 [N] 引用替换为超链接
    link_body_citations(final_doc)

    replace_placeholder(
        final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs,
        default_body_style=config.reference_style,
    )
    final_doc.save(str(output_path))
    temp_path.unlink(missing_ok=True)

    print(f"[完成] 论文生成完成: {output_path}")

    # 10. 字段填充报告（动态收集所有模板与解析字段）
    report_fields = list(dict.fromkeys([*config.metadata.keys(), *_PARSER_FIELDS]))
    print("\n--- 字段填充情况 ---")
    for key in report_fields:
        val = ctx.get(key, "<None>")
        if val == "<None>":
            print(f"  [缺失] {key}")
        else:
            preview = str(val)[:60].replace("\n", " ")
            print(f"  [OK] {key}: {preview}...")

    missing = [k for k in report_fields if ctx.get(k, "<None>") == "<None>"]
    if missing:
        print("\n[警告] 以下字段缺失，已填充 '<None>'：")
        for f in missing:
            print(f"  - {f}")

    return dict(ctx)