Files
md2word/transit/renderer.py
zzy 4e39a4f2ac feat(transit): 添加正文引用标记到书签超链接功能
- 新增 CITE_PATTERN 正则表达式匹配 [N] 引用格式
- 添加 base_dir 参数支持相对图片路径解析
- 实现书签创建和超链接替换功能
- 添加 link_body_citations 函数处理正文引用链接
- 在参考文献段落中添加书签标识
- 支持将 [N] 引用替换为指向参考文献的超链接
2026-05-10 15:07:20 +08:00

153 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
论文生成编排器。
组装 配置 + 解析 + 模板渲染 + 正文注入 的完整流水线。
"""
from collections import defaultdict
from pathlib import Path
from docxtpl import DocxTemplate
from docx import Document
from .config import load_config, ThesisConfig
from .parser import parse_markdown
from .body import body_to_paragraphs, replace_placeholder, link_body_citations
from .references import references_to_paragraphs
# 解析器可能产生的字段(用于填充报告)
_PARSER_FIELDS = [
"title",
"abstact_cn_context",
"abstract_cn_keywords",
"abstract_en_context",
"abstract_en_keywords",
"acknowledgement",
"reference",
"appendix",
"body_md",
]
def generate_thesis(
template_path: str | Path,
data_path: str | Path,
config_path: str | Path | None = None,
output_path: str | Path = "output.docx",
) -> dict:
"""执行从数据到 Word 的完整论文生成流程。
Parameters
----------
template_path : str | Path
docxtpl 模板文件路径(.docx
data_path : str | Path
Markdown 论文正文文件路径(.md
config_path : str | Path | None
TOML 配置文件路径。为 ``None`` 时尝试自动查找。
output_path : str | Path
输出 Word 文件路径。
"""
data_path = Path(data_path)
# 1. 加载配置
if config_path is None:
candidates = [
Path("thesis_config.toml"),
data_path.with_suffix(".toml"),
]
config_path = next((p for p in candidates if p.exists()), None)
config: ThesisConfig | None = None
if config_path and Path(config_path).exists():
config = load_config(config_path)
print(f"[配置] 配置文件: {config_path}")
else:
config = ThesisConfig()
print("[配置] 未找到配置文件,使用默认值。")
# 2. 解析 Markdown
with open(data_path, "r", encoding="utf-8") as f:
md_text = f.read()
context = parse_markdown(
md_text,
body_start_kw=config.body_start_keywords,
body_end_kw=config.body_end_keywords,
)
# 3. 合并配置 → 上下文(配置填充解析器未产生的空白)
for k, v in config.to_dict().items():
if k == "title" and config.title_from_md and context.get("title"):
continue # 以 markdown 标题为准
context.setdefault(k, v)
# 4. 用 defaultdict 兜底缺失键
ctx = defaultdict(lambda: "<None>", context)
# 5. 解析正文为段落列表
body_md = ctx.get("body_md", "")
body_paragraphs = (
body_to_paragraphs(
body_md,
level_offset=config.level_offset,
body_style=config.body_style,
base_dir=data_path.parent,
)
if body_md else []
)
# 6. 解析参考文献为段落列表
ref_text = ctx.get("reference", "")
ref_paragraphs = references_to_paragraphs(ref_text, ref_style=config.reference_style)
# 7. 占位符(替代模板变量,后处理时替换)
ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__"
ctx["reference"] = "__REFERENCE_PLACEHOLDER__"
# 7. 渲染模板
doc = DocxTemplate(str(template_path))
doc.render(ctx)
# 8. 保存临时文件,再做后处理
temp_path = Path(output_path).with_suffix(".tmp")
doc.save(str(temp_path))
# 9. 正文注入+参考文献注入
final_doc = Document(str(temp_path))
replace_placeholder(
final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs,
default_body_style=config.body_style,
)
# 将正文中的 [N] 引用替换为超链接
link_body_citations(final_doc)
replace_placeholder(
final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs,
default_body_style=config.reference_style,
)
final_doc.save(str(output_path))
temp_path.unlink(missing_ok=True)
print(f"[完成] 论文生成完成: {output_path}")
# 10. 字段填充报告(动态收集所有模板与解析字段)
report_fields = list(dict.fromkeys([*config.metadata.keys(), *_PARSER_FIELDS]))
print("\n--- 字段填充情况 ---")
for key in report_fields:
val = ctx.get(key, "<None>")
if val == "<None>":
print(f" [缺失] {key}")
else:
preview = str(val)[:60].replace("\n", " ")
print(f" [OK] {key}: {preview}...")
missing = [k for k in report_fields if ctx.get(k, "<None>") == "<None>"]
if missing:
print("\n[警告] 以下字段缺失,已填充 '<None>'")
for f in missing:
print(f" - {f}")
return dict(ctx)