Compare commits

...

4 Commits

Author SHA1 Message Date
zzy
4e39a4f2ac feat(transit): 添加正文引用标记到书签超链接功能
- 新增 CITE_PATTERN 正则表达式匹配 [N] 引用格式
- 添加 base_dir 参数支持相对图片路径解析
- 实现书签创建和超链接替换功能
- 添加 link_body_citations 函数处理正文引用链接
- 在参考文献段落中添加书签标识
- 支持将 [N] 引用替换为指向参考文献的超链接
2026-05-10 15:07:20 +08:00
zzy
0b10e97e0c feat(transit): 支持Markdown图片转Word图片段落
- 新增 transit/images.py 模块处理图片解析和插入逻辑
- 实现 `<img>` 标签解析为独立图片段落
- 添加图片尺寸检测和自适应缩放功能(超出页面宽度时自动缩放)
- 支持图标题居中显示和图片居中对齐
- 优化 body_to_paragraphs 函数,添加图片处理逻辑
- 更新 README.md 使用说明,添加模板文件要求说明

BREAKING CHANGE: 图片处理方式变更,需要包含 sample.docx 模板文件
2026-05-09 16:17:51 +08:00
zzy
fc6afdea9d refactor(config): 重构配置类以支持动态元数据字段
配置类 ThesisConfig 现在使用 metadata 字典直接透传 TOML 配置,
无需为每个变量单独声明字段。新增模板变量只需修改 TOML 文件,
无需修改 Python 代码。

BREAKING CHANGE: 配置文件结构发生改变,从单独字段改为统一的
metadata 节点。
2026-05-08 23:07:23 +08:00
zzy
74d28ea2d8 feat(transit): 添加参考文献解析功能并修复段落编号继承问题
新增了参考文献处理模块,支持按照 GB 7714 《文后参考文献著录规则》顺序编码制
解析和格式化参考文献。同时修复了段落替换过程中自动编号丢失的问题。

- 新增 transit/references.py 模块,提供参考文献解析和格式化功能
- 在 body.py 的 replace_placeholder 函数中实现段落编号属性的正确继承
- 修改 transit/__init__.py 导入新的参考文献处理函数
- 更新 transit/config.py 添加参考文献样式配置项
- 修改 transit/renderer.py 集成参考文献处理流程
2026-05-08 22:14:51 +08:00
8 changed files with 511 additions and 72 deletions

4
.gitignore vendored
View File

@@ -20,3 +20,7 @@ wheels/
*.docx *.docx
*.doc *.doc
*.txt *.txt
.vscode/
.tmp/
*.toml

View File

@@ -1,4 +1,5 @@
使用方法 使用方法
```shell ```shell
py .\test.py .\毕业论文初稿.md # 需要 sample.docx 文件 且 该文件有 {{xxx}} 模板引擎的内容
python .\test.py .\毕业论文初稿.md
``` ```

View File

@@ -4,6 +4,7 @@ from .parser import parse_markdown
from .body import body_to_paragraphs, replace_placeholder from .body import body_to_paragraphs, replace_placeholder
from .renderer import generate_thesis from .renderer import generate_thesis
from .config import load_config, ThesisConfig from .config import load_config, ThesisConfig
from .references import references_to_paragraphs, format_gb7714
__all__ = [ __all__ = [
"parse_markdown", "parse_markdown",
@@ -12,4 +13,6 @@ __all__ = [
"generate_thesis", "generate_thesis",
"load_config", "load_config",
"ThesisConfig", "ThesisConfig",
"references_to_paragraphs",
"format_gb7714",
] ]

View File

@@ -6,16 +6,26 @@ Markdown 正文 → Word 段落转换。
""" """
import re import re
from copy import deepcopy
from pathlib import Path
from docx import Document from docx import Document
from docx.oxml.ns import qn
from .images import make_image_paragraph, is_figure_caption, insert_image_paragraphs
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) _PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
# 匹配正文中的引用标记 [1] / [1,2,3]
_CITE_PATTERN = re.compile(r"\[(\d+(?:[,\s]*\d+)*)\]")
def body_to_paragraphs( def body_to_paragraphs(
md_text: str, md_text: str,
*, *,
level_offset: int = 0, level_offset: int = 0,
body_style: str = "Body Text Indent", body_style: str = "Body Text Indent",
base_dir: str | Path | None = None,
) -> list[dict]: ) -> list[dict]:
"""将 Markdown 正文按标题和段落拆分为结构化列表。 """将 Markdown 正文按标题和段落拆分为结构化列表。
@@ -27,21 +37,34 @@ def body_to_paragraphs(
标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。 标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。
body_style : str body_style : str
正文段落的 Word 样式名。 正文段落的 Word 样式名。
base_dir : str | Path | None
Markdown 文件所在目录,用于解析图片相对路径。
""" """
paragraphs: list[dict] = [] paragraphs: list[dict] = []
last_end = 0 last_end = 0
def _add_block(block: str) -> None:
block = block.strip()
if not block:
return
# 图片段落
img = make_image_paragraph(block, base_dir)
if img:
paragraphs.append(img)
return
# 跳过紧跟在图片后的重复图标题
if paragraphs and paragraphs[-1].get("type") == "image" and is_figure_caption(block):
return
# 普通正文段落
paragraphs.append({"text": block, "level": 0, "style": body_style})
for m in _PAT_HEADING.finditer(md_text): for m in _PAT_HEADING.finditer(md_text):
# 标题前的普通文本 # 标题前的普通文本
if m.start() > last_end: if m.start() > last_end:
pre = md_text[last_end : m.start()].strip() pre = md_text[last_end : m.start()].strip()
if pre: if pre:
for block in re.split(r"\n\s*\n", pre): for block in re.split(r"\n\s*\n", pre):
block = block.strip() _add_block(block)
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
level = len(m.group(1)) + level_offset level = len(m.group(1)) + level_offset
heading_text = m.group(2).strip() heading_text = m.group(2).strip()
@@ -54,11 +77,7 @@ def body_to_paragraphs(
tail = md_text[last_end:].strip() tail = md_text[last_end:].strip()
if tail: if tail:
for block in re.split(r"\n\s*\n", tail): for block in re.split(r"\n\s*\n", tail):
block = block.strip() _add_block(block)
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
return paragraphs return paragraphs
@@ -84,31 +103,164 @@ def replace_placeholder(
placeholder_style = para.style.name if para.style else None placeholder_style = para.style.name if para.style else None
parent = para._element.getparent() parent = para._element.getparent()
idx = list(parent).index(para._element) idx = list(parent).index(para._element)
# 保存原段落的编号属性numPr用于继承自动编号
orig_pPr = para._element.find(qn("w:pPr"))
numPr = orig_pPr.find(qn("w:numPr")) if orig_pPr is not None else None
parent.remove(para._element) parent.remove(para._element)
# 为参考文献段落准备书签 ID
bm_id = _max_bookmark_id(doc) + 1
for pd_data in reversed(paragraphs): for pd_data in reversed(paragraphs):
new_p = doc.add_paragraph(pd_data["text"]) if pd_data.get("type") == "image":
style_name = pd_data["style"] insert_image_paragraphs(
doc, [pd_data], idx=idx, parent=parent
)
else:
new_p = doc.add_paragraph(pd_data["text"])
style_name = pd_data["style"]
# 尝试应用样式,逐步降级 # 尝试应用样式,逐步降级
applied = _apply_style(new_p, doc, style_name) applied = _apply_style(new_p, doc, style_name)
if not applied and style_name.startswith("Heading"): if not applied and style_name.startswith("Heading"):
# 标题样式找不到 new_p.style = doc.styles["Normal"]
new_p.style = doc.styles["Normal"] elif not applied:
elif not applied: if placeholder_style:
# 正文样式找不到 → 尝试占位符自身的样式 _apply_style(new_p, doc, placeholder_style)
if placeholder_style: if new_p.style.name == "Normal" and placeholder_style:
_apply_style(new_p, doc, placeholder_style) new_p.style = doc.styles[placeholder_style]
if new_p.style.name == "Normal" and placeholder_style:
new_p.style = doc.styles[placeholder_style]
parent.insert(idx, new_p._element) # 继承原段落的编号属性(自动编号)
if numPr is not None:
new_pPr = new_p._element.find(qn("w:pPr"))
if new_pPr is None:
new_pPr = new_p._element.makeelement(qn("w:pPr"), {})
new_p._element.insert(0, new_pPr)
existing = new_pPr.find(qn("w:numPr"))
if existing is not None:
new_pPr.remove(existing)
new_pPr.append(deepcopy(numPr))
parent.insert(idx, new_p._element)
# 为参考文献条目添加书签
ref_id = pd_data.get("ref_id")
if ref_id is not None:
_add_bookmark(new_p, f"ref-{ref_id}", bm_id)
bm_id += 1
break break
if not placeholder_found: if not placeholder_found:
print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。") print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。")
def link_body_citations(doc: Document):
"""将文档中正文段落的 ``[N]`` 引用替换为指向对应书签的超链接。"""
for para in doc.paragraphs:
# 跳过已有超链接的段落(如目录页)
if para._element.findall(qn("w:hyperlink")):
continue
_link_paragraph(para)
def _max_bookmark_id(doc: Document) -> int:
"""扫描文档返回最大书签 ID。"""
max_id = 0
for para in doc.paragraphs:
for bm in para._element.iter(qn("w:bookmarkStart")):
try:
max_id = max(max_id, int(bm.get(qn("w:id"))))
except (ValueError, TypeError):
pass
return max_id
def _add_bookmark(paragraph, name: str, bm_id: int):
"""为段落添加书签。"""
bm_start = paragraph._element.makeelement(qn("w:bookmarkStart"), {})
bm_start.set(qn("w:id"), str(bm_id))
bm_start.set(qn("w:name"), name)
bm_end = paragraph._element.makeelement(qn("w:bookmarkEnd"), {})
bm_end.set(qn("w:id"), str(bm_id))
pPr = paragraph._element.find(qn("w:pPr"))
if pPr is not None:
paragraph._element.insert(1, bm_start)
else:
paragraph._element.insert(0, bm_start)
paragraph._element.append(bm_end)
def _link_paragraph(para):
"""将单个段落中的 ``[N]`` 替换为 HYPERLINK 域。"""
runs = list(para._element.findall(qn("w:r")))
if not runs:
return
full_text = ""
for r in runs:
t = r.find(qn("w:t"))
if t is not None and t.text:
full_text += t.text
matches = list(_CITE_PATTERN.finditer(full_text))
if not matches:
return
first_rPr = runs[0].find(qn("w:rPr"))
for r in runs:
para._element.remove(r)
pos = 0
for m in matches:
before = full_text[pos : m.start()]
if before:
_add_run(para._element, before, first_rPr)
nums = re.findall(r"\d+", m.group(1))
_add_hlink(para._element, f"ref-{nums[0]}", m.group())
pos = m.end()
after = full_text[pos:]
if after:
_add_run(para._element, after, first_rPr)
def _add_run(parent, text: str, rPr):
r = parent.makeelement(qn("w:r"), {})
if rPr is not None:
r.append(deepcopy(rPr))
t = r.makeelement(qn("w:t"), {})
t.text = text
r.append(t)
parent.append(r)
def _add_hlink(parent, anchor: str, text: str):
hl = parent.makeelement(qn("w:hyperlink"), {})
hl.set(qn("w:anchor"), anchor)
r = parent.makeelement(qn("w:r"), {})
rPr = r.makeelement(qn("w:rPr"), {})
rStyle = rPr.makeelement(qn("w:rStyle"), {})
rStyle.set(qn("w:val"), "Hyperlink")
rPr.append(rStyle)
vertAlign = rPr.makeelement(qn("w:vertAlign"), {})
vertAlign.set(qn("w:val"), "superscript")
rPr.append(vertAlign)
r.append(rPr)
t = r.makeelement(qn("w:t"), {})
t.text = text
r.append(t)
hl.append(r)
parent.append(hl)
def _apply_style(paragraph, doc, style_name: str) -> bool: def _apply_style(paragraph, doc, style_name: str) -> bool:
"""尝试给段落应用样式,成功返回 ``True``。""" """尝试给段落应用样式,成功返回 ``True``。"""
try: try:

View File

@@ -6,17 +6,15 @@ import tomllib
@dataclass @dataclass
class ThesisConfig: class ThesisConfig:
"""论文配置数据(学生信息、元数据等,不包含正文内容)。""" """论文配置数据
student_name: str = "<None>" ``metadata`` 直接透传 TOML 的 ``[metadata]`` 节,不再为每个变量声明字段。
student_id: str = "<None>" 新增模板变量只需改 TOML无需修改 Python。
college: str = "<None>" """
major: str = "<None>"
class_: str = "<None>"
advisor: str = "<None>"
advisor_title: str = "<None>"
title: str = "<None>"
metadata: dict = field(default_factory=dict)
# 以下字段仍有业务逻辑,保留为显式属性
title_from_md: bool = True title_from_md: bool = True
body_start_keywords: list[str] = field(default_factory=lambda: ["绪论", "引言"]) body_start_keywords: list[str] = field(default_factory=lambda: ["绪论", "引言"])
body_end_keywords: list[str] = field( body_end_keywords: list[str] = field(
@@ -24,19 +22,11 @@ class ThesisConfig:
) )
body_style: str = "Body Text Indent" body_style: str = "Body Text Indent"
level_offset: int = -1 level_offset: int = -1
reference_style: str = "列出段落1"
def to_dict(self) -> dict: def to_dict(self) -> dict:
"""转成模板渲染用的扁平字典,排除 options 命名空间""" """透传 metadata模板变量来源"""
return { return self.metadata
"student_name": self.student_name,
"student_id": self.student_id,
"college": self.college,
"major": self.major,
"class": self.class_,
"advisor": self.advisor,
"advisor_title": self.advisor_title,
"title": self.title,
}
def load_config(path: str | Path) -> ThesisConfig: def load_config(path: str | Path) -> ThesisConfig:
@@ -49,14 +39,7 @@ def load_config(path: str | Path) -> ThesisConfig:
opts = raw.get("options", {}) opts = raw.get("options", {})
return ThesisConfig( return ThesisConfig(
student_name=meta.get("student_name", "<None>"), metadata=meta,
student_id=meta.get("student_id", "<None>"),
college=meta.get("college", "<None>"),
major=meta.get("major", "<None>"),
class_=meta.get("class", "<None>"),
advisor=meta.get("advisor", "<None>"),
advisor_title=meta.get("advisor_title", "<None>"),
title=meta.get("title", "<None>"),
title_from_md=opts.get("title_from_md", True), title_from_md=opts.get("title_from_md", True),
body_start_keywords=opts.get("body_start_keywords", ["绪论", "引言"]), body_start_keywords=opts.get("body_start_keywords", ["绪论", "引言"]),
body_end_keywords=opts.get( body_end_keywords=opts.get(
@@ -64,4 +47,5 @@ def load_config(path: str | Path) -> ThesisConfig:
), ),
body_style=opts.get("body_style", "Body Text Indent"), body_style=opts.get("body_style", "Body Text Indent"),
level_offset=opts.get("level_offset", -1), level_offset=opts.get("level_offset", -1),
reference_style=opts.get("reference_style", "列出段落1"),
) )

163
transit/images.py Normal file
View File

@@ -0,0 +1,163 @@
"""
图片处理模块。
将 Markdown 中的 ``<img>`` 标签解析为独立段落,
并在 Word 文档中插入图片及居中图标题。
"""
import struct
import re
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
# 匹配 <img src="..." alt="...">
_IMG_TAG = re.compile(
r'<img\s+([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE
)
# 从标签属性块中提取名值对
_ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']')
# 匹配 **图X ...** 重复图标题(注入时跳过)
_FIG_CAPTION = re.compile(
r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE
)
def make_image_paragraph(block: str, base_dir: str | Path | None = None) -> dict | None:
"""若 *block* 包含 ``<img>`` 标签,返回图片段落字典;否则返回 ``None``。
Parameters
----------
block : str
文本块。
base_dir : str | Path | None
Markdown 文件所在目录,用于解析图片相对路径。
"""
m = _IMG_TAG.search(block)
if not m:
return None
attrs = dict(_ATTR.findall(block))
src = attrs.get("src", m.group(2))
if base_dir and not Path(src).is_absolute():
src = str(Path(base_dir) / src)
return {
"type": "image",
"src": src,
"alt": attrs.get("alt", ""),
}
def is_figure_caption(block: str) -> bool:
"""检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。"""
return bool(_FIG_CAPTION.match(block.strip()))
def _get_image_dimensions(image_path: str) -> tuple[int, int] | None:
"""读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。
仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。
"""
try:
with open(image_path, "rb") as f:
header = f.read(32)
except Exception:
return None
# PNG: 8-byte signature, then IHDR chunk
if header[:8] == b"\x89PNG\r\n\x1a\n":
w, h = struct.unpack_from(">II", header, 16)
return w, h
# JPEG: starts with FF D8, scan for SOF marker
if header[:2] == b"\xff\xd8":
pos = 2
while pos < len(header):
if header[pos] != 0xFF:
return None
marker = header[pos + 1]
if marker in (0xC0, 0xC1, 0xC2):
h, w = struct.unpack_from(">HH", header, pos + 5)
return w, h
seg_len = struct.unpack_from(">H", header, pos + 2)[0]
pos += 2 + seg_len
return None
# GIF: "GIF87a" or "GIF89a"
if header[:6] in (b"GIF87a", b"GIF89a"):
w, h = struct.unpack_from("<HH", header, 6)
return w, h
# BMP: "BM" signature
if header[:2] == b"BM":
w, h = struct.unpack_from("<ii", header, 18)
return w, abs(h)
return None
def _get_native_emu(image_path: str) -> int | None:
"""读取图片的原生宽度EMU失败返回 ``None``。
Word 默认以 72 DPI 渲染图片1 px = 914400 / 72 = 12700 EMU。
"""
dims = _get_image_dimensions(image_path)
if dims is None:
return None
w_px, _ = dims
return w_px * 12700
def _constrain_width(image_path: str, page_text_width: int) -> int | None:
"""返回图片宽度EMU超出页宽时缩至页宽。
Parameters
----------
image_path : str
图片路径。
page_text_width : int
页面正文区宽度EMU来自 ``section.page_width - margins``。
"""
native = _get_native_emu(image_path)
if native is None:
return None
return min(native, page_text_width)
def insert_image_paragraphs(
doc: Document,
paragraphs: list[dict],
*,
idx: int,
parent,
):
"""在 *doc* 的指定位置插入图片段落序列。
每条图片段落生成两个 Word 段落:
1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸)
2. 居中图标题(从 ``alt`` 提取)
插入顺序保持 ``paragraphs`` 的原有顺序。
"""
section = doc.sections[0]
page_text_width = section.page_width - section.left_margin - section.right_margin
for pd_data in reversed(paragraphs):
# 图标题(在 reversed 中先插入,最终位于图片下方)
cap_p = doc.add_paragraph(pd_data.get("alt", ""))
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
parent.insert(idx, cap_p._element)
# 图片段落max-width 行为:超出页宽时压缩,否则原尺寸)
img_p = doc.add_paragraph()
img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
img_run = img_p.add_run()
try:
img_width = _constrain_width(pd_data["src"], page_text_width)
img_run.add_picture(pd_data["src"], width=img_width)
except Exception as exc:
print(f"警告:图片加载失败 {pd_data['src']}{exc}")
img_run.add_text(f"[图片加载失败: {pd_data['src']}]")
parent.insert(idx, img_p._element)

118
transit/references.py Normal file
View File

@@ -0,0 +1,118 @@
"""
GB 7714 《文后参考文献著录规则》顺序编码制 — 参考文献解析与格式化。
将 Markdown 中 ``[N] ... [TYPE] ...`` 格式的参考文献逐条解析,
按文献类型(书 M、期刊 J、会议 C、学位论文 D、标准 S、电子资源 EB/OL 等)
重新编排为符合 GB 7714 规范的格式,并输出为独立段落。
"""
import re
from typing import Optional
# 单行匹配: [N] 开头
_RE_LINE = re.compile(r"^\[(\d+)\]\s*(.*)$")
# 文献类型标记: [M] [J] [C] [D] [S] [EB/OL] [P] [N]
_RE_TYPE = re.compile(r"\[(\w+(?:/\w+)?)\]")
# GB 7714 中各文献类型的标准格式模板(仅用于说明,实际格式化直接拼接)
TYPE_LABELS: dict[str, str] = {
"M": "专著",
"J": "期刊文章",
"C": "会议论文",
"D": "学位论文",
"S": "标准",
"EB/OL": "电子资源",
"P": "专利",
"N": "报纸文章",
}
def parse_reference_line(line: str) -> Optional[dict]:
"""解析单行参考文献,提取序号、作者+标题、文献类型、来源信息。
期望格式::
[N] Authors. Title[TYPE]. Source info.
"""
line = line.strip()
m = _RE_LINE.match(line)
if not m:
return None
number = int(m.group(1))
rest = m.group(2).strip()
# 定位文献类型标记 [TYPE]
tm = _RE_TYPE.search(rest)
if not tm:
return None
before_type = rest[: tm.start()].strip().rstrip(".")
doc_type = tm.group(1)
after_type = rest[tm.end() :].strip().lstrip(".").strip()
return {
"number": number,
"before_type": before_type, # "Authors. Title"
"doc_type": doc_type, # "M", "J", "EB/OL", …
"after_type": after_type, # 来源信息
}
def _normalize_period(text: str) -> str:
"""确保文本以英文句点结尾GB 7714 要求)。"""
text = text.rstrip()
if text and not text.endswith("."):
text += "."
return text
def format_gb7714(ref: dict) -> str:
"""按 GB 7714 重新编排一条参考文献(不含序号前缀,由 Word 样式自动编号)。
格式::
Authors. Title[TYPE]. Source.
"""
bt = ref["before_type"]
dt = ref["doc_type"]
at = ref["after_type"]
formatted = f"{bt}[{dt}]. {at}"
return _normalize_period(formatted)
def references_to_paragraphs(
ref_text: str,
ref_style: str = "列出段落1",
) -> list[dict]:
"""将参考文献原始文本转换为格式化段落列表。
返回的每个元素::
{"text": str, "level": 0, "style": ref_style}
每条参考文献为一个独立段落。
"""
if not ref_text or ref_text == "<None>":
return [{"text": "<None>", "level": 0, "style": ref_style}]
lines = [l.strip() for l in ref_text.strip().split("\n") if l.strip()]
paragraphs: list[dict] = []
for line in lines:
ref = parse_reference_line(line)
if ref:
formatted = format_gb7714(ref)
ref_id = ref["number"]
else:
# 无法解析时,至少去掉 [N] 前缀
fallback = re.sub(r"^\[\d+\]\s*", "", line)
formatted = _normalize_period(fallback)
ref_id = None
paragraphs.append(
{"text": formatted, "level": 0, "style": ref_style, "ref_id": ref_id}
)
return paragraphs

View File

@@ -11,10 +11,12 @@ from docx import Document
from .config import load_config, ThesisConfig from .config import load_config, ThesisConfig
from .parser import parse_markdown from .parser import parse_markdown
from .body import body_to_paragraphs, replace_placeholder from .body import body_to_paragraphs, replace_placeholder, link_body_citations
from .references import references_to_paragraphs
_TEXT_FIELDS = [ # 解析器可能产生的字段(用于填充报告)
_PARSER_FIELDS = [
"title", "title",
"abstact_cn_context", "abstact_cn_context",
"abstract_cn_keywords", "abstract_cn_keywords",
@@ -23,13 +25,7 @@ _TEXT_FIELDS = [
"acknowledgement", "acknowledgement",
"reference", "reference",
"appendix", "appendix",
"student_name", "body_md",
"student_id",
"college",
"major",
"class",
"advisor",
"advisor_title",
] ]
@@ -80,12 +76,11 @@ def generate_thesis(
body_end_kw=config.body_end_keywords, body_end_kw=config.body_end_keywords,
) )
# 3. 合并配置 → 上下文(配置优先 # 3. 合并配置 → 上下文(配置填充解析器未产生的空白
for k, v in config.to_dict().items(): for k, v in config.to_dict().items():
if k == "title" and config.title_from_md and context.get("title"): if k == "title" and config.title_from_md and context.get("title"):
continue # 以 markdown 标题为准 continue # 以 markdown 标题为准
if v != "<None>": context.setdefault(k, v)
context[k] = v
# 4. 用 defaultdict 兜底缺失键 # 4. 用 defaultdict 兜底缺失键
ctx = defaultdict(lambda: "<None>", context) ctx = defaultdict(lambda: "<None>", context)
@@ -93,12 +88,22 @@ def generate_thesis(
# 5. 解析正文为段落列表 # 5. 解析正文为段落列表
body_md = ctx.get("body_md", "") body_md = ctx.get("body_md", "")
body_paragraphs = ( body_paragraphs = (
body_to_paragraphs(body_md, level_offset=config.level_offset, body_style=config.body_style) body_to_paragraphs(
body_md,
level_offset=config.level_offset,
body_style=config.body_style,
base_dir=data_path.parent,
)
if body_md else [] if body_md else []
) )
# 6. 占位符 # 6. 解析参考文献为段落列表
ref_text = ctx.get("reference", "")
ref_paragraphs = references_to_paragraphs(ref_text, ref_style=config.reference_style)
# 7. 占位符(替代模板变量,后处理时替换)
ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__" ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__"
ctx["reference"] = "__REFERENCE_PLACEHOLDER__"
# 7. 渲染模板 # 7. 渲染模板
doc = DocxTemplate(str(template_path)) doc = DocxTemplate(str(template_path))
@@ -108,28 +113,37 @@ def generate_thesis(
temp_path = Path(output_path).with_suffix(".tmp") temp_path = Path(output_path).with_suffix(".tmp")
doc.save(str(temp_path)) doc.save(str(temp_path))
# 9. 正文注入 # 9. 正文注入+参考文献注入
final_doc = Document(str(temp_path)) final_doc = Document(str(temp_path))
replace_placeholder( replace_placeholder(
final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs, final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs,
default_body_style=config.body_style, default_body_style=config.body_style,
) )
# 将正文中的 [N] 引用替换为超链接
link_body_citations(final_doc)
replace_placeholder(
final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs,
default_body_style=config.reference_style,
)
final_doc.save(str(output_path)) final_doc.save(str(output_path))
temp_path.unlink(missing_ok=True) temp_path.unlink(missing_ok=True)
print(f"[完成] 论文生成完成: {output_path}") print(f"[完成] 论文生成完成: {output_path}")
# 10. 字段填充报告 # 10. 字段填充报告(动态收集所有模板与解析字段)
report_fields = list(dict.fromkeys([*config.metadata.keys(), *_PARSER_FIELDS]))
print("\n--- 字段填充情况 ---") print("\n--- 字段填充情况 ---")
for key in _TEXT_FIELDS: for key in report_fields:
val = ctx[key] val = ctx.get(key, "<None>")
if val == "<None>": if val == "<None>":
print(f" [缺失] {key}") print(f" [缺失] {key}")
else: else:
preview = str(val)[:60].replace("\n", " ") preview = str(val)[:60].replace("\n", " ")
print(f" [OK] {key}: {preview}...") print(f" [OK] {key}: {preview}...")
missing = [k for k in _TEXT_FIELDS if ctx[k] == "<None>"] missing = [k for k in report_fields if ctx.get(k, "<None>") == "<None>"]
if missing: if missing:
print("\n[警告] 以下字段缺失,已填充 '<None>'") print("\n[警告] 以下字段缺失,已填充 '<None>'")
for f in missing: for f in missing: