Files
md2word/transit/body.py
zzy 74d28ea2d8 feat(transit): 添加参考文献解析功能并修复段落编号继承问题
新增了参考文献处理模块,支持按照 GB 7714 《文后参考文献著录规则》顺序编码制
解析和格式化参考文献。同时修复了段落替换过程中自动编号丢失的问题。

- 新增 transit/references.py 模块,提供参考文献解析和格式化功能
- 在 body.py 的 replace_placeholder 函数中实现段落编号属性的正确继承
- 修改 transit/__init__.py 导入新的参考文献处理函数
- 更新 transit/config.py 添加参考文献样式配置项
- 修改 transit/renderer.py 集成参考文献处理流程
2026-05-08 22:14:51 +08:00

141 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Markdown 正文 → Word 段落转换。
将正文 Markdown 按标题层级拆分为带样式的段落序列,
再注入到渲染后 docx 文档的占位符位置。
"""
import re
from copy import deepcopy
from docx import Document
from docx.oxml.ns import qn
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
def body_to_paragraphs(
md_text: str,
*,
level_offset: int = 0,
body_style: str = "Body Text Indent",
) -> list[dict]:
"""将 Markdown 正文按标题和段落拆分为结构化列表。
Parameters
----------
md_text : str
正文 Markdown。
level_offset : int
标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。
body_style : str
正文段落的 Word 样式名。
"""
paragraphs: list[dict] = []
last_end = 0
for m in _PAT_HEADING.finditer(md_text):
# 标题前的普通文本
if m.start() > last_end:
pre = md_text[last_end : m.start()].strip()
if pre:
for block in re.split(r"\n\s*\n", pre):
block = block.strip()
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
level = len(m.group(1)) + level_offset
heading_text = m.group(2).strip()
paragraphs.append(
{"text": heading_text, "level": level, "style": f"Heading {level}"}
)
last_end = m.end()
# 最后一段 / 尾部文本
tail = md_text[last_end:].strip()
if tail:
for block in re.split(r"\n\s*\n", tail):
block = block.strip()
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
return paragraphs
def replace_placeholder(
doc: Document,
placeholder: str,
paragraphs: list[dict],
*,
default_body_style: str | None = None,
):
"""在 *doc* 中找到包含 *placeholder* 的段落,替换为 *paragraphs* 列表。
正文段落的样式优先级:
1. ``style`` 字段指定的样式名(来自 ``body_to_paragraphs`` 的 ``body_style``
2. 占位符段落自身的样式(模板中已设好的样式)
3. ``Normal``
"""
placeholder_found = False
for para in doc.paragraphs:
if placeholder in para.text:
placeholder_found = True
placeholder_style = para.style.name if para.style else None
parent = para._element.getparent()
idx = list(parent).index(para._element)
# 保存原段落的编号属性numPr用于继承自动编号
orig_pPr = para._element.find(qn("w:pPr"))
numPr = orig_pPr.find(qn("w:numPr")) if orig_pPr is not None else None
parent.remove(para._element)
for pd_data in reversed(paragraphs):
new_p = doc.add_paragraph(pd_data["text"])
style_name = pd_data["style"]
# 尝试应用样式,逐步降级
applied = _apply_style(new_p, doc, style_name)
if not applied and style_name.startswith("Heading"):
new_p.style = doc.styles["Normal"]
elif not applied:
if placeholder_style:
_apply_style(new_p, doc, placeholder_style)
if new_p.style.name == "Normal" and placeholder_style:
new_p.style = doc.styles[placeholder_style]
# 继承原段落的编号属性(自动编号)
if numPr is not None:
new_pPr = new_p._element.find(qn("w:pPr"))
if new_pPr is None:
new_pPr = new_p._element.makeelement(qn("w:pPr"), {})
new_p._element.insert(0, new_pPr)
existing = new_pPr.find(qn("w:numPr"))
if existing is not None:
new_pPr.remove(existing)
new_pPr.append(deepcopy(numPr))
parent.insert(idx, new_p._element)
break
if not placeholder_found:
print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。")
def _apply_style(paragraph, doc, style_name: str) -> bool:
"""尝试给段落应用样式,成功返回 ``True``。"""
try:
paragraph.style = doc.styles[style_name]
return True
except KeyError:
pass
# 大小写不敏感匹配
for s in doc.styles:
if s.name.lower() == style_name.lower():
paragraph.style = s
return True
return False