feat(transit): 添加正文引用标记到书签超链接功能

- 新增 CITE_PATTERN 正则表达式匹配 [N] 引用格式
- 添加 base_dir 参数支持相对图片路径解析
- 实现书签创建和超链接替换功能
- 添加 link_body_citations 函数处理正文引用链接
- 在参考文献段落中添加书签标识
- 支持将 [N] 引用替换为指向参考文献的超链接
This commit is contained in:
zzy
2026-05-10 15:07:20 +08:00
parent 0b10e97e0c
commit 4e39a4f2ac
4 changed files with 153 additions and 7 deletions

View File

@@ -7,6 +7,7 @@ Markdown 正文 → Word 段落转换。
import re import re
from copy import deepcopy from copy import deepcopy
from pathlib import Path
from docx import Document from docx import Document
from docx.oxml.ns import qn from docx.oxml.ns import qn
@@ -15,12 +16,16 @@ from .images import make_image_paragraph, is_figure_caption, insert_image_paragr
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) _PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
# 匹配正文中的引用标记 [1] / [1,2,3]
_CITE_PATTERN = re.compile(r"\[(\d+(?:[,\s]*\d+)*)\]")
def body_to_paragraphs( def body_to_paragraphs(
md_text: str, md_text: str,
*, *,
level_offset: int = 0, level_offset: int = 0,
body_style: str = "Body Text Indent", body_style: str = "Body Text Indent",
base_dir: str | Path | None = None,
) -> list[dict]: ) -> list[dict]:
"""将 Markdown 正文按标题和段落拆分为结构化列表。 """将 Markdown 正文按标题和段落拆分为结构化列表。
@@ -32,6 +37,8 @@ def body_to_paragraphs(
标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。 标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。
body_style : str body_style : str
正文段落的 Word 样式名。 正文段落的 Word 样式名。
base_dir : str | Path | None
Markdown 文件所在目录,用于解析图片相对路径。
""" """
paragraphs: list[dict] = [] paragraphs: list[dict] = []
last_end = 0 last_end = 0
@@ -41,7 +48,7 @@ def body_to_paragraphs(
if not block: if not block:
return return
# 图片段落 # 图片段落
img = make_image_paragraph(block) img = make_image_paragraph(block, base_dir)
if img: if img:
paragraphs.append(img) paragraphs.append(img)
return return
@@ -103,6 +110,9 @@ def replace_placeholder(
parent.remove(para._element) parent.remove(para._element)
# 为参考文献段落准备书签 ID
bm_id = _max_bookmark_id(doc) + 1
for pd_data in reversed(paragraphs): for pd_data in reversed(paragraphs):
if pd_data.get("type") == "image": if pd_data.get("type") == "image":
insert_image_paragraphs( insert_image_paragraphs(
@@ -134,12 +144,123 @@ def replace_placeholder(
new_pPr.append(deepcopy(numPr)) new_pPr.append(deepcopy(numPr))
parent.insert(idx, new_p._element) parent.insert(idx, new_p._element)
# 为参考文献条目添加书签
ref_id = pd_data.get("ref_id")
if ref_id is not None:
_add_bookmark(new_p, f"ref-{ref_id}", bm_id)
bm_id += 1
break break
if not placeholder_found: if not placeholder_found:
print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。") print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。")
def link_body_citations(doc: Document):
"""将文档中正文段落的 ``[N]`` 引用替换为指向对应书签的超链接。"""
for para in doc.paragraphs:
# 跳过已有超链接的段落(如目录页)
if para._element.findall(qn("w:hyperlink")):
continue
_link_paragraph(para)
def _max_bookmark_id(doc: Document) -> int:
"""扫描文档返回最大书签 ID。"""
max_id = 0
for para in doc.paragraphs:
for bm in para._element.iter(qn("w:bookmarkStart")):
try:
max_id = max(max_id, int(bm.get(qn("w:id"))))
except (ValueError, TypeError):
pass
return max_id
def _add_bookmark(paragraph, name: str, bm_id: int):
"""为段落添加书签。"""
bm_start = paragraph._element.makeelement(qn("w:bookmarkStart"), {})
bm_start.set(qn("w:id"), str(bm_id))
bm_start.set(qn("w:name"), name)
bm_end = paragraph._element.makeelement(qn("w:bookmarkEnd"), {})
bm_end.set(qn("w:id"), str(bm_id))
pPr = paragraph._element.find(qn("w:pPr"))
if pPr is not None:
paragraph._element.insert(1, bm_start)
else:
paragraph._element.insert(0, bm_start)
paragraph._element.append(bm_end)
def _link_paragraph(para):
"""将单个段落中的 ``[N]`` 替换为 HYPERLINK 域。"""
runs = list(para._element.findall(qn("w:r")))
if not runs:
return
full_text = ""
for r in runs:
t = r.find(qn("w:t"))
if t is not None and t.text:
full_text += t.text
matches = list(_CITE_PATTERN.finditer(full_text))
if not matches:
return
first_rPr = runs[0].find(qn("w:rPr"))
for r in runs:
para._element.remove(r)
pos = 0
for m in matches:
before = full_text[pos : m.start()]
if before:
_add_run(para._element, before, first_rPr)
nums = re.findall(r"\d+", m.group(1))
_add_hlink(para._element, f"ref-{nums[0]}", m.group())
pos = m.end()
after = full_text[pos:]
if after:
_add_run(para._element, after, first_rPr)
def _add_run(parent, text: str, rPr):
r = parent.makeelement(qn("w:r"), {})
if rPr is not None:
r.append(deepcopy(rPr))
t = r.makeelement(qn("w:t"), {})
t.text = text
r.append(t)
parent.append(r)
def _add_hlink(parent, anchor: str, text: str):
hl = parent.makeelement(qn("w:hyperlink"), {})
hl.set(qn("w:anchor"), anchor)
r = parent.makeelement(qn("w:r"), {})
rPr = r.makeelement(qn("w:rPr"), {})
rStyle = rPr.makeelement(qn("w:rStyle"), {})
rStyle.set(qn("w:val"), "Hyperlink")
rPr.append(rStyle)
vertAlign = rPr.makeelement(qn("w:vertAlign"), {})
vertAlign.set(qn("w:val"), "superscript")
rPr.append(vertAlign)
r.append(rPr)
t = r.makeelement(qn("w:t"), {})
t.text = text
r.append(t)
hl.append(r)
parent.append(hl)
def _apply_style(paragraph, doc, style_name: str) -> bool: def _apply_style(paragraph, doc, style_name: str) -> bool:
"""尝试给段落应用样式,成功返回 ``True``。""" """尝试给段落应用样式,成功返回 ``True``。"""
try: try:

View File

@@ -7,6 +7,7 @@
import struct import struct
import re import re
from pathlib import Path
from docx import Document from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.text import WD_ALIGN_PARAGRAPH
@@ -25,15 +26,26 @@ _FIG_CAPTION = re.compile(
) )
def make_image_paragraph(block: str) -> dict | None: def make_image_paragraph(block: str, base_dir: str | Path | None = None) -> dict | None:
"""若 *block* 包含 ``<img>`` 标签,返回图片段落字典;否则返回 ``None``。""" """若 *block* 包含 ``<img>`` 标签,返回图片段落字典;否则返回 ``None``。
Parameters
----------
block : str
文本块。
base_dir : str | Path | None
Markdown 文件所在目录,用于解析图片相对路径。
"""
m = _IMG_TAG.search(block) m = _IMG_TAG.search(block)
if not m: if not m:
return None return None
attrs = dict(_ATTR.findall(block)) attrs = dict(_ATTR.findall(block))
src = attrs.get("src", m.group(2))
if base_dir and not Path(src).is_absolute():
src = str(Path(base_dir) / src)
return { return {
"type": "image", "type": "image",
"src": attrs.get("src", m.group(2)), "src": src,
"alt": attrs.get("alt", ""), "alt": attrs.get("alt", ""),
} }

View File

@@ -104,11 +104,15 @@ def references_to_paragraphs(
ref = parse_reference_line(line) ref = parse_reference_line(line)
if ref: if ref:
formatted = format_gb7714(ref) formatted = format_gb7714(ref)
ref_id = ref["number"]
else: else:
# 无法解析时,至少去掉 [N] 前缀 # 无法解析时,至少去掉 [N] 前缀
fallback = re.sub(r"^\[\d+\]\s*", "", line) fallback = re.sub(r"^\[\d+\]\s*", "", line)
formatted = _normalize_period(fallback) formatted = _normalize_period(fallback)
ref_id = None
paragraphs.append({"text": formatted, "level": 0, "style": ref_style}) paragraphs.append(
{"text": formatted, "level": 0, "style": ref_style, "ref_id": ref_id}
)
return paragraphs return paragraphs

View File

@@ -11,7 +11,7 @@ from docx import Document
from .config import load_config, ThesisConfig from .config import load_config, ThesisConfig
from .parser import parse_markdown from .parser import parse_markdown
from .body import body_to_paragraphs, replace_placeholder from .body import body_to_paragraphs, replace_placeholder, link_body_citations
from .references import references_to_paragraphs from .references import references_to_paragraphs
@@ -88,7 +88,12 @@ def generate_thesis(
# 5. 解析正文为段落列表 # 5. 解析正文为段落列表
body_md = ctx.get("body_md", "") body_md = ctx.get("body_md", "")
body_paragraphs = ( body_paragraphs = (
body_to_paragraphs(body_md, level_offset=config.level_offset, body_style=config.body_style) body_to_paragraphs(
body_md,
level_offset=config.level_offset,
body_style=config.body_style,
base_dir=data_path.parent,
)
if body_md else [] if body_md else []
) )
@@ -114,6 +119,10 @@ def generate_thesis(
final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs, final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs,
default_body_style=config.body_style, default_body_style=config.body_style,
) )
# 将正文中的 [N] 引用替换为超链接
link_body_citations(final_doc)
replace_placeholder( replace_placeholder(
final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs, final_doc, "__REFERENCE_PLACEHOLDER__", ref_paragraphs,
default_body_style=config.reference_style, default_body_style=config.reference_style,