""" Markdown 正文 → Word 段落转换。 将正文 Markdown 按标题层级拆分为带样式的段落序列, 再注入到渲染后 docx 文档的占位符位置。 """ import re from copy import deepcopy from pathlib import Path from docx import Document from docx.oxml.ns import qn from .images import make_image_paragraph, is_figure_caption, insert_image_paragraphs _PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) # 匹配正文中的引用标记 [1] / [1,2,3] _CITE_PATTERN = re.compile(r"\[(\d+(?:[,,\s]*\d+)*)\]") def body_to_paragraphs( md_text: str, *, level_offset: int = 0, body_style: str = "Body Text Indent", base_dir: str | Path | None = None, ) -> list[dict]: """将 Markdown 正文按标题和段落拆分为结构化列表。 Parameters ---------- md_text : str 正文 Markdown。 level_offset : int 标题级别偏移量(正文从 ``##`` 开始时传 ``-1``,使其输出为 ``Heading 1``)。 body_style : str 正文段落的 Word 样式名。 base_dir : str | Path | None Markdown 文件所在目录,用于解析图片相对路径。 """ paragraphs: list[dict] = [] last_end = 0 def _add_block(block: str) -> None: block = block.strip() if not block: return # 图片段落 img = make_image_paragraph(block, base_dir) if img: paragraphs.append(img) return # 跳过紧跟在图片后的重复图标题 if paragraphs and paragraphs[-1].get("type") == "image" and is_figure_caption(block): return # 普通正文段落 paragraphs.append({"text": block, "level": 0, "style": body_style}) for m in _PAT_HEADING.finditer(md_text): # 标题前的普通文本 if m.start() > last_end: pre = md_text[last_end : m.start()].strip() if pre: for block in re.split(r"\n\s*\n", pre): _add_block(block) level = len(m.group(1)) + level_offset heading_text = m.group(2).strip() paragraphs.append( {"text": heading_text, "level": level, "style": f"Heading {level}"} ) last_end = m.end() # 最后一段 / 尾部文本 tail = md_text[last_end:].strip() if tail: for block in re.split(r"\n\s*\n", tail): _add_block(block) return paragraphs def replace_placeholder( doc: Document, placeholder: str, paragraphs: list[dict], *, default_body_style: str | None = None, ): """在 *doc* 中找到包含 *placeholder* 的段落,替换为 *paragraphs* 列表。 正文段落的样式优先级: 1. ``style`` 字段指定的样式名(来自 ``body_to_paragraphs`` 的 ``body_style``) 2. 占位符段落自身的样式(模板中已设好的样式) 3. ``Normal`` """ placeholder_found = False for para in doc.paragraphs: if placeholder in para.text: placeholder_found = True placeholder_style = para.style.name if para.style else None parent = para._element.getparent() idx = list(parent).index(para._element) # 保存原段落的编号属性(numPr),用于继承自动编号 orig_pPr = para._element.find(qn("w:pPr")) numPr = orig_pPr.find(qn("w:numPr")) if orig_pPr is not None else None parent.remove(para._element) # 为参考文献段落准备书签 ID bm_id = _max_bookmark_id(doc) + 1 for pd_data in reversed(paragraphs): if pd_data.get("type") == "image": insert_image_paragraphs( doc, [pd_data], idx=idx, parent=parent ) else: new_p = doc.add_paragraph(pd_data["text"]) style_name = pd_data["style"] # 尝试应用样式,逐步降级 applied = _apply_style(new_p, doc, style_name) if not applied and style_name.startswith("Heading"): new_p.style = doc.styles["Normal"] elif not applied: if placeholder_style: _apply_style(new_p, doc, placeholder_style) if new_p.style.name == "Normal" and placeholder_style: new_p.style = doc.styles[placeholder_style] # 继承原段落的编号属性(自动编号) if numPr is not None: new_pPr = new_p._element.find(qn("w:pPr")) if new_pPr is None: new_pPr = new_p._element.makeelement(qn("w:pPr"), {}) new_p._element.insert(0, new_pPr) existing = new_pPr.find(qn("w:numPr")) if existing is not None: new_pPr.remove(existing) new_pPr.append(deepcopy(numPr)) parent.insert(idx, new_p._element) # 为参考文献条目添加书签 ref_id = pd_data.get("ref_id") if ref_id is not None: _add_bookmark(new_p, f"ref-{ref_id}", bm_id) bm_id += 1 break if not placeholder_found: print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。") def link_body_citations(doc: Document): """将文档中正文段落的 ``[N]`` 引用替换为指向对应书签的超链接。""" for para in doc.paragraphs: # 跳过已有超链接的段落(如目录页) if para._element.findall(qn("w:hyperlink")): continue _link_paragraph(para) def _max_bookmark_id(doc: Document) -> int: """扫描文档返回最大书签 ID。""" max_id = 0 for para in doc.paragraphs: for bm in para._element.iter(qn("w:bookmarkStart")): try: max_id = max(max_id, int(bm.get(qn("w:id")))) except (ValueError, TypeError): pass return max_id def _add_bookmark(paragraph, name: str, bm_id: int): """为段落添加书签。""" bm_start = paragraph._element.makeelement(qn("w:bookmarkStart"), {}) bm_start.set(qn("w:id"), str(bm_id)) bm_start.set(qn("w:name"), name) bm_end = paragraph._element.makeelement(qn("w:bookmarkEnd"), {}) bm_end.set(qn("w:id"), str(bm_id)) pPr = paragraph._element.find(qn("w:pPr")) if pPr is not None: paragraph._element.insert(1, bm_start) else: paragraph._element.insert(0, bm_start) paragraph._element.append(bm_end) def _link_paragraph(para): """将单个段落中的 ``[N]`` 替换为 HYPERLINK 域。""" runs = list(para._element.findall(qn("w:r"))) if not runs: return full_text = "" for r in runs: t = r.find(qn("w:t")) if t is not None and t.text: full_text += t.text matches = list(_CITE_PATTERN.finditer(full_text)) if not matches: return first_rPr = runs[0].find(qn("w:rPr")) for r in runs: para._element.remove(r) pos = 0 for m in matches: before = full_text[pos : m.start()] if before: _add_run(para._element, before, first_rPr) nums = re.findall(r"\d+", m.group(1)) _add_hlink(para._element, f"ref-{nums[0]}", m.group()) pos = m.end() after = full_text[pos:] if after: _add_run(para._element, after, first_rPr) def _add_run(parent, text: str, rPr): r = parent.makeelement(qn("w:r"), {}) if rPr is not None: r.append(deepcopy(rPr)) t = r.makeelement(qn("w:t"), {}) t.text = text r.append(t) parent.append(r) def _add_hlink(parent, anchor: str, text: str): hl = parent.makeelement(qn("w:hyperlink"), {}) hl.set(qn("w:anchor"), anchor) r = parent.makeelement(qn("w:r"), {}) rPr = r.makeelement(qn("w:rPr"), {}) rStyle = rPr.makeelement(qn("w:rStyle"), {}) rStyle.set(qn("w:val"), "Hyperlink") rPr.append(rStyle) vertAlign = rPr.makeelement(qn("w:vertAlign"), {}) vertAlign.set(qn("w:val"), "superscript") rPr.append(vertAlign) r.append(rPr) t = r.makeelement(qn("w:t"), {}) t.text = text r.append(t) hl.append(r) parent.append(hl) def _apply_style(paragraph, doc, style_name: str) -> bool: """尝试给段落应用样式,成功返回 ``True``。""" try: paragraph.style = doc.styles[style_name] return True except KeyError: pass # 大小写不敏感匹配 for s in doc.styles: if s.name.lower() == style_name.lower(): paragraph.style = s return True return False