feat(transit): 支持Markdown图片转Word图片段落

- 新增 transit/images.py 模块处理图片解析和插入逻辑
- 实现 `<img>` 标签解析为独立图片段落
- 添加图片尺寸检测和自适应缩放功能(超出页面宽度时自动缩放)
- 支持图标题居中显示和图片居中对齐
- 优化 body_to_paragraphs 函数,添加图片处理逻辑
- 更新 README.md 使用说明,添加模板文件要求说明

BREAKING CHANGE: 图片处理方式变更,需要包含 sample.docx 模板文件
This commit is contained in:
zzy
2026-05-09 16:17:51 +08:00
parent fc6afdea9d
commit 0b10e97e0c
3 changed files with 200 additions and 33 deletions

View File

@@ -1,4 +1,5 @@
使用方法 使用方法
```shell ```shell
py .\test.py .\毕业论文初稿.md # 需要 sample.docx 文件 且 该文件有 {{xxx}} 模板引擎的内容
python .\test.py .\毕业论文初稿.md
``` ```

View File

@@ -7,9 +7,12 @@ Markdown 正文 → Word 段落转换。
import re import re
from copy import deepcopy from copy import deepcopy
from docx import Document from docx import Document
from docx.oxml.ns import qn from docx.oxml.ns import qn
from .images import make_image_paragraph, is_figure_caption, insert_image_paragraphs
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) _PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
@@ -33,17 +36,28 @@ def body_to_paragraphs(
paragraphs: list[dict] = [] paragraphs: list[dict] = []
last_end = 0 last_end = 0
def _add_block(block: str) -> None:
block = block.strip()
if not block:
return
# 图片段落
img = make_image_paragraph(block)
if img:
paragraphs.append(img)
return
# 跳过紧跟在图片后的重复图标题
if paragraphs and paragraphs[-1].get("type") == "image" and is_figure_caption(block):
return
# 普通正文段落
paragraphs.append({"text": block, "level": 0, "style": body_style})
for m in _PAT_HEADING.finditer(md_text): for m in _PAT_HEADING.finditer(md_text):
# 标题前的普通文本 # 标题前的普通文本
if m.start() > last_end: if m.start() > last_end:
pre = md_text[last_end : m.start()].strip() pre = md_text[last_end : m.start()].strip()
if pre: if pre:
for block in re.split(r"\n\s*\n", pre): for block in re.split(r"\n\s*\n", pre):
block = block.strip() _add_block(block)
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
level = len(m.group(1)) + level_offset level = len(m.group(1)) + level_offset
heading_text = m.group(2).strip() heading_text = m.group(2).strip()
@@ -56,11 +70,7 @@ def body_to_paragraphs(
tail = md_text[last_end:].strip() tail = md_text[last_end:].strip()
if tail: if tail:
for block in re.split(r"\n\s*\n", tail): for block in re.split(r"\n\s*\n", tail):
block = block.strip() _add_block(block)
if block:
paragraphs.append(
{"text": block, "level": 0, "style": body_style}
)
return paragraphs return paragraphs
@@ -94,31 +104,36 @@ def replace_placeholder(
parent.remove(para._element) parent.remove(para._element)
for pd_data in reversed(paragraphs): for pd_data in reversed(paragraphs):
new_p = doc.add_paragraph(pd_data["text"]) if pd_data.get("type") == "image":
style_name = pd_data["style"] insert_image_paragraphs(
doc, [pd_data], idx=idx, parent=parent
)
else:
new_p = doc.add_paragraph(pd_data["text"])
style_name = pd_data["style"]
# 尝试应用样式,逐步降级 # 尝试应用样式,逐步降级
applied = _apply_style(new_p, doc, style_name) applied = _apply_style(new_p, doc, style_name)
if not applied and style_name.startswith("Heading"): if not applied and style_name.startswith("Heading"):
new_p.style = doc.styles["Normal"] new_p.style = doc.styles["Normal"]
elif not applied: elif not applied:
if placeholder_style: if placeholder_style:
_apply_style(new_p, doc, placeholder_style) _apply_style(new_p, doc, placeholder_style)
if new_p.style.name == "Normal" and placeholder_style: if new_p.style.name == "Normal" and placeholder_style:
new_p.style = doc.styles[placeholder_style] new_p.style = doc.styles[placeholder_style]
# 继承原段落的编号属性(自动编号) # 继承原段落的编号属性(自动编号)
if numPr is not None: if numPr is not None:
new_pPr = new_p._element.find(qn("w:pPr")) new_pPr = new_p._element.find(qn("w:pPr"))
if new_pPr is None: if new_pPr is None:
new_pPr = new_p._element.makeelement(qn("w:pPr"), {}) new_pPr = new_p._element.makeelement(qn("w:pPr"), {})
new_p._element.insert(0, new_pPr) new_p._element.insert(0, new_pPr)
existing = new_pPr.find(qn("w:numPr")) existing = new_pPr.find(qn("w:numPr"))
if existing is not None: if existing is not None:
new_pPr.remove(existing) new_pPr.remove(existing)
new_pPr.append(deepcopy(numPr)) new_pPr.append(deepcopy(numPr))
parent.insert(idx, new_p._element) parent.insert(idx, new_p._element)
break break
if not placeholder_found: if not placeholder_found:

151
transit/images.py Normal file
View File

@@ -0,0 +1,151 @@
"""
图片处理模块。
将 Markdown 中的 ``<img>`` 标签解析为独立段落,
并在 Word 文档中插入图片及居中图标题。
"""
import struct
import re
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
# 匹配 <img src="..." alt="...">
_IMG_TAG = re.compile(
r'<img\s+([^>]*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE
)
# 从标签属性块中提取名值对
_ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']')
# 匹配 **图X ...** 重复图标题(注入时跳过)
_FIG_CAPTION = re.compile(
r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE
)
def make_image_paragraph(block: str) -> dict | None:
"""若 *block* 包含 ``<img>`` 标签,返回图片段落字典;否则返回 ``None``。"""
m = _IMG_TAG.search(block)
if not m:
return None
attrs = dict(_ATTR.findall(block))
return {
"type": "image",
"src": attrs.get("src", m.group(2)),
"alt": attrs.get("alt", ""),
}
def is_figure_caption(block: str) -> bool:
"""检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。"""
return bool(_FIG_CAPTION.match(block.strip()))
def _get_image_dimensions(image_path: str) -> tuple[int, int] | None:
"""读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。
仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。
"""
try:
with open(image_path, "rb") as f:
header = f.read(32)
except Exception:
return None
# PNG: 8-byte signature, then IHDR chunk
if header[:8] == b"\x89PNG\r\n\x1a\n":
w, h = struct.unpack_from(">II", header, 16)
return w, h
# JPEG: starts with FF D8, scan for SOF marker
if header[:2] == b"\xff\xd8":
pos = 2
while pos < len(header):
if header[pos] != 0xFF:
return None
marker = header[pos + 1]
if marker in (0xC0, 0xC1, 0xC2):
h, w = struct.unpack_from(">HH", header, pos + 5)
return w, h
seg_len = struct.unpack_from(">H", header, pos + 2)[0]
pos += 2 + seg_len
return None
# GIF: "GIF87a" or "GIF89a"
if header[:6] in (b"GIF87a", b"GIF89a"):
w, h = struct.unpack_from("<HH", header, 6)
return w, h
# BMP: "BM" signature
if header[:2] == b"BM":
w, h = struct.unpack_from("<ii", header, 18)
return w, abs(h)
return None
def _get_native_emu(image_path: str) -> int | None:
"""读取图片的原生宽度EMU失败返回 ``None``。
Word 默认以 72 DPI 渲染图片1 px = 914400 / 72 = 12700 EMU。
"""
dims = _get_image_dimensions(image_path)
if dims is None:
return None
w_px, _ = dims
return w_px * 12700
def _constrain_width(image_path: str, page_text_width: int) -> int | None:
"""返回图片宽度EMU超出页宽时缩至页宽。
Parameters
----------
image_path : str
图片路径。
page_text_width : int
页面正文区宽度EMU来自 ``section.page_width - margins``。
"""
native = _get_native_emu(image_path)
if native is None:
return None
return min(native, page_text_width)
def insert_image_paragraphs(
doc: Document,
paragraphs: list[dict],
*,
idx: int,
parent,
):
"""在 *doc* 的指定位置插入图片段落序列。
每条图片段落生成两个 Word 段落:
1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸)
2. 居中图标题(从 ``alt`` 提取)
插入顺序保持 ``paragraphs`` 的原有顺序。
"""
section = doc.sections[0]
page_text_width = section.page_width - section.left_margin - section.right_margin
for pd_data in reversed(paragraphs):
# 图标题(在 reversed 中先插入,最终位于图片下方)
cap_p = doc.add_paragraph(pd_data.get("alt", ""))
cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
parent.insert(idx, cap_p._element)
# 图片段落max-width 行为:超出页宽时压缩,否则原尺寸)
img_p = doc.add_paragraph()
img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
img_run = img_p.add_run()
try:
img_width = _constrain_width(pd_data["src"], page_text_width)
img_run.add_picture(pd_data["src"], width=img_width)
except Exception as exc:
print(f"警告:图片加载失败 {pd_data['src']}{exc}")
img_run.add_text(f"[图片加载失败: {pd_data['src']}]")
parent.insert(idx, img_p._element)