From 0b10e97e0c2623b45c1c0efdaad6c3f12f54b498 Mon Sep 17 00:00:00 2001
From: zzy <2450266535@qq.com>
Date: Sat, 9 May 2026 16:17:51 +0800
Subject: [PATCH] =?UTF-8?q?feat(transit):=20=E6=94=AF=E6=8C=81Markdown?=
=?UTF-8?q?=E5=9B=BE=E7=89=87=E8=BD=ACWord=E5=9B=BE=E7=89=87=E6=AE=B5?=
=?UTF-8?q?=E8=90=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 transit/images.py 模块处理图片解析和插入逻辑
- 实现 `
` 标签解析为独立图片段落
- 添加图片尺寸检测和自适应缩放功能(超出页面宽度时自动缩放)
- 支持图标题居中显示和图片居中对齐
- 优化 body_to_paragraphs 函数,添加图片处理逻辑
- 更新 README.md 使用说明,添加模板文件要求说明
BREAKING CHANGE: 图片处理方式变更,需要包含 sample.docx 模板文件
---
README.md | 3 +-
transit/body.py | 79 ++++++++++++++----------
transit/images.py | 151 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 200 insertions(+), 33 deletions(-)
create mode 100644 transit/images.py
diff --git a/README.md b/README.md
index 887931d..c0ca4a5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
使用方法
```shell
-py .\test.py .\毕业论文初稿.md
+# 需要 sample.docx 文件 且 该文件有 {{xxx}} 模板引擎的内容
+python .\test.py .\毕业论文初稿.md
```
diff --git a/transit/body.py b/transit/body.py
index e0c35a7..d372879 100644
--- a/transit/body.py
+++ b/transit/body.py
@@ -7,9 +7,12 @@ Markdown 正文 → Word 段落转换。
import re
from copy import deepcopy
+
from docx import Document
from docx.oxml.ns import qn
+from .images import make_image_paragraph, is_figure_caption, insert_image_paragraphs
+
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
@@ -33,17 +36,28 @@ def body_to_paragraphs(
paragraphs: list[dict] = []
last_end = 0
+ def _add_block(block: str) -> None:
+ block = block.strip()
+ if not block:
+ return
+ # 图片段落
+ img = make_image_paragraph(block)
+ if img:
+ paragraphs.append(img)
+ return
+ # 跳过紧跟在图片后的重复图标题
+ if paragraphs and paragraphs[-1].get("type") == "image" and is_figure_caption(block):
+ return
+ # 普通正文段落
+ paragraphs.append({"text": block, "level": 0, "style": body_style})
+
for m in _PAT_HEADING.finditer(md_text):
# 标题前的普通文本
if m.start() > last_end:
pre = md_text[last_end : m.start()].strip()
if pre:
for block in re.split(r"\n\s*\n", pre):
- block = block.strip()
- if block:
- paragraphs.append(
- {"text": block, "level": 0, "style": body_style}
- )
+ _add_block(block)
level = len(m.group(1)) + level_offset
heading_text = m.group(2).strip()
@@ -56,11 +70,7 @@ def body_to_paragraphs(
tail = md_text[last_end:].strip()
if tail:
for block in re.split(r"\n\s*\n", tail):
- block = block.strip()
- if block:
- paragraphs.append(
- {"text": block, "level": 0, "style": body_style}
- )
+ _add_block(block)
return paragraphs
@@ -94,31 +104,36 @@ def replace_placeholder(
parent.remove(para._element)
for pd_data in reversed(paragraphs):
- new_p = doc.add_paragraph(pd_data["text"])
- style_name = pd_data["style"]
+ if pd_data.get("type") == "image":
+ insert_image_paragraphs(
+ doc, [pd_data], idx=idx, parent=parent
+ )
+ else:
+ new_p = doc.add_paragraph(pd_data["text"])
+ style_name = pd_data["style"]
- # 尝试应用样式,逐步降级
- applied = _apply_style(new_p, doc, style_name)
- if not applied and style_name.startswith("Heading"):
- new_p.style = doc.styles["Normal"]
- elif not applied:
- if placeholder_style:
- _apply_style(new_p, doc, placeholder_style)
- if new_p.style.name == "Normal" and placeholder_style:
- new_p.style = doc.styles[placeholder_style]
+ # 尝试应用样式,逐步降级
+ applied = _apply_style(new_p, doc, style_name)
+ if not applied and style_name.startswith("Heading"):
+ new_p.style = doc.styles["Normal"]
+ elif not applied:
+ if placeholder_style:
+ _apply_style(new_p, doc, placeholder_style)
+ if new_p.style.name == "Normal" and placeholder_style:
+ new_p.style = doc.styles[placeholder_style]
- # 继承原段落的编号属性(自动编号)
- if numPr is not None:
- new_pPr = new_p._element.find(qn("w:pPr"))
- if new_pPr is None:
- new_pPr = new_p._element.makeelement(qn("w:pPr"), {})
- new_p._element.insert(0, new_pPr)
- existing = new_pPr.find(qn("w:numPr"))
- if existing is not None:
- new_pPr.remove(existing)
- new_pPr.append(deepcopy(numPr))
+ # 继承原段落的编号属性(自动编号)
+ if numPr is not None:
+ new_pPr = new_p._element.find(qn("w:pPr"))
+ if new_pPr is None:
+ new_pPr = new_p._element.makeelement(qn("w:pPr"), {})
+ new_p._element.insert(0, new_pPr)
+ existing = new_pPr.find(qn("w:numPr"))
+ if existing is not None:
+ new_pPr.remove(existing)
+ new_pPr.append(deepcopy(numPr))
- parent.insert(idx, new_p._element)
+ parent.insert(idx, new_p._element)
break
if not placeholder_found:
diff --git a/transit/images.py b/transit/images.py
new file mode 100644
index 0000000..6c41a3b
--- /dev/null
+++ b/transit/images.py
@@ -0,0 +1,151 @@
+"""
+图片处理模块。
+
+将 Markdown 中的 ``
`` 标签解析为独立段落,
+并在 Word 文档中插入图片及居中图标题。
+"""
+
+import struct
+import re
+
+from docx import Document
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+
+# 匹配
+_IMG_TAG = re.compile(
+ r'
]*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE
+)
+
+# 从标签属性块中提取名值对
+_ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']')
+
+# 匹配 **图X ...** 重复图标题(注入时跳过)
+_FIG_CAPTION = re.compile(
+ r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE
+)
+
+
+def make_image_paragraph(block: str) -> dict | None:
+ """若 *block* 包含 ``
`` 标签,返回图片段落字典;否则返回 ``None``。"""
+ m = _IMG_TAG.search(block)
+ if not m:
+ return None
+ attrs = dict(_ATTR.findall(block))
+ return {
+ "type": "image",
+ "src": attrs.get("src", m.group(2)),
+ "alt": attrs.get("alt", ""),
+ }
+
+
+def is_figure_caption(block: str) -> bool:
+ """检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。"""
+ return bool(_FIG_CAPTION.match(block.strip()))
+
+
+def _get_image_dimensions(image_path: str) -> tuple[int, int] | None:
+ """读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。
+
+ 仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。
+ """
+ try:
+ with open(image_path, "rb") as f:
+ header = f.read(32)
+ except Exception:
+ return None
+
+ # PNG: 8-byte signature, then IHDR chunk
+ if header[:8] == b"\x89PNG\r\n\x1a\n":
+ w, h = struct.unpack_from(">II", header, 16)
+ return w, h
+
+ # JPEG: starts with FF D8, scan for SOF marker
+ if header[:2] == b"\xff\xd8":
+ pos = 2
+ while pos < len(header):
+ if header[pos] != 0xFF:
+ return None
+ marker = header[pos + 1]
+ if marker in (0xC0, 0xC1, 0xC2):
+ h, w = struct.unpack_from(">HH", header, pos + 5)
+ return w, h
+ seg_len = struct.unpack_from(">H", header, pos + 2)[0]
+ pos += 2 + seg_len
+ return None
+
+ # GIF: "GIF87a" or "GIF89a"
+ if header[:6] in (b"GIF87a", b"GIF89a"):
+ w, h = struct.unpack_from(" int | None:
+ """读取图片的原生宽度(EMU),失败返回 ``None``。
+
+ Word 默认以 72 DPI 渲染图片,1 px = 914400 / 72 = 12700 EMU。
+ """
+ dims = _get_image_dimensions(image_path)
+ if dims is None:
+ return None
+ w_px, _ = dims
+ return w_px * 12700
+
+
+def _constrain_width(image_path: str, page_text_width: int) -> int | None:
+ """返回图片宽度(EMU),超出页宽时缩至页宽。
+
+ Parameters
+ ----------
+ image_path : str
+ 图片路径。
+ page_text_width : int
+ 页面正文区宽度(EMU),来自 ``section.page_width - margins``。
+ """
+ native = _get_native_emu(image_path)
+ if native is None:
+ return None
+ return min(native, page_text_width)
+
+
+def insert_image_paragraphs(
+ doc: Document,
+ paragraphs: list[dict],
+ *,
+ idx: int,
+ parent,
+):
+ """在 *doc* 的指定位置插入图片段落序列。
+
+ 每条图片段落生成两个 Word 段落:
+ 1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸)
+ 2. 居中图标题(从 ``alt`` 提取)
+
+ 插入顺序保持 ``paragraphs`` 的原有顺序。
+ """
+ section = doc.sections[0]
+ page_text_width = section.page_width - section.left_margin - section.right_margin
+
+ for pd_data in reversed(paragraphs):
+ # 图标题(在 reversed 中先插入,最终位于图片下方)
+ cap_p = doc.add_paragraph(pd_data.get("alt", ""))
+ cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ parent.insert(idx, cap_p._element)
+
+ # 图片段落(max-width 行为:超出页宽时压缩,否则原尺寸)
+ img_p = doc.add_paragraph()
+ img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ img_run = img_p.add_run()
+ try:
+ img_width = _constrain_width(pd_data["src"], page_text_width)
+ img_run.add_picture(pd_data["src"], width=img_width)
+ except Exception as exc:
+ print(f"警告:图片加载失败 {pd_data['src']} — {exc}")
+ img_run.add_text(f"[图片加载失败: {pd_data['src']}]")
+ parent.insert(idx, img_p._element)