From 0b10e97e0c2623b45c1c0efdaad6c3f12f54b498 Mon Sep 17 00:00:00 2001 From: zzy <2450266535@qq.com> Date: Sat, 9 May 2026 16:17:51 +0800 Subject: [PATCH] =?UTF-8?q?feat(transit):=20=E6=94=AF=E6=8C=81Markdown?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E8=BD=ACWord=E5=9B=BE=E7=89=87=E6=AE=B5?= =?UTF-8?q?=E8=90=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 transit/images.py 模块处理图片解析和插入逻辑 - 实现 `` 标签解析为独立图片段落 - 添加图片尺寸检测和自适应缩放功能(超出页面宽度时自动缩放) - 支持图标题居中显示和图片居中对齐 - 优化 body_to_paragraphs 函数,添加图片处理逻辑 - 更新 README.md 使用说明,添加模板文件要求说明 BREAKING CHANGE: 图片处理方式变更,需要包含 sample.docx 模板文件 --- README.md | 3 +- transit/body.py | 79 ++++++++++++++---------- transit/images.py | 151 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 33 deletions(-) create mode 100644 transit/images.py diff --git a/README.md b/README.md index 887931d..c0ca4a5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ 使用方法 ```shell -py .\test.py .\毕业论文初稿.md +# 需要 sample.docx 文件 且 该文件有 {{xxx}} 模板引擎的内容 +python .\test.py .\毕业论文初稿.md ``` diff --git a/transit/body.py b/transit/body.py index e0c35a7..d372879 100644 --- a/transit/body.py +++ b/transit/body.py @@ -7,9 +7,12 @@ Markdown 正文 → Word 段落转换。 import re from copy import deepcopy + from docx import Document from docx.oxml.ns import qn +from .images import make_image_paragraph, is_figure_caption, insert_image_paragraphs + _PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) @@ -33,17 +36,28 @@ def body_to_paragraphs( paragraphs: list[dict] = [] last_end = 0 + def _add_block(block: str) -> None: + block = block.strip() + if not block: + return + # 图片段落 + img = make_image_paragraph(block) + if img: + paragraphs.append(img) + return + # 跳过紧跟在图片后的重复图标题 + if paragraphs and paragraphs[-1].get("type") == "image" and is_figure_caption(block): + return + # 普通正文段落 + paragraphs.append({"text": block, "level": 0, "style": body_style}) + for m in _PAT_HEADING.finditer(md_text): # 标题前的普通文本 if m.start() > last_end: pre = md_text[last_end : m.start()].strip() if pre: for block in re.split(r"\n\s*\n", pre): - block = block.strip() - if block: - paragraphs.append( - {"text": block, "level": 0, "style": body_style} - ) + _add_block(block) level = len(m.group(1)) + level_offset heading_text = m.group(2).strip() @@ -56,11 +70,7 @@ def body_to_paragraphs( tail = md_text[last_end:].strip() if tail: for block in re.split(r"\n\s*\n", tail): - block = block.strip() - if block: - paragraphs.append( - {"text": block, "level": 0, "style": body_style} - ) + _add_block(block) return paragraphs @@ -94,31 +104,36 @@ def replace_placeholder( parent.remove(para._element) for pd_data in reversed(paragraphs): - new_p = doc.add_paragraph(pd_data["text"]) - style_name = pd_data["style"] + if pd_data.get("type") == "image": + insert_image_paragraphs( + doc, [pd_data], idx=idx, parent=parent + ) + else: + new_p = doc.add_paragraph(pd_data["text"]) + style_name = pd_data["style"] - # 尝试应用样式,逐步降级 - applied = _apply_style(new_p, doc, style_name) - if not applied and style_name.startswith("Heading"): - new_p.style = doc.styles["Normal"] - elif not applied: - if placeholder_style: - _apply_style(new_p, doc, placeholder_style) - if new_p.style.name == "Normal" and placeholder_style: - new_p.style = doc.styles[placeholder_style] + # 尝试应用样式,逐步降级 + applied = _apply_style(new_p, doc, style_name) + if not applied and style_name.startswith("Heading"): + new_p.style = doc.styles["Normal"] + elif not applied: + if placeholder_style: + _apply_style(new_p, doc, placeholder_style) + if new_p.style.name == "Normal" and placeholder_style: + new_p.style = doc.styles[placeholder_style] - # 继承原段落的编号属性(自动编号) - if numPr is not None: - new_pPr = new_p._element.find(qn("w:pPr")) - if new_pPr is None: - new_pPr = new_p._element.makeelement(qn("w:pPr"), {}) - new_p._element.insert(0, new_pPr) - existing = new_pPr.find(qn("w:numPr")) - if existing is not None: - new_pPr.remove(existing) - new_pPr.append(deepcopy(numPr)) + # 继承原段落的编号属性(自动编号) + if numPr is not None: + new_pPr = new_p._element.find(qn("w:pPr")) + if new_pPr is None: + new_pPr = new_p._element.makeelement(qn("w:pPr"), {}) + new_p._element.insert(0, new_pPr) + existing = new_pPr.find(qn("w:numPr")) + if existing is not None: + new_pPr.remove(existing) + new_pPr.append(deepcopy(numPr)) - parent.insert(idx, new_p._element) + parent.insert(idx, new_p._element) break if not placeholder_found: diff --git a/transit/images.py b/transit/images.py new file mode 100644 index 0000000..6c41a3b --- /dev/null +++ b/transit/images.py @@ -0,0 +1,151 @@ +""" +图片处理模块。 + +将 Markdown 中的 ```` 标签解析为独立段落, +并在 Word 文档中插入图片及居中图标题。 +""" + +import struct +import re + +from docx import Document +from docx.enum.text import WD_ALIGN_PARAGRAPH + +# 匹配 ... +_IMG_TAG = re.compile( + r']*?)src=["\']([^"\']+)["\']([^>]*?)>', re.IGNORECASE +) + +# 从标签属性块中提取名值对 +_ATTR = re.compile(r'(\w+)\s*=\s*["\']([^"\']*)["\']') + +# 匹配 **图X ...** 重复图标题(注入时跳过) +_FIG_CAPTION = re.compile( + r'^\*\*(?:图|Table|Figure|Fig\.?)\s*\d+.*\*\*$', re.IGNORECASE +) + + +def make_image_paragraph(block: str) -> dict | None: + """若 *block* 包含 ```` 标签,返回图片段落字典;否则返回 ``None``。""" + m = _IMG_TAG.search(block) + if not m: + return None + attrs = dict(_ATTR.findall(block)) + return { + "type": "image", + "src": attrs.get("src", m.group(2)), + "alt": attrs.get("alt", ""), + } + + +def is_figure_caption(block: str) -> bool: + """检查 *block* 是否为 ``**图X ...**`` 格式的重复图标题。""" + return bool(_FIG_CAPTION.match(block.strip())) + + +def _get_image_dimensions(image_path: str) -> tuple[int, int] | None: + """读取图片文件头返回 ``(width_px, height_px)``,不支持格式返回 ``None``。 + + 仅读取文件头,不依赖第三方库。支持 PNG / JPEG / GIF / BMP。 + """ + try: + with open(image_path, "rb") as f: + header = f.read(32) + except Exception: + return None + + # PNG: 8-byte signature, then IHDR chunk + if header[:8] == b"\x89PNG\r\n\x1a\n": + w, h = struct.unpack_from(">II", header, 16) + return w, h + + # JPEG: starts with FF D8, scan for SOF marker + if header[:2] == b"\xff\xd8": + pos = 2 + while pos < len(header): + if header[pos] != 0xFF: + return None + marker = header[pos + 1] + if marker in (0xC0, 0xC1, 0xC2): + h, w = struct.unpack_from(">HH", header, pos + 5) + return w, h + seg_len = struct.unpack_from(">H", header, pos + 2)[0] + pos += 2 + seg_len + return None + + # GIF: "GIF87a" or "GIF89a" + if header[:6] in (b"GIF87a", b"GIF89a"): + w, h = struct.unpack_from(" int | None: + """读取图片的原生宽度(EMU),失败返回 ``None``。 + + Word 默认以 72 DPI 渲染图片,1 px = 914400 / 72 = 12700 EMU。 + """ + dims = _get_image_dimensions(image_path) + if dims is None: + return None + w_px, _ = dims + return w_px * 12700 + + +def _constrain_width(image_path: str, page_text_width: int) -> int | None: + """返回图片宽度(EMU),超出页宽时缩至页宽。 + + Parameters + ---------- + image_path : str + 图片路径。 + page_text_width : int + 页面正文区宽度(EMU),来自 ``section.page_width - margins``。 + """ + native = _get_native_emu(image_path) + if native is None: + return None + return min(native, page_text_width) + + +def insert_image_paragraphs( + doc: Document, + paragraphs: list[dict], + *, + idx: int, + parent, +): + """在 *doc* 的指定位置插入图片段落序列。 + + 每条图片段落生成两个 Word 段落: + 1. 居中图片(超出页宽时自动缩放至页宽,否则保持原尺寸) + 2. 居中图标题(从 ``alt`` 提取) + + 插入顺序保持 ``paragraphs`` 的原有顺序。 + """ + section = doc.sections[0] + page_text_width = section.page_width - section.left_margin - section.right_margin + + for pd_data in reversed(paragraphs): + # 图标题(在 reversed 中先插入,最终位于图片下方) + cap_p = doc.add_paragraph(pd_data.get("alt", "")) + cap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER + parent.insert(idx, cap_p._element) + + # 图片段落(max-width 行为:超出页宽时压缩,否则原尺寸) + img_p = doc.add_paragraph() + img_p.alignment = WD_ALIGN_PARAGRAPH.CENTER + img_run = img_p.add_run() + try: + img_width = _constrain_width(pd_data["src"], page_text_width) + img_run.add_picture(pd_data["src"], width=img_width) + except Exception as exc: + print(f"警告:图片加载失败 {pd_data['src']} — {exc}") + img_run.add_text(f"[图片加载失败: {pd_data['src']}]") + parent.insert(idx, img_p._element)